#
# Encode: support routines for ENCODE scripts
#
# DO NOT EDIT the /cluster/bin/scripts copy of this file --
# edit ~/kent/src/hg/utils/automation/Encode.pm instead.
#
# $Id: Encode.pm,v 1.67 2010/06/01 23:47:57 braney Exp $

package Encode;

use warnings;
use strict;

use File::stat;
use IO::File;
use Cwd;

use RAFile;
use HgAutomate;

# Global constants

our $loadFile = "load.ra";
our $unloadFile = "unload.ra";
our $trackFile = "trackDb.ra";
our $fileDbFile = "fileDb.ra";
our $mdbFile = "mdb.txt";
our $pushQFile = "pushQ.sql";
#our $dafVersion = "0.2.2";
#our $dafVersion = "1.1";
our $dafVersionOld = "2.0";
our $dafVersion = "3.0";

# Prefix for table and filenames (was 'wgEncode' in v1 pipeline)
our $compositePrefix = "wgEncode";

our $fieldConfigFile = "fields.ra";
our $vocabConfigFile = "cv.ra";
our $grantConfigFile = "labs.ra";
our $labConfigFile = "pi.ra";    # for reporting purposes
our $expVarsFile= "expVars.ra";
our $autoCreatedPrefix = "auto";

our $restrictedMonths = 9;

# change this for each freeze
#our $dataVersion = "ENCODE Sep 2009 Freeze";
#our $dataVersion = "ENCODE Jan 2010 Freeze";
#our $dataVersion = "post ENCODE June 2010 Freeze";
#our $dataVersion = "ENCODE Jan 2011 Freeze";
#our $dataVersion = "post ENCODE Jan 2011 Freeze";
#our $dataVersion = "ENCODE Mar 2012 Freeze";
our $dataVersion = "ENCODE Jul 2012 Freeze";
#we haven't decided what mouse is going to use, so just the same as human for now
#our $mouseDataVersion = "ENCODE Mar 2012 Freeze";
our $mouseDataVersion = "ENCODE Jul 2012 Freeze";

our $tempDir = "/data/tmp";	# place to put the big temporary files generated by sort etc.

our $sqlCreate = "/cluster/bin/sqlCreate";
# Add type names to this list for types that can be loaded via .sql files (e.g. bed5FloatScore.sql)
# You also have to make sure the .sql file is copied into the $sqlCreate directory.
our @extendedTypes = ("narrowPeak", "broadPeak", "gappedPeak");
# Add type names to this list for types that can be loaded as bigBed via .as files (e.g. tagAlign.as)
# You also have to make sure the .as file is copied into the $sqlCreate directory.
our @bigBedTypes = ("bigBed");
our @bigWigTypes = ("bigWig");
our @bamTypes = ("bam");

sub newestFile
{
  # Get the most recently modified file from a list
    my @files = @_;
    my $newestTime = 0;
    my $newestFile = "";
    my $file = "";
    foreach $file (@files) {
        my $fileTime = (stat($file))->mtime;
        if ($fileTime > $newestTime) {
            $newestTime = $fileTime;
            $newestFile = $file;
        }
    }
    return $newestFile;
}

sub splitKeyVal
{
# split a line into key/value, using the FIRST white-space in the line; we also trim key/value strings
    my ($str) = @_;
    my $key = undef;
    my $val = undef;
    if($str =~ /([^\s]+)\s+(.+)/) {
        $key = $1;
        $val = $2;
        $key =~ s/^\s+//;
        $key =~ s/\s+$//;
        $val =~ s/^\s+//;
        $val =~ s/\s+$//;
    }
    return ($key, $val);
}


sub validateFieldList {
# validate the entries in a RA record or DDF header using fields.ra
# $file s/d be 'ddf' or 'dafHeader' or 'dafList'
# Returns list of any errors that are found.
#
# Removed ddf header validation function (3-14-12)
# due to the fact that the ddf header should validate against the cv.
    my ($fields, $schema, $file) = @_;
    my %hash = map {$_ => 1} @{$fields};
    my @errors;
    if ($file eq 'ddf') {
        push (@errors, "fields.ra is no longer used to validate ddf headers")
    } elsif ($file ne 'dafHeader' && $file ne 'dafList') {
        push(@errors, "file argument '$file' is invalid");
    } else {
        # look for missing required fields
        for my $field (keys %{$schema}) {
            if(defined($schema->{$field}{file}) && $schema->{$field}{file} eq $file && $schema->{$field}{required} && !exists($hash{$field})) {
                push(@errors, "field '$field' not defined");
            }
        }

        # now look for fields in list that aren't in schema
        for my $field (@{$fields}) {
            if(!defined($schema->{$field}) || !defined($schema->{$field}{file}) || $schema->{$field}{file} ne $file) {
                push(@errors, "invalid field '$field'");
            }
        }
    }
    return @errors;
}

sub validateValueList {
# validate hash of values using fields.ra; $file s/d be 'ddf' or 'dafHeader'
# Returns list of any errors that are found.
    my ($values, $schema, $file) = @_;
    my @errors;
    for my $field (keys %{$values}) {
        my $val = $values->{$field};
        if(defined($schema->{$field}{file}) && $schema->{$field}{file} eq $file) {
            my $type = $schema->{$field}{type} || 'string';
            if($type eq 'bool') {
                if(lc($val) ne 'yes' && lc($val) ne 'no' && $val != 1 && $val != 0) {
                    push(@errors, "invalid boolean; field '$field', value '$val'; value must be 'yes' or 'no'");
                } else {
                    $values->{$field} = lc($val) eq 'yes' ? 1 : 0;
                }
            } elsif($type eq 'int') {
                if($val !~ /^\d+$/) {
                    if($field eq 'replicate') {
                        $values->{$field} = 1;
                    } else {
                        push(@errors, "invalid integer; field '$field', value '$val'");
                    }
                }
            }
        }
    }
    return @errors;
}

sub readFile
{
# Return lines from given file, with EOLs chomp'ed off.
# Handles Macintosh, MS-DOS or Unix EOL characters.
# Reads whole file into memory, so should NOT be used for huge files.
    my ($file) = @_;
    my $oldEOL = $/;
    undef $/;
    open(FILE, $file) or die "ERROR: Can't open file \'$file\'\n";
    my $content = <FILE>;
    close(FILE);
    $/ = $oldEOL;

    # MS-DOS => Unix
    $content =~ s/\r\n/\n/g;
    # Mac => Unix
    $content =~ s/\r/\n/g;
    my @lines = split(/\n/, $content);
    return \@lines;
}

sub pipelineDb
{
    my ($instance) = @_;
    return "encpipeline_$instance";
}

sub projectDir
{
    my ($instance, $id) = @_;
    return "/cluster/data/encode/pipeline/encpipeline_$instance/$id";
}

#this function is to be deprecated
sub getGrants
{
# The grants are called "labs" in the labs.ra file (for historical reasons).
    my ($configPath) = @_;
    my %grants;
    if(-e "$configPath/$grantConfigFile") {
        # tolerate missing labs.ra in dev trees.
        %grants = RAFile::readRaFile("$configPath/$grantConfigFile", "lab");
    }
    return \%grants;
}

#this function is to be deprecated
sub getLabs
{
# file with lab/pi/project/grant -- used for reporting purposes
# Captures conventions in reporting spreadsheet and pipeline metadata
    my ($configPath) = @_;
    my %labs = ();
    if(-e "$configPath/$labConfigFile") {
        %labs = RAFile::readRaFile("$configPath/$labConfigFile", "lab");
    }
    return \%labs;
}

#this function is to be deprecated
sub getExpVars
{
# Returns hash indexed by the composite name in the experiments.ra file
    my ($configPath, $composite) = @_;
    my %expVars = RAFile::readRaFile("$configPath/$expVarsFile", "composite");
    die "could not find $composite $configPath/$expVarsFile" unless defined($expVars{$composite});
    %expVars = %{$expVars{$composite}};
    my @results;
    for(my $i = 1; $i < scalar(keys %expVars); ++$i) {
        push @results, $expVars{"expVar$i"};
    }
    return @results;
}

sub getControlledVocab
{
# Returns hash indexed by the type's in the cv.ra file (e.g. "Cell Line", "Antibody")
    my ($configPath) = @_;
    my %terms = ();
    my %termRa = RAFile::readRaFile("$configPath/$vocabConfigFile", "term");
    foreach my $term (keys %termRa) {
        my $type = $termRa{$term}->{type};
        if(defined($type)) {
            $terms{$type}->{$term} = $termRa{$term};
        }
    }
    return %terms;
}

sub getControlledVocabTags
{
# Returns hash of cv tags, indexed by the types in the cv.ra file
    my ($configPath) = @_;
    my %tags = ();
    my %termRa = RAFile::readRaFile("$configPath/$vocabConfigFile", "term");
    foreach my $term (keys %termRa) {
        my $type = $termRa{$term}->{type};
        die "no type for $term" unless defined($type);

        # use term for tag if there is no tag
        my $tag = $termRa{$term}->{tag};
        if (!defined($tag)) {
            $tag = $term
        }
        $tags{$type}->{$tag} = $termRa{$term};
    }
    return %tags;
}

sub getFields
{
# Gather fields defined for DDF file. File is in
# ra format:  field <name>, required <true|false>
    my ($configPath) = @_;
    my %fields = RAFile::readRaFile("$configPath/$fieldConfigFile", "field");

    # For convenience, convert "required" to a real boolean (1 or 0);
    for my $key (keys %fields) {
        if(exists($fields{$key}->{required})) {
            my $val = $fields{$key}->{required};
            $fields{$key}->{required} = lc($val) eq 'yes' ? 1 : 0;
        }
    }
    return \%fields;
}

sub validateAssembly {
    my ($val, $pipelineInstance) = @_;
    if ($pipelineInstance eq 'standard') {
        if ($val ne 'hg19' && $val ne 'mm9') {
            return "Assembly '$val' is invalid (must be 'hg19 or mm9')";
        } else {
            return ();
        }
    } else {
        if ($val ne 'encodeTest') {
            return "Assembly '$val' is invalid (must be 'encodeTest')";
        } else {
            return ();
        }
    }
}

sub getDaf
{
# Return reference to DAF hash, using newest DAF file found in $submitDir.
# hash keys are RA style plus an additional TRACKS key which is a nested hash for
# the track list at the end of the DAF file; e.g.:
# (lab => 'Myers', TRACKS => {'Alignments => {}, Signal => {}})
    my ($submitDir, $fields, $pipelineInstance) = @_;

    # Verify required fields
    # are present and that the project is marked active.
    my $wd = cwd();
    chdir($submitDir);
    my @glob = glob "*.DAF";
    push(@glob, glob "*.daf");
    my $dafFile = newestFile(@glob);
    if(!(-e $dafFile)) {
        die "Can't find the DAF file\n";
    }
    $dafFile = cwd() . "/" . $dafFile;
    HgAutomate::verbose(2, "Using newest DAF file \'$dafFile\'\n");
    chdir($wd);
    return parseDaf($dafFile, $fields, $pipelineInstance);
}

sub parseDaf
{
# Identical to getDaf, but first argument is the DAF filename.
    my ($dafFile, $fields, $pipelineInstance) = @_;
    my %daf = ();
    $daf{TRACKS} = {};
    my $lines = readFile("$dafFile");
	#my $order = 1; Removing order for view level as this is not being used for prioritization

    while (@{$lines}) {
        my $line = shift @{$lines};
        # strip leading and trailing spaces
        $line =~ s/^ +//;
        $line =~ s/ +$//;
        # ignore comments and blank lines
        next if $line =~ /^#/;
        next if $line =~ /^$/;

        my ($key, $val) = splitKeyVal($line);
        if(!defined($key)) {
            next;
        }
        if ($key eq "view") {
            my %track = ();
            my $track = $val;
            # remember track of order, so we can prioritize tracks correctly
			# $track{order} = $order++; Removing order for view level as this is not being used for prioritization
            $daf{TRACKS}->{$track} = \%track;
            HgAutomate::verbose(5, "  Found view: \'$track\'\n");
            while ($line = shift @{$lines}) {
                $line =~ s/^ +//;
                $line =~ s/ +$//;
                next if $line =~ /^#/;
                next if $line =~ /^$/;
                if ($line =~ /^view/) {
                    unshift @{$lines}, $line;
                    last;
                }
                my ($key, $val) = splitKeyVal($line);
                $track{$key} = $val;
                HgAutomate::verbose(5, "    line: '$line'; Property: $key = $val\n");
            }
            $track{required} = lc($track{required}) eq 'yes' ? 1 : 0;
            $track{hasReplicates} = lc($track{hasReplicates}) eq 'yes' ? 1 : 0;
        } else {
            HgAutomate::verbose(3, "DAF field: $key = $val\n");
            $daf{$key} = $val;
        }
    }

    # Validate DAF version, and return immediately if not current (production)
    # OR:  During beta test, provide notice about this instead of error

    #unless ($daf{dafVersion} eq $dafVersion || $daf{dafVersion} eq $dafVersionOld) {
    #    die "NOTICE:\n\n" .
    #            "ENCODE pipeline 2.0 testing is in progress." .
    #            "Your wrangler will complete this submission and provide you" .
    #            " with a version 2.0 DAF file to use for future submissions.\n";
    #    #die "ERROR(s) in DAF '$dafFile':\n\n" .
    #            #"DAF version '$daf{dafVersion}' does not match current version: $dafVersion\n");
    #}

    # Validate fields
    my @tmp = grep(!/^TRACKS$/, keys %daf);
    my @errors = validateFieldList(\@tmp, $fields, 'dafHeader');

    if(!keys(%{$daf{TRACKS}})) {
        push(@errors, "no views defined for project \'$daf{project}\' in DAF '$dafFile'");
    }
    push(@errors, validateAssembly($daf{assembly}, $pipelineInstance));

    foreach my $view (keys %{$daf{TRACKS}}) {
        HgAutomate::verbose(4, "  View: $view\n");
        my %track = %{$daf{TRACKS}->{$view}};
        foreach my $key (keys %track) {
            HgAutomate::verbose(4, "    Setting: $key   Value: $track{$key}\n");
        }
 	#my @keys = grep(!/^order$/, keys %track); I would like to skip order
        my @keys = keys %track;
        push(@errors, validateFieldList(\@keys, $fields, 'dafList'));
    }

    if (defined($daf{variables})) {
        my @variables = split (/\s*,\s*/, $daf{variables});
        my %variables;
        my $i = 0;
        foreach my $variable (@variables) {
            # replace underscore with space
            $variable =~ s/_/ /g;
            $variables[$i++] = $variable;
            $variables{$variable} = 1;
        }
        $daf{variableHash} = \%variables;
        $daf{variableArray} = \@variables;
    }
    if(@errors) {
        die "ERROR(s) in DAF '$dafFile':\n\n" . join("\n\n", @errors) . "\n";
    }
    return \%daf;
}

sub compositeTrackName
{
    my ($daf) = @_;
    return $compositePrefix .
        $daf->{compositeSuffix} .
        (defined($daf->{dataAgreementSuffix}) ? $daf->{dataAgreementSuffix} : "");
}

sub downloadDir
{
    my ($daf) = @_;
    return "/usr/local/apache/htdocs-hgdownload/goldenPath/$daf->{assembly}/encodeDCC/" . compositeTrackName($daf);
}

sub gbdbDir
{
    my ($daf) = @_;
    return downloadDir($daf) . "/gbdb";
}

sub daysInMonth
{
    # $mon and $year are in format returned by localtime
    my ($mon, $year) = @_;
    $year += 1900;
    if($mon == 1) {
        if ((!($year % 4) && ($year % 100)) || !($year % 400)) {
            return 29;
        } else {
            return 28;
        }
    } elsif ($mon == 3 || $mon == 5 || $mon == 8 || $mon == 10) {
        return 30;
    } else {
        return 31;
    }
}

sub restrictionDate
{
# calculate the "restrict until ..." date.
# now argument s/d be time().
# returns the standard time list; i.e.: ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)
    my ($now) = @_;
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($now);
    my $restrictedYear = $year;
    my $restrictedMon = $mon + $restrictedMonths;
    if($restrictedMon >= 12) {
        $restrictedYear++;
        $restrictedMon = ($mon + $restrictedMonths) % 12;
    }
    if($mday > daysInMonth($restrictedMon,$restrictedYear)) {
        # wrap to first when to avoid, for example, 2008-05-31 + 9mo = 2009-02-31 => 2009-03-03
        $mday = $mday - daysInMonth($restrictedMon,$restrictedYear);
        $restrictedMon++;
        if($restrictedMon >= 12) {
            $restrictedYear++;
            $restrictedMon = 0;
        }
    }
    return ($sec,$min,$hour,$mday,$restrictedMon,$restrictedYear,$wday,$yday,$isdst);
}

sub wigMinMaxPlaceHolder
{
# This is a placeholder used to let the loader fixup the min/max ranges for wig's
    my ($tableName) = @_;
    return uc("${tableName}_MinMaxRange");
}

sub isTarZipped
# true if file ends in [.tar.gz] or [.tgz]
{
    my ($filePath) = @_;
    my $fileinfo = `file -iz $filePath`;
    if ($fileinfo =~ m/tar.*zip/) {
        return 1;
    } else {
        return 0;
    }
}

sub isTar
# true if file ends in [.tar]
{
    my ($filePath) = @_;
    my $fileinfo = `file -i $filePath`;
    if ($fileinfo =~ m/tar/) {
        return 1;
    } else {
        return 0;
    }

}

sub isZipped
{
    my ($filePath) = @_;
    my $fileinfo = `file -i $filePath`;
    if ($fileinfo =~ m/zip/) {
        return 1;
    } else {
        return 0;
    }
}

sub isControlInput
{
    my ($str) = @_;
    return lc($str) eq 'control' || lc($str) eq 'input';
}

sub openUtil
{
# Handles opening gzipped, tar gzipped, tar, as well as plain files
# $path is the optional path of $file.
    my ($file, $path) = @_;
    my $fh = new IO::File;
    my $filePath = defined($path) ? "$path/$file" : $file;
    open($fh, Encode::isTar($filePath) ? "/bin/tar -Oxf $filePath |"
	      : ( Encode::isTarZipped($filePath) ? "/bin/tar -Ozxf $filePath |"
	          : ( Encode::isZipped($filePath) ? "/bin/gunzip -c $filePath |"
		      : $filePath ))
	) or die "Couldn't open file '$file'; error: $!\n";
    return $fh;
}

sub metadataLineToArrays {
# Creates pair of arrays that contain the settings in a metadata setting line (retains order)
    my ($line) = @_;

    my @tags;
    my @vals;
    my $tix = 0;
    while($line && length($line)>0) {
        my $tag;
        my $val;
        ( $tag,$line ) = split(/=/, $line,2);
        $tag =~ s/\s+//g;
        my @chars = split(//,$line);
        if($chars[0] ne "\"") {
            ( $val,$line ) = split(/\s+/, $line,2);
        } else {
            my $ix=1;
            while($ix < length($line) && ($chars[$ix] ne '"' || $chars[$ix - 1] eq '\\')) {  # Find next " skipping escaped \"
                $ix++;
            }
            if($ix < length($line)) {
                $val  = substr($line,1,$ix - 1);
                $line = substr($line,  $ix + 1);
                $val  =~ s/\\"/\"/g;
            } else {
                $val = $line;
                $line = "";
            }
        }
        $tags[$tix  ] = $tag;
        $vals[$tix++] = $val;
    }
    return ( \@tags, \@vals );
}

sub metadataLineToHash {
# Create a hash of settings from metadata line
    my ($line) = @_;
    my ($tags, $vals) = Encode::metadataLineToArrays($line);
    my %hash = ();
    my $i = 0;
    while($tags->[$i]) {
        $hash{$tags->[$i]} = $vals->[$i];
        $i++
    }
    return \%hash;
}

my %pipelineStatuses = (
 'loaded' => 1,
 'displayed' => 2,
 'approved' => 3,
 'reviewing' => 4,
 'released' => 5
);

sub latestPipelineStatus {
# Return status that is farther along in the pipeline
# Any status not in the list is assigned 0
    my ($status1, $status2) = @_;
    my $first = defined($pipelineStatuses{$status1}) ?
        $pipelineStatuses{$status1} : 0;
    my $second = defined($pipelineStatuses{$status2}) ?
        $pipelineStatuses{$status2} : 0;
    return ($first > $second ? $status1 : $status2);
}

sub laterPipelineStatus {
# Return true if first status is later than second status, else false
    my ($status1, $status2) = @_;
    my $latest = latestPipelineStatus($status1, $status2);
    return ($latest eq $status1 ? 1  : 0);
}

1;