# # Encode: support routines for ENCODE scripts # # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/Encode.pm instead. # # $Id: Encode.pm,v 1.67 2010/06/01 23:47:57 braney Exp $ package Encode; use warnings; use strict; use File::stat; use IO::File; use Cwd; use RAFile; use HgAutomate; # Global constants our $loadFile = "load.ra"; our $unloadFile = "unload.ra"; our $trackFile = "trackDb.ra"; our $fileDbFile = "fileDb.ra"; our $mdbFile = "mdb.txt"; our $pushQFile = "pushQ.sql"; #our $dafVersion = "0.2.2"; #our $dafVersion = "1.1"; our $dafVersionOld = "2.0"; our $dafVersion = "3.0"; # Prefix for table and filenames (was 'wgEncode' in v1 pipeline) our $compositePrefix = "wgEncode"; our $fieldConfigFile = "fields.ra"; our $vocabConfigFile = "cv.ra"; our $grantConfigFile = "labs.ra"; our $labConfigFile = "pi.ra"; # for reporting purposes our $expVarsFile= "expVars.ra"; our $autoCreatedPrefix = "auto"; our $restrictedMonths = 9; # change this for each freeze #our $dataVersion = "ENCODE Sep 2009 Freeze"; #our $dataVersion = "ENCODE Jan 2010 Freeze"; #our $dataVersion = "post ENCODE June 2010 Freeze"; #our $dataVersion = "ENCODE Jan 2011 Freeze"; #our $dataVersion = "post ENCODE Jan 2011 Freeze"; #our $dataVersion = "ENCODE Mar 2012 Freeze"; our $dataVersion = "ENCODE Jul 2012 Freeze"; #we haven't decided what mouse is going to use, so just the same as human for now #our $mouseDataVersion = "ENCODE Mar 2012 Freeze"; our $mouseDataVersion = "ENCODE Jul 2012 Freeze"; our $tempDir = "/data/tmp"; # place to put the big temporary files generated by sort etc. our $sqlCreate = "/cluster/bin/sqlCreate"; # Add type names to this list for types that can be loaded via .sql files (e.g. bed5FloatScore.sql) # You also have to make sure the .sql file is copied into the $sqlCreate directory. our @extendedTypes = ("narrowPeak", "broadPeak", "gappedPeak"); # Add type names to this list for types that can be loaded as bigBed via .as files (e.g. tagAlign.as) # You also have to make sure the .as file is copied into the $sqlCreate directory. our @bigBedTypes = ("bigBed"); our @bigWigTypes = ("bigWig"); our @bamTypes = ("bam"); sub newestFile { # Get the most recently modified file from a list my @files = @_; my $newestTime = 0; my $newestFile = ""; my $file = ""; foreach $file (@files) { my $fileTime = (stat($file))->mtime; if ($fileTime > $newestTime) { $newestTime = $fileTime; $newestFile = $file; } } return $newestFile; } sub splitKeyVal { # split a line into key/value, using the FIRST white-space in the line; we also trim key/value strings my ($str) = @_; my $key = undef; my $val = undef; if($str =~ /([^\s]+)\s+(.+)/) { $key = $1; $val = $2; $key =~ s/^\s+//; $key =~ s/\s+$//; $val =~ s/^\s+//; $val =~ s/\s+$//; } return ($key, $val); } sub validateFieldList { # validate the entries in a RA record or DDF header using fields.ra # $file s/d be 'ddf' or 'dafHeader' or 'dafList' # Returns list of any errors that are found. # # Removed ddf header validation function (3-14-12) # due to the fact that the ddf header should validate against the cv. my ($fields, $schema, $file) = @_; my %hash = map {$_ => 1} @{$fields}; my @errors; if ($file eq 'ddf') { push (@errors, "fields.ra is no longer used to validate ddf headers") } elsif ($file ne 'dafHeader' && $file ne 'dafList') { push(@errors, "file argument '$file' is invalid"); } else { # look for missing required fields for my $field (keys %{$schema}) { if(defined($schema->{$field}{file}) && $schema->{$field}{file} eq $file && $schema->{$field}{required} && !exists($hash{$field})) { push(@errors, "field '$field' not defined"); } } # now look for fields in list that aren't in schema for my $field (@{$fields}) { if(!defined($schema->{$field}) || !defined($schema->{$field}{file}) || $schema->{$field}{file} ne $file) { push(@errors, "invalid field '$field'"); } } } return @errors; } sub validateValueList { # validate hash of values using fields.ra; $file s/d be 'ddf' or 'dafHeader' # Returns list of any errors that are found. my ($values, $schema, $file) = @_; my @errors; for my $field (keys %{$values}) { my $val = $values->{$field}; if(defined($schema->{$field}{file}) && $schema->{$field}{file} eq $file) { my $type = $schema->{$field}{type} || 'string'; if($type eq 'bool') { if(lc($val) ne 'yes' && lc($val) ne 'no' && $val != 1 && $val != 0) { push(@errors, "invalid boolean; field '$field', value '$val'; value must be 'yes' or 'no'"); } else { $values->{$field} = lc($val) eq 'yes' ? 1 : 0; } } elsif($type eq 'int') { if($val !~ /^\d+$/) { if($field eq 'replicate') { $values->{$field} = 1; } else { push(@errors, "invalid integer; field '$field', value '$val'"); } } } } } return @errors; } sub readFile { # Return lines from given file, with EOLs chomp'ed off. # Handles Macintosh, MS-DOS or Unix EOL characters. # Reads whole file into memory, so should NOT be used for huge files. my ($file) = @_; my $oldEOL = $/; undef $/; open(FILE, $file) or die "ERROR: Can't open file \'$file\'\n"; my $content = ; close(FILE); $/ = $oldEOL; # MS-DOS => Unix $content =~ s/\r\n/\n/g; # Mac => Unix $content =~ s/\r/\n/g; my @lines = split(/\n/, $content); return \@lines; } sub pipelineDb { my ($instance) = @_; return "encpipeline_$instance"; } sub projectDir { my ($instance, $id) = @_; return "/cluster/data/encode/pipeline/encpipeline_$instance/$id"; } #this function is to be deprecated sub getGrants { # The grants are called "labs" in the labs.ra file (for historical reasons). my ($configPath) = @_; my %grants; if(-e "$configPath/$grantConfigFile") { # tolerate missing labs.ra in dev trees. %grants = RAFile::readRaFile("$configPath/$grantConfigFile", "lab"); } return \%grants; } #this function is to be deprecated sub getLabs { # file with lab/pi/project/grant -- used for reporting purposes # Captures conventions in reporting spreadsheet and pipeline metadata my ($configPath) = @_; my %labs = (); if(-e "$configPath/$labConfigFile") { %labs = RAFile::readRaFile("$configPath/$labConfigFile", "lab"); } return \%labs; } #this function is to be deprecated sub getExpVars { # Returns hash indexed by the composite name in the experiments.ra file my ($configPath, $composite) = @_; my %expVars = RAFile::readRaFile("$configPath/$expVarsFile", "composite"); die "could not find $composite $configPath/$expVarsFile" unless defined($expVars{$composite}); %expVars = %{$expVars{$composite}}; my @results; for(my $i = 1; $i < scalar(keys %expVars); ++$i) { push @results, $expVars{"expVar$i"}; } return @results; } sub getControlledVocab { # Returns hash indexed by the type's in the cv.ra file (e.g. "Cell Line", "Antibody") my ($configPath) = @_; my %terms = (); my %termRa = RAFile::readRaFile("$configPath/$vocabConfigFile", "term"); foreach my $term (keys %termRa) { my $type = $termRa{$term}->{type}; if(defined($type)) { $terms{$type}->{$term} = $termRa{$term}; } } return %terms; } sub getControlledVocabTags { # Returns hash of cv tags, indexed by the types in the cv.ra file my ($configPath) = @_; my %tags = (); my %termRa = RAFile::readRaFile("$configPath/$vocabConfigFile", "term"); foreach my $term (keys %termRa) { my $type = $termRa{$term}->{type}; die "no type for $term" unless defined($type); # use term for tag if there is no tag my $tag = $termRa{$term}->{tag}; if (!defined($tag)) { $tag = $term } $tags{$type}->{$tag} = $termRa{$term}; } return %tags; } sub getFields { # Gather fields defined for DDF file. File is in # ra format: field , required my ($configPath) = @_; my %fields = RAFile::readRaFile("$configPath/$fieldConfigFile", "field"); # For convenience, convert "required" to a real boolean (1 or 0); for my $key (keys %fields) { if(exists($fields{$key}->{required})) { my $val = $fields{$key}->{required}; $fields{$key}->{required} = lc($val) eq 'yes' ? 1 : 0; } } return \%fields; } sub validateAssembly { my ($val, $pipelineInstance) = @_; if ($pipelineInstance eq 'standard') { if ($val ne 'hg19' && $val ne 'mm9') { return "Assembly '$val' is invalid (must be 'hg19 or mm9')"; } else { return (); } } else { if ($val ne 'encodeTest') { return "Assembly '$val' is invalid (must be 'encodeTest')"; } else { return (); } } } sub getDaf { # Return reference to DAF hash, using newest DAF file found in $submitDir. # hash keys are RA style plus an additional TRACKS key which is a nested hash for # the track list at the end of the DAF file; e.g.: # (lab => 'Myers', TRACKS => {'Alignments => {}, Signal => {}}) my ($submitDir, $fields, $pipelineInstance) = @_; # Verify required fields # are present and that the project is marked active. my $wd = cwd(); chdir($submitDir); my @glob = glob "*.DAF"; push(@glob, glob "*.daf"); my $dafFile = newestFile(@glob); if(!(-e $dafFile)) { die "Can't find the DAF file\n"; } $dafFile = cwd() . "/" . $dafFile; HgAutomate::verbose(2, "Using newest DAF file \'$dafFile\'\n"); chdir($wd); return parseDaf($dafFile, $fields, $pipelineInstance); } sub parseDaf { # Identical to getDaf, but first argument is the DAF filename. my ($dafFile, $fields, $pipelineInstance) = @_; my %daf = (); $daf{TRACKS} = {}; my $lines = readFile("$dafFile"); #my $order = 1; Removing order for view level as this is not being used for prioritization while (@{$lines}) { my $line = shift @{$lines}; # strip leading and trailing spaces $line =~ s/^ +//; $line =~ s/ +$//; # ignore comments and blank lines next if $line =~ /^#/; next if $line =~ /^$/; my ($key, $val) = splitKeyVal($line); if(!defined($key)) { next; } if ($key eq "view") { my %track = (); my $track = $val; # remember track of order, so we can prioritize tracks correctly # $track{order} = $order++; Removing order for view level as this is not being used for prioritization $daf{TRACKS}->{$track} = \%track; HgAutomate::verbose(5, " Found view: \'$track\'\n"); while ($line = shift @{$lines}) { $line =~ s/^ +//; $line =~ s/ +$//; next if $line =~ /^#/; next if $line =~ /^$/; if ($line =~ /^view/) { unshift @{$lines}, $line; last; } my ($key, $val) = splitKeyVal($line); $track{$key} = $val; HgAutomate::verbose(5, " line: '$line'; Property: $key = $val\n"); } $track{required} = lc($track{required}) eq 'yes' ? 1 : 0; $track{hasReplicates} = lc($track{hasReplicates}) eq 'yes' ? 1 : 0; } else { HgAutomate::verbose(3, "DAF field: $key = $val\n"); $daf{$key} = $val; } } # Validate DAF version, and return immediately if not current (production) # OR: During beta test, provide notice about this instead of error #unless ($daf{dafVersion} eq $dafVersion || $daf{dafVersion} eq $dafVersionOld) { # die "NOTICE:\n\n" . # "ENCODE pipeline 2.0 testing is in progress." . # "Your wrangler will complete this submission and provide you" . # " with a version 2.0 DAF file to use for future submissions.\n"; # #die "ERROR(s) in DAF '$dafFile':\n\n" . # #"DAF version '$daf{dafVersion}' does not match current version: $dafVersion\n"); #} # Validate fields my @tmp = grep(!/^TRACKS$/, keys %daf); my @errors = validateFieldList(\@tmp, $fields, 'dafHeader'); if(!keys(%{$daf{TRACKS}})) { push(@errors, "no views defined for project \'$daf{project}\' in DAF '$dafFile'"); } push(@errors, validateAssembly($daf{assembly}, $pipelineInstance)); foreach my $view (keys %{$daf{TRACKS}}) { HgAutomate::verbose(4, " View: $view\n"); my %track = %{$daf{TRACKS}->{$view}}; foreach my $key (keys %track) { HgAutomate::verbose(4, " Setting: $key Value: $track{$key}\n"); } #my @keys = grep(!/^order$/, keys %track); I would like to skip order my @keys = keys %track; push(@errors, validateFieldList(\@keys, $fields, 'dafList')); } if (defined($daf{variables})) { my @variables = split (/\s*,\s*/, $daf{variables}); my %variables; my $i = 0; foreach my $variable (@variables) { # replace underscore with space $variable =~ s/_/ /g; $variables[$i++] = $variable; $variables{$variable} = 1; } $daf{variableHash} = \%variables; $daf{variableArray} = \@variables; } if(@errors) { die "ERROR(s) in DAF '$dafFile':\n\n" . join("\n\n", @errors) . "\n"; } return \%daf; } sub compositeTrackName { my ($daf) = @_; return $compositePrefix . $daf->{compositeSuffix} . (defined($daf->{dataAgreementSuffix}) ? $daf->{dataAgreementSuffix} : ""); } sub downloadDir { my ($daf) = @_; return "/usr/local/apache/htdocs-hgdownload/goldenPath/$daf->{assembly}/encodeDCC/" . compositeTrackName($daf); } sub gbdbDir { my ($daf) = @_; return downloadDir($daf) . "/gbdb"; } sub daysInMonth { # $mon and $year are in format returned by localtime my ($mon, $year) = @_; $year += 1900; if($mon == 1) { if ((!($year % 4) && ($year % 100)) || !($year % 400)) { return 29; } else { return 28; } } elsif ($mon == 3 || $mon == 5 || $mon == 8 || $mon == 10) { return 30; } else { return 31; } } sub restrictionDate { # calculate the "restrict until ..." date. # now argument s/d be time(). # returns the standard time list; i.e.: ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) my ($now) = @_; my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($now); my $restrictedYear = $year; my $restrictedMon = $mon + $restrictedMonths; if($restrictedMon >= 12) { $restrictedYear++; $restrictedMon = ($mon + $restrictedMonths) % 12; } if($mday > daysInMonth($restrictedMon,$restrictedYear)) { # wrap to first when to avoid, for example, 2008-05-31 + 9mo = 2009-02-31 => 2009-03-03 $mday = $mday - daysInMonth($restrictedMon,$restrictedYear); $restrictedMon++; if($restrictedMon >= 12) { $restrictedYear++; $restrictedMon = 0; } } return ($sec,$min,$hour,$mday,$restrictedMon,$restrictedYear,$wday,$yday,$isdst); } sub wigMinMaxPlaceHolder { # This is a placeholder used to let the loader fixup the min/max ranges for wig's my ($tableName) = @_; return uc("${tableName}_MinMaxRange"); } sub isTarZipped # true if file ends in [.tar.gz] or [.tgz] { my ($filePath) = @_; my $fileinfo = `file -iz $filePath`; if ($fileinfo =~ m/tar.*zip/) { return 1; } else { return 0; } } sub isTar # true if file ends in [.tar] { my ($filePath) = @_; my $fileinfo = `file -i $filePath`; if ($fileinfo =~ m/tar/) { return 1; } else { return 0; } } sub isZipped { my ($filePath) = @_; my $fileinfo = `file -i $filePath`; if ($fileinfo =~ m/zip/) { return 1; } else { return 0; } } sub isControlInput { my ($str) = @_; return lc($str) eq 'control' || lc($str) eq 'input'; } sub openUtil { # Handles opening gzipped, tar gzipped, tar, as well as plain files # $path is the optional path of $file. my ($file, $path) = @_; my $fh = new IO::File; my $filePath = defined($path) ? "$path/$file" : $file; open($fh, Encode::isTar($filePath) ? "/bin/tar -Oxf $filePath |" : ( Encode::isTarZipped($filePath) ? "/bin/tar -Ozxf $filePath |" : ( Encode::isZipped($filePath) ? "/bin/gunzip -c $filePath |" : $filePath )) ) or die "Couldn't open file '$file'; error: $!\n"; return $fh; } sub metadataLineToArrays { # Creates pair of arrays that contain the settings in a metadata setting line (retains order) my ($line) = @_; my @tags; my @vals; my $tix = 0; while($line && length($line)>0) { my $tag; my $val; ( $tag,$line ) = split(/=/, $line,2); $tag =~ s/\s+//g; my @chars = split(//,$line); if($chars[0] ne "\"") { ( $val,$line ) = split(/\s+/, $line,2); } else { my $ix=1; while($ix < length($line) && ($chars[$ix] ne '"' || $chars[$ix - 1] eq '\\')) { # Find next " skipping escaped \" $ix++; } if($ix < length($line)) { $val = substr($line,1,$ix - 1); $line = substr($line, $ix + 1); $val =~ s/\\"/\"/g; } else { $val = $line; $line = ""; } } $tags[$tix ] = $tag; $vals[$tix++] = $val; } return ( \@tags, \@vals ); } sub metadataLineToHash { # Create a hash of settings from metadata line my ($line) = @_; my ($tags, $vals) = Encode::metadataLineToArrays($line); my %hash = (); my $i = 0; while($tags->[$i]) { $hash{$tags->[$i]} = $vals->[$i]; $i++ } return \%hash; } my %pipelineStatuses = ( 'loaded' => 1, 'displayed' => 2, 'approved' => 3, 'reviewing' => 4, 'released' => 5 ); sub latestPipelineStatus { # Return status that is farther along in the pipeline # Any status not in the list is assigned 0 my ($status1, $status2) = @_; my $first = defined($pipelineStatuses{$status1}) ? $pipelineStatuses{$status1} : 0; my $second = defined($pipelineStatuses{$status2}) ? $pipelineStatuses{$status2} : 0; return ($first > $second ? $status1 : $status2); } sub laterPipelineStatus { # Return true if first status is later than second status, else false my ($status1, $status2) = @_; my $latest = latestPipelineStatus($status1, $status2); return ($latest eq $status1 ? 1 : 0); } 1;