#!/bin/csh exit; ############################################################ # get metadata.txt file and non-accessioned sequences # from Elliott Margulies # /cluster/bluearc/encode/metadata.txt ############################################################ # Check to make sure there are no redundant lines in the metadata file # These should return the same number cut -f1-2,16 metadata.txt | sort | wc -l cut -f1-2,16 metadata.txt | sort -u | wc -l # make sure that the metadata.txt file has a header: head -2 metadata.txt #niscOrganism encodeRegion freezeDate ncbiTaxonId assemblyProvider assemblyDate assemblyId chromosome chrom_start chrom_end chromLength strand accessionVersion numBases numN thisContigNum totalNumContigs comment #armadillo ENm001 JAN-2006 9361 NISC 08-NOV-2005 . . 0 0 0 + NT_086518.3 1930245 100000 1 1 . # Check to verify that coverage is as expected, that no species or # targets were omitted: cut -f1,2,5 metadata.txt | tail +2 | sort -u | cut -f1,3 | sort | uniq -c # 1 armadillo Broad Assisted Assembly v 1.0 # 43 armadillo NISC # 44 baboon NISC # 41 cat Mullikin Phusion Assembly of Agencourt Traces # 3 cat NISC # 43 chicken CGSC Feb. 2004 # 1 chicken NCBI # 1 chimp NCBI # 43 chimp NCBI Build 1 v1 # 1 colobus_monkey NISC # 40 cow BCM # 1 cow NCBI # 43 dog Broad Institute v. 2.0 # 1 dog NCBI # 1 dusky_titi NISC # 44 eehedgehog Mullikin Phusion Assembly of Broad Traces # 2 elephant Broad Assisted Assembly v 1.0 # 42 elephant NISC # 36 fugu IMCB/JGI # 44 galago NISC # 8 hedgehog NISC # 44 human NCBI Build 35 # 44 macaque BCM # 44 marmoset NISC # 43 monodelphis Broad Institute # 1 monodelphis NCBI # 44 mouse NCBI # 3 mouse_lemur NISC # 1 owl_monkey NISC # 29 platypus Mullikin Phusion Assembly of WUGSC Traces # 15 platypus NISC # 16 rabbit Broad Assisted Assembly v 1.0 # 28 rabbit NISC # 43 rat Baylor HGSC v3.1 # 1 rat NCBI # 44 rfbat NISC # 24 shrew Mullikin Phusion Assembly of Broad Traces # 20 shrew NISC # 44 tenrec Mullikin Phusion Assembly of Broad Traces # 42 tetraodon Genoscope V7 # 1 tetraodon NCBI # 39 xenopus JGI # 43 zebrafish Sanger Zv4 cd /cluster/bluearc/encode cat >! makeLists.pl << '_EOF_' #!/usr/bin/perl -W %speciesOptions = ( "armadillo" => "mammal ", "baboon" => "human ", "cat" => "cat ", "chicken" => "chicken ", "chimp" => "human ", "cow" => "cow ", "colobus_monkey" => "human ", "dog" => "dog ", "dusky_titi" => "cow ", "elephant" => "mammal ", "fugu" => "fugu ", "galago" => "human ", "eehedgehog" => "mammal ", "hedgehog" => "mammal ", "human" => "human ", "macaque" => "macaca ", "marmoset" => "human ", "monodelphis" => "mammal ", "mouse" => "mouse ", "mouse_lemur" => "human ", "owl_monkey" => "human ", "pig" => "pig ", "platypus" => "mammal ", "rat" => "rattus ", "rabbit" => "mammal ", "rfbat" => "mammal ", "shrew" => "mammal ", "tenrec" => "mammal ", "tetraodon" => "fugu ", "xenopus" => "xenops ", "zebrafish" => "danio ", ); $encodeLibDir = "/cluster/data/encode/ortho/reconLibs"; $reconLibrary = "supplemental.lib"; %libOptions = ( "armadillo" => "$encodeLibDir/armadillo/$reconLibrary ", "baboon" => "$encodeLibDir/baboon/$reconLibrary ", "elephant" => "$encodeLibDir/elephant/$reconLibrary ", "galago" => "$encodeLibDir/galago/$reconLibrary ", "marmoset" => "$encodeLibDir/marmoset/$reconLibrary ", "mouse_lemur" => "$encodeLibDir/mouse_lemur/$reconLibrary ", "platypus" => "$encodeLibDir/platypus/$reconLibrary ", "rabbit" => "$encodeLibDir/rabbit/$reconLibrary ", "rfbat" => "$encodeLibDir/rfbat/$reconLibrary ", "tenrec" => "$encodeLibDir/tenrec/$reconLibrary ", ); $efetchURL = "http://www.ncbi.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&id="; $bluearc = "/cluster/bluearc/encode"; $metadata = "$bluearc/metadata.txt"; $wgetList = "$bluearc/wgetList"; $maskList = "$bluearc/para/maskList"; system ("mkdir -p $bluearc/trf $bluearc/RepeatMasker $bluearc/masked $bluearc/para"); open(METADATA, " < $metadata") or die "can't open $metadata for reading.\n"; open(WGETLIST, " > $wgetList") or die "can't open $wgetList"; open(MASKLIST, " > $maskList") or die "can't open $maskList"; print(WGETLIST "cd $bluearc\n"); while () { if (/niscOrganism/) { next; } @fields = split /\t/; $species = $fields[0]; $region = $fields[1]; $acc = $fields[12]; $index = $fields[15]; $how = "none"; if (defined $speciesOptions{lc($species)}) { $spcOpt="$speciesOptions{lc($species)}"; $how="speciesDefault"; } else { $spcOpt=""; } if (defined $libOptions{lc($species)}) { $libOpt= "$libOptions{lc($species)}"; if ($spcOpt eq "") { $how="speciesDefault"; } else { $how="defaultAndCustom"; } } else { $libOpt=""; } if ($acc =~ /^\w{2}\_\d+/) { print(WGETLIST "wget -O $bluearc/original/$acc \"${efetchURL}$acc\"\n"); } if ($acc !~ /NT_/) { $acc="$species.$region.$index.fa"; } print(MASKLIST "/cluster/bin/scripts/encodeMask.csh $how $acc $spcOpt $libOpt\n"); } '_EOF_' # # set freezeDate = "JAN-2006" cd /cluster/bluearc/encode mkdir -p original para masked trf RepeatMasker chmod ug+x makeLists.pl ./makeLists.pl chmod ug+x wgetList ./wgetList # check for network error grep -i error original/NT_* # should return nothing ############################################################ # mask sequences cat >! /cluster/bin/scripts/encodeMask.csh << '_EOF_' #!/bin/csh if ( $#argv < 2 ) then echo " usage: $0 [] [] " echo " $0 none " echo " $0 speciesDefault " echo " $0 custom " echo " $0 defaultAndCustom " exit 1 endif set rm = /cluster/bluearc/RepeatMasker/RepeatMasker set trf = /cluster/bin/i386/trfBig set mof = /cluster/bin/i386/maskOutFa set opt = $1 set seqFileName = $2 cd /tmp /bin/rm -f $seqFileName* /bin/cp -f /cluster/bluearc/encode/original/$seqFileName . $trf $seqFileName $seqFileName.TRF -bed -trf=/cluster/bin/i386/trf $mof $seqFileName $seqFileName.bed $seqFileName.trf -soft if ( $opt == "none" ) then $rm -nocut $seqFileName $mof $seqFileName.trf $seqFileName.out $seqFileName.fin -softAdd else if ( $opt == "speciesDefault" ) then $rm -nocut -species $3 $seqFileName $mof $seqFileName.trf $seqFileName.out $seqFileName.fin -softAdd else if ( $opt == "custom" ) then $rm -nocut -lib $3 $seqFileName $mof $seqFileName.trf $seqFileName.out $seqFileName.fin -softAdd else if ( $opt == "defaultAndCustom" ) then $rm -nocut -species $3 $seqFileName $mof $seqFileName.trf $seqFileName.out $seqFileName.default -softAdd mv $seqFileName.out $seqFileName.$3.out $rm -nocut -lib $4 $seqFileName $mof $seqFileName.default $seqFileName.out $seqFileName.fin -softAdd mv $seqFileName.out $seqFileName.custom.out else echo " usage: $0 [] [] " echo " $0 none " echo " $0 speciesDefault " echo " $0 custom " echo " $0 defaultAndCustom " rm -rf $acc* exit 1 endif /bin/mv -f $seqFileName.default /cluster/bluearc/encode/default/$seqFileName /bin/mv -f $seqFileName.fin /cluster/bluearc/encode/masked/$seqFileName /bin/mv -f $seqFileName.bed /cluster/bluearc/encode/trf/ /bin/mv -f $seqFileName.*out /cluster/bluearc/encode/RepeatMasker/ /bin/rm -rf $seqFileName* '_EOF_' # chmod ug+x /cluster/bin/scripts/encodeMask.csh ## check that all of the sequences are present in the original ## directory to avoid crashed jobs > wc -l /cluster/bluearc/encode/para/maskList # 10709 /cluster/bluearc/encode/para/maskList > ls -1 /cluster/bluearc/encode/original/| wc -l # 10709 ssh kk cd /cluster/bluearc/encode/para para create maskList para push para check para time #10709 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 10709 of 10709 jobs #CPU time in finished jobs: 3111337s 51855.62m 864.26h 36.01d 0.099 y #IO & Wait Time: 41641s 694.01m 11.57h 0.48d 0.001 y #Average job time: 294s 4.91m 0.08h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 16313s 271.88m 4.53h 0.19d #Submission to last job: 17875s 297.92m 4.97h 0.21d ############################################################ # rewrite files with new headers cat >! writeEncodeFasta.pl << '_EOF_' #!/usr/bin/perl -W $releaseDate = shift || die "usage: writeEncodeFasta.pl releaseDate [metadata.txt]"; $metadata = shift || "metadata.txt"; ($sec,$min,$hour,$mday,$mon,$year)=localtime(time); printf "%4d-%02d-%02d %02d:%02d:%02d Started\n",$year+1900,$mon+1,$mday,$hour,$min,$sec; $outDir = "/cluster/bluearc/encode/$releaseDate"; system "mkdir -p $outDir"; if (-e "$outDir") { system "mv -i $outDir/ $outDir.old/"; } system "mkdir -p $outDir/"; open(METADATA," < $metadata") or die "Can't open $metadata"; while () { chomp(); $org=$region=$acc=$source=$outFile="."; if ( /niscOrganism/ ) { next; } # ignore header @row = split /\t/; if ( $#row != 17 ) { die "Bad input format: $_"; } $header = ">"; $header .= join "|", @row; $org = $row[0]; $region = $row[1]; $acc = $row[12]; $thisContigNum = $row[15]; $outFile = "$outDir/$region/$org.$region.fa"; system ("mkdir -p $outDir/$region"); system ("touch $outFile"); open (DEST, " >> $outFile") or die "Can't open $outFile"; print DEST "$header\n"; $source = "/cluster/bluearc/encode/masked/$acc"; unless ( -e $source && $acc ne '.') { $source = "/cluster/bluearc/encode/masked/$org.$region.$thisContigNum.fa";} open (SOURCE, " < $source") or die ("Can't open $source"); ; # ignore header while () { print DEST; } close(SOURCE); close(DEST); } ($sec,$min,$hour,$mday,$mon,$year)=localtime(time); printf "%4d-%02d-%02d %02d:%02d:%02d Finished writing files\n",$year+1900,$mon+1,$mday,$hour,$min,$sec; system "cd $outDir/..; chmod o+x $metadata; cp -L $metadata $outDir/; tar cfz $releaseDate.tgz $metadata $releaseDate/EN*/*"; ($sec,$min,$hour,$mday,$mon,$year)=localtime(time); printf "%4d-%02d-%02d %02d:%02d:%02d Finished creating tarball\n",$year+1900,$mon+1,$mday,$hour,$min,$sec; system "mkdir -p /usr/local/apache/htdocs/encode/downloads/msa/$releaseDate"; system "chmod o+rx /usr/local/apache/htdocs/encode/downloads/msa/$releaseDate"; system "chmod -R o+r $metadata $outDir/ $outDir/../$releaseDate.tgz"; system "chmod o+rx $outDir $outDir/EN*"; system "rsync -a $outDir/ /usr/local/apache/htdocs/encode/downloads/msa/$releaseDate/sequences"; system "rsync -a $outDir/../$metadata /usr/local/apache/htdocs/encode/downloads/msa/$releaseDate/"; system "rsync -a $outDir/../$releaseDate.tgz /usr/local/apache/htdocs/encode/downloads/msa/$releaseDate/"; ($sec,$min,$hour,$mday,$mon,$year)=localtime(time); printf "%4d-%02d-%02d %02d:%02d:%02d End\n",$year+1900,$mon+1,$mday,$hour,$min,$sec; '_EOF_' # chmod ug+x writeEncodeFasta.pl set releaseDate = JAN-2006 ./writeEncodeFasta.pl $releaseDate metadata.txt # 2006-01-21 08:11:36 Started # 2006-01-21 08:16:06 Finished writing files # 2006-01-21 08:25:06 Finished creating tarball # 2006-01-21 08:26:12 End