# for emacs: -*- mode: sh; -*- # Creating the assembly for Monodelphis domestica # South American, Short-tailed Opossum # http://www.genome.gov/11510687 # http://www.genome.gov/12512285 # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | ensGene | genePred ensPep | # | genscan | genePred genscanPep | # +-------------+---------------------------------+ ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2007-05-11 - Hiram) ssh kkstore04 mkdir -p /cluster/store8/monDom5/broad ln -s /cluster/store8/monDom5 /cluster/data/monDom5 cd /cluster/data/monDom4/broad.mit.edu time nice -n +19 wget --timestamping \ 'ftp://broad.mit.edu/pub/assemblies/mammals/monodelphis/monDom5/*' # real 52m48.859s # user 0m2.122s # sys 0m22.031s # This was a symlink to ../monDom2/ # lrwxrwxrwx 1 36 May 10 16:00 BACread_2_BACclone.txt.gz -> ../monDom2/BACread_2_BACclone.txt.gz # so actually fetch the file here rm BACread_2_BACclone.txt.gz time nice -n +19 wget --timestamping \ 'ftp://broad.mit.edu/pub/assemblies/mammals/monodelphis/monDom2/BACread_2_BACclone.txt.gz' # it looks like they've been using our tools, their agp file has # the extra gap at the end of chrUn: # chrUn 103240612 103241611 16995 N 1000 clone no # Their fasta file is in 5,000,000 chunks # fixup the split fasta files time nice -n +19 gunzip Monodelphis5.0.agp.chromosome.qual.gz \ Monodelphis5.0.agp.chromosome.fasta.gz # real 5m24.942s # user 1m31.895s # sys 0m51.726s mkdir splitFa time nice -n +19 faSplit -verbose=2 byname \ Monodelphis5.0.agp.chromosome.fasta splitFa/ # 2m44s mkdir chrFa # combine the Broad split files into single chrom fasta files time for C in 1 2 3 4 5 6 7 8 X Un do rm -f chrFa/chr${C}.fa echo ">chr${C}" > chrFa/chr${C}.fa echo -n "chrFa/chr${C}.fa working ... " ls splitFa/${C}.*-*.fa | sort -t"." -k2,2n | while read F do grep -v "^>" ${F} >> chrFa/chr${C}.fa done echo "done" done # verify nothing was lost, should be the same totals here faSize chrFa/chr*.fa # 3605614649 bases (103971429 N's 3501643220 real 3501643220 upper 0 # lower) in 10 sequences in 10 files faSize Monodelphis5.0.agp.chromosome.fasta # 3605614649 bases (103971429 N's 3501643220 real 3501643220 upper 0 # lower) in 726 sequences in 1 files # put them together into a single file: cat chrFa/chr*.fa > ucscChroms.fa # create a lift file from the information in the fasta headers cat << '_EOF_' > liftBroadToChroms.pl #!/usr/bin/env perl use strict; use warnings; open (FH, 'grep "^>" Monodelphis5.0.agp.chromosome.fasta|') or die "can not grep Monodelphis5.0.agp.chromosome.fasta"; my %liftSpec; # key is chrom_start, value is end my %chrSize; # key is chrom, value is size my @liftLines; # index is line number, value is line to output $chrSize{'1'} = 0; $chrSize{'2'} = 0; $chrSize{'3'} = 0; $chrSize{'4'} = 0; $chrSize{'5'} = 0; $chrSize{'6'} = 0; $chrSize{'7'} = 0; $chrSize{'8'} = 0; $chrSize{'X'} = 0; $chrSize{'Un'} = 0; my $lineCount = 0; my $prevChr = ""; my $chrStart = 0; while (my $line = ) { my ($chr_pos, $name) = split('\s+',$line); my ($chr, $range) = split('\.',$chr_pos); $chr =~ s/>//; if ($chr ne $prevChr) { $chrStart = 0; $prevChr = $chr; print STDERR "chr$chr starting\n"; } my ($start,$end) = split('-',$range); my $key = sprintf("%s_%d", $chr, $start); $liftSpec{$key} = $end; $chrSize{$chr} = $end if ($end > $chrSize{$chr}); my $fragSize = $end - $start + 1; $liftLines[$lineCount++] = sprintf "%d\t%s.%d-%d\t%d\tchr%s", $chrStart, $chr, $start, $end, $fragSize, $chr; $chrStart += $fragSize; } close (FH); for (my $i = 0; $i < $lineCount; ++$i) { my ($chrStart, $fragName, $fragSize, $chr) = split('\t',$liftLines[$i]); $chr =~ s/chr//; printf "%s\t%d\n", $liftLines[$i], $chrSize{$chr}; } '_EOF_' # << happy emacs chmod +x liftBroadToChroms.pl ./liftBroadToChroms.pl liftBroad.lft # split up the quality file to get it put together into a single # chrom based file cat << '_EOF_' > splitQual.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "$fileName") or die "can not write to $fileName"; while (my $line = ) { if ($line =~ m/^>/) { close(OUT); $line =~ s/>//; $line =~ s/-.*//; $fileName = "splitQual/$line"; open (OUT,">$fileName") or die "can not write to $fileName"; printf STDERR "writing to $fileName"; } else { print OUT $line; } } close (FH); close (OUT); '_EOF_' # << happy emacs chmod +x splitQual.pl mkdir splitQual ./splitQual.pl # put them back together in order as full chroms for C in 1 2 3 4 5 6 7 8 X Un do echo ">chr${C}" > chrQual/chr${C}.qual.fa ls splitQual/${C}.* | sort -t'.' -k2,2n | while read F do cat $F >> chrQual/chr${C}.qual.fa done done # real 6m7.079s # and as a single file for C in 1 2 3 4 5 6 7 8 X Un do cat chrQual/chr${C}.qual.fa done | gzip -c > ucscChroms.qual.fa.gz # real 4m41.229s # and turn it into a qac file qaToQac ucscChroms.qual.fa.gz ucscChroms.qac # real 3m49.380s ######################################################################### # create genome assembly database (DONE - 2008-11-25 - Hiram) cd /hive/data/genomes/monDom5/ cat << '_EOF_' > monDom5.config.ra # Config parameters for makeGenomeDb.pl: db monDom5 clade mammal scientificName Monodelphis domestica commonName Opossum assemblyDate Oct. 2006 assemblyLabel Broad Institute monDom5 (NCBI project 12561, accession AAFR03000000) orderKey 354 mitoAcc NC_006299 fastaFiles /hive/data/genomes/monDom5/broad/ucscChroms.fa agpFiles /hive/data/genomes/monDom5/broad/Monodelphis5.0.agp qualFiles /hive/data/genomes/monDom5/broad/ucscChroms.qac dbDbSpeciesDir opossum '_EOF_' # << happy emacs time makeGenomeDb.pl -verbose=2 -stop=seq monDom5.config.ra > seq.log 2>&1 # real 3m3.414s time makeGenomeDb.pl -verbose=2 -continue=agp -stop=agp monDom5.config.ra \ > agp.log 2>&1 # real 0m36.723s time makeGenomeDb.pl -verbose=2 -continue=db -stop=db monDom5.config.ra \ > db.log 2>&1 # real 36m30.041 time makeGenomeDb.pl -verbose=2 -continue=dbDb -stop=dbDb \ monDom5.config.ra > dbDb.log 2>&1 # real 0m1.074s time makeGenomeDb.pl -verbose=2 -continue=trackDb -stop=trackDb \ monDom5.config.ra > trackDb.log 2>&1 # check in the trackDb files and the browser should be up and running ######################################################################### # monDom5 - Opossum - Ensembl Genes version 50 (DONE - 2008-12-02 - hiram) ssh hgwdev cd /hive/data/genomes/monDom5 cat << '_EOF_' > monDom5.ensGene.ra # required db variable db monDom5 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=50 monDom5.ensGene.ra ssh hgwdev cd /hive/data/genomes/monDom5/bed/ensGene.50 featureBits monDom5 ensGene # 32970520 bases of 3501660299 (0.942%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/monDom5/bed/ensGene.50 ############################################################################ # monDom5 - Opossum - Ensembl Genes version 51 (DONE - 2008-12-02 - hiram) ssh hgwdev cd /hive/data/genomes/monDom5 cat << '_EOF_' > monDom5.ensGene.ra # required db variable db monDom5 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 monDom5.ensGene.ra ssh hgwdev cd /hive/data/genomes/monDom5/bed/ensGene.51 featureBits monDom5 ensGene # 32959755 bases of 3501660299 (0.941%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/monDom5/bed/ensGene.51 ######################################################################### ## REPEATMASKER (DONE 2008-11-26 Andy) screen -S monDom5RepeatMasker # to manage this several day job mkdir /hive/data/genomes/monDom5/bed/repeatMasker cd /hive/data/genomes/monDom5/bed/repeatMasker time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl -workhorse=hgwdev \ -bigClusterHub=swarm -buildDir=`pwd` monDom5 > do.log ## (detach screen Ctrl+A+D) ## ... (reattach on hgwdev: screen -r monDom5RepeatMasker) # #Checking finished jobs #cat run.time #HgStepManager: executing step 'cat' Wed Nov 26 07:29:04 2008. #Use of uninitialized value in substitution (s///) at /cluster/home/aamp/kent/src/hg/utils/automation/HgAutomate.pm line 262. #Could not extract server from output of "df /hive/data/genomes/monDom5/bed/repeatMasker": #/dev/hivedev 171434397696 55883525376 115550872320 33% /hive #... #user 0m0.099s #sys 0m0.070s ## checking the problem, it's that I don't have a $HOST in my env. ## ... so, tcsh it is. tcsh time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl -continue=cat \ -workhorse=hgwdev -bigClusterHub=swarm -buildDir=`pwd` \ monDom5 > doAfterCat.log #0.389u 0.348s 1:12:48.52 0.0% 0+0k 0+0io 1pf+0w ######################################################################### ## SIMPLE REPEATS TRF (DONE 2008-11-26 - Andy) screen mkdir /hive/data/genomes/monDom5/bed/simpleRepeat cd /hive/data/genomes/monDom5/bed/simpleRepeat time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/cluster/data/monDom5/bed/simpleRepeat monDom5 > do.log #0.233u 0.153s 2:52:17.94 0.0% 0+0k 0+0io 0pf+0w cat fb.simpleRepeat # 83450133 bases of 3501660299 (2.383%) in intersection ## after RM run is done, add this mask: cd /hive/data/genomes/monDom5 rm monDom5.2bit twoBitMask monDom5.rmsk.2bit -add bed/simpleRepeat/trfMask.bed monDom5.2bit ## can safely ignore warning about >=13 fields in bed file twoBitToFa monDom5.2bit stdout | faSize stdin > monDom5.2bit.faSize.txt # 3605631728 bases (103971429 N's 3501660299 real 1542619938 upper 1959040361 lower) # %54.33 masked total, %55.95 masked real ## link to gbdb ln -s `pwd`/monDom5.2bit /gbdb/monDom5 ########################################################################### # WINDOWMASKER (DONE 2008-12-02 Andy) ssh kolossus mkdir /hive/data/genomes/monDom5/bed/WindowMasker cd /hive/data/genomes/monDom5/bed/WindowMasker screen -S monDom5_WindowMasker tcsh ~/kent/src/hg/utils/automation/doWindowMasker.pl monDom5 -buildDir=`pwd` -workhorse=kolossus >& wm.log ## oops forgot to time it (or nice it). It took less than 8 hrs though ## load result to prep for cleaning ssh hgwdev cd /hive/data/genomes/monDom5/bed/WindowMasker hgLoadBed monDom5 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 1346187 elements of size 3 featureBits monDom5 windowmaskerSdust # 1619987578 bases of 3501660299 (46.263%) in intersection # eliminate the gaps from the masking (WM bug) featureBits monDom5 -not gap -bed=notGap.bed # 170473138 bases of 170473138 (100.000%) in intersection screen -S monDom5_WindowMasker time nice featureBits monDom5 windowmasker.sdust.bed.gz notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz #1516026449 bases of 3501660299 (43.295%) in intersection # #real 12m17.912s #user 3m56.801s #sys 0m13.704s ## reload track to get it clean hgLoadBed monDom5 windowmaskerSdust cleanWMask.bed.gz ## Loaded 22241063 elements of size 4 ########################################################################## ## CPG ISLANDS (DONE 2008-12-03, Andy) ssh hgwdev screen -S monDom5_CpG ~/kent/src/hg/utils/automation/doCpgIslands.pl monDom5 ## [detach screen] ## took an hour or two featureBits monDom5 cpgIslandExt # 14553072 bases of 3501660299 (0.416%) in intersection ########################################################################## ## LIFTOVER CHAIN MONDOM4->MONDOM5 (DONE 2008-12-06, Andy) ssh hgwdev screen -S monDom5_liftOver tcsh ~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl monDom4 monDom5 ## [detach] ## ... 8 hippos ssh swarm cd /hive/data/genomes/monDom4/bed/blat.monDom5.2008-12-04/run.blat ## edit job.csh and add -maxIntron=1 -mask=lower ## to blat command and up minScore to 5000 and minIdentity to 99 para stop para push ## Took several hours after hippos were dealt with and -continue net ## had to be run as well after it crashed running out of tmp disk space cd /hive/data/genomes/monDom4/bed/blat.monDom5.2008-12-04 rm -rf run.chain/ run.blat/ mv monDom4ToMonDom5.over.chain.gz ../liftOver/ cd /gbdb/monDom4/liftOver ####################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE 2008-12-11, Andy) ssh hgwdev screen -S monDom5_11.ooc cd /hive/data/genomes/monDom5 ## use repMatch of 1228 as this genome is ~ %20 larger than human ## 1024 + (1024 * 0.2) = 1228 time nice blat monDom5.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228 # Wrote 43992 overused 11-mers to 11.ooc # real 3m24.793s ####################################################################### # GENBANK AUTOUPDATE SETUP (DONE 2008-12-15 Andy) ssh hgwdev screen -S monDom5_genbank cd ~/kent/src/hg/makeDb/genbank/conf/ ## edit genbank.conf: search monDom4, copy and paste that whole entry ## and replace monDom4 with monDom5: ## cvs ci genbank.conf make etc-update ssh genbank cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial monDom5 & # logFile: var/build/logs/2008.12.11-16:37:37.monDom5.initalign.log #real 633m58.880s ##user 96m10.868s #sys 35m42.207s exit ## we're back on hgwdev nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad monDom5 & #logFile: var/dbload/genbank/logs/2008.12.15-10:44:36.dbload.log cd ~/kent/src/hg/makeDb/genbank cvsup ## add monDom5 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added monDom5" etc/align.dbs etc/hgwdev.dbs make etc-update ########################################################################## ## BLAT (DONE 2008-01-20 Andy) # e-mail cluster-admin and ask for a server and tell them about # /gbdb/monDom5/monDom5.2bit # then ssh hgwdev hgsql -N hgcentraltest mysql> insert into blatServers (db, host, port, isTrans, canPcr) values ('monDom5', 'blatx', '17778', '1', '0'); Query OK, 1 row affected (0.01 sec) mysql> insert into blatServers (db, host, port, isTrans, canPcr) values ('monDom5', 'blatx', '17779', '0', '1'); Query OK, 1 row affected (0.00 sec) ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE 2009-02-05 braney ) # bash if not using bash shell already cd /cluster/data/monDom5 grep chrUn monDom5.agp | /cluster/bin/scripts/agpToLift > jkStuff/Un.lft mkdir /cluster/data/monDom5/blastDb cat broad/Monodelphis5.0.agp | awk '/^chrUn/ {if ($5 == "W") { printf "echo \\>%s\n",$6 ; printf "twoBitToFa monDom5.2bit:chrUn:%d-%d stdout | tail -n +2\n", $2,$3 }}' > script sh script > chrUn.fa grep -v chrUn chrom.sizes | awk '{ printf "twoBitToFa monDom5.2bit:%s stdout \n", $1}' > script sh script > chrNotUn.fa cat chrUn.fa chrNotUn.fa | faToTwoBit stdin monDom5.chrUnAsContigs.2bit twoBitInfo monDom5.chrUnAsContigs.2bit monDom5.chrUnAsContigs.sizes awk '{if ($2 > 1000000) print $1}' monDom5.chrUnAsContigs.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst monDom5.chrUnAsContigs.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 4366 pieces of 4366 written rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' monDom5.chrUnAsContigs.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst monDom5.chrUnAsContigs.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y ls blastDb/y* | wc -l # 88 rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa & ls *.nsq | wc -l # 4454 mkdir -p /cluster/data/monDom5/bed/tblastn.hg18KG cd /cluster/data/monDom5/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 4454 query.lst # we want around 1000000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(1000000/`wc query.lst | awk '{print $1}'`\) # 36727/(1000000/4454) = 163.582058 mkdir -p kgfa split -l 164 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/monDom5/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/monDom5/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome exit ssh swarm cd /cluster/data/monDom5/bed/tblastn.hg18KG gensub2 query.lst kg.lst blastGsub blastSpec para create blastSpec # para try, check, push, check etc. para time # Completed: 997696 of 997696 jobs # CPU time in finished jobs: 21259114s 354318.56m 5905.31h 246.05d 0.674 y # IO & Wait Time: 3461687s 57694.79m 961.58h 40.07d 0.110 y # Average job time: 25s 0.41m 0.01h 0.00d # Longest finished job: 76s 1.27m 0.02h 0.00d # Submission to last job: 31794s 529.90m 8.83h 0.37d ssh swarm cd /cluster/data/monDom5/bed/tblastn.hg18KG cat << _EOF_ > splitOne (cd \$1; grep -h -v contig q.*.psl | pslSplitOnTarget stdin chroms grep -h contig q.*.psl > chroms/contigs.psl ) _EOF_ chmod +x splitOne for i in `pwd`/blastOut/kg??; do echo ./splitOne $i done > split.jobs para create split.jobs para maxJob 10 para shove # Completed: 224 of 224 jobs # CPU time in finished jobs: 910s 15.17m 0.25h 0.01d 0.000 y # IO & Wait Time: 128754s 2145.89m 35.76h 1.49d 0.004 y # Average job time: 579s 9.65m 0.16h 0.01d # Longest finished job: 683s 11.38m 0.19h 0.01d # Submission to last job: 13166s 219.43m 3.66h 0.15d ssh swarm cd /cluster/data/monDom5/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne ( f=`basename $1`; d=`dirname $1`; if simpleChain -prot -outPsl -maxGap=150000 $1 $d/c.$f.tmp then mv $d/c.$f.tmp $d/c.$f else rm $d/c.$f.tmp fi ) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg??/chroms/*.psl > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 2270 of 2270 jobs # CPU time in finished jobs: 10821559s 180359.32m 3005.99h 125.25d 0.343 y # IO & Wait Time: 249403s 4156.72m 69.28h 2.89d 0.008 y # Average job time: 4877s 81.28m 1.35h 0.06d # Longest finished job: 205913s 3431.88m 57.20h 2.38d # Submission to last job: 205932s 3432.20m 57.20h 2.38d cd /cluster/data/monDom5/bed/tblastn.hg18KG/blastOut for i in kg?? do cat $i/chroms/c.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../unLift.psl cd .. pslCheck unLift.psl # checked: 71490 failed: 0 errors: 0 liftUp blastHg18KG.psl ../../jkStuff/Un.lft carry unLift.psl pslCheck -prot blastHg18KG.psl # checked: 71490 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/monDom5/bed/tblastn.hg18KG hgLoadPsl monDom5 blastHg18KG.psl # check coverage featureBits monDom5 blastHg18KG # 33631308 bases of 3501660299 (0.960%) in intersection featureBits monDom4 blastHg18KG # 19424941 bases of 3501643220 (0.555%) in intersection featureBits monDom5 all_mrna blastHg18KG -enrichment # all_mrna 0.006%, blastHg18KG 0.960%, both 0.003%, cover 43.79%, enrich 45.59x rm -rf blastOut #end tblastn ######################################################################### ## NSCAN LIFT (FROM MONDOM4) (DONE 2008-12-08 Andy) ssh hgwdev mkdir /hive/data/genomes/monDom5/bed/nscanGene.lifted cd /hive/data/genomes/monDom5/bed/nscanGene.lifted echo "select * from nscanGene" | hgsql monDom4 | sed '1d' > nscanGene.monDom4.gp liftOver nscanGene.monDom4.gp /gbdb/monDom4/liftOver/monDom4ToMonDom5.over.chain.gz \ | nscanGene.monDom5.gp unmapped.gp ldHgGene -predTab monDom5 nscanGene nscanGene.monDom5.gp ########################################################################## ## GENSCAN (DONE 2008-01-22 Andy) ssh hgwdev mkdir -p /hive/data/genomes/monDom5/bed/genscan/{fasta,gtf,pep,subopt} cd /hive/data/genomes/monDom5/bed/genscan twoBitToFa ../../monDom5.2bit stdout | maskOutFa stdin hard stdout \ | faSplit -maxN=5000000 -lift=chunks.lft gap stdin 8000000 fasta/chunk cvs co hg3rdParty/genscanlinux cat << '_EOF_' > gsub #LOOP gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs ls -1 fasta/* > chunk.lst ssh swarm cd /hive/data/genomes/monDom5/bed/genscan gensub2 chunk.lst single gsub spec para create spec para push para time #Completed: 421 of 421 jobs #CPU time in finished jobs: 60788s 1013.13m 16.89h 0.70d 0.002 y #IO & Wait Time: 5097s 84.96m 1.42h 0.06d 0.000 y #Average job time: 156s 2.61m 0.04h 0.00d #Longest finished job: 367s 6.12m 0.10h 0.00d #Submission to last job: 378s 6.30m 0.10h 0.00d catDir subopt | liftUp my.bed chunks.lft error stdin sed 's/chunk/opossum/' my.bed > genscan.subopt.bed catDir gtf | liftUp my.gtf chunks.lft error stdin sed 's/chunk/opossum/g' my.gtf > genscan.gtf catDir pep | sed 's/chunk/opossum/' > genscan.pep.fa ldHgGene -gtf monDom5 genscan genscan.gtf hgPepPred monDom5 generic genscanPep genscan.pep.fa hgLoadBed monDom5 genscanSubopt genscan.subopt.bed ######################################################################### # BIGZIPS/DOWNLOADS (DONE, 2009-06-08 Andy) ssh hgwdev mkdir /hive/data/genomes/monDom5/downloads mkdir /hive/data/genomes/monDom5/downloads/bigZips mkdir /hive/data/genomes/monDom5/downloads/chromosomes cd /hive/data/genomes/monDom5/downloads/chromosomes cp /usr/local/apache/htdocs/goldenPath/monDom4/chromosomes/README.txt . # edit that readme to provide correct references and details ln -s `pwd` /usr/local/apache/htdocs/goldenPath/monDom5/chromosomes cd /hive/data/genomes/monDom5/downloads/chromosomes for c in `cut -f1 ../../chrom.sizes`; do echo $c; twoBitToFa ../../monDom5.unmasked.2bit:$c stdout \ | gzip -c > $c.fa.gz; done md5sum *.fa.gz R*.txt > md5sum.txt cd ../../ for c in `cut -f1 chrom.sizes`; do echo ${c#chr}; pushd ${c#chr}; cp ../downloads/chromosomes/${c}.fa.gz . gunzip ${c}.fa.gz; popd; done tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa Un/chrUn.fa tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out Un/chrUn.fa.out for c in `cut -f1 chrom.sizes`; do echo $c; twoBitToFa monDom5.2bit:$c ${c#chr}/${c}.fa.masked; done tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \ Un/chrUn.fa.masked #real 17m19.439s cd bed/simpleRepeat/ mkdir trfMask cd trfMask ln -s ../trfMaskChrom/* . cd ../ tar -hcvzf chromTrf.tar.gz ./trfMask mv chromTrf.tar.gz ../../downloads/bigZips/ cd ../../downloads/bigZips/ cp ../../broad/Monodelphis5.0.agp . gzip Monodelphis5.0.agp cp /usr/local/apache/htdocs/goldenPath/monDom4/bigZips/README.txt . # [edit] ln -s /hive/data/genomes/monDom5/downloads/bigZips \ /usr/local/apache/htdocs/goldenPath/monDom5/bigZips cd /usr/local/apache/htdocs/goldenPath/monDom5/bigZips ln -s /hive/data/genomes/monDom5/monDom5.2bit . md5sum *.gz *.2bit README.txt > md5sum.txt cd ../chromosomes/ md5sum *.gz R*.txt > md5sum.txt cd ../ mkdir database cd database/ cp ../../monDom4/database/README.txt . # [edit README.txt] ######################################################################### #nscanGene (2009-06-23 markd) # nscanGene track from WUSTL cd /hive/data/genomes/monDom5/bed/nscan wget http://mblab.wustl.edu/predictions/possum/monDom5/README wget http://mblab.wustl.edu/predictions/possum/monDom5/monDom5.gtf wget -r -np -e robots=off -l 1 http://mblab.wustl.edu/predictions/possum/monDom5/chr_ptx/ nice bzip2 chr_ptx/* monDom5.gtf # load track gtfToGenePred -genePredExt monDom5.gtf.bz2 stdout| hgLoadGenePred -genePredExt monDom5 nscanGene stdin bzcat chr_ptx/*.fa.bz2 | hgPepPred monDom5 generic nscanPep stdin rm *.tab # validate same number of transcripts and peptides are loaded hgsql -Ne 'select count(*) from nscanGene' monDom5 hgsql -Ne 'select count(*) from nscanPep' monDom5 # validate search expression hgc-sql -Ne 'select name from nscanGene' monDom5 | egrep -v -e '^chr[0-9a-zA-Z_]+\.[0-9]+\.[0-9]+(\.[0-9a-z]+)?$' |wc -l ######################################################################### # lastz Horse equCab2 (DONE - 2009-06-29,07-02 - Hiram) mkdir /hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29 cd /hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29 cat << '_EOF_' > DEF # opossum vs. Horse # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Opossum (monDom5) SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse equCab2 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/scratch/data/equCab2/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -workhorse=hgwdev \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1353m4.161s # had some trouble with the first kluster run due to various factors # not the least of which was a major power outage time doBlastzChainNet.pl `pwd`/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -continue=cat -workhorse=hgwdev \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 957m25.164s cat fb.monDom5.chainEquCab2Link.txt # 355004426 bases of 3501660299 (10.138%) in intersection mkdir /hive/data/genomes/equCab2/bed/blastz.monDom5.swap cd /hive/data/genomes/equCab2/bed/blastz.monDom5.swap time doBlastzChainNet.pl \ /hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -swap -workhorse=hgwdev \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 320m3.573s cat fb.equCab2.chainMonDom5Link.txt # 351787662 bases of 2428790173 (14.484%) in intersection ######################################################################### # Set default position to OPN1LW gene 2009-07-20, Brooke hgsql -e \ "update dbDb set defaultPos='chrX:14658820-14669558' where name='monDom5'" \ hgcentraltest ######################################################################### ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ########################################################################### # ALIGNMENTS/CHAINS/NETS (DONE Dec 2008, Andy) # # To be honest I didn't really concentrate on getting the whole enchilada of # make-notes into record, because the whole process is so robotic. # # I'll start with the DEF files. These have varying parameters based on # the query species. So here's the various DEF parameters: # DB H Y L K T M A_R* Q 1CHUNK 1LAP 2CHUNK 2LAP 2LIMIT bosTau4 - 3400 6000 2200 - - - HoxD55 1M 10K 20M 0 100 canFam2 2000 3400 10000 2200 - 50 0 HoxD55 1M 10K 30M 0 - danRer5 2000 3400 6000 2200 - - - HoxD55 5M 10K 10M 0 - galGal3 2000 3400 10000 2200 - - - HoxD55 5M 10K 20M 0 - hg18 2000 3400 10000 2200 - 50 0 HoxD55 5M 10K 10M 0 - macEug1 - 3400 - - 2 10 - - 10M 320K 500K 0 100 mm9 - 3400 6000 2200 - - - HoxD55 5M 10K 20M 0 - ornAna1 2000 3400 6000 2200 - 50 - HoxD55 5M 10K 20M 0 300 panTro2 2000 3400 10000 2200 - 50 0 HoxD55 1M 10K 30M 0 - ponAbe2 2000 3400 10000 2200 - 50 0 HoxD55 1M 10K 30M 0 - rheMac2 2000 3400 10000 2200 - 50 0 HoxD55 1M 10K 30M 0 - rn4 - 3400 6000 2200 - - - HoxD55 2M 10K 10M 0 - rn5 - 3400 6000 2200 - - - - 5M 10K 20M 0 - xenTro2 2000 3400 8000 2200 - 50 - HoxD55 5M 10K 20M 0 100 * BLASTZ_ABRIDGE_REPEATS # In most of those cases I was looking to the DEF variables in the monDom4 version # of the alignment. In the case of macEug1, the variable settings were given by # Webb Miller. # # After all the DEF files are created, it's time to run doBlastzChainNet.pl # In the case of monDom5, they all have the profile of this: cd /hive/data/genomes/monDom5/bed mkdir blastz.otherDb cd blastz.otherDb screen -S otherDb_monDom5 doBlastzChainNet.pl -bigClusterHub swarm -stop cat DEF >& doUntilCat.log #[detach screen, come back when it's done] # The reason I'd stop after the cat step is that I was having better results # using swarm for chaining instead of memk. At the time, memk only had a dozen # nodes, and usually only 8 would be online. Because of the large chromosomes, # the chaining step would be bottlenecked by the hippo jobs on the big chroms. # It was going much more quickly when the cluster could accommodate more # concurrent jobs. On memk the jobs were allocated 8GB of RAM and they normally # only get 2GB on swarm, so in case that mattered, I did this #[re-attach screen] doBlastzChainNet.pl -bigClusterHub swarm -smallClusterHub swarm -minChainScore 3000 DEF >& doAfterCat.log #[detach screen] ssh swarm cd /hive/data/genomes/monDom5/bed/blastz.otherDb/axtChain/run para check # check to see the jobs have started, once they have: para stop; para resetCounts; para -ram=8g -cpu=4 create jobList; para push # check to see things are all fine, and log out of swarm # after that all should have completed fine. Stopping the cluster run manually # while the doBlastzChainNet.pl script is running doesn't typically confuse it. # I suppose it's possible that the script while check on the run in the small # period while it's stopped, but it didn't happen to me. # # In retrospect I think the memk trick is possibly avoidable now that there are # more memk nodes. However the oppossum is a particularly difficult species to # chain, so I'm just mentioning it anyway. Later when doing alignments on hg19, # memk was up to the task just fine. # # Also of note is the inconsistency of DEF parameter settings. When monDom6 # happens, someone will probably look here to see what was done with monDom5. # If I have any advice it's to take an approach like the one with hg19 and # use consistent parameters as much as possible and set them according to # different tiers of evolutionary distance. #################################################################### # RELOAD CHAINS/NETS AS NON-SPLIT (2009-06-09, Andy) for d in blastz.*; do if [ $d != "blastz.bosTau4" ]; then db=${d#blastz.}; Db=`echo $db | sed 's/^./\u&/'`; echo Loading $db chains into monDom5...; time nice -n +19 hgLoadChain -tIndex monDom5 chain$Db \ blastz.$db/axtChain/monDom5.$db.all.chain.gz; fi; done >& unsplit/chainReloads.log # problem with macEug1 cd blastz.macEug1/axtChain/ time nice -n +19 hgLoadChain -tIndex monDom5 chainMacEug1 monDom5.macEug1.all.chain.gz #Loading 19668859 chains into monDom5.chainMacEug1 #Can't start query: #load data local infile 'link.tab' into table chainMacEug1Link # #mySQL error 1114: The table 'chainMacEug1Link' is full #real 146m54.273s # wc -l link.tab #200440062 link.tab #19668859 chain.tab randomLines link.tab 10000000 stdout | awk '{print length($0)}' | sort | uniq -c randomLines chain.tab 1000000 stdout | awk '{print length($0)}' | sort | uniq -c # 92 chain, 42 link sed "s/hgLoadChain.*/hgsqldump monDom5 chainRn4Link --no-data --skip-comments\n\ | sed \'s\/Rn4\/MacEug1\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=201000000\n\ avg_row_length=42 pack_keys=1 CHARSET=latin1\/\' | hgsql monDom5 \n\ /" loadUp.csh > manualLoadUp.csh hgsqldump monDom5 chainRn4 --no-data --skip-comments | sed \'s\/Rn4\/MacEug1\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=20000000 avg_row_length=92 pack_keys=1 CHARSET=latin1\/\' | hgsql monDom5 \n\ hgsql monDom5 -e \"load data local infile \'chain.tab\' into table chainMacEug1\"\n\ hgsql monDom5 -e \"load data local infile \'link.tab\' into table chainMacEug1Link\"\n\ hgsql monDom5 -e \"INSERT into history (ix, startId, endId, who, what, modTime, errata) VALUES(NULL,0,0,\'aamp\',\'Loaded 19668859 chains into macEug1 chain table manually\', NOW(), NULL)\"\ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ ssh hgwdev mkdir /hive/data/genomes/monDom5/bed/multiz9way cd /hive/data/genomes/monDom5/bed/multiz9way ## Build the tree by pruning out the 8 species from the recent 44way, ## changing monDom4 to monDom5, and adding macEug1 ## (Wallaby placement deduced from (Margulies et al.,PNAS 2004) ). /cluster/bin/phast/tree_doctor \ --prune-all-but=hg18,mm9,canFam2,ornAna1,galGal3,xenTro2,danRer5,monDom4 \ /hive/data/genomes/hg18/bed/multiz44way/44way.nh | \ sed 's/:0\.[[:digit:]]\+/ /g; s/,//g; s/;//; s/\ )/)/g; s/monDom4/(monDom5 macEug1)/' \ > tree.nh sed 's/(//g; s/)//g' tree.nh > species.lst ## Make per-chrom maf links mkdir mafLinks cd mafLinks bash for db in hg18 mm9 canFam2 macEug1 ornAna1 galGal3 xenTro2 danRer5; do mkdir $db; pushd $db; if [ -e /hive/data/genomes/monDom5/bed/blastz.${db}/mafSynNet ]; then ln -s /hive/data/genomes/monDom5/bed/blastz.${db}/mafSynNet/* . else ln -s /hive/data/genomes/monDom5/bed/blastz.${db}/mafNet/* . fi popd done cd ../ ## Split mafs up mkdir mafSplit mafSplitPos -minGap=50000 monDom5 10 mafSplit.bed for db in `ls -1 mafLinks`; do mkdir mafSplit/$db pushd mafSplit/$db mafSplit ../../mafSplit.bed monDom5_ ../../mafLinks/${db}/chr*.maf.gz \ -verbose=2 popd done ## Make file list for cluster run cd mafSplit/canFam2/ find . -type f | sort -u | sed -e "s#./##" > ../../9-way.split.lst ## Double-check the file sets are all the same in each database subdir ## Cluster run on split mafs ssh swarm cd /hive/data/genomes/monDom5/bed/multiz9way mkdir -p multizRun/run/penn multizRun/maf cd multizRun/run/ cp -p /cluster/bin/penn/multiz.2008-11-25/{multiz,maf_project,autoMZ} penn/ cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = monDom5 set c = $1 set result = $2 set run = `pwd` set tmp = $run/tmp/$db/multiz.$c set pairs = /hive/data/genomes/monDom5/bed/multiz9way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.lst $tmp pushd $tmp foreach s (`sed -e "s/ $db//" species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out else if (-e $in) then ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db /bin/rmdir --ignore-fail-on-non-empty $run/tmp '_EOF_' # << emacs chmod +x autoMultiz.csh cat << '_EOF' > gsub #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/monDom5/bed/multiz9way/multizRun/maf/$(root1).maf} #ENDLOOP '_EOF_' # << emacs para -ram=4g -cpu=2 create jobList para push para time #Completed: 332 of 332 jobs #CPU time in finished jobs: 17639s 293.98m 4.90h 0.20d 0.001 y #IO & Wait Time: 46958s 782.64m 13.04h 0.54d 0.001 y #Average job time: 195s 3.24m 0.05h 0.00d #Longest finished job: 1693s 28.22m 0.47h 0.02d #Submission to last job: 1706s 28.43m 0.47h 0.02d ## Sew up the mafs into one file ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/multizRun mkdir ../maf ls -1 maf | sed 's/monDom5_//; s/\..*//' | sort -u | while read chrom; do head -q -n 1 maf/monDom5_${chrom}.*.maf | sort -u > ../maf/${chrom}.maf grep -h "^#" maf/monDom5_${chrom}.*.maf | egrep -v "maf version=1|eof maf" | sed -e "s#${chrom}.[0-9][0-9]*#${chrom}#g; s#_MZ_[^ ]* # #g;" | sort -u >> ../maf/${chrom}.maf grep -h -v "^#" `ls maf/monDom5_${chrom}.*.maf | sort -t. -k2,2n` >> ../maf/${chrom}.maf tail -q -n 1 maf/monDom5_${chrom}.*.maf | sort -u >> ../maf/${chrom}.maf echo done with $chrom done cd ../ rm -rf multizRun/maf/ mafSplit/ ## gap annotation mkdir -p anno/{maf,run} cd anno/ for db in `cat ../species.lst`; do dbDir="/hive/data/genomes/${db}" if [ ! -f ${dbDir}/${db}.N.bed ]; then echo "creating ${db}.N.bed" twoBitInfo -nBed ${dbDir}/${db}.2bit ${dbDir}/${db}.N.bed else ls -og ${dbDir}/${db}.N.bed fi done cd run/ for db in `sed -e "s/monDom5 //" ../../species.lst`; do echo "${db} " ln -s /hive/data/genomes/${db}/${db}.N.bed ${db}.bed echo ${db}.bed >> nBeds ln -s /hive/data/genomes/${db}/chrom.sizes ${db}.len echo ${db}.len >> sizes done ssh hgwdevnew cd /hive/data/genomes/monDom5/bed/multiz9way/run for c in `ls -1 ../../maf | sed -e "s/.maf//"`; do echo starting $c time nice mafAddIRows -nBeds=nBeds ../../maf/${c}.maf /hive/data/genomes/monDom5/monDom5.2bit ../maf/${c}.maf done exit # hgwdevnew cd ../../ ## quality information (only available for 6 of 9 species including monDom5) mkdir -p quals/{maf,qa} cd quals/qa/ ln -s /hive/data/genomes/monDom5/monDom5.{qac,qdx} . ln -s /hive/data/genomes/hg18/bed/multiz44way/quals/allInOnePlace/{galGal3,canFam2,ornAna1}.* . qaToQac /hive/data/genomes/rn5/baylor/Rnor4.chroms.fa.qual.gz \ rn5.noIdx.qac qacAddGapIdx /hive/data/genomes/rn5/baylor/Rnor4.scaffold_chr.agp \ rn5.noIdx.qac rn5.qac rn5.qdx rm rn5.noIdx.qac qaToQac /hive/data/genomes/macEug1/baylor/macEug1.contigs.onlyInAgp.qa.gz \ macEug1.noIdx.qac qacAddGapIdx /hive/data/genomes/macEug1/baylor/macEug1.agp macEug1.noIdx.qac \ macEug1.qac macEug1.qdx for db in canFam2 galGal3 macEug1 monDom5 ornAna1 rn5; do echo $db | awk 'BEGIN{OFS="\t"}{print $1, "/hive/data/genomes/monDom5/bed/multiz9way/quals/qa";}'; done > quals.lst cat << '_EOF_' > gsub #LOOP mafAddQRows quals.lst $(path1) {check out line+ maf/$(file1) } #ENDLOOP '_EOF_' # << emacs ssh swarm cd /hive/data/genomes/monDom5/bed/multiz9way/quals mkdir inMaf cd inMaf/ ln -s ../../anno/maf/* . cd ../ ls -1 inMaf/* > in.lst gensub2 in.lst single gsub jobList para -ram=8g -cpu=4 create jobList para push para time #Completed: 11 of 11 jobs #CPU time in finished jobs: 5419s 90.32m 1.51h 0.06d 0.000 y #IO & Wait Time: 19745s 329.08m 5.48h 0.23d 0.001 y #Average job time: 2288s 38.13m 0.64h 0.03d #Longest finished job: 5196s 86.60m 1.44h 0.06d #Submission to last job: 5206s 86.77m 1.45h 0.06d cd ../ ## Gene frames mkdir -p genes/{genePreds,inMaf,chrFrames} cd genes/genePreds/ ## human/mouse use known genes for db in hg18 mm9; do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${db} | \ genePredSingleCover stdin stdout | gzip -c > ${db}.gp.gz done ## monDom5/canFam2/ornAna1/galGal3/xenTro2/danRer5 use ensGene for db in monDom5 canFam2 ornAna1 galGal3 xenTro2 danRer5; do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" $db | \ genePredSingleCover stdin stdout | gzip -c > ${db}.gp.gz done cd ../inMaf/ ln -s ../../quals/maf/* . cd ../ ls -1 inMaf/ | sed 's/.maf//' > chr.lst ls -1 genePreds/ | sed 's/.gp.gz//' > gp.lst cat << '_EOF_' > mafFrames.csh #!/bin/csh -fe set c = $1 set g = $2 cat inMaf/${c}.maf | genePredToMafFrames monDom5 stdin stdout \ ${g} genePreds/${g}.gp.gz | gzip -c > chrFrames/${c}.${g}.mafFrames.gz '_EOF_' cat << '_EOF_' > gsub #LOOP ./mafFrames.csh $(root1) $(root2) {check out exists+ chrFrames/$(root1).$(root2).mafFrames.gz} #ENDLOOP '_EOF_' ssh swarm cd /hive/data/genomes/monDom5/bed/multiz9way/genes gensub2 chr.lst gp.lst gsub jobList para -ram=8g -cpu=4 create jobList para push para time #Completed: 88 of 88 jobs #CPU time in finished jobs: 1610s 26.83m 0.45h 0.02d 0.000 y ##IO & Wait Time: 3101s 51.69m 0.86h 0.04d 0.000 y #Average job time: 54s 0.89m 0.01h 0.00d #Longest finished job: 95s 1.58m 0.03h 0.00d #Submission to last job: 211s 3.52m 0.06h 0.00d exit # [exit swarm] find chrFrames -type f | while read frameFile; do zcat ${frameFile} done | sort -k1,1 -k2,2n > multiz9wayFrames.bed hgLoadMafFrames monDom5 multiz9wayFrames{,.bed} ## make downloads mkdir -p download/maf cd download/maf cp -p ../../quals/maf/chr*.maf . gzip --rsyncable *.maf md5sum *.gz > md5sum.txt mkdir -p /usr/local/apache/htdocs/goldenPath/monDom5/multiz9way/maf ln -s `pwd -P`/* /usr/local/apache/htdocs/goldenPath/monDom5/multiz9way/maf cd ../../ ## load mafs into database cd anno/maf mkdir -p /gbdb/monDom5/multiz9way/maf ln -s `pwd -P`/*.maf /gbdb/monDom5/multiz9way/maf pushd /data/tmp nice -n +19 hgLoadMaf -pathPrefix=/gbdb/monDom5/multiz9way/maf monDom5 multiz9way nice -n +19 cat /gbdb/monDom5/multiz9way/maf/*.maf | hgLoadMafSummary monDom5 multiz9waySummary stdin popd cd ../../ ## phastCons ## 1. collect all mafs with all nine species in components. mkdir cons cd cons/ for chrom in `ls -1 ../initialMafs/`; do for db in `cat ../species.lst`; do maf=../initialMafs/$chrom if [ -e withAllNine.$chrom ]; then maf=withAllNine.$chrom fi mafFilter -needComp=$db $maf > tmp.maf mv tmp.maf withAllNine.$chrom done echo Done with withAllNine.$chrom done ## 2. split into 10 Mbase pieces and generate .ss files mkdir msa.split ss cd msa.split/ ln -s ../../../../monDom5.2bit cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set maf = /hive/data/genomes/monDom5/bed/multiz9way/cons/maf/withAllNine.${c}.maf set chromDir = /hive/data/genomes/monDom5/bed/multiz9way/cons/ss/$c rm -fr $chromDir mkdir $chromDir pushd $chromDir > /dev/null twoBitToFa -seq=$c monDom5.2bit monDom5.${c}.fa msa_split $maf -i MAF \ -M monDom5.${c}.fa -o SS -r $chromDir/$c -w 20000000,0 -I 100 -B 5000 rm -f monDom5.${c}.fa popd > /dev/null date >> ${c}.done '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP ./doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs ssh swarm cd /hive/data/genomes/monDom5/bed/multiz9way/cons/msa.split ls -1 ../maf/ | sed 's/withAllNine.//; s/.maf//' > chr.lst gensub2 chr.lst single gsub jobList para -ram=8g -cpu=4 create jobList para push para time #Completed: 11 of 11 jobs #CPU time in finished jobs: 805s 13.42m 0.22h 0.01d 0.000 y #IO & Wait Time: 367s 6.11m 0.10h 0.00d 0.000 y #Average job time: 107s 1.78m 0.03h 0.00d #Longest finished job: 211s 3.52m 0.06h 0.00d #Submission to last job: 220s 3.67m 0.06h 0.00d cd ../ ## another run, full chroms mkdir ss.chrom msa_view.run cd msa_view.run ln -s ../../../../monDom5.2bit cat << '_EOF_' > doChrom.csh #!/bin/csh -ef set c = $1 twoBitToFa -seq=$c monDom5.2bit ${c}.fa msa_view ../maf/withAllNine.${c}.maf -i MAF -M ${c}.fa -o SS > ../ss.chrom/${c}.ss rm ${c}.fa '_EOF_' # << emacs cat << '_EOF_' > gsub #LOOP ./doChrom.csh $(root1) {check out line+ ../ss.chrom/$(root1).ss } #ENDLOOP '_EOF_' ls -1 ../maf/ | sed 's/withAllNine.//; s/.maf//' > chr.lst gensub2 chr.lst single gsub jobList para -ram=8g -cpu=4 create jobList para push para time #Completed: 11 of 11 jobs #CPU time in finished jobs: 769s 12.81m 0.21h 0.01d 0.000 y #IO & Wait Time: 426s 7.10m 0.12h 0.00d 0.000 y #Average job time: 109s 1.81m 0.03h 0.00d #Longest finished job: 211s 3.52m 0.06h 0.00d #Submission to last job: 217s 3.62m 0.06h 0.00d mkdir phyloFit.run cd phyloFit.run/ ls -1 ../ss.chrom/* > chr.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/phast/x86_64/phyloFit --msa-format SS --tree start.tree $(path1) { check out line+ ../trees/$(root1).mod } #ENDLOOP '_EOF_' gensub2 chr.lst single gsub jobList para -ram=8g -cpu=4 create jobList para push para time cd ../trees/ phyloBoot --read-mods chr{1,2,3,4,5,6,7,8}.mod --output-average chroms1-8.mod ######## Let's back up a minute exit # back to hgwdev mkdir msa_split.chr1 cd msa_split.chr1/ twoBitToFa ../../../../monDom5.2bit:chr1 chr1.fa time msa_split ../maf/withAllNine.chr1.maf --refseq chr1.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root ssChr1 ssh swarm cd /hive/data/genomes/monDom5/bed/multiz9way/cons/phastCons.estimateTrees.run ls -1 ../msa_split.chr1/* > ss.lst grep BACKGROUND ../trees/chroms1-8.mod | awk '{printf "%0.3f\n", $3 + $4;}' #0.383 cat << '_EOF_' > gsub #LOOP /cluster/bin/phast/x86_64/phastCons --gc 0.383 --target-coverage 0.17 --expected-length 12 --estimate-trees --no-post-probs trees/$(root1) $(path1) ../trees/chroms1-8.mod #ENDLOOP '_EOF_' gensub2 ss.lst single gsub jobList para -ram=8g -cpu=4 create jobList para push para time #Completed: 8 of 8 jobs #CPU time in finished jobs: 51655s 860.91m 14.35h 0.60d 0.002 y #IO & Wait Time: 950s 15.84m 0.26h 0.01d 0.000 y #Average job time: 6576s 109.59m 1.83h 0.08d #Longest finished job: 8998s 149.97m 2.50h 0.10d #Submission to last job: 9217s 153.62m 2.56h 0.11d ## OK now average these cd ../ ls trees/*.cons.mod > cons.txt phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod ls trees/*.noncons.mod > noncons.txt phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod ## Now break the big mafs into .ss files cd ../ mkdir msa_split.bigMafs ss.splits cd msa_split.bigMafs/ ls -1 ../../initialMafs/* > mafs.lst cat << '_EOF_' > msa.sh #!/bin/bash chr=`basename $1 .maf` mkdir -p $chr twoBitToFa ../../../../monDom5.2bit:$chr ${chr}.fa /cluster/bin/phast/x86_64/msa_split $1 -M ${chr}.fa -i MAF -o SS -w 10000000,1000 -B 5000 -r ${chr}/ss rm ${chr}.fa '_EOF_' chmod +x msa.sh cat << '_EOF_' > gsub #LOOP ./msa.sh $(path1) { check exists $(root1) } #ENDLOOP '_EOF_' ssh swarm cd /hive/data/genomes/monDom5/bed/multiz9way/cons/msa_split.bigMafs gensub2 mafs.lst single gsub spec para create spec para push para time ## Run phastCons on the many .ss files mkdir ../phast.run cd ../phast.run/ find ../msa_split.bigMafs/ -type f -name '*.ss' > ss.lst ln -s ../phastCons.estimateTrees.run/*.mod . cat << '_EOF_' > phast.sh #!/bin/bash ss=$1 wig=$2 bed=$3 mkdir -p `dirname $wig` mkdir -p `dirname $bed` /cluster/bin/phast/x86_64/phastCons --target-coverage 0.17 --expected-length 12 --most-conserved $bed --score $ss ave.cons.mod,ave.noncons.mod > $wig '_EOF_' chmod +x phast.sh cat << '_EOF_' > gsub #LOOP ./phast.sh $(path1) wig/$(lastDir1)/$(root1).wig bed/$(lastDir1)/$(root1).bed #ENDLOOP '_EOF_' gensub2 ss.lst single gsub spec para create spec para push para time #Completed: 367 of 367 jobs #CPU time in finished jobs: 9131s 152.18m 2.54h 0.11d 0.000 y #IO & Wait Time: 6169s 102.82m 1.71h 0.07d 0.000 y #Average job time: 42s 0.69m 0.01h 0.00d #Longest finished job: 70s 1.17m 0.02h 0.00d #Submission to last job: 343s 5.72m 0.10h 0.00d ## Stitch up wig files, make .wibs, load in DB ############################################################################ # susScr1 Pig BLASTZ/CHAIN/NET (DONE - 2010-01-21,22 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/monDom5/bed/lastzSusScr1.2010-01-21 cd /hive/data/genomes/monDom5/bed/lastzSusScr1.2010-01-21 cat << '_EOF_' > DEF # Pig vs. Opossum BLASTZ_M=50 # TARGET: Opossum MonDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Pig SusScr1 SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit SEQ2_LEN=/scratch/data/susScr1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzSusScr1.2010-01-21 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 992m51.260s cat fb.monDom5.chainSusScr1Link.txt # 179898391 bases of 3501660299 (5.138%) in intersection mkdir /hive/data/genomes/susScr1/bed/blastz.monDom5.swap cd /hive/data/genomes/susScr1/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzSusScr1.2010-01-21/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 97m35.156s cat fb.susScr1.chainMonDom5Link.txt # 182834626 bases of 2231332019 (8.194%) in intersection ######################################################################### # ailMel1 Panda BLASTZ/CHAIN/NET (DONE - 2010-02-04 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/monDom5/bed/lastzAilMel1.2010-02-04 cd /hive/data/genomes/monDom5/bed/lastzAilMel1.2010-02-04 cat << '_EOF_' > DEF # Panda vs. Opossum BLASTZ_M=50 # TARGET: Opossum MonDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Panda AilMel1 SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzAilMel1.2010-02-04 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 492m43.088s cat fb.monDom5.chainAilMel1Link.txt # 223510659 bases of 3501660299 (6.383%) in intersection mkdir /hive/data/genomes/ailMel1/bed/blastz.monDom5.swap cd /hive/data/genomes/ailMel1/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzAilMel1.2010-02-04/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 69m35.464s cat fb.ailMel1.chainMonDom5Link.txt # 211209682 bases of 2245312831 (9.407%) in intersection ############################################################################## # calJac3 Marmoset LASTZ/CHAIN/NET (DONE - 2010-02-12 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/monDom5/bed/lastzCalJac3.2010-02-12 cd /hive/data/genomes/monDom5/bed/lastzCalJac3.2010-02-12 cat << '_EOF_' > DEF # Marmoset vs. Opossum BLASTZ_M=50 # TARGET: Opossum MonDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Marmoset CalJac3 SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzCalJac3.2010-02-12 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=memk \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=loose \ > do.log 2>&1 & # real 2772m44.886s cat fb.monDom5.chainCalJac3Link.txt # 216197506 bases of 3501660299 (6.174%) in intersection mkdir /hive/data/genomes/calJac3/bed/blastz.monDom5.swap cd /hive/data/genomes/calJac3/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzCalJac3.2010-02-12/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 110m13.435s cat fb.calJac3.chainMonDom5Link.txt # 217614612 bases of 2752505800 (7.906%) in intersection ####################################################################### # felCat4 Cat BLASTZ/CHAIN/NET (working - 2010-06-07 - Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/monDom5/bed/lastzFelCat4.2010-06-07 cd /hive/data/genomes/monDom5/bed/lastzFelCat4.2010-06-07 cat << '_EOF_' > DEF # opossum vs. cat # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Opossum monDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Cat (felCat4) SEQ2_DIR=/scratch/data/felCat4/felCat4.2bit SEQ2_LEN=/scratch/data/felCat4/chrom.sizes SEQ2_LIMIT=50 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzFelCat4.2010-06-07 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noDbNameCheck \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # *** All done ! Elapsed time: 2287m6s # *** Make sure that goldenPath/monDom5/vsFelCat4/README.txt is accurate. # *** Add {chain,net}FelCat4 tracks to trackDb.ra if necessary. cat fb.monDom5.chainFelCat4Link.txt # 178616721 bases of 3501660299 (5.101%) in intersection # Swap mkdir /hive/data/genomes/felCat4/bed/blastz.monDom5.swap cd /hive/data/genomes/felCat4/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzFelCat4.2010-06-07/DEF \ -swap -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=menk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 181m0.411s # step 'syntenicNet' failed, continue time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzFelCat4.2010-06-07/DEF \ -swap -syntenicNet -noDbNameCheck \ -continue syntenicNet \ -workhorse=hgwdev -smallClusterHub=menk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > syntenicNet.log 2>&1 & # real 9m46.383s cat fb.felCat4.chainMonDom5Link.txt # 166499264 bases of 1990635005 (8.364%) in intersection ############################################################################## # susScr2 Pig BLASTZ/CHAIN/NET (DONE - 2010-03-26,27 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/monDom5/bed/lastzSusScr2.2010-03-26 cd /hive/data/genomes/monDom5/bed/lastzSusScr2.2010-03-26 cat << '_EOF_' > DEF # Pig vs. Opossum BLASTZ_M=50 # TARGET: Opossum MonDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Pig SusScr2 SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit SEQ2_LEN=/scratch/data/susScr2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzSusScr2.2010-03-26 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # Elapsed time: 1132m5s cat fb.monDom5.chainSusScr2Link.txt # 179898307 bases of 3501660299 (5.138%) in intersection mkdir /hive/data/genomes/susScr2/bed/blastz.monDom5.swap cd /hive/data/genomes/susScr2/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzSusScr2.2010-03-26/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # Elapsed time: 82m55s cat fb.susScr2.chainMonDom5Link.txt # 182834643 bases of 2231298548 (8.194%) in intersection ############################################################################## # oviAri1 Sheep BLASTZ/CHAIN/NET (DONE - 2010-04-16 - Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/monDom5/bed/lastzOviAri1.2010-04-16 cd /hive/data/genomes/monDom5/bed/lastzOviAri1.2010-04-16 cat << '_EOF_' > DEF # Sheep vs. Opossum BLASTZ_M=50 # TARGET: Opossum MonDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Sheep OviAri1 SEQ2_DIR=/scratch/data/oviAri1/oviAri1.2bit SEQ2_LEN=/scratch/data/oviAri1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzOviAri1.2010-04-16 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 756m34.317s cat fb.monDom5.chainOviAri1Link.txt # 133534458 bases of 3501660299 (3.813%) in intersection # and the swap mkdir /hive/data/genomes/oviAri1/bed/blastz.monDom5.swap cd /hive/data/genomes/oviAri1/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzOviAri1.2010-04-16/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memn -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 39m13.430s cat fb.oviAri1.chainMonDom5Link.txt # 117493519 bases of 1201271277 (9.781%) in intersection ######################################################################### # lastz Rabbit oryCun2 swap (DONE - 2010-01-25 - Hiram) # original alignment cd /hive/data/genomes/oryCun2/bed/lastzMonDom5.2010-01-21 cat fb.oryCun2.chainMonDom5Link.txt # 176460344 bases of 2604023284 (6.776%) in intersection # and the swap mkdir /hive/data/genomes/monDom5/bed/blastz.oryCun2.swap cd /hive/data/genomes/monDom5/bed/blastz.oryCun2.swap time doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/oryCun2/bed/lastzMonDom5.2010-01-21/DEF \ -swap -noLoadChainSplit -workhorse=hgwdev -smallClusterHub=memk \ -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 74m20.206s cat fb.monDom5.chainOryCun2Link.txt # 178331930 bases of 3501660299 (5.093%) in intersection ######################################################################### # LASTZ Turkey MelGal1 ( Done 2011-03-30 - Chin) mkdir /hive/data/genomes/monDom5/bed/lastzMelGal1.2011-03-30 cd /hive/data/genomes/monDom5/bed/lastzMelGal1.2011-03-30 cat << '_EOF_' > DEF # Turkey vs Opossum # TARGET: Opossum monDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Turkey melGal1 - single chunk big enough to run entire chrom SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzMelGal1.2011-03-30 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 # real 129m24.914s cat fb.monDom5.chainMelGal1Link.txt # 79077534 bases of 3501660299 (2.258%) in intersection cd /hive/data/genomes/monDom5/bed ln -s lastzMelGal1.2011-03-30 lastz.melGal1 # running the swap mkdir /hive/data/genomes/melGal1/bed/blastz.monDom5.swap cd /hive/data/genomes/melGal1/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzMelGal1.2011-03-30/DEF \ -swap \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 # Elapsed time: 10m15s cat fb.melGal1.chainMonDom5Link.txt # 64933013 bases of 935922386 (6.938%) in intersection cd /hive/data/genomes/melGal1/bed ln -s lastz.monDom5.swap lastz.monDom5 ############################################################################## # LASTZ Cow bosTau6 (DONE - 2011-05-31- Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/monDom5/bed/lastzBosTau6.2011-05-31 cd /hive/data/genomes/monDom5/bed/lastzBosTau6.2011-05-31 cat << '_EOF_' > DEF # Cow vs. Opossum BLASTZ_M=50 # TARGET: Opossum MonDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow BosTau6 SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit SEQ2_LEN=/scratch/data/bosTau6/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzBosTau6.2011-05-31 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real real 1188m23.907s cat fb.monDom5.chainBosTau6Link.txt # 207703303 bases of 3501660299 (5.932%) in intersection # and the swap mkdir /hive/data/genomes/bosTau6/bed/blastz.monDom5.swap cd /hive/data/genomes/bosTau6/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzBosTau6.2011-05-31/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 52m10.229s cat fb.bosTau6.chainMonDom5Link.txt # 199980646 bases of 2649682029 (7.547%) in intersection cd /hive/data/genomes/bosTau6/bed ln -s blastz.monDom5.swap lastz.monDom5 ######################################################################### # LASTZ Frog xenTro3 (DONE - 2011-09-22 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/monDom5/bed/lastzXenTro3.2011-09-22 cd /hive/data/genomes/monDom5/bed/lastzXenTro3.2011-09-22 cat << '_EOF_' > DEF # Opossum vs. Frog BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Opossum MonDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Frog XenTro3 SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzXenTro3.2011-09-22 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # Elapsed time: 1329m50s cat fb.monDom5.chainXenTro3Link.txt # 75079446 bases of 3501660299 (2.144%) in intersection # and the swap mkdir /hive/data/genomes/xenTro3/bed/blastz.monDom5.swap cd /hive/data/genomes/xenTro3/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzXenTro3.2011-09-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 66m38.348s cat fb.xenTro3.chainMonDom5Link.txt # 74593612 bases of 1358334882 (5.492%) in intersection cd /hive/data/genomes/xenTro3/bed ln -s blastz.monDom5.swap lastz.monDom5 ############################################################################ # Blastz Elephant loxAfr3 (DONE - 2011-11-10 - Chin) mkdir /hive/data/genomes/monDom5/bed/lastzLoxAfr3.2011-11-10 cd /hive/data/genomes/monDom5/bed/lastzLoxAfr3.2011-11-10 cat << '_EOF_' > DEF # Opossum vs. Elephant BLASTZ_M=50 # TARGET: Opossum MonDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Elephant loxAfr3 SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/monDom5/bed/lastzLoxAfr3.2011-11-10 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 528m3.128s # cat step failed due to "memkk" error, continue to run rm -r /cluster/data/monDom5/bed/lastzLoxAfr3.2011-11-10/run.cat/ time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue cat \ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > cat.log 2>&1 & # real 1222m21.262s cat fb.monDom5.chainLoxAfr3Link.txt # 228949705 bases of 3501660299 (6.538%) in intersection cd /hive/data/genomes/monDom5/bed ln -s lastzLoxAfr3.2011-11-10 lastz.loxAfr3 # trying syntenic nets time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 & # real 6m7.179s mkdir /hive/data/genomes/loxAfr3/bed/blastz.monDom5.swap cd /hive/data/genomes/loxAfr3/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzLoxAfr3.2011-11-10/DEF \ -swap -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -syntenicNet > swap.log 2>&1 & # real 100m41.251s cat fb.loxAfr3.chainMonDom5Link.txt # 230673812 bases of 3118565340 (7.397%) in intersection cd /hive/data/genomes/loxAfr3/bed ln -s blastz.monDom5.swap lastz.monDom5 ############################################################################## # LASTZ Cow bosTau7 (DONE - 2012-01-26- Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/monDom5/bed/lastzBosTau7.2012-01-26 cd /hive/data/genomes/monDom5/bed/lastzBosTau7.2012-01-26 cat << '_EOF_' > DEF # Cow vs. Opossum BLASTZ_M=50 # TARGET: Opossum MonDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow BosTau7 SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/monDom5/bed/lastzBosTau7.2012-01-26 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1102m12.996s cat fb.monDom5.chainBosTau7Link.txt # 206928725 bases of 3501660299 (5.909%) in intersection cd /hive/data/genomes/monDom5/bed ln -s lastzBosTau7.2012-01-26 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.monDom5.swap cd /hive/data/genomes/bosTau7/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzBosTau7.2012-01-26/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 59m33.272s cat fb.bosTau7.chainMonDom5Link.txt # 207518037 bases of 2804673174 (7.399%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.monDom5.swap lastz.monDom5 ############################################################################ # LASTZ Rhesus rheMac3 SWAP (DONE - 2012-07-19 - Chin) cd /hive/data/genomes/monDom5/bed/lastzRheMac3.2012-07-19 cat fb.monDom5.chainRheMac3Link.txt # 221659579 bases of 3501660299 (6.330%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/monDom5/bed ln -s lastzRheMac3.2012-07-19 lastz.rheMac3 mkdir /hive/data/genomes/rheMac3/bed/blastz.monDom5.swap cd /hive/data/genomes/rheMac3/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/monDom5/bed/lastzRheMac3.2012-07-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 106m24.321s cat fb.rheMac3.chainMonDom5Link.txt # 214801189 bases of 2639145830 (8.139%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rheMac3/bed ln -s blastz.monDom5.swap lastz.monDom5 ######################################################################### # LASTZ petMar2 Lamprey (DONE - 2012-10-23 - Hiram) screen -S monDom5PetMar2 # use a screen to control this multi-day job mkdir /cluster/data/monDom5/bed/lastzPetMar2.2012-10-23 cd /cluster/data/monDom5/bed/lastzPetMar2.2012-10-23 cat << '_EOF_' > DEF # Opossum vs Lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Opossum monDom5 SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Lamprey petMar2 SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/monDom5/bed/lastzPetMar2.2012-10-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -bigClusterHub=swarm > do.log 2>&1 & # real 483m8.591s cat fb.monDom5.chainPetMar2Link.txt # 25404425 bases of 3501660299 (0.725%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/monDom5/bed ln -s lastzPetMar2.2012-10-23 lastz.petMar2 # and the swap mkdir /cluster/data/petMar2/bed/blastz.monDom5.swap cd /cluster/data/petMar2/bed/blastz.monDom5.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/monDom5/bed/lastzPetMar2.2012-10-23/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=swarm -verbose=2 > swap.log 2>&1 & # real 11m54.144s cat fb.petMar2.chainMonDom5Link.txt # 16944977 bases of 647368134 (2.618%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/petMar2/bed ln -s blastz.monDom5.swap lastz.monDom5 ############################################################################ # lastz White Rhino cerSim1 (DONE - 2012-10-23 - Hiram) screen -S monDomr5CerSim1 mkdir /hive/data/genomes/monDom5/bed/lastzCerSim1.2012-10-23 cd /hive/data/genomes/monDom5/bed/lastzCerSim1.2012-10-23 cat << '_EOF_' > DEF # opossum vs. White Rhino # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Opossum (monDom5) SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit SEQ1_LEN=/scratch/data/monDom5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: White Rhino CerSim1 SEQ2_DIR=/hive/data/genomes/cerSim1/cerSim1.2bit SEQ2_LEN=/hive/data/genomes/cerSim1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/monDom5/bed/lastzCerSim1.2012-10-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1598m7.927s # problem chaining, running out of memory, running these three on hgwdev # manually: export maxMem=188743680 ulimit -S -m $maxMem -v $maxMem ./chain.csh monDom5.2bit:chr3: chain/monDom5.2bit:chr3:.chain & ./chain.csh monDom5.2bit:chr4: chain/monDom5.2bit:chr4:.chain & ./chain.csh monDom5.2bit:chr1: chain/monDom5.2bit:chr1:.chain wait # real 422m31.359s time doBlastzChainNet.pl -verbose=2 \ -continue=chainMerge `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > chainMerge.log 2>&1 & # real 63m24.561s cat fb.monDom5.chainCerSim1Link.txt # 370181399 bases of 3501660299 (10.572%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/monDom5/bed ln -s lastzCerSim1.2012-10-23 lastz.cerSim1 mkdir /hive/data/genomes/cerSim1/bed/blastz.monDom5.swap cd /hive/data/genomes/cerSim1/bed/blastz.monDom5.swap time doBlastzChainNet.pl \ /hive/data/genomes/monDom5/bed/lastzCerSim1.2012-10-23/DEF \ -verbose=2 -bigClusterHub=swarm \ -swap -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 69m55.609s cat fb.cerSim1.chainMonDom5Link.txt # 362581231 bases of 2366858012 (15.319%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cerSim1/bed ln -s blastz.monDom5.swap lastz.monDom5 #########################################################################