# for emacs: -*- mode: sh; -*- # Takifugu rubripes # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Takifugu_rubripes/FUGU5/ # # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CAAB02 # WGS: CAAB02000001:CAAB02030857 # http://www.ncbi.nlm.nih.gov/nuccore/CAAB00000000 # http://www.fugu-sg.org/ # ########################################################################## # Download sequence (DONE - 2011-11-09 - Hiram) mkdir -p /hive/data/genomes/fr3/genbank cd /hive/data/genomes/fr3/genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Takifugu_rubripes/FUGU5/*" faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz # 281572362 bases (12721879 N's 268850483 real 268850483 upper 0 lower) in 22 sequences in 22 files cat << '_EOF_' > mkUcsc.pl #!/usr/bin/env perl use strict; use warnings; my %chrFrNameToChrN; open (FH, "< Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $frName) = split('\s+', $line); $chrFrNameToChrN{$frName} = $chrN; } close (FH); my $firstHeader = 1; open (UC, "|gzip -c > ucsc.agp.gz") or die "can not write to ucsc.agp.gz"; open (FA, "|gzip -c > ucsc.fa.gz") or die "can not write to uscs.fa.gz"; foreach my $key (sort keys %chrFrNameToChrN) { my $chrN = $chrFrNameToChrN{$key}; printf "%s\tchr%s\n", $key, $chrN; my $fastaFile = "Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz"; my $agpFile = "Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = ) { if ($line =~ m/^#/) { if ($firstHeader) { printf UC "%s", $line; } next; } $firstHeader = 0; $line =~ s/$key/chr${chrN}/; printf UC "%s", $line; } close (FH); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; while (my $line = ) { if ($line =~ m/^>/) { die "can not match fasta name $key in $fastaFile\n$line" if ($line !~ m/$key/); printf FA ">chr${chrN}\n"; next; } printf FA "%s", $line; } close (FH); } close (UC); close (FA); '_EOF_' # << happy emacs chmod +x mkUcsc.pl time ./mkUcsc.pl # real 18m3.244s zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz | \ sed -e 's/\.1//' | gzip -c > ucsc.unplaced.agp.gz zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ | sed -e 's/^>.*emb|\([A-Z0-9]*\).*/>\1/' | gzip -c \ > ucsc.unplaced.fa.gz ########################################################################### # Initial genome build (DONE - 2011-11-16 - Hiram) cd /hive/data/genomes/fr3 cat << '_EOF_' > fr3.config.ra # Config parameters for makeGenomeDb.pl: db fr3 clade vertebrate genomeCladePriority 110 scientificName Takifugu Rubripes commonName Fugu assemblyDate Oct. 2011 assemblyLabel Fugu Genome Sequencing Consortium - FUGU5 (NCBI project 1434, GCA_000180615.2) assemblyShortLabel fr3.1 # orderKey = fr2.orderKey - 1 orderKey 463 # GI: 23397366 mitoAcc NC_004299 fastaFiles /hive/data/genomes/fr3/genbank/ucsc.*fa.gz agpFiles /hive/data/genomes/fr3/genbank/ucsc.*agp.gz dbDbSpeciesDir fugu # qualFiles none taxId 31033 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp -workhorse=hgwdev -fileServer=hgwdev \ fr3.config.ra > step.agp.log 2>&1 & # real 0m29.258s # take a look at the constructed AGP file to see if it has the names # desired, then continuing: time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \ fr3.config.ra > step.db.log 2>&1 & # about 3 minutes ################################################ ## WINDOWMASKER (DONE - 2011-11-16 - Hiram) mkdir /hive/data/genomes/fr3/bed/windowMasker cd /hive/data/genomes/fr3/bed/windowMasker time doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev fr3 > do.log 2>&1 & # real 21m47.182s # Masking statistics twoBitToFa fr3.wmsk.sdust.2bit stdout | faSize stdin # 391484715 bases (40522884 N's 350961831 real 284458798 upper # 66503033 lower) in 6835 sequences in 1 files # %16.99 masked total, %18.95 masked real hgLoadBed fr3 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 1742235 elements of size 3 featureBits -countGaps fr3 windowmaskerSdust # 107012497 bases of 391484715 (27.335%) in intersection # eliminate the gaps from the masking featureBits fr3 -not gap -bed=notGap.bed # 350961831 bases of 350961831 (100.000%) in intersection time nice -n +19 featureBits fr3 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 66503033 bases of 350961831 (18.949%) in intersection # reload track to get it clean hgLoadBed fr3 windowmaskerSdust cleanWMask.bed.gz # Loaded 1743810 elements of size 4 featureBits -countGaps fr3 windowmaskerSdust # 66503033 bases of 391484715 (16.987%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../fr3.unmasked.2bit stdin \ -type=.bed fr3.cleanWMSdust.2bit twoBitToFa fr3.cleanWMSdust.2bit stdout | faSize stdin \ > fr3.cleanWMSdust.faSize.txt cat fr3.cleanWMSdust.faSize.txt # 391484715 bases (40522884 N's 350961831 real 284458798 upper 66503033 lower) in 6835 sequences in 1 files # %16.99 masked total, %18.95 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps fr3 rmsk windowmaskerSdust # 21610292 bases of 391484715 (5.520%) in intersection ########################################################################## # running repeat masker (DONE - 2011-11-16 - Hiram) mkdir /hive/data/genomes/fr3/bed/repeatMasker cd /hive/data/genomes/fr3/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk fr3 > do.log 2>&1 & # real 46m48.252s cat faSize.rmsk.txt # 391484715 bases (40522884 N's 350961831 real 322500133 upper # 28461698 lower) in 6835 sequences in 1 files # %7.27 masked total, %8.11 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps fr3 rmsk # 28570382 bases of 391484715 (7.298%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2011-11-16 - Hiram) mkdir /hive/data/genomes/fr3/bed/simpleRepeat cd /hive/data/genomes/fr3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ fr3 > do.log 2>&1 & # real 14m11.20 cat fb.simpleRepeat # 10415803 bases of 350961831 (2.968%) in intersection # We are not going to use the RepeatMasker business since Window Masker # covers much more. It it was just TRF and rmsk, the following would # take place. # when repeatMasker above is completed, add this mask: cd /hive/data/genomes/fr3 twoBitMask fr3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed fr3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa fr3.2bit stdout | faSize stdin > faSize.fr3.2bit.txt cat faSize.fr3.2bit.txt # 391484715 bases (40522884 N's 350961831 real 321862602 upper # 29099229 lower) in 6835 sequences in 1 files # %7.43 masked total, %8.29 masked real # *** REMEMBER *** to reset they symLink in gbdb: rm /gbdb/fr3/fr3.2bit ln -s `pwd`/fr3.2bit /gbdb/fr3/fr3.2bit ######################################################################### ## Add TRF mask to WindowMasker masked sequence cd /cluster/data/fr3 twoBitMask bed/windowMasker/fr3.cleanWMSdust.2bit \ -add bed/simpleRepeat/trfMask.bed fr3.2bit # you can safely ignore the warnings about >= 13 fields # check the total masking twoBitToFa fr3.2bit stdout | faSize stdin # 391484715 bases (40522884 N's 350961831 real 284231237 upper # 66730594 lower) in 6835 sequences in 1 files # %17.05 masked total, %19.01 masked real # as an experiment, see what rmsk adds: twoBitMask fr3.2bit \ -add bed/repeatMasker/fr3.sorted.fa.out fr3.trf.wm.rmsk.2bit twoBitToFa fr3.trf.wm.rmsk.2bit stdout | faSize stdin # 391484715 bases (40522884 N's 350961831 real 277411829 upper # 73550002 lower) in 6835 sequences in 1 files # %18.79 masked total, %20.96 masked real # which is an extra 6.8 million bases: 73550002-66730594 = 6819408 # Make this be the actual file for the genome browser rm /gbdb/fr3/fr3.2bit ln -s /hive/data/genomes/fr3/fr3.2bit /gbdb/fr3/fr3.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2011-11-16 - Hiram) mkdir /hive/data/genomes/fr3/bed/gap cd /hive/data/genomes/fr3/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../fr3.unmasked.2bit > findMotif.txt 2>&1 # real 1m8.647s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits fr3 -not gap -bed=notGap.bed # 363677527 bases of 363677527 (100.000%) in intersection featureBits fr3 allGaps.bed notGap.bed -bed=new.gaps.bed # 12715696 bases of 363677527 (3.496%) in intersection # Wow, that's a lot of unmarked gap # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" fr3 | sort -n | tail -1 # 110 # that number is used below in the script to mark all the new gaps # with a higher ix than that cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" fr3 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps fr3 other.bed # 12715696 bases of 363677527 (3.248%) in intersection wc -l other.bed # 24141 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad fr3 otherGap other.bed # starting with this many hgsql -e "select count(*) from gap;" fr3 # 14303 hgsql fr3 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" fr3 # 38444 # == 24141 + 14303 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 fr3 nonBridged.lift -bedFile=nonBridged.bed # see example in danRer7.txt # there are non-bridged gaps here: hgsql -N -e "select bridge from gap;" fr3 | sort | uniq -c # 256 no # 38188 yes ########################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2011-11-17 - Hiram) # Use -repMatch=128 (based on size -- for human we use 1024, and # fugu size is ~12% of human judging by gapless fr3 vs. hg18 # genome sizes from featureBits. Use the "real" number from # the faSize measurements # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 350961831 / 2897316137 \) \* 1024 # ( 350961831 / 2897316137 ) * 1024 = 124.040629 # round up to 128 cd /hive/data/genomes/fr3 blat fr3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/fr3.11.ooc -repMatch=128 # Wrote 8853 overused 11-mers to jkStuff/fr3.11.ooc # copy all of this stuff to the klusters: # there are some non-bridged gaps hgsql -N -e "select bridge from gap;" fr3 | sort | uniq -c # 256 no # 38188 yes cd /hive/data/genomes/fr3/jkStuff gapToLift fr3 fr3.nonBridged.lift -bedFile=fr3.nonBridged.bed cd /hive/data/genomes/fr3 mkdir /hive/data/staging/data/fr3 cp -p jkStuff/fr3.11.ooc chrom.sizes fr3.2bit jkStuff/fr3.nonBridged.lift \ /hive/data/staging/data/fr3/ # request rsync copy from cluster admin ######################################################################### # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("fr3", "blat1", "17814", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("fr3", "blat1", "17815", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### ## Default position set same as fr2 (DONE - 2011-11-17 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chr21:5807962-5832802" where name="fr3";' hgcentraltest ######################################################################### # AUTO UPDATE GENBANK (DONE - 2011-11-17,28 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Takifugu rubripes 1271 26069 429 # to decide which "native" mrna or ests you want to specify in genbank.conf # this appears that fr3 has plenty of native est's ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add fr3 just after fr2 and commit to GIT # fr3 fr3.serverGenome = /hive/data/genomes/fr3/fr3.2bit fr3.clusterGenome = /scratch/data/fr3/fr3.2bit fr3.ooc = /scratch/data/fr3/fr3.11.ooc fr3.align.unplacedChroms = HE* fr3.lift = /scratch/data/fr3/fr3.nonBridged.lift fr3.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} fr3.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} fr3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} fr3.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} fr3.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} fr3.genbank.mrna.xeno.loadDesc = yes fr3.refseq.mrna.xeno.load = no # fr3.upstreamGeneTbl = ensGene # fr3.upstreamMaf = multiz5way # end of section added to etc/genbank.conf git commit -m "adding fr3 Takifugu rubripes" genbank.conf git push make etc-update # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains # fr genome information, if this is a new species, need to add stuff there ssh hgwdev # used to do this on "genbank" machine screen # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial fr3 & # var/build/logs/2011.11.28-09:01:11.fr3.initalign.log # real 111m30.317s # tail the log to make sure it is successful: # hgwdev 2011.11.28-10:52:34 fr3.initalign: Succeeded: fr3 # hgwdev 2011.11.28-10:52:41 fr3.initalign: finish # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad fr3 & # logFile: var/dbload/hgwdev/logs/2011.11.28-18:56:26.dbload.log # real 56m28.463s # enable daily alignment and update of hgwdev (DONE - 2011-04-14 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add fr3 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added fr3." etc/align.dbs etc/hgwdev.dbs git push make etc-update ########################################################################### # lastz Zebrafish danRer7 (DONE - 2011-12-01 - Hiram) mkdir /hive/data/genomes/fr3/bed/lastzDanRer7.2011-12-01 cd /hive/data/genomes/fr3/bed/lastzDanRer7.2011-12-01 cat << '_EOF_' > DEF # Fugu vs. Zebrafish # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Fugu fr3 # CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go SEQ1_DIR=/scratch/data/fr3/fr3.2bit SEQ1_LEN=/scratch/data/fr3/chrom.sizes SEQ1_CHUNK=24000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Zebrafish danRer7 SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit SEQ2_LEN=/scratch/data/danRer7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/hive/data/genomes/fr3/bed/lastzDanRer7.2011-12-01 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long running job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -tRepeats=windowmaskerSdust \ -smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 & # real 764m6.061s cat fb.fr3.chainDanRer7Link.txt # 80101694 bases of 350961831 (22.823%) in intersection mkdir /hive/data/genomes/danRer7/bed/blastz.fr3.swap cd /hive/data/genomes/danRer7/bed/blastz.fr3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/fr3/bed/lastzDanRer7.2011-12-01/DEF \ -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -tRepeats=windowmaskerSdust \ -swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 & # real 9m54.286s cat fb.danRer7.chainFr3Link.txt # 103831209 bases of 1409770109 (7.365%) in intersection ############################################################################ # lastz Nile tilapia oreNil1 (DONE - 2011-12-01,02 - Hiram) mkdir /hive/data/genomes/fr3/bed/lastzOreNil1.2011-12-01 cd /hive/data/genomes/fr3/bed/lastzOreNil1.2011-12-01 cat << '_EOF_' > DEF # Fugu vs. Nile tilapia # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Fugu fr3 # CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go SEQ1_DIR=/scratch/data/fr3/fr3.2bit SEQ1_LEN=/scratch/data/fr3/chrom.sizes SEQ1_CHUNK=24000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Nile tilapia oreNil1 SEQ2_DIR=/scratch/data/oreNil1/oreNil1.2bit SEQ2_LEN=/scratch/data/oreNil1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/hive/data/genomes/fr3/bed/lastzOreNil1.2011-12-01 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long running job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \ -qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \ -smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 & # real 1023m19.470s cat fb.fr3.chainOreNil1Link.txt # 200890618 bases of 350961831 (57.240%) in intersection mkdir /hive/data/genomes/oreNil1/bed/blastz.fr3.swap cd /hive/data/genomes/oreNil1/bed/blastz.fr3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/fr3/bed/lastzOreNil1.2011-12-01/DEF \ -chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \ -qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \ -swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 & # real 17m29.215s cat fb.oreNil1.chainFr3Link.txt # 250828022 bases of 816084674 (30.736%) in intersection ############################################################################ # lastz Tetraodon tetNig2 (DONE - 2011-12-01,02 - Hiram) mkdir /hive/data/genomes/fr3/bed/lastzTetNig2.2011-12-01 cd /hive/data/genomes/fr3/bed/lastzTetNig2.2011-12-01 cat << '_EOF_' > DEF # Fugu vs. Tetraodon # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Fugu fr3 # CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go SEQ1_DIR=/scratch/data/fr3/fr3.2bit SEQ1_LEN=/scratch/data/fr3/chrom.sizes SEQ1_CHUNK=24000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Tetraodon tetNig2 SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=1 SEQ2_LAP=0 BASE=/hive/data/genomes/fr3/bed/lastzTetNig2.2011-12-01 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long running job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \ -qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \ -smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 & # real 468m6.556s cat fb.fr3.chainTetNig2Link.txt # 248874754 bases of 350961831 (70.912%) in intersection mkdir /hive/data/genomes/tetNig2/bed/blastz.fr3.swap cd /hive/data/genomes/tetNig2/bed/blastz.fr3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/fr3/bed/lastzTetNig2.2011-12-01/DEF \ -chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \ -qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \ -swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 & # real 10m59.521s cat fb.tetNig2.chainFr3Link.txt # 243890006 bases of 302314788 (80.674%) in intersection ############################################################################ # lastz Stickleback gasAcu1 (DONE - 2011-12-01 - Hiram) # experiment here, two runs on Stickleback, one with gasAcu1 # as it stands with little masking, and a second with it masked # with WindowMasker. Result: we get more coverage with the less masked # sequence twoBitToFa /scratch/data/gasAcu1/gasAcu1.2bit stdout | faSize stdin # 463354448 bases (16726587 N's 446627861 real 435123380 upper # 11504481 lower) in 23 sequences in 1 files # %2.48 masked total, %2.58 masked real twoBitToFa /hive/data/genomes/gasAcu1/bed/windowMasker/gasAcu1.TRF.WMSdust.2bit stdout | faSize stdin mkdir /hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02 cd /hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02 cat << '_EOF_' > DEF # Fugu vs. Stickleback # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Fugu fr3 # CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go SEQ1_DIR=/scratch/data/fr3/fr3.2bit SEQ1_LEN=/scratch/data/fr3/chrom.sizes SEQ1_CHUNK=24000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Stickleback gasAcu1 SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit SEQ2_LEN=/scratch/data/gasAcu1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=1 SEQ2_LAP=0 BASE=/hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long running job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \ -tRepeats=windowmaskerSdust \ -smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 & # real 267m15.622s cat fb.fr3.chainGasAcu1Link.txt # 194876988 bases of 350961831 (55.527%) in intersection # vs. the more masked result: # 188878416 bases of 350961831 (53.817%) in intersection mkdir /hive/data/genomes/gasAcu1/bed/blastz.fr3.swap cd /hive/data/genomes/gasAcu1/bed/blastz.fr3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02/DEF \ -chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \ -tRepeats=windowmaskerSdust \ -swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 & # real 15m25.221s cat fb.gasAcu1.chainFr3Link.txt # 214549957 bases of 446627861 (48.038%) in intersection ############################################################################## # experiment lastz test on Stickleback with window masker sequence # (DONE - Hiram - 2011-12-02) mkdir /hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02maskTest cd /hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02maskTest cat << '_EOF_' > DEF # Fugu vs. Stickleback # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Fugu fr3 # CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go SEQ1_DIR=/scratch/data/fr3/fr3.2bit SEQ1_LEN=/scratch/data/fr3/chrom.sizes SEQ1_CHUNK=24000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Stickleback gasAcu1 SEQ2_DIR=/hive/data/genomes/gasAcu1/bed/windowMasker/gasAcu1.TRF.WMSdust.2bit SEQ2_LEN=/scratch/data/gasAcu1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=1 SEQ2_LAP=0 BASE=/hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02maskTest TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long running job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 & # real 253m13.215s cat fb.fr3.chainGasAcu1Link.txt # 188878416 bases of 350961831 (53.817%) in intersection # vs. the less masked result: # 194876988 bases of 350961831 (55.527%) in intersection ############################################################################ # lastz Medaka oryLat2 (DONE - 2011-12-01,02 - Hiram) mkdir /hive/data/genomes/fr3/bed/lastzOryLat2.2011-12-01 cd /hive/data/genomes/fr3/bed/lastzOryLat2.2011-12-01 cat << '_EOF_' > DEF # Fugu vs. Medaka # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Fugu fr3 # CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go SEQ1_DIR=/scratch/data/fr3/fr3.2bit SEQ1_LEN=/scratch/data/fr3/chrom.sizes SEQ1_CHUNK=24000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Medaka oryLat2 SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/fr3/bed/lastzOryLat2.2011-12-01 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long running job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \ -qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \ -smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 & # real 725m19.856s cat fb.fr3.chainOryLat2Link.txt # 153943192 bases of 350961831 (43.863%) in intersection mkdir /hive/data/genomes/oryLat2/bed/blastz.fr3.swap cd /hive/data/genomes/oryLat2/bed/blastz.fr3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/fr3/bed/lastzOryLat2.2011-12-01/DEF \ -chainMinScore=2000 -chainLinearGap=medium \ -qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \ -swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 & # real 15m6.162s cat fb.oryLat2.chainFr3Link.txt # 181436751 bases of 700386597 (25.905%) in intersection ############################################################################ # lastz Atlantic Cod gadMor1 (DONE - 2012-01-11,17 - Hiram) mkdir /hive/data/genomes/fr3/bed/lastzGadMor1.2012-01-11 cd /hive/data/genomes/fr3/bed/lastzGadMor1.2012-01-11 cat << '_EOF_' > DEF # Fugu vs. Atlantic Cod # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Fugu fr3 # CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go SEQ1_DIR=/scratch/data/fr3/fr3.2bit SEQ1_LEN=/scratch/data/fr3/chrom.sizes SEQ1_CHUNK=24000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Atlantic Cod gadMor1 SEQ2_DIR=/hive/data/genomes/gadMor1/gadMor1.2bit SEQ2_LEN=/hive/data/genomes/gadMor1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=500 SEQ2_LAP=0 BASE=/hive/data/genomes/fr3/bed/lastzGadMor1.2012-01-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long running job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \ -qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \ -smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 & # Elapsed time: 5107m6s == 3 days 13 hours 7m cat fb.fr3.chainGadMor1Link.txt # 106436551 bases of 350961831 (30.327%) in intersection mkdir /hive/data/genomes/gadMor1/bed/blastz.fr3.swap cd /hive/data/genomes/gadMor1/bed/blastz.fr3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/fr3/bed/lastzGadMor1.2012-01-11/DEF \ -chainMinScore=2000 -chainLinearGap=medium \ -qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \ -swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 & # real 160m13.224s cat fb.gadMor1.chainFr3Link.txt # 115839168 bases of 608038597 (19.051%) in intersection ############################################################################ # lastz Coelacanth latCha1 (DONE - 2011-12-12,17 - Hiram) mkdir /hive/data/genomes/fr3/bed/lastzLatCha1.2012-01-12 cd /hive/data/genomes/fr3/bed/lastzLatCha1.2012-01-12 cat << '_EOF_' > DEF # Fugu vs. Coelacanth # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Fugu fr3 # CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go SEQ1_DIR=/scratch/data/fr3/fr3.2bit SEQ1_LEN=/scratch/data/fr3/chrom.sizes SEQ1_CHUNK=24000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Coelacanth latCha1 SEQ2_DIR=/hive/data/genomes/latCha1/latCha1.2bit SEQ2_LEN=/hive/data/genomes/latCha1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=500 SEQ2_LAP=0 BASE=/hive/data/genomes/fr3/bed/lastzLatCha1.2012-01-12 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long running job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \ -qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \ -smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 & # real 1626m11.140s cat fb.fr3.chainLatCha1Link.txt # 48206149 bases of 350961831 (13.735%) in intersection mkdir /hive/data/genomes/latCha1/bed/blastz.fr3.swap cd /hive/data/genomes/latCha1/bed/blastz.fr3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/fr3/bed/lastzLatCha1.2012-01-12/DEF \ -chainMinScore=2000 -chainLinearGap=medium \ -qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \ -swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 & # real 8m47.103s cat fb.latCha1.chainFr3Link.txt # 62854403 bases of 2183592768 (2.878%) in intersection ############################################################################ ## 8-Way Multiz (WORKING - 2012-01-18 - Hiram) ssh hgwdev mkdir /hive/data/genomes/fr3/bed/multiz8way cd /hive/data/genomes/fr3/bed/multiz8way # species list: cd /hive/data/genomes/fr3/bed ls -d lastz.* | sed -e "s/lastz.//" | sort | xargs echo # danRer7 gadMor1 gasAcu1 latCha1 oreNil1 oryLat2 tetNig2 # plus this one # fr3 # All distances remain as specified in the 60way.nh /cluster/bin/phast/tree_doctor --prune-all-but \ fr3,danRer7,gadMor1,gasAcu1,latCha1,oreNil1,oryLat2,tetNig2 \ $HOME/kent/src/hg/utils/phyloTrees/60way.nh > 8way.nh # what that looks like: cat 8way.nh # (latCha1:1.166927,((((((tetNig2:0.224159,fr3:0.203847):0.097590, # oreNil1:0.200000):0.097590,gasAcu1:0.316413):0.030000, # oryLat2:0.511970):0.030000,gadMor1:0.350000):0.225640, # danRer7:0.730752):0.147949); # rearrange to get fr3 on top: cat << '_EOF_' > fr3.8way.nh (((((((fr3:0.203847,tetNig2:0.224159):0.097590,oreNil1:0.200000):0.097590, gasAcu1:0.316413):0.030000,oryLat2:0.511970):0.030000, gadMor1:0.350000):0.225640,danRer7:0.730752):0.147949,latCha1:1.166927); '_EOF_' # << happy emacs # convert to species names /cluster/bin/phast/tree_doctor --rename \ "fr3->Fugu; tetNig2->Tetraodon; oreNil1->Nile_Tilapia; gasAcu1->Stickleback; oryLat2->Medaka; gadMor1->Atlantic_cod; danRer7->Zebrafish; latCha1->Coelacanth;" \ fr3.8way.nh > fr3.commonNames.8way.nh # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/fr3_8way.png /cluster/bin/phast/all_dists fr3.8way.nh > 8way.distances.txt # Use this output to create the table below grep -i fr3 8way.distances.txt | sort -k3,3n # fr3 tetNig2 0.428006 # fr3 oreNil1 0.501437 # fr3 gasAcu1 0.715440 # fr3 gadMor1 0.809027 # fr3 oryLat2 0.940997 # fr3 danRer7 1.415419 # fr3 latCha1 1.999543 # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainFr3Link chain linearGap # distance on fr3 on other minScore # 1 0.428 - tetraodon tetNig2 (% 70.91) (% 80.67) 2000 medium # 2 0.501 - nile tilapia oreNil1 (% 57.24) (% 30.74) 2000 medium # 3 0.715 - stickleback gasAcu1 (% 55.53) (% 48.04) 2000 medium # 4 0.809 - altantic cod gadMor1 (% 30.33) (% 19.05) 2000 medium # 5 0.941 - medaka oryLat2 (% 43.86) (% 25.91) 2000 medium # 6 1.415 - zebrafish danRer7 (% 22.82) (% 7.37) 2000 medium # 7 2.000 - coelacanth latCha1 (% 13.74) (% 2.88) 2000 medium # None of this concern for distances matters in building the first step, the # maf files. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ fr3.8way.nh > tmp.nh echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # break up maf files into smaller pieces: mkdir /hive/data/genomes/fr3/bed/multiz8way/mafInputs cd /hive/data/genomes/fr3/bed/multiz8way/mafInputs for S in `cat ../species.list | sed -e "s/fr3 //"` do echo $S rm -fr ./${S}/ mkdir -p ./${S}/ netMaf="../../lastz.${S}/mafNet/fr3.${S}.net.maf.gz" ls -l "${netMaf}" mafSplit -byTarget -useHashedName=5 /dev/null ./${S}/ ${netMaf} done # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | wc -l # 255 find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list wc -l maf.list # 32 maf.list mkdir /hive/data/genomes/fr3/bed/multiz8way/splitRun cd /hive/data/genomes/fr3/bed/multiz8way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = fr3 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /hive/data/genomes/fr3/bed/multiz8way/mafInputs /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/fr3/bed/multiz8way/splitRun/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs ln -s ../../mafInputs/maf.list maf.list ssh swarm cd /hive/data/genomes/fr3/bed/multiz8way/splitRun/run # the tac reverses the list to get the small jobs first gensub2 maf.list single template stdout | tac > jobList # the ram=8g will allow only one job on a node to slow them down para -ram=8g create jobList # Completed: 32 of 32 jobs # CPU time in finished jobs: 22781s 379.69m 6.33h 0.26d 0.001 y # IO & Wait Time: 967s 16.11m 0.27h 0.01d 0.000 y # Average job time: 742s 12.37m 0.21h 0.01d # Longest finished job: 2113s 35.22m 0.59h 0.02d # Submission to last job: 17092s 284.87m 4.75h 0.20d # assemble into a single maf file cd /hive/data/genomes/fr3/bed/multiz8way head -1 splitRun/maf/001.maf > multiz8way.maf for F in splitRun/maf/*.maf do egrep -v "^#" ${F} done >> multiz8way.maf tail -1 splitRun/maf/001.maf >> multiz8way.maf # -rw-rw-r-- 1 2491426291 Feb 21 12:06 multiz8way.maf # Load into database ssh hgwdev cd /hive/data/genomes/fr3/bed/multiz8way mkdir /gbdb/fr3/multiz8way ln -s `pwd`/multiz8way.maf /gbdb/fr3/multiz8way cd /scratch/tmp time nice -n +19 hgLoadMaf fr3 multiz8way # Indexing and tabulating /gbdb/fr3/multiz8way/multiz8way.maf # Loaded 2735007 mafs in 1 files from /gbdb/fr3/multiz8way # real 1m48.011s time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 fr3 multiz8waySummary \ /gbdb/fr3/multiz8way/multiz8way.maf # Created 179258 summary blocks from 8283246 components and 2735007 mafs # from /gbdb/fr3/multiz8way/multiz8way.maf # real 1m51.378s wc -l multiz8way*.tab # 2735007 multiz8way.tab # 179258 multiz8waySummary.tab # 2914265 total rm multiz8way*.tab ####################################################################### # GAP ANNOTATE MULTIZ9WAY MAF AND LOAD TABLES (DONE - 2012-02-21 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. mkdir -p /hive/data/genomes/fr3/bed/multiz8way/anno/mafSplit cd /hive/data/genomes/fr3/bed/multiz8way/anno/mafSplit time mafSplit -outDirDepth=2 -byTarget -useFullSequenceName \ /dev/null . ../../multiz8way.maf # real 1m58.310s find . -type f | wc # 6214 6214 117987 cd /hive/data/genomes/fr3/bed/multiz8way/anno # a couple of these do not yet have an N.bed file for DB in `cat ../species.list` do cd /hive/data/genomes/${DB} if [ ! -s "${DB}.N.bed" ]; then twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed ls -og ${DB}.2bit ${DB}.N.bed fi done for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL ssh swarm cd /hive/data/genomes/fr3/bed/multiz8way/anno mkdir result # NEXT TIME: this template should have a check out exists+ statement cat << '_EOF_' > template #LOOP runOne $(path1) {check out exists+ result/$(path1)} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > runOne #!/bin/sh set -beEu -o pipefail D=`dirname $2` mkdir -p "${D}" mafAddIRows -nBeds=nBeds $1 /hive/data/genomes/fr3/fr3.2bit $2 '_EOF_' # << happy emacs find ./mafSplit -type f | sed -e 's#^./##' > maf.list gensub2 maf.list single template jobList # limit jobs to one per node with the ram=8g requirement para -ram=8g create jobList para try ... check ... push ... # Completed: 6214 of 6214 jobs # CPU time in finished jobs: 1542s 25.70m 0.43h 0.02d 0.000 y # IO & Wait Time: 16187s 269.78m 4.50h 0.19d 0.001 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 16s 0.27m 0.00h 0.00d # Submission to last job: 331s 5.52m 0.09h 0.00d # verify all result files have some content, look for 0 size files: find ./result -type f -size 0 # should see none # combine into one file head -q -n 1 result/mafSplit/0/0/HE591688.maf > fr3.8way.maf find ./result -type f | while read F do grep -h -v "^#" ${F} done >> fr3.8way.maf # these maf files do not have the end marker, this does nothing: # tail -q -n 1 result/GL172637.maf >> fr3.8way.maf # How about an official end marker: echo "##eof maf" >> fr3.8way.maf # Load into database rm /gbdb/fr3/multiz8way/multiz8way.maf # remove old symlink ln -s `pwd`/fr3.8way.maf /gbdb/fr3/multiz8way/multiz8way.maf cd /scratch/tmp time nice -n +19 hgLoadMaf fr3 multiz8way # Loaded 2844956 mafs in 1 files from /gbdb/fr3/multiz8way # real 1m58.565s time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 fr3 multiz8waySummary \ /gbdb/fr3/multiz8way/multiz8way.maf # Created 179258 summary blocks from 8283246 components and 2844956 mafs # from /gbdb/fr3/multiz8way/multiz8way.maf # real 2m26.270s wc -l multiz8way*.tab # 2844956 multiz8way.tab # 179258 multiz8waySummary.tab # 3024214 total rm multiz8way*.tab ######################################################################### # MULTIZ9WAY MAF FRAMES (DONE - 2012-02-21 - Hiram) ssh hgwdev mkdir /hive/data/genomes/fr3/bed/multiz8way/frames cd /hive/data/genomes/fr3/bed/multiz8way/frames mkdir genes # instead of knownGene, so, using ensGene for hg19 and mm9 fr3 - the lifted ensGene from fr2 tetNig2 - ensGene oreNil1 - none gasAcu1 - ensGene oryLat2 - ensGene gadMor1 - none danRer7 - ensGene latCha1 - none #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using ensGene for: for qDB in tetNig2 gasAcu1 oryLat2 danRer7 do echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB} hgsql -N -e "select * from ensGene" ${qDB} | cut -f 2-11 \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done # xenoMrna hgsql -N -e "select * from xenoMrna" fr3 | cut -f2- \ | pslToBed stdin stdout | bedToGenePred stdin stdout \ | genePredSingleCover stdin stdout | gzip -2c > genes/fr3.gp.gz # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/danRer7.gp.gz: 25919 # genes/fr3.gp.gz: 29280 # genes/gasAcu1.gp.gz: 20631 # genes/oryLat2.gp.gz: 19576 # genes/tetNig2.gp.gz: 19539 ssh hgwdev cd /hive/data/genomes/fr3/bed/multiz8way/frames time (cat ../anno/fr3.8way.maf \ | nice -n +19 genePredToMafFrames fr3 stdin stdout \ `echo "fr3 tetNig2 gasAcu1 oryLat2 danRer7" | sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g"` \ | gzip > multiz8way.mafFrames.gz) # real 2m22.546s # -rw-rw-r-- 1 17609757 Feb 21 14:29 multiz8way.mafFrames.gz # verify there are frames on everything: zcat multiz8way.mafFrames.gz | awk '{print $4}' | sort | uniq -c # 352686 danRer7 # 233699 fr3 # 298187 gasAcu1 # 278448 oryLat2 # 273743 tetNig2 ssh hgwdev cd /hive/data/genomes/fr3/bed/multiz8way/frames time hgLoadMafFrames fr3 multiz8wayFrames multiz8way.mafFrames.gz # real 0m12.716s ############################################################################ # creating upstream mafs (STALLED - 2012-02-21 - Hiram) # will need to wait until there are some ensGenes here ssh hgwdev mkdir -p /hive/data/genomes/fr3/goldenPath/multiz8way cd /hive/data/genomes/fr3/goldenPath/multiz8way # run this bash script cat << '_EOF_' > mkUpstream.sh DB=fr3 GENE=ensGene NWAY=multiz8way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.${GENE}.maf" echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout" featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | mafFrags ${DB} ${NWAY} stdin stdout \ -orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.${GENE}.maf.gz echo "done upstream${S}.${GENE}.maf.gz" done '_EOF_' # << happy emacs chmod +x mkUpstream.sh time ./mkUpstream.sh # real 17m4.037s # FIXUP/VERIFY the genbank.conf file to indicate this gene choice for the # upstream automatic generation ######################################################################### # Phylogenetic tree from 8-way (DONE - 2012-02-21 - Hiram) mkdir /hive/data/genomes/fr3/bed/multiz8way/4d cd /hive/data/genomes/fr3/bed/multiz8way/4d # the split annotated maf's are in: ../anno/result/mafSplit/*/*/*.maf cd /hive/data/genomes/fr3/bed/multiz8way/4d # no gene tracks yet on fr3. liftUp fr2 ensGenes to fr3 for this: hgsql -N -e "select * from ensGene;" fr2 | cut -f2- > fr2.ensGene.gp liftOver -genePred fr2.ensGene.gp \ /gbdb/fr2/liftOver/fr2ToFr3.over.chain.gz \ fr3.lifted.ensGene.gp unmapped.ensGene.gp wc -l fr2.ensGene.gp fr3.lifted.ensGene.gp unmapped.ensGene.gp # 48706 fr2.ensGene.gp # 48484 fr3.lifted.ensGene.gp # 444 unmapped.ensGene.gp # 97634 total genePredSingleCover fr3.lifted.ensGene.gp stdout | sort > fr3.ensGene.NR.gp wc -l fr3.ensGene.NR.gp # 18248 fr3.ensGene.NR.gp # (this was attempted with all_est which give a much different estimate) # (the distances were much smaller) ssh swarm mkdir /hive/data/genomes/fr3/bed/multiz8way/4d/run cd /hive/data/genomes/fr3/bed/multiz8way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/fr3/bed/multiz8way" set c = $1 set infile = $r/anno/result/mafSplit/$2 set resultDir = $3 mkdir -p ../mfa/$resultDir set outfile = $r/4d/mfa/$resultDir/$1.mfa cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/fr3.ensGene.NR.gp | sed -e "s/\t$c\t/\tfr3.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile else echo "" > $outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh find ../../anno/result/mafSplit -type f \ | sed -e "s#../../anno/result/mafSplit/##" > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) $(dir1) {check out exists ../mfa/$(dir1)/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs gensub2 maf.list single template jobList para create jobList para try ... check para -maxJob=5 push para time # some of the jobs can not complete # Completed: 6180 of 6214 jobs # Crashed: 24 jobs # Other count: 10 jobs # CPU time in finished jobs: 565s 9.42m 0.16h 0.01d 0.000 y # IO & Wait Time: 15923s 265.38m 4.42h 0.18d 0.001 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 21s 0.35m 0.01h 0.00d # Submission to last job: 861s 14.35m 0.24h 0.01d # combine mfa files ssh hgwdev cd /hive/data/genomes/fr3/bed/multiz8way/4d # remove the broken empty files, size 0 and size 1: find ./mfa -type f -size 0 | xargs rm -f # most interesting, this did not identify files of size 1: # find ./mfa -type f -size 1 find ./mfa -type f | xargs ls -og | awk '$3 == 1' | awk '{print $NF}' \ > empty.list cat empty.list | xargs rm #want comma-less species.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" \ `find ./mfa -type f | xargs echo` | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa # >fr3 # >tetNig2 # >oreNil1 # >gasAcu1 # >oryLat2 # >gadMor1 # >danRer7 # >latCha1 # tree-commas.nh: # (((((((fr3,tetNig2),oreNil1),gasAcu1),oryLat2),gadMor1),danRer7),latCha1) # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree tree-commas.nh 4d.all.mfa # real 0m51.111s mv phyloFit.mod all.mod grep TREE all.mod # TREE: # (((((((fr3:0.156632,tetNig2:0.195181):0.256538,oreNil1:0.28154):0.00848527,gasAcu1:0.305909):0.0519573,oryLat2:0.38542):0.140656,gadMor1:0.618515):0.26134,danRer7:0.587935):0.460809,latCha1:0.460809); /cluster/bin/phast/all_dists new8way.nh | grep "^fr3" fr3 tetNig2 0.351813 fr3 oreNil1 0.694710 fr3 gasAcu1 0.727564 fr3 oryLat2 0.859033 fr3 gadMor1 1.232784 fr3 danRer7 1.463544 fr3 latCha1 1.797227 # existing distances from the 60-way prototype tree, with # several branch lengths being pure guesses: (((((((fr3:0.203847,tetNig2:0.224159):0.09759,oreNil1:0.2):0.09759, gasAcu1:0.316413):0.03,oryLat2:0.511970):0.03,gadMor1:0.35):0.22564, danRer7:0.730752):0.147949,latCha1:0.855573); fr3 tetNig2 0.428006 fr3 oreNil1 0.501437 fr3 gasAcu1 0.715440 fr3 gadMor1 0.809027 fr3 oryLat2 0.940997 fr3 danRer7 1.415419 fr3 latCha1 1.688189 ######################################################################### # phastCons 8-way (DONE - 2012-02-22 - Hiram) # split 8way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh encodek mkdir -p /hive/data/genomes/fr3/bed/multiz8way/cons/SS cd /hive/data/genomes/fr3/bed/multiz8way/cons/SS mkdir result done cat << '_EOF_' > mkSS.csh #!/bin/csh -ef set dirPath = $1 set mafFile = $2 set MAF = /hive/data/genomes/fr3/bed/multiz8way/anno/result/mafSplit/$dirPath/$mafFile.maf set WINDOWS = /hive/data/genomes/fr3/bed/multiz8way/cons/SS/result/$dirPath set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $3 ) then exit 0 endif mkdir -p done/$dirPath if ( -s done/$dirPath/$mafFile.running ) then exit 0 endif date >> done/$dirPath/$mafFile.running rm -f $WINDOWS/$mafFile mkdir -p $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$mafFile -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $3 rm -f done/$dirPath/$mafFile.running '_EOF_' # << happy emacs chmod +x mkSS.csh cat << '_EOF_' > template #LOOP mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)} #ENDLOOP '_EOF_' # << happy emacs # find ../../anno/result/mafSplit -type f \ | sed -e 's#../../anno/result/mafSplit/##' > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 6214 of 6214 jobs # CPU time in finished jobs: 562s 9.36m 0.16h 0.01d 0.000 y # IO & Wait Time: 16125s 268.76m 4.48h 0.19d 0.001 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 29s 0.48m 0.01h 0.00d # Submission to last job: 914s 15.23m 0.25h 0.01d # not all of them produce results # there isn't much aligned in these MAF files find ./result -type f | wc # 5472 5472 176757 # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh encodek mkdir -p /hive/data/genomes/fr3/bed/multiz8way/cons/run.cons cd /hive/data/genomes/fr3/bed/multiz8way/cons/run.cons # there are going to be only one phastCons run using # this same script. It triggers off of the current working directory # $cwd:t which is the "grp" in this script. It is: # all cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set dir = $1 set c = $2 set f = $3 set len = $4 set cov = $5 set rho = $6 set resultFile = pp/$dir/$c.pp set grp = $cwd:t set cons = /hive/data/genomes/fr3/bed/multiz8way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons set useGrp = "$grp.mod" ln -s $ssSrc/SS/result/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phastCons $cons/SS/result/$dir/$f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp popd > /dev/null mkdir -p pp/$dir bed/$dir sleep 4 touch pp/$dir bed/$dir rm -f $resultFile rm -f bed/$dir/$f.bed mv $tmp/$f.pp pp/$dir mv $tmp/$f.bed bed/$dir rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(dir1) $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs find ../SS -type f | grep ".ss$" | sed -e 's#../SS/result/##; s/.ss$//' \ > ss.list wc -l ss.list # 3291 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/fr3/bed/multiz8way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para try ... check ... para push # Completed: 5472 of 5472 jobs # CPU time in finished jobs: 1187s 19.78m 0.33h 0.01d 0.000 y # IO & Wait Time: 35801s 596.69m 9.94h 0.41d 0.001 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest finished job: 40s 0.67m 0.01h 0.00d # Submission to last job: 2238s 37.30m 0.62h 0.03d cd /hive/data/genomes/fr3/bed/multiz8way/cons/all find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database time nice -n +19 hgLoadBed fr3 phastConsElements8way mostConserved.bed # Loaded 1126332 elements of size 5 # real 0m7.677s # on human we often try for 5% overall cov, and 70% CDS cov # most bets are off here for that goal, these alignments are too few # and too far between featureBits fr3 -enrichment ensGene:cds phastConsElements8way # --rho 0.3 --expected-length 45 --target-coverage 0.3 # ensGene:cds 2.244%, phastConsElements8way 10.055%, both 1.669%, cover 74.39%, enrich 7.40x featureBits fr3 -enrichment mgcGenes:cds phastConsElements8way # mgcGenes:cds 0.741%, phastConsElements8way 10.055%, both 0.619%, cover 83.53%, enrich 8.31x # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/fr3/bed/multiz8way/cons/all mkdir downloads find ./pp -type f | sed -e "s#/# dir #g; s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k9,9 -k11,11n | sed -e "s# dir #/#g; s# d #.#g; s# m #-#g;" \ | xargs cat \ | gzip -c > downloads/phastCons8way.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phastCons8way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/fr3/chrom.sizes \ phastCons8way.bw) > bigWig.log 2>&1 & tail bigWig.log # pid=25817: VmPeak: 3413828 kB # real 3m47.314s bigWigInfo phastCons8way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 606,493,400 # primaryIndexSize: 11,794,172 # zoomLevels: 10 # chromCount: 5452 # basesCovered: 305,346,303 # mean: 0.363248 # min: 0.000000 # max: 1.000000 # std: 0.412709 # encode those files into wiggle data time (zcat downloads/phastCons8way.wigFix.gz \ | wigEncode stdin phastCons8way.wig phastCons8way.wib) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 1m28.386s du -hsc *.wi? # 292M phastCons8way.wib # 34M phastCons8way.wig # 325M total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons8way.wib /gbdb/fr3/multiz8way/phastCons8way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/fr3/multiz8way \ fr3 phastCons8way phastCons8way.wig # real 0m5.850s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh fr3 phastCons8way # db.table min max mean count sumData stdDev viewLimits # fr3.phastCons8way 0 1 0.363248 305346303 1.10917e+08 0.412709 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=fr3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons8way > histogram.data 2>&1 # real 0m46.993s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Fugu fr3 Histogram phastCons8way track" set xlabel " phastCons8way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # phyloP for 8-way (DONE - 2012-02-22 - Hiram) # run phyloP with score=LRT ssh encodek mkdir /cluster/data/fr3/bed/multiz8way/consPhyloP cd /cluster/data/fr3/bed/multiz8way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.613 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.613 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.193500 0.306500 0.306500 0.193500 # following the pattern from panTro3 with grp: "all" (no other grp) cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set dir = $1 set f = $2 set file1 = $f:t set out = $3 set cName = $f:t:r set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/fr3/bed/multiz8way/consPhyloP set tmp = $cons/tmp/$grp/$cName rm -fr $tmp mkdir -p $tmp set ssSrc = "/hive/data/genomes/fr3/bed/multiz8way/cons/SS/result/$f" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$file1.wigFix $out rm -fr $tmp rmdir --ignore-fail-on-non-empty $cons/tmp/$grp rmdir --ignore-fail-on-non-empty $cons/tmp '_EOF_' # << happy emacs # Create list of chunks find ../../cons/SS/result -type f | grep ".ss$" \ | sed -e 's#../../cons/SS/result/##; s/.ss$//' > ss.list # make sure the list looks good wc -l ss.list # 5472 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(dir1) $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/all cd /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # the -ram=8g will allow only one job per node to slow this down since # it would run too fast otherwise. Either run on one of the small # klusters or use the -ram=8g on the para create para -ram=8g create jobList para try ... check ... etc ... para -maxJob=64 push para time > run.time # Completed: 5690 of 5690 jobs # CPU time in finished jobs: 9670s 161.17m 2.69h 0.11d 0.000 y # IO & Wait Time: 38146s 635.76m 10.60h 0.44d 0.001 y # Average job time: 8s 0.14m 0.00h 0.00d # Longest finished job: 63s 1.05m 0.02h 0.00d # Submission to last job: 2448s 40.80m 0.68h 0.03d # make downloads mkdir downloads time (find ./wigFix -type f \ | sed -e "s#/# dir #g; s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k9,9 -k11,11n | sed -e "s# dir #/#g; s# d #.#g; s# m #-#g;" \ | xargs cat \ | gzip -c > downloads/phyloP8way.wigFix.gz) & # real 5m50.668s # check integrity of data with wigToBigWig time (zcat downloads/phyloP8way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/fr3/chrom.sizes \ phyloP8way.bw) > bigWig.log 2>&1 & egrep "real|VmPeak" bigWig.log # pid=32533: VmPeak: 3413824 kB # real 4m20.055s bigWigInfo phyloP8way.bw | sed -e "s/^/#\t/" # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 613,964,662 # primaryIndexSize: 11,794,172 # zoomLevels: 10 # chromCount: 5452 # basesCovered: 305,346,303 # mean: 0.340267 # min: -2.059000 # max: 2.645000 # std: 0.813855 # encode those files into wiggle data time (zcat downloads/phyloP8way.wigFix.gz \ | wigEncode stdin phyloP8way.wig phyloP8way.wib) & # Converted stdin, upper limit 2.65, lower limit -2.06 # real 1m34.170s du -hsc *.wi? # 292M phyloP8way.wib # 35M phyloP8way.wig # 326M total # Load gbdb and database with wiggle. ln -s `pwd`/phyloP8way.wib /gbdb/fr3/multiz8way/phyloP8way.wib nice hgLoadWiggle -pathPrefix=/gbdb/fr3/multiz8way fr3 \ phyloP8way phyloP8way.wig # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh fr3 phyloP8way # db.table min max mean count sumData stdDev viewLimits # fr3.phyloP8way -2.059 2.645 0.340267 305346303 1.03899e+08 0.813855 # viewLimits=-2.059:2.645 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=fr3 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phyloP8way > histogram.data 2>&1 # real 0m46.775s # find out the range for the 2:5 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin # Q1 0.000392 # median 0.000683 # Q3 0.001184 # average 0.001000 # min 0.000086 # max 0.029306 # count 1000 # total 0.999978 # standard deviation 0.001531 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Fugu fr3 Histogram phyloP8way track" set xlabel " phyloP8way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.007] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # lifting ensGene track from fr2 (DONE - 2012-02-22 - Hiram) # no gene tracks yet on fr3. liftUp fr2 ensGenes to fr3 # history of fr2 ensGene indicates it has been the same there since # v60 of ensGene - thru today same as v65 mkdir /hive/data/genomes/fr3/bed/ensGene cd /hive/data/genomes/fr3/bed/ensGene hgsql -N -e "select * from ensGene;" fr2 | cut -f2- > fr2.ensGene.gp liftOver -genePred fr2.ensGene.gp \ /gbdb/fr2/liftOver/fr2ToFr3.over.chain.gz \ fr3.lifted.ensGene.gp unmapped.ensGene.gp ls -ogrt # -rw-rw-r-- 1 18680443 Feb 22 16:22 fr2.ensGene.gp # -rw-rw-r-- 1 15869141 Feb 22 16:22 fr3.lifted.ensGene.gp # -rw-rw-r-- 1 126578 Feb 22 16:22 unmapped.ensGene.gp hgLoadGenePred -skipInvalid -genePredExt fr3 ensGene fr3.lifted.ensGene.gp # Warning: skipping 782 invalid genePreds # make a list of what did get loaded: hgsql -N -e "select name from ensGene;" fr3 | sort -u > fr3.name.ensGene.txt wc -l fr3.name.ensGene.txt # 47702 fr3.name.ensGene.txt hgsql -N -e "select * from ensPep;" fr2 | sort > fr2.ensPep.tab hgsql -N -e "select * from ensGtp;" fr2 | sort -k2,2 > fr2.ensGtp.tab hgsql -N -e "select * from ensemblSource;" fr2 | sort -k1,1 \ > fr2.ensemblSource.tab hgsql -N -e "select * from ensemblToGeneName;" fr2 | sort -k1,1 \ > fr2.ensemblToGeneName.tab # select out ensGtp records that match with the names in fr3 ensGene: join -1 2 -2 1 -o "1.1,1.2,1.3" fr2.ensGtp.tab fr3.name.ensGene.txt \ | tr '[ ]' '[\t]' > fr3.ensGtp.tab wc -l *.ensGtp.tab # 48706 fr2.ensGtp.tab # 47702 fr3.ensGtp.tab # select out ensPep records that match with the names in fr3 ensGene: join -1 1 -2 2 -o "1.1,1.2" fr2.ensPep.tab fr3.ensGtp.tab \ | tr '[ ]' '[\t]' > fr3.ensPep.tab wc -l fr2.ensPep.tab fr3.ensPep.tab # 47841 fr2.ensPep.tab # 46842 fr3.ensPep.tab # select out ensemblSource records that match the fr3 ensGene names: join -1 1 -2 1 -o "1.1,1.2" fr2.ensemblSource.tab fr3.name.ensGene.txt \ | tr '[ ]' '[\t]' > fr3.ensemblSource.tab wc -l fr2.ensemblSource.tab fr3.ensemblSource.tab 48706 fr2.ensemblSource.tab 47702 fr3.ensemblSource.tab # select out ensemblToGeneName records that match the fr3 ensGene names: join -1 1 -2 1 -o "1.1,1.2" fr2.ensemblToGeneName.tab \ fr3.name.ensGene.txt | tr '[ ]' '[\t]' > fr3.ensemblToGeneName.tab wc -l fr2.ensemblToGeneName.tab fr3.ensemblToGeneName.tab # 40067 fr2.ensemblToGeneName.tab # 39200 fr3.ensemblToGeneName.tab hgPepPred fr3 tab ensPep fr3.ensPep.tab hgLoadSqlTab fr3 ensGtp ~/kent/src/hg/lib/ensGtp.sql fr3.ensGtp.tab # verify size of the names cut -f1 fr3.ensemblSource.tab | awk '{print length($0)}' \ | sort | uniq -c | head # 47702 18 sed -e "s/15/18/" ~/kent/src/hg/lib/ensemblSource.sql > ensemblSource.sql hgLoadSqlTab fr3 ensemblSource ensemblSource.sql fr3.ensemblSource.tab # find sizes for indexes NL=`awk '{print length($1)}' fr3.ensemblToGeneName.tab | sort -rn | head -1` VL=`awk '{print length($2)}' fr3.ensemblToGeneName.tab | sort -rn | head -1` # construct sql definition with appropriate index sizes sed -e "s/ knownTo / ensemblToGeneName /; s/known gene/ensGen/; s/INDEX(name(12)/PRIMARY KEY(name($NL)/; s/value(12)/value($VL)/" \ ~/kent/src/hg/lib/knownTo.sql > ensemblToGeneName.sql hgLoadSqlTab fr3 ensemblToGeneName ensemblToGeneName.sql \ fr3.ensemblToGeneName.tab hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("fr3", "ensGene", "hiram", "65", now(), \ "lifted from mm9 ensGene 65", \ "lifted from mm9 ensGene 65", \ "dec2011" );' hgFixed ############################################################################# # hgPal downloads (DONE - Hiram - 2011-12-08) # FASTA from 8way for ensGene ssh hgwdev rm -rf /hive/data/genomes/fr3/bed/multiz8way/pal mkdir /hive/data/genomes/fr3/bed/multiz8way/pal cd /hive/data/genomes/fr3/bed/multiz8way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list mz=multiz8way gp=ensGene db=fr3 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /hive/data/genomes/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.list stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.list stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.list stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.list stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # real 49m45.740s mz=multiz8way gp=ensGene db=fr3 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz # about 12 minutes rm -rf exonAA exonNuc ppredAA ppredNuc & md5sum $gp.$mz.exonAA.fa.gz $gp.$mz.exonNuc.fa.gz > md5sum.txt # we're only distributing exons at the moment mz=multiz8way gp=ensGene db=fr3 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ # obtain a copy of a most-recent README.txt file cp -p /usr/local/apache/htdocs-hgdownload/goldenPath/gorGor3/multiz11way/alignments/README.txt . # edit to bring up to date for fr3 ln -s `pwd`/README.txt $pd/ ############################################################################# # download data for 8-way (DONE - 2012-02-23 - Hiram) # use a README from the most recent multiz like this # this one is from gorGor3 cd /hive/data/genomes/fr3/bed/multiz8way/cons cp -p /hive/data/genomes/gorGor3/bed/multiz11way/cons/README.txt . # need to edit this to finish it off mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/phastCons8way cd /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/phastCons8way ln -s \ /hive/data/genomes/fr3/bed/multiz8way/cons/all/downloads/phastCons8way.wigFix.gz \ ./phastCons8way.wigFix.gz ln -s /hive/data/genomes/fr3/bed/multiz8way/cons/all/phastCons8way.bw \ ./phastCons8way.bw ln -s /hive/data/genomes/fr3/bed/multiz8way/cons/all/all.mod \ ./fish.mod ln -s /hive/data/genomes/fr3/bed/multiz8way/cons/README.txt . md5sum phastCons8way.bw phastCons8way.wigFix.gz fish.mod \ > /hive/data/genomes/fr3/bed/multiz8way/cons/md5sum.txt ln -s /hive/data/genomes/fr3/bed/multiz8way/cons/md5sum.txt . # use a README from the most recent multiz like this # this one is from xenTro3 cd /hive/data/genomes/fr3/bed/multiz8way/consPhyloP cp /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP/README.txt . # need to edit this to finish it off mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/phyloP8way cd /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/phyloP8way ln -s \ /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/all/downloads/phyloP8way.wigFix.gz \ ./phyloP8way.wigFix.gz ln -s \ /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/all/phyloP8way.bw \ ./phyloP8way.bw ln -s \ /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/run.phyloP/all.mod \ ./fish.mod ln -s /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/README.txt . md5sum README.txt phyloP8way.bw phyloP8way.wigFix.gz fish.mod \ > /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/md5sum.txt ln -s /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/md5sum.txt . mkdir /hive/data/genomes/fr3/bed/multiz8way/downloads cd /hive/data/genomes/fr3/bed/multiz8way/downloads # use a README from the most recent multiz like this # this one is from xenTro3 cp -p /hive/data/genomes/gorGor3/bed/multiz11way/downloads/README.txt . # need to edit this README.txt to finish it off # you can find this common name translation above in the section that # began the multiz8way work: grep TREE ../cons/all/all.mod | sed -e "s/TREE: //" > fr3.8way.nh /cluster/bin/phast/tree_doctor --rename \ "fr3->Fugu; tetNig2->Tetraodon; oreNil1->Nile_Tilapia; gasAcu1->Stickleback; oryLat2->Medaka; gadMor1->Atlantic_cod; danRer7->Zebrafish; latCha1->Coelacanth;" \ fr3.8way.nh > fr3.commonNames.8way.nh time cp -p ../anno/fr3.8way.maf ./multiz8way.maf # real 1m38.305s # -rw-rw-r-- 1 3235224846 Feb 21 13:55 multiz8way.maf time gzip multiz8way.maf # real 7m24.097s # -rw-rw-r-- 1 738827448 Feb 21 13:55 multiz8way.maf.gz md5sum *.nh *.gz *.txt > md5sum.txt mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/multiz8way cd /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/multiz8way ln -s /hive/data/genomes/fr3/bed/multiz8way/downloads/fr3.8way.nh . ln -s \ /hive/data/genomes/fr3/bed/multiz8way/downloads/fr3.commonNames.8way.nh . ln -s \ /hive/data/genomes/fr3/bed/multiz8way/downloads/multiz8way.maf.gz . ln -s /hive/data/genomes/fr3/bed/multiz8way/downloads/README.txt . ln -s /hive/data/genomes/fr3/bed/multiz8way/downloads/md5sum.txt . ############################################################################# # Construct download files (DONE - 2012-02-23 - Hiram) cd /hive/data/genomes/fr3 # verify joinerCheck is good and fr3 is in all.joiner joinerCheck -keys -database=fr3 \ $HOME/kent/src/hg/makeDb/schema/all.joiner time makeDownloads.pl fr3 -workhorse=hgwdev > downloads.log 2>&1 & # real 24m7.903s # you must edit the */README files in ./goldenPath/ ############################################################################ # after downloads, construct pushQ (DONE - 2012-02-23 - Hiram) mkdir /hive/data/genomes/fr3/pushQ cd /hive/data/genomes/fr3/pushQ time makePushQSql.pl fr3 > fr3.sql 2> stderr.log # check stderr.log for anything not categorized scp -p fr3.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < fr3.sql # after loading, view the subQ in the WEB interface, go through each # item and verify it can measure the tables and files ############################################################################ # running cpgIsland business (DONE - 2012-04-19 - Hiram) mkdir /hive/data/genomes/fr3/bed/cpgIsland cd /hive/data/genomes/fr3/bed/cpgIsland # use a previous binary for this program ln -s ../../../mm9/bed/cpgIsland/hg3rdParty/cpgIslands/cpglh.exe . mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../fr3.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/fr3/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck. cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | egrep -v "^#seq|^total" | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod +x runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time # Completed: 6835 of 6835 jobs # CPU time in finished jobs: 45s 0.75m 0.01h 0.00d 0.000 y # IO & Wait Time: 18494s 308.23m 5.14h 0.21d 0.001 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 10s 0.17m 0.00h 0.00d # Submission to last job: 266s 4.43m 0.07h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed # verify longest unique chrom name: cut -f1 cpgIsland.bed | awk '{print length($0)}' | sort -rn | head -1 # 8 # update the length 14 in the template to be 8: sed -e "s/14/8/" $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql cd /hive/data/genomes/fr3/bed/cpgIsland hgLoadBed fr3 cpgIslandExt -tab -sqlTable=cpgIslandExt.sql cpgIsland.bed # Read 36504 elements of size 10 from cpgIsland.bed featureBits fr3 cpgIslandExt # 18279877 bases of 350961831 (5.209%) in intersection # there should be no output from checkTableCoords: checkTableCoords -verboseBlocks -table=cpgIslandExt fr3 # cleanup, unless you want to move them to the genscan procedure below rm -fr hardMaskedFa ######################################################################### # GENSCAN GENE PREDICTIONS (DONE - 2012-04-09 - Hiram) mkdir /hive/data/genomes/fr3/bed/genscan cd /hive/data/genomes/fr3/bed/genscan # use a previously existing genscan binary ln -s ../../../mm9/bed/genscan/hg3rdParty . # create hard masked .fa files mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../fr3.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done # Generate a list file, genome.list, of all the hard-masked contig chunks: find ./hardMaskedFa/ -type f | sed -e 's#^./##' > genome.list wc -l genome.list # 6835 genome.list # OK to run on swarm, small chroms will not need the large memory ssh swarm cd /hive/data/genomes/fr3/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Create template file, template, for gensub2. For example (3-line file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in exists+ $(path1)} {check out exists gtf/$(root1).gtf} -trans={check out exists pep/$(root1).pep} -subopt={check out exists subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs gensub2 genome.list single template jobList para create jobList para try para check ... etc... para time # Completed: 6835 of 6835 jobs # CPU time in finished jobs: 9926s 165.43m 2.76h 0.11d 0.000 y # IO & Wait Time: 20476s 341.27m 5.69h 0.24d 0.001 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest finished job: 786s 13.10m 0.22h 0.01d # Submission to last job: 1236s 20.60m 0.34h 0.01d # Concatenate results: cd /hive/data/genomes/fr3/bed/genscan find ./gtf -type f | xargs cat > genscan.gtf find ./pep -type f | xargs cat > genscan.pep find ./subopt -type f | xargs cat > genscanSubopt.bed # Load into the data/genomesbase (without -genePredExt because no frame info): # Don't load the Pep anymore -- redundant since it's from genomic. ssh hgwdev cd /hive/data/genomes/fr3/bed/genscan # to construct a local file with the genePred business: gtfToGenePred genscan.gtf genscan.gp genePredCheck -db=fr3 genscan.gp # checked: 20354 failed: 0 # this produces exactly the same thing and loads the table: ldHgGene -gtf fr3 genscan genscan.gtf # Read 20354 transcripts in 159927 lines in 1 files # 20354 groups 4323 seqs 1 sources 1 feature types # 20354 gene predictions hgLoadBed fr3 genscanSubopt genscanSubopt.bed # Read 526572 elements of size 6 from genscanSubopt.bed featureBits fr3 genscan # 25082192 bases of 350961831 (7.147%) in intersection #########################################################################