# for emacs: -*- mode: sh; -*- # Danio rerio Zv9 sequence: # ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/ # Danio_rerio/Zv9/ # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CABZ00 ########################################################################## # Download sequence (DONE - 2010-11-30 - Hiram) mkdir /hive/data/genomes/danRer7 cd /hive/data/genomes/danRer7 mkdir genbank cd genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Danio_rerio/Zv9/*" ########################################################################### # fixup to UCSC names (DONE - 2010-11-30 - Hiram) cd /hive/data/genomes/danRer7/genbank cat << '_EOF_' > ucscNames.pl #!/bin/env perl use strict; use warnings; my %cuToChr; open (FH, ") { chomp $line; my ($chrN, $cuName) = split('\s+', $line); $cuToChr{$cuName} = $chrN; printf "%d\t%s\n", $chrN, $cuName; open (AG, "zcat Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz"; open (UC, "|gzip -c > ucsc/chr${chrN}.agp.gz") or die "can not write to ucsc/chr${chrN}.agp.gz"; while (my $agpLine = ) { if ($agpLine =~ m/^#/) { print UC $agpLine; } else { chomp $agpLine; my ($cuId, $rest) = split('\s+', $agpLine, 2); printf UC "chr%d\t%s\n", $cuToChr{$cuId}, $rest; } } close (AG); close (UC); print `zcat Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz | sed -e "s#>.*#>chr${chrN}#" | gzip -c > ucsc/chr${chrN}.fa.gz` } close (FH); '_EOF_' # << happy emacs chmod +x ucscNames.pl mkdir ucsc ./ucscNames.pl zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | sed -e "s#^FR\([0-9]*\).1#FR\1#" | gzip -c > ucsc/chr.unplaced.agp.gz zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ | sed -e "s#gi.*emb|##; s#.1| .*##" | gzip -c > ucsc/chr.unplaced.fa.gz ########################################################################### # Fixup names to match Ensembl naming scheme (DONE - 2010-02-12 - Hiram) cd /hive/data/genomes/danRer7/genbank cat << '_EOF_' > scafNames.pl #!/bin/env perl use strict; use warnings; my %accToScaf; open (FH, ") { chomp $line; my ($scafName, $acc) = split('\s+', $line); $accToScaf{$acc} = $scafName; } close (FH); open (FH, "zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz|") or die "can not read Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; open (UC, "| gzip -c > ucsc/chr.unplaced.agp.gz") or die "can not write to ucsc/chr.unplaced.agp.gz"; while (my $agpLine = ) { if ($agpLine =~ m/^#/) { print UC $agpLine; } else { chomp $agpLine; my ($acc, $rest) = split('\s+', $agpLine, 2); printf UC "%s\t%s\n", $accToScaf{$acc}, $rest; } } close (FH); close (UC); open (FH, "zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz|") or die "can not read Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (UC, "| gzip -c > ucsc/chr.unplaced.fa.gz") or die "can not write to ucsc/chr.unplaced.fa.gz"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/, whole genome shotgun sequence//; $line =~ s/^>.*assembly, />/; printf UC "%s\n", $line; } else { printf UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs ./scafNames.pl ########################################################################### # initial database build (DONE - 2010-12-02 - Hiram) cd /hive/data/genomes/danRer7 cat << '_EOF_' > danRer7.config.ra # Config parameters for makeGenomeDb.pl: db danRer7 clade vertebrate scientificName Danio rerio commonName Zebrafish assemblyDate Jul. 2010 assemblyLabel Wellcome Trust Sanger Institute Zv9 (GCA_000002035.2) assemblyShortLabel Zv9 orderKey 447 fastaFiles /hive/data/genomes/danRer7/genbank/ucsc/chr*.fa.gz agpFiles /hive/data/genomes/danRer7/genbank/ucsc/chr*.agp.gz # qualFiles none dbDbSpeciesDir zebrafish mitoAcc NC_002333.2 taxId 7955 '_EOF_' # << happy emacs makeGenomeDb.pl danRer7.config.ra ln -s `pwd`/danRer7.unmasked.2bit /gbdb/danRer7/danRer7.2bit ########################################################################### # Ensembl v60 genes (DONE - 2010-12-02 - Hiram) cd /hive/data/genomes/danRer7 cat << '_EOF_' > danRer7.ensGene.ra # required db variable db danRer7 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=60 \ -bigClusterHub=swarm -smallClusterHub=swarm -workhorse=hgwdev \ danRer7.ensGene.ra > ensGene.v60.log 2>&1 ########################################################################### # RepeatMasker (DONE - 2010-12-02 - Hiram) time doRepeatMasker.pl -buildDir=`pwd` -noSplit -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev danRer7 > do.log 2>&1 & # real 1283m15.593s # these jobs run about twice as slow as other genomes: # Completed: 2861 of 2861 jobs # CPU time in finished jobs: 28800301s 480005.01m 8000.08h 333.34d 0.913 y # IO & Wait Time: 1812390s 30206.50m 503.44h 20.98d 0.057 y # Average job time: 10700s 178.33m 2.97h 0.12d # Longest finished job: 11428s 190.47m 3.17h 0.13d # Submission to last job: 105184s 1753.07m 29.22h 1.22d # after resolving various swarm problems, continuing: time doRepeatMasker.pl -continue=cat -buildDir=`pwd` \ -noSplit -bigClusterHub=swarm -dbHost=hgwdev \ -workhorse=hgwdev danRer7 > cat.log 2>&1 & # real 25m56.591s cat faSize.rmsk.txt # 1412464843 bases (2694734 N's 1409770109 real 676920525 upper # 732849584 lower) in 1133 sequences in 1 files # %51.88 masked total, %51.98 masked real ########################################################################### # TRF/SimpleRepeats (DONE - 2010-12-02 - Hiram) mkdir /hive/data/genomes/danRer7/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev danRer7 \ # 1.5 hours later # cluster run failed due to finding nothing on chrM time doSimpleRepeat.pl -continue=filter -buildDir=`pwd` \ -smallClusterHub=swarm -bigClusterHub=swarm -dbHost=hgwdev \ -workhorse=hgwdev danRer7 > filter.log 2>&1 # real 0m32.422s cat fb.simpleRepeat # 101845718 bases of 1409772743 (7.224%) in intersection cd /hive/data/genomes/danRer7 twoBitMask danRer7.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed danRer7.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa danRer7.2bit stdout | faSize stdin > faSize.danRer7.2bit.txt cat faSize.danRer7.2bit.txt # 1412464843 bases (2694734 N's 1409770109 real 675828305 upper # 733941804 lower) in 1133 sequences in 1 files # %51.96 masked total, %52.06 masked real rm /gbdb/danRer7/danRer7.2bit ln -s `pwd`/danRer7.2bit /gbdb/danRer7/danRer7.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-12-09 - Hiram) mkdir /hive/data/genomes/danRer7/bed/gap cd /hive/data/genomes/danRer7/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../danRer7.2bit > findMotif.txt 2>&1 # real 1m12.153s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits danRer7 -not gap -bed=notGap.bed # 1409772743 bases of 1409772743 (100.000%) in intersection featureBits danRer7 allGaps.bed notGap.bed -bed=new.gaps.bed # 2634 bases of 1409772743 (0.000%) in intersection # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" danRer7 | sort -n | tail -1 # 3063 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" danRer7 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps danRer7 other.bed # 2634 bases of 1412464843 (0.000%) in intersection wc -l other.bed # 1975 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad danRer7 otherGap other.bed # starting with this many hgsql -e "select count(*) from gap;" danRer7 # 26921 hgsql danRer7 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" danRer7 # 28896 # == 26921 + 1975 # there is an odd one marked as bridged when it is actually a continuation # of a non-bridged gap. # Found when running gapToLift below, without the -minGap=100. # chr22:42178985-42178985 hgsql -e 'update gap set bridge="no" where chromStart=42178984 AND chrom="chr22";' danRer7 ########################################################################### # MAKE 11.OOC FILES FOR BLAT (DONE - 2010-12-09 - Hiram) ssh kolossus # numerator is danRer7 gapless bases "real" as reported by faSize # denominator is hg19 gapless bases as reported by featureBits, # 1024 is threshold used for human -repMatch: calc \( 1409770109 / 2897316137 \) \* 1024 # ( 1409770109 / 2897316137 ) * 1024 = 498.255808 # ==> use -repMatch=400 according to size scaled down from 1024 for human. # and rounded down to nearest 100 cd /hive/data/genomes/danRer7 blat danRer7.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/danRer7.11.ooc -repMatch=400 # Wrote 63970 overused 11-mers to jkStuff/danRer7.11.ooc mkdir /hive/data/staging/data/danRer7 cp -p danRer7.2bit chrom.sizes jkStuff/danRer7.11.ooc \ /hive/data/staging/data/danRer7 # just to see what is going on, since all gaps here are almost all # exactly 100 bases, they aren't really "non-bridged", not really. gapToLift -minGap=100 -bedFile=jkStuff/nonBridgedGaps.bed danRer7 \ jkStuff/danRer7.nonBridged.lft ########################################################################### # AUTO UPDATE GENBANK MRNA AND EST AND ZGC GENES RUN # (DONE - 2010-12-09 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add danRer7 and commit to GIT # danRer7 (zebrafish) danRer7.serverGenome = /hive/data/genomes/danRer7/danRer7.2bit danRer7.clusterGenome = /scratch/data/danRer7/danRer7.2bit danRer7.ooc = /scratch/data/danRer7/11.ooc danRer7.lift = no danRer7.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} danRer7.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} danRer7.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} danRer7.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} danRer7.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} danRer7.downloadDir = danRer7 danRer7.perChromTables = no danRer7.refseq.mrna.xeno.load = yes danRer7.upstreamGeneTbl = refGene danRer7.upstreamMaf = multiz8way /hive/data/genomes/danRer7/bed/multiz8way/species.lst # end of section added to etc/genbank.conf git commit -m "adding danRer7 definition" genbank.conf git push make etc-update # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains # danRer genome information ssh genbank screen # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial danRer7 & # logFile: var/build/logs/2010.12.10-10:36:31.danRer7.initalign.log # 5h33m # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad danRer7 & # logFile: var/dbload/hgwdev/logs/2010.12.10-16:27:54.dbload.log # about 30 minutes # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add danRer7 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added danRer7." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # update default position to be similar to danRer6 (DONE - 2010-12-14 - Hiram) hgsql \ -e 'update dbDb set defaultPos = "chr13:25004825-25082273" where name = "danRer7";' \ hgcentraltest ######################################################################### # construct download files (DONE - 2010-12-14 - Hiram) cd /hive/data/genomes/danRer7 time makeDownloads.pl danRer7 -workhorse=hgwdev > downloads.log 2>&1 # real 12m48.251s # edit the README files in goldenPath/*/README ######################################################################### # Construct pushQ entry (DONE - 2010-12-14 - Hiram) mkdir /hive/data/genomes/danRer7/pushQ cd /hive/data/genomes/danRer7/pushQ makePushQSql.pl danRer7 > danRer7.pushQ.sql WARNING: hgwdev does not have /gbdb/danRer7/wib/gc5Base.wib WARNING: hgwdev does not have /gbdb/danRer7/wib/quality.wib WARNING: hgwdev does not have /gbdb/danRer7/bbi/quality.bw WARNING: danRer7 does not have seq WARNING: danRer7 does not have extFile WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of supporting and genbank tables) which tracks to assign these tables to: ensGtp ensPep scp -p danRer7.pushQ.sql hgwbeta:/tmp ssh hgwbeta hgsql qapushq < /tmp/danRer7.pushQ.sql ######################################################################### # LASTZ Xenopus xenTro2 (DONE - 2010-12-17 - Hiram) mkdir /hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17 cd /hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17 cat << '_EOF_' > DEF # zebrafish vs. X. tropicalis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Zebrafish danRer7 SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit SEQ1_LEN=/scratch/data/danRer7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: X. tropicalis xenTro2 SEQ2_DIR=/scratch/data/xenTro2/xenTro2.2bit SEQ2_LEN=/scratch/data/xenTro2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & cat fb.danRer7.chainXenTro2Link.txt # 90625809 bases of 1409770109 (6.428%) in intersection mkdir /hive/data/genomes/xenTro2/bed/blastz.danRer7.swap cd /hive/data/genomes/xenTro2/bed/blastz.danRer7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 32m57.901s cat fb.xenTro2.chainDanRer7Link.txt # 89862892 bases of 1359412157 (6.610%) in intersection ######################################################################### # LASTZ Tetraodon TetNig2 (DONE - 2010-12-17 - Hiram) mkdir /hive/data/genomes/danRer7/bed/lastzTetNig2.2010-12-17 cd /hive/data/genomes/danRer7/bed/lastzTetNig2.2010-12-17 cat << '_EOF_' > DEF # zebrafish vs tetraodon BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Zebrafish danRer7 SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit SEQ1_LEN=/scratch/data/danRer7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/danRer7/bed/lastzTetNig2.2010-12-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -qRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # Elapsed time: 144m12s cat fb.danRer7.chainTetNig2Link.txt # 88435047 bases of 1409770109 (6.273%) in intersection mkdir /hive/data/genomes/tetNig2/bed/blastz.danRer7.swap cd /hive/data/genomes/tetNig2/bed/blastz.danRer7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer7/bed/lastzTetNig2.2010-12-17/DEF \ -qRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 8m43.275s cat fb.tetNig2.chainDanRer7Link.txt # 71259923 bases of 302314788 (23.571%) in intersection ############################################################################## # LASTZ Stickleback gasAcu1 (DONE - 2010-12-17 - Hiram) mkdir /hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17 cd /hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17 cat << '_EOF_' > DEF # zebrafish vs. stickleback # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Zebrafish danRer7 SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit SEQ1_LEN=/scratch/data/danRer7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # TARGET: Stickleback gasAcu1, chunk large enough to run largest piece SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit SEQ2_LEN=/scratch/data/gasAcu1/chrom.sizes SEQ2_CTGDIR=/scratch/data/gasAcu1/gasAcu1.contigs.2bit SEQ2_CTGLEN=/scratch/data/gasAcu1/gasAcu1.contigs.sizes SEQ2_LIFT=/scratch/data/gasAcu1/gasAcu1.contigs.lift SEQ2_CHUNK=22000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 247m45.584s cat fb.danRer7.chainGasAcu1Link.txt # 144782335 bases of 1409770109 (10.270%) in intersection mkdir /hive/data/genomes/gasAcu1/bed/blastz.danRer7.swap cd /hive/data/genomes/gasAcu1/bed/blastz.danRer7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17/DEF \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 25m35.024s cat fb.gasAcu1.chainDanRer7Link.txt # 122918766 bases of 446627861 (27.522%) in intersection ######################################################################### # lastz Medaka oryLat2 (DONE - 2010-12-17 - Hiram) mkdir /hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17 cd /hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17 cat << '_EOF_' > DEF # Zebrafish vs. Medaka # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Zebrafish danRer7 SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit SEQ1_LEN=/scratch/data/danRer7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # Query: Medaka oryLat2 SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=2000 -chainLinearGap=medium \ -noLoadChainSplit -qRepeats=windowmaskerSdust \ -smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 & # real 258m58.096s cat fb.danRer7.chainOryLat2Link.txt # 141428502 bases of 1409770109 (10.032%) in intersection mkdir /hive/data/genomes/oryLat2/bed/blastz.danRer7.swap cd /hive/data/genomes/oryLat2/bed/blastz.danRer7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17/DEF \ -chainMinScore=2000 -chainLinearGap=medium \ -noLoadChainSplit -qRepeats=windowmaskerSdust \ -swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 & # real 20m56.926s cat fb.oryLat2.chainDanRer7Link.txt # 124527579 bases of 700386597 (17.780%) in intersection ############################################################################ # lastz swap mm9 (DONE - 2010-12-17 - Hiram # original alignment cd /hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17 cat fb.danRer7.chainMm9Link.txt # 68190354 bases of 2620346127 (2.602%) in intersection # and the swap mkdir /hive/data/genomes/danRer7/bed/blastz.mm9.swap cd /hive/data/genomes/danRer7/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 16m8.672s cat fb.danRer7.chainMm9Link.txt # 71960602 bases of 1409770109 (5.104%) in intersection ######################################################################### # lastz swap from hg19 (DONE - 2010-12-20 - Hiram) # original alignment cd /hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17 cat fb.hg19.chainDanRer7Link.txt # 80849592 bases of 2897316137 (2.790%) in intersection # running the swap mkdir /hive/data/genomes/danRer7/bed/blastz.hg19.swap cd /hive/data/genomes/danRer7/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 42m52.920s cat fb.danRer7.chainHg19Link.txt # 86716552 bases of 1409770109 (6.151%) in intersection ######################################################################### # lastz Fugu fr2 (DONE - 2010-12-21 - Hiram) mkdir /hive/data/genomes/danRer7/bed/lastzFr2.2010-12-21 cd /hive/data/genomes/danRer7/bed/lastzFr2.2010-12-21 cat << '_EOF_' > DEF # Zebrafish vs. Fugu # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Zebrafish danRer7 SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit SEQ1_LEN=/scratch/data/danRer7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Fugu fr2 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/scratch/data/fr2/fr2.2bit SEQ2_LEN=/hive/data/genomes/fr2/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/fr2/noUn/fr2.scaffolds.2bit SEQ2_CTGLEN=/hive/data/genomes/fr2/noUn/fr2.scaffolds.sizes SEQ2_LIFT=/hive/data/genomes/fr2/jkStuff/liftAll.lft SEQ2_CHUNK=10000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/hive/data/genomes/danRer7/bed/lastzFr2.2010-12-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs screen # use screen to manage this long running job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=2000 -chainLinearGap=medium \ -noLoadChainSplit -qRepeats=windowmaskerSdust \ -smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 & # real 144m0.459s cat fb.danRer7.chainFr2Link.txt # 103798841 bases of 1409770109 (7.363%) in intersection mkdir /hive/data/genomes/fr2/bed/blastz.danRer7.swap cd /hive/data/genomes/fr2/bed/blastz.danRer7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer7/bed/lastzFr2.2010-12-21/DEF \ -chainMinScore=2000 -chainLinearGap=medium \ -noLoadChainSplit -qRepeats=windowmaskerSdust \ -swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 & # real 10m26.244s cat fb.fr2.chainDanRer7Link.txt # 80078057 bases of 393312790 (20.360%) in intersection ############################################################################ ## 8-Way Multiz (DONE - 2011-01-18 - Hiram) ssh hgwdev mkdir /hive/data/genomes/danRer7/bed/multiz8way cd /hive/data/genomes/danRer7/bed/multiz8way wget -O 46way.nh \ http://hgdownload.cse.ucsc.edu/goldenPath/hg19/multiz46way/46way.corrected.nh # ((Zebrafish, # (Stickleback, # Tetraodon)), # (X_tropicalis, # (Mouse, # Human))) # All distances remain as specified in the 46way.nh /cluster/bin/phast/tree_doctor --prune-all-but danRer6,fr2,oryLat2,gasAcu1,tetNig2,xenTro2,mm9,hg19 46way.nh | sed -e "s/danRer6/danRer7/" > 8way.nh # (((hg19:0.131183,mm9:0.352605):0.525173,xenTro2:0.852482):0.300396,(((tetNig2:0.224774,fr2:0.205294):0.191836,(gasAcu1:0.313967,oryLat2:0.478451):0.058404):0.322824,danRer7:0.731166):0.155214); # Mark reports three new reports for fish phylogeny: # (zebrafish, (medaka, (stickleback, (fugu, tetraodon)))) # http://www.ncbi.nlm.nih.gov/pubmed/18771739 analyzed mitochondrial data. # http://www.ncbi.nlm.nih.gov/pubmed/20546903 analyzed data from 11 nuclear genes. # http://www.ncbi.nlm.nih.gov/pubmed/17709262 analyzed mitochrondrial data. # So, rearrange to get Zebrafish on top and Medaka out on its own: cat << '_EOF_' > zFishOnTop.nh ((danRer7:0.731166,(oryLat2:0.478451,(gasAcu1:0.313967,(tetNig2:0.224774,fr2:0.205294):0.191836):0.058404):0.322824):0.155214,(xenTro2:0.852482,(mm9:0.352605,hg19:0.131183):0.525173):0.300396); '_EOF_' # << happy emacs # convert to species names /cluster/bin/phast/tree_doctor --rename \ "danRer7->Zebrafish; gasAcu1->Stickleback; oryLat2->Medaka; tetNig2->Tetraodon; fr2->Fugu; xenTro2->X_Tropicalis; mm9->Mouse; hg19->Human" \ zFishOnTop.nh ((Zebrafish:0.731166,(Medaka:0.478451,(Stickleback:0.313967,(Tetraodon:0.224774,Fugu:0.205294):0.191836):0.058404):0.322824):0.155214,(X_Tropicalis:0.852482,(Mouse:0.352605,Human:0.131183):0.525173):0.300396); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for /usr/local/apache/htdocs/images/phylo/danRer7_8way.gif /cluster/bin/phast/all_dists zFishOnTop.nh > 8way.distances.txt # Use this output to create the table below grep -i danRer7 8way.distances.txt | sort -k3,3n # danRer7 gasAcu1 1.426361 # danRer7 fr2 1.509524 # danRer7 tetNig2 1.529004 # danRer7 oryLat2 1.532441 # danRer7 hg19 1.843132 # danRer7 xenTro2 2.039258 # danRer7 mm9 2.064554 # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainDanRer6Link chain linearGap # distance on danRer7 on other minScore # 1 1.43 - stickleback gasAcu1 (% 10.27) (% 27.52) 2000 medium # 2 1.51 - Fugu fr2 (% 7.36) (% 20.36) 2000 medium # 3 1.53 - tetraodon tetNig2 (% 6.27) (% 23.57) 2000 medium # 4 1.53 - medaka oryLat2 (% 10.03) (% 17.78) 2000 medium # 5 1.84 - human hg19 (% 2.79) (% 6.15) 5000 loose # 6 2.04 - xenopus xenTro2 (% 6.43) (% 6.61) 5000 loose # 7 2.06 - mouse mm9 (% 2.60) (% 5.10) 5000 loose # None of this concern for distances matters in building the first step, the # maf files. # bash shell syntax here ... export H=/hive/data/genomes/danRer7/bed mkdir mafLinks for G in gasAcu1 fr2 tetNig2 oryLat2 hg19 xenTro2 mm9 do mkdir mafLinks/$G if [ ! -d ${H}/lastz.${G}/mafNet ]; then echo "missing directory lastz.${G}/mafNet" exit 255 fi ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ zFishOnTop.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst # location for cluster run #/hive/data/genomes/danRer7/bed/multiz8way/mafLinks/ mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn # this one is simple enough that it doesn't need a cluster run. cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = danRer7 set binDir = /hive/data/genomes/danRer7/bed/multiz8way/penn rm -fr tmp mkdir -p tmp cd tmp foreach s (`cat ../species.lst`) if ($s != $db) then zcat ../mafLinks/$s/$db.$s.net.maf > $db.$s.sing.maf endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=. E=$db "`cat ../tree.nh`" $db.*.sing.maf ../multiz8way.maf cd .. rm -fr tmp '_EOF_' # << happy emacs chmod +x autoMultiz screen time ./autoMultiz > autoMultiz.log 2>&1 & # real 242m15.543s # log shows no errors. # -rw-rw-r-- 1 1785003021 Jan 18 18:49 multiz8way.maf # Load into database ssh hgwdev cd /hive/data/genomes/danRer7/bed/multiz8way mkdir /gbdb/danRer7/multiz8way ln -s `pwd`/multiz8way.maf /gbdb/danRer7/multiz8way cd /scratch/tmp time nice -n +19 hgLoadMaf danRer7 multiz8way # Indexing and tabulating /gbdb/danRer7/multiz8way/multiz8way.maf # Loaded 2997216 mafs in 1 files from /gbdb/danRer7/multiz8way # real 0m32.612s time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 danRer7 multiz8waySummary \ /gbdb/danRer7/multiz8way/multiz8way.maf # Created 1160923 summary blocks from 9245726 components # and 2997216 mafs from /gbdb/danRer7/multiz8way/multiz8way.maf # real 0m34.656s wc -l multiz8way*.tab # 2997216 multiz8way.tab # 1160923 multiz8waySummary.tab # 4158139 total rm multiz8way*.tab ######################################################################### # GAP ANNOTATE MULTIZ6WAY MAF AND LOAD TABLES (DONE - 2011-01-19 - Hiram) # redone properly 2011-05-12 # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. ssh hgwdev mkdir -p /hive/data/genomes/danRer7/bed/multiz8way/anno/mafSplit cd /hive/data/genomes/danRer7/bed/multiz8way/anno/mafSplit time mafSplit -byTarget -useFullSequenceName \ /dev/null . ../../multiz8way.maf # real 1m0.519s mkdir /hive/data/genomes/danRer7/bed/multiz8way/anno cd /hive/data/genomes/danRer7/bed/multiz8way/anno for DB in `sed -e "s/ danRer7//" ../species.lst` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done mkdir annoSplit for F in mafSplit/*.maf do B=`basename ${F}` mafAddIRows -nBeds=nBeds ${F} /hive/data/genomes/danRer7/danRer7.2bit \ annoSplit/${B} echo $B done # about 10 minutes # combine into one file head -q -n 1 annoSplit/chr1.maf > danRer7.8way.maf for F in annoSplit/*.maf do grep -h -v "^#" ${F} done >> danRer7.8way.maf # about 2 minutes # these maf files do not have the end marker, this does nothing: # tail -q -n 1 mafSplit/chr1.maf >> panTro3.12way.maf # How about an official end marker: echo "##eof maf" >> danRer7.8way.maf # Load into database rm /gbdb/danRer7/multiz8way/multiz8way.maf # remove old symlink ln -s `pwd`/danRer7.8way.maf /gbdb/danRer7/multiz8way/multiz8way.maf cd /scratch/tmp time nice -n +19 hgLoadMaf danRer7 multiz8way # Loaded 3365771 mafs in 1 files from /gbdb/danRer7/multiz8way # real 1m22.783s time nice hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 danRer7 multiz8waySummary \ /gbdb/danRer7/multiz8way/multiz8way.maf # Created 1160923 summary blocks from 9245726 components # and 3365771 mafs from /gbdb/danRer7/multiz8way/multiz8way.maf # real 1m37.885s wc -l multiz8way*.tab # 3365771 multiz8way.tab # 1160923 multiz8waySummary.tab # 4526694 total rm multiz8way*.tab ############################################################################ # Adding automatic generation of upstream files (DONE - 2010-12-23 - Hiram) # edit src/hg/makeDb/genbank/etc/genbank.conf to add: danRer7.upstreamGeneTbl = refGene danRer7.upstreamMaf = multiz8way /hive/data/genomes/danRer7/bed/multiz8way/species.lst ######################################################################### # MULTIZ8WAY MAF FRAMES (DONE - 2011-01-19 - Hiram) ssh hgwdev mkdir /hive/data/genomes/danRer7/bed/multiz8way/frames cd /hive/data/genomes/danRer7/bed/multiz8way/frames mkdir genes #danRer7 ensGene #gasAcu1 ensGene #tetNig2 ensGene #fr2 ensGene #oryLat2 ensGene #xenTro2 ensGene #hg19 knownGene #mm9 knownGene #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using ensGene for danRer7 gasAcu1 tetNig2 fr2 oryLat2 xenTro2 for qDB in danRer7 gasAcu1 tetNig2 fr2 oryLat2 xenTro2 do echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB} hgsql -N -e "select * from ensGene" ${qDB} | cut -f 2-11 \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done # verify counts of genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/danRer7.gp.gz:24068 # genes/fr2.gp.gz:18336 # genes/gasAcu1.gp.gz:20631 # genes/oryLat2.gp.gz:19576 # genes/tetNig2.gp.gz:19539 # genes/xenTro2.gp.gz:17888 # using knownGene for mm9 hg19 # genePreds; (must keep only the first 10 columns for knownGene) for qDB in mm9 hg19 do echo hgsql -N -e \"'select * from 'knownGene\;\" ${qDB} hgsql -N -e "select * from knownGene" ${qDB} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/danRer7.gp.gz: 24068 # genes/fr2.gp.gz: 18336 # genes/gasAcu1.gp.gz: 20631 # genes/hg19.gp.gz: 20597 # genes/mm9.gp.gz: 20630 # genes/oryLat2.gp.gz: 19576 # genes/tetNig2.gp.gz: 19539 # genes/xenTro2.gp.gz: 17888 # reloaded corrected maf 2011-05-12 ssh hgwdev cd /hive/data/genomes/danRer7/bed/multiz8way/frames cat ../anno/danRer7.8way.maf \ | nice -n +19 genePredToMafFrames danRer7 stdin stdout \ `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.lst` \ | gzip > multiz8way.mafFrames.gz # verify there are frames on everything: zcat multiz8way.mafFrames.gz | awk '{print $4}' | sort | uniq -c # 234538 danRer7 # 418660 fr2 # 437610 gasAcu1 # 436329 hg19 # 422074 mm9 # 409432 oryLat2 # 392409 tetNig2 # 346443 xenTro2 ssh hgwdev cd /hive/data/genomes/danRer7/bed/multiz8way/frames nice hgLoadMafFrames danRer7 multiz8wayFrames multiz8way.mafFrames.gz ############################################################################ # creating upstream mafs (DONE - 2011-10-04 - Hiram) ssh hgwdev mkdir /hive/data/genomes/danRer7/goldenPath/multiz8way cd /hive/data/genomes/danRer7/goldenPath/multiz8way # run this bash script cat << '_EOF_' > mkUpstream.sh DB=danRer7 GENE=ensGene NWAY=multiz8way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.${GENE}.maf" echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout" featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | mafFrags ${DB} ${NWAY} stdin stdout \ -orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.lst \ | gzip -c > upstream${S}.${GENE}.maf.gz echo "done upstream${S}.${GENE}.maf.gz" done '_EOF_' # << happy emacs chmod +x mkUpstream.sh time ./mkUpstream.sh # real 61m16.030s # FIXUP/VERIFY the genbank.conf file to indicate this gene choice for the # upstream automatic generation ####################################################################### # Phylogenetic tree from 8-way (DONE - 2011-01-19 - Hiram) # We need one tree for all chroms cd /hive/data/genomes/danRer7/bed/multiz8way/ mkdir mafSplit cd mafSplit mafSplit -byTarget -useFullSequenceName /dev/null . ../anno/multiz8way.maf # got 1072 mafs named after their chrom/scaff .maf # although there are over 1133 chroms and scaffolds, # some are too small or have nothing aligning. mkdir /hive/data/genomes/danRer7/bed/multiz8way/4d cd /hive/data/genomes/danRer7/bed/multiz8way/4d # danRer7 cannot be too picky, dropping this clause: # refSeqStatus.status='Reviewed' hgsql danRer7 -Ne \ "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and mol='mRNA'" \ | cut -f 2-20 | egrep -E -v "chrM" \ > refSeqReviewed.gp wc -l refSeqReviewed.gp # 14431 refSeqReviewed.gp genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp wc -l refSeqReviewedNR.gp # 13835 refSeqReviewedNR.gp ssh memk mkdir /hive/data/genomes/danRer7/bed/multiz8way/4d/run cd /hive/data/genomes/danRer7/bed/multiz8way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/danRer7/bed/multiz8way" set c = $1 set infile = $r/mafSplit/$2 set outfile = $3 cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/refSeqReviewedNR.gp | sed -e "s/\t$c\t/\tdanRer7.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ls -1S /hive/data/genomes/danRer7/bed/multiz8way/mafSplit/*.maf | \ egrep -E -v "chrM" \ | sed -e "s#.*multiz8way/mafSplit/##" \ > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs gensub2 maf.list single template stdout | tac > jobList para create jobList para try ... check para -maxJob=30 push para time # three of them have no result: # -rw-rw-r-- 1 0 Jan 19 10:13 Zv9_NA684.mfa # -rw-rw-r-- 1 0 Jan 19 10:13 Zv9_NA269.mfa # -rw-rw-r-- 1 0 Jan 19 10:13 Zv9_NA975.mfa # Completed: 1068 of 1071 jobs # Crashed: 3 jobs # CPU time in finished jobs: 793s 13.21m 0.22h 0.01d 0.000 y # IO & Wait Time: 2779s 46.32m 0.77h 0.03d 0.000 y # Average job time: 3s 0.06m 0.00h 0.00d # Longest finished job: 44s 0.73m 0.01h 0.00d # Submission to last job: 343s 5.72m 0.10h 0.00d # combine mfa files ssh hgwdev cd /hive/data/genomes/danRer7/bed/multiz8way/4d # but first clean out junk 1-byte leftovers from above process. cd mfa find -type f -size 1c | xargs rm find -type f -size 0c | xargs rm cd .. #want comma-less species.lst /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.lst`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa # >danRer7 # >oryLat2 # >gasAcu1 # >tetNig2 # >fr2 # >xenTro2 # >mm9 # >hg19 # tree-commas.nh: # ((danRer7,(oryLat2,(gasAcu1,(tetNig2,fr2)))),(xenTro2,(mm9,hg19))) # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ../tree-commas.nh 4d.all.mfa # real 1m21.248s mv phyloFit.mod all.mod grep TREE all.mod # TREE: ((danRer7:0.580406,(oryLat2:0.364485,(gasAcu1:0.30842,(tetNig2:0.201742,fr2:0.182553):0.217051):0.106429):0.308334):0.15191,(xenTro2:0.747875,(mm9:0.228088,hg19:0.2107):0.397886):0.15191); # create subset of fish-only echo "tetNig2 fr2 gasAcu1 oryLat2 danRer7" > fish.lst # use only the mfa files that have all five fish sequences: mkdir fishMfa for F in mfa/*.mfa do C=`egrep "^> tetNig2|^> fr2|^> gasAcu1|^> oryLat2|^> danRer7" "${F}" | wc -l` if [ "${C}" -eq "5" ]; then echo ${F} fi done | while read OK do B=`basename ${OK}` awk ' BEGIN { fish = 0 } { if (match($0, "^>")) { fish = 1 if (match($0, "^> mm9")) { fish = 0 } if (match($0, "^> hg19")) { fish = 0 } if (match($0, "^> xenTro2")) { fish = 0 } } if (fish) {print} } ' "${OK}" > fishMfa/${B} done # verify only fish are present grep -h "^>" fishMfa/*.mfa | sort | uniq -c # 168 > danRer7 # 168 > fr2 # 168 > gasAcu1 # 168 > oryLat2 # 168 > tetNig2 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat fish.lst`" fishMfa/*.mfa | sed s/"> "/">"/ \ > 4d.fish.mfa /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/tree_doctor \ --no-branchlen --prune-all-but="`cat fish.lst`" ../tree-commas.nh \ > tree-commas.fish.nh # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ./tree-commas.fish.nh 4d.fish.mfa mv phyloFit.mod fish.mod # real 0m1.495s grep TREE fish.mod | sed 's/TREE\:\ //' > fish.8way.nh # (danRer7:0.441576,(oryLat2:0.358316,(gasAcu1:0.302322,(tetNig2:0.201705,fr2:0.179742):0.221704):0.106667):0.441576); ######################################################################### # phastCons 6-way (DONE - 2011-01-19 - Hiram) # split 8way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh swarm mkdir -p /hive/data/genomes/danRer7/bed/multiz8way/cons/SS cd /hive/data/genomes/danRer7/bed/multiz8way/cons/SS cat << '_EOF_' > mkSS.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/danRer7/bed/multiz8way/mafSplit/$c.maf set WINDOWS = /hive/data/genomes/danRer7/bed/multiz8way/cons/SS/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x mkSS.csh cat << '_EOF_' > template #LOOP mkSS.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../mafSplit | sed -e "s/.maf//" > maf.list gensub2 maf.list single template jobList para -ram=8g create jobList para try ... check ... etc # Completed: 1072 of 1072 jobs # CPU time in finished jobs: 1029s 17.14m 0.29h 0.01d 0.000 y # IO & Wait Time: 3083s 51.39m 0.86h 0.04d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest finished job: 63s 1.05m 0.02h 0.00d # Submission to last job: 227s 3.78m 0.06h 0.00d find . -type f | grep ".ss$" | wc # 1082 1082 36982 # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh swarm mkdir -p /hive/data/genomes/danRer7/bed/multiz8way/cons/run.cons cd /hive/data/genomes/danRer7/bed/multiz8way/cons/run.cons # there are going to be only one phastCons run using # this same script. It triggers off of the current working directory # $cwd:t which is the "grp" in this script. It is: # all cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set cX = $1:r set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/danRer7/bed/multiz8way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons set useGrp = "$grp.mod" ln -s $ssSrc/SS/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs find ../SS -type f | grep ".ss$" | sed -e 's/.ss$//' > ss.list wc -l ss.list # 1082 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/danRer7/bed/multiz8way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... para -maxJob=64 push # Completed: 1082 of 1082 jobs # CPU time in finished jobs: 3539s 58.99m 0.98h 0.04d 0.000 y # IO & Wait Time: 7255s 120.91m 2.02h 0.08d 0.000 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest finished job: 35s 0.58m 0.01h 0.00d # Submission to last job: 432s 7.20m 0.12h 0.01d cd /hive/data/genomes/danRer7/bed/multiz8way/cons/all find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database time nice -n +19 hgLoadBed danRer7 phastConsElements8way mostConserved.bed # Loaded 1110857 elements of size 5 # real 0m7.114s # not sure what the targets should be for Zebrafish, on human we often # try for 5% overall cov, and 70% CDS cov featureBits danRer7 -enrichment refGene:cds phastConsElements8way # --rho 0.3 --expected-length 45 --target-coverage 0.3 # refGene:cds 1.320%, phastConsElements8way 13.722%, both 1.101% # cover 83.38%, enrich 6.08x featureBits danRer7 -enrichment ensGene:cds phastConsElements8way # ensGene:cds 2.783%, phastConsElements8way 13.722%, both 2.170% # cover 77.98%, enrich 5.68x # hg19 for comparison # refGene:cds 1.196%, phastConsElements46way 5.065%, # both 0.888%, cover 74.22%, enrich 14.65x # ensGene:cds 1.278%, phastConsElements46way 5.065%, # both 0.910%, cover 71.23%, enrich 14.06x # knownGene:cds 1.252%, phastConsElements46way 5.065%, # both 0.905%, cover 72.29%, enrich 14.27x # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/danRer7/bed/multiz8way/cons/all mkdir downloads find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phastCons8way.wigFix.gz # check integrity of data with wigToBigWig zcat downloads/phastCons8way.wigFix.gz \ | wigToBigWig stdin /hive/data/genomes/danRer7/chrom.sizes \ phastCons8way.bw bigWigInfo phastCons8way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 631,292,485 # primaryIndexSize: 16,520,056 # zoomLevels: 10 # chromCount: 959 # basesCovered: 264,308,757 # mean: 0.680379 # min: 0.000000 # max: 1.000000 # std: 0.360578 # encode those files into wiggle data zcat downloads/phastCons8way.wigFix.gz \ | wigEncode stdin phastCons8way.wig phastCons8way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 du -hsc *.wi? # 253M phastCons8way.wib # 49M phastCons8way.wig # 301M total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons8way.wib /gbdb/danRer7/multiz8way/phastCons8way.wib nice hgLoadWiggle -pathPrefix=/gbdb/danRer7/multiz8way danRer7 \ phastCons8way phastCons8way.wig # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh danRer7 phastCons8way # db.table min max mean count sumData stdDev viewLimits #danRer7.phastCons8way 0 1 0.680379 264308757 1.7983e+08 0.360578 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=danRer7 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons8way > histogram.data 2>&1 # real 0m22.444s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Zebrafish danRer7 Histogram phastCons8way track" set xlabel " phastCons8way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # run for fish only species mkdir /hive/data/genomes/danRer7/bed/multiz8way/cons/fish cd /hive/data/genomes/danRer7/bed/multiz8way/cons/fish # Using the fish.mod tree cp -p ../../4d/fish.mod ./fish.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... para -maxJob=32 push # Completed: 1082 of 1082 jobs # CPU time in finished jobs: 3479s 57.99m 0.97h 0.04d 0.000 y # IO & Wait Time: 7257s 120.95m 2.02h 0.08d 0.000 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest finished job: 35s 0.58m 0.01h 0.00d # Submission to last job: 423s 7.05m 0.12h 0.00d cd /hive/data/genomes/danRer7/bed/multiz8way/cons/fish find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database time nice -n +19 hgLoadBed danRer7 \ phastConsElements8wayFish mostConserved.bed # Loaded 842567 elements of size 5 # real 0m5.941s # not sure what the targets should be for Zebrafish, on human we often # try for 5% overall cov, and 70% CDS cov featureBits danRer7 -enrichment refGene:cds phastConsElements8wayFish # --rho 0.3 --expected-length 45 --target-coverage 0.3 # refGene:cds 1.320%, phastConsElements8wayFish 10.576%, both 1.093% # cover 82.82%, enrich 7.83x featureBits danRer7 -enrichment ensGene:cds phastConsElements8wayFish # ensGene:cds 2.783%, phastConsElements8wayFish 10.576%, both 2.141% # cover 76.93%, enrich 7.27x # hg19 for comparison # refGene:cds 1.196%, phastConsElements46wayPrimates 3.636%, # both 0.738%, cover 61.69%, enrich 16.97x # ensGene:cds 1.278%, phastConsElements46wayPrimates 3.636%, # both 0.749%, cover 58.62%, enrich 16.12x # knownGene:cds 1.252%, phastConsElements46wayPrimates 3.636%, # both 0.747%, cover 59.66%, enrich 16.41x # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/danRer7/bed/multiz8way/cons/fish mkdir downloads find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phastCons8wayFish.wigFix.gz # check integrity of data with wigToBigWig zcat downloads/phastCons8wayFish.wigFix.gz \ | wigToBigWig stdin /hive/data/genomes/danRer7/chrom.sizes \ phastCons8wayFish.bw bigWigInfo phastCons8wayFish.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 589,598,860 # primaryIndexSize: 16,520,056 # zoomLevels: 10 # chromCount: 959 # basesCovered: 264,308,757 # mean: 0.584684 # min: 0.000000 # max: 1.000000 # std: 0.363150 # encode that ascii data into wiggle data zcat downloads/phastCons8wayFish.wigFix.gz \ | wigEncode stdin phastCons8wayFish.wig phastCons8wayFish.wib # Converted stdin, upper limit 1.00, lower limit 0.00 du -hsc *.wi? # 253M phastCons8wayFish.wib # 51M phastCons8wayFish.wig # 303M total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons8wayFish.wib \ /gbdb/danRer7/multiz8way/phastCons8wayFish.wib nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/danRer7/multiz8way danRer7 \ phastCons8wayFish phastCons8wayFish.wig # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh danRer7 phastCons8wayFish # db.table min max mean count sumData stdDev viewLimits #danRer7.phastCons8wayFish 0 1 0.584684 264308757 1.54537e+08 0.36315 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=danRer7 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons8wayFish > histogram.data 2>&1 # real 0m22.444s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Zebrafish danRer7 Histogram phastCons8wayFish track" set xlabel " phastCons8wayFish score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # phyloP for 8-way (DONE - 2011-01-19 - Hiram) # run phyloP with score=LRT ssh swarm mkdir /cluster/data/danRer7/bed/multiz8way/consPhyloP cd /cluster/data/danRer7/bed/multiz8way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.583 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.583 > all.mod # following the pattern from hg19 with only one grp: "all" cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set file1 = $f:t set out = $2 set cName = $f:t:r set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP set tmp = $cons/tmp/$grp/$f rm -fr $tmp mkdir -p $tmp set ssSrc = "/hive/data/genomes/danRer7/bed/multiz8way/cons/SS/$f" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$file1.wigFix $out rm -fr $tmp rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$cName rmdir --ignore-fail-on-non-empty $cons/tmp/$grp rmdir --ignore-fail-on-non-empty $cons/tmp '_EOF_' # << happy emacs # Create list of chunks find ../../cons/SS -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/SS/##" > ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/all cd /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para -ram=8g create jobList para try ... check ... etc ... para -maxJob=64 push para time > run.time # Completed: 1082 of 1082 jobs # CPU time in finished jobs: 4804s 80.06m 1.33h 0.06d 0.000 y # IO & Wait Time: 7456s 124.27m 2.07h 0.09d 0.000 y # Average job time: 11s 0.19m 0.00h 0.00d # Longest finished job: 61s 1.02m 0.02h 0.00d # Submission to last job: 211s 3.52m 0.06h 0.00d # make downloads mkdir downloads find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP8way.wigFix.gz # check integrity of data with wigToBigWig zcat downloads/phyloP8way.wigFix.gz \ | wigToBigWig stdin /hive/data/genomes/danRer7/chrom.sizes phyloP8way.bw bigWigInfo phyloP8way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 463,742,565 # primaryIndexSize: 16,520,056 # zoomLevels: 10 # chromCount: 959 # basesCovered: 264,308,757 # mean: 0.475521 # min: -2.143000 # max: 2.572000 # std: 0.752223 # encode those files into wiggle data zcat downloads/phyloP8way.wigFix.gz \ | wigEncode stdin phyloP8way.wig phyloP8way.wib # Converted stdin, upper limit 2.57, lower limit -2.14 du -hsc *.wi? # 253M phyloP8way.wib # 49M phyloP8way.wig # 301M total # Load gbdb and database with wiggle. ln -s `pwd`/phyloP8way.wib /gbdb/danRer7/multiz8way/phyloP8way.wib nice hgLoadWiggle -pathPrefix=/gbdb/danRer7/multiz8way danRer7 \ phyloP8way phyloP8way.wig # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh danRer7 phyloP8way # db.table min max mean count sumData stdDev viewLimits # danRer7.phyloP8way -2.143 2.572 0.475521 264308757 1.25684e+08 0.752223 # viewLimits=-2.143:2.572 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=danRer7 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phyloP8way > histogram.data 2>&1 # real 0m22.461s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Zebrafish danRer7 Histogram phyloP8way track" set xlabel " phyloP8way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.002] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running Fish species subset ##################### cd /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/fish/fish.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.613 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/fish/fish.mod 0.613 > fish.mod # setup run fish species subset mkdir /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/fish cd /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/fish rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para -ram=8g create jobList para try ... check ... etc ... para -maxJob=32 push para time > run.time # Completed: 1082 of 1082 jobs # CPU time in finished jobs: 2724s 45.40m 0.76h 0.03d 0.000 y # IO & Wait Time: 7369s 122.81m 2.05h 0.09d 0.000 y # Average job time: 9s 0.16m 0.00h 0.00d # Longest finished job: 40s 0.67m 0.01h 0.00d # Submission to last job: 356s 5.93m 0.10h 0.00d # make downloads mkdir downloads find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP8wayFish.wigFix.gz # check integrity of data with wigToBigWig zcat downloads/phyloP8wayFish.wigFix.gz \ | wigToBigWig stdin /hive/data/genomes/danRer7/chrom.sizes \ phyloP8wayFish.bw bigWigInfo phyloP8wayFish.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 351,513,155 # primaryIndexSize: 16,520,056 # zoomLevels: 10 # chromCount: 959 # basesCovered: 264,308,757 # mean: 0.340395 # min: -1.499000 # max: 1.587000 # std: 0.617638 # encode those files into wiggle data zcat downloads/phyloP8wayFish.wigFix.gz \ | wigEncode stdin phyloP8wayFish.wig phyloP8wayFish.wib # Converted stdin, upper limit 1.59, lower limit -1.50 du -hsc *.wi? # 253M phyloP8wayFish.wib # 50M phyloP8wayFish.wig # 302M total # Load gbdb and database with wiggle. ln -s `pwd`/phyloP8wayFish.wib /gbdb/danRer7/multiz8way/phyloP8wayFish.wib nice hgLoadWiggle -pathPrefix=/gbdb/danRer7/multiz8way danRer7 \ phyloP8wayFish phyloP8wayFish.wig # use to set trackDb.ra entries for wiggle min and max wigTableStats.sh danRer7 phyloP8wayFish # db.table min max mean count sumData stdDev viewLimits # danRer7.phyloP8wayFish -1.499 1.587 0.340395 264308757 8.99693e+07 0.617638 viewLimits=-1.499:1.587 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=danRer7 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phyloP8wayFish > histogram.data 2>&1 # real 0m18.184s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Zebrafish danRer7 Histogram phyloP8wayFish track" set xlabel " phyloP8wayFish score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.002] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # download data for 8-way (DONE - 2011-01-19 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/phastCons8way cd /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/phastCons8way ln -s \ /hive/data/genomes/danRer7/bed/multiz8way/cons/all/downloads/phastCons8way.wigFix.gz \ ./vertebrate.phastCons8way.wigFix.gz ln -s \ /hive/data/genomes/danRer7/bed/multiz8way/cons/fish/downloads/phastCons8wayFish.wigFix.gz \ ./fish.phastCons8way.wigFix.gz ln -s /hive/data/genomes/danRer7/bed/multiz8way/cons/all/phastCons8way.bw \ ./vertebrate.phastCons8way.bw ln -s \ /hive/data/genomes/danRer7/bed/multiz8way/cons/fish/phastCons8wayFish.bw \ ./fish.phastCons8way.bw ln -s /hive/data/genomes/danRer7/bed/multiz8way/cons/fish/fish.mod \ ./fish.mod ln -s /hive/data/genomes/danRer7/bed/multiz8way/cons/all/all.mod \ ./vertebrate.mod ln -s /hive/data/genomes/danRer7/bed/multiz8way/cons/README.txt . md5sum * > /hive/data/genomes/danRer7/bed/multiz8way/cons/md5sum.txt ln -s /hive/data/genomes/danRer7/bed/multiz8way/cons/md5sum.txt . mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/phyloP8way cd /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/phyloP8way ln -s \ /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/all/downloads/phyloP8way.wigFix.gz \ ./vertebrate.phyloP8way.wigFix.gz ln -s \ /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/fish/downloads/phyloP8wayFish.wigFix.gz \ ./fish.phastCons8way.wigFix.gz ln -s \ /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/fish/phyloP8wayFish.bw ./fish.phastCons8way.bw ln -s \ /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/all/phyloP8way.bw ./vertebrate.phyloP8way.bw ln -s \ /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/run.phyloP/fish.mod \ ./fish.mod ln -s \ /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/run.phyloP/all.mod \ ./vertebrate.mod ln -s /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/README.txt . md5sum * > /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/md5sum.txt ln -s /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/md5sum.txt . cd /hive/data/genomes/danRer7/bed/multiz8way /cluster/bin/phast/tree_doctor --rename \ "danRer7->Zebrafish; gasAcu1->Stickleback; oryLat2->Medaka; tetNig2->Tetraodon; fr2->Fugu; xenTro2->X_Tropicalis; mm9->Mouse; hg19->Human" \ zFishOnTop.nh > commonNames.8way.nh mkdir /hive/data/genomes/danRer7/bed/multiz8way/downloads cd /hive/data/genomes/danRer7/bed/multiz8way/downloads cp -p ../anno/danRer7.8way.maf multiz8way.maf gzip multiz8way.maf # real 8m35.334s mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/multiz8way cd /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/multiz8way ln -s /hive/data/genomes/danRer7/bed/multiz8way/8way.nh . ln -s /hive/data/genomes/danRer7/bed/multiz8way/commonNames.8way.nh . ln -s \ /hive/data/genomes/danRer7/bed/multiz8way/downloads/multiz8way.maf.gz \ . ln -s /hive/data/genomes/danRer7/bed/multiz8way/downloads/README.txt . md5sum * > /hive/data/genomes/danRer7/bed/multiz8way/downloads/md5sum.txt ln -s /hive/data/genomes/danRer7/bed/multiz8way/downloads/md5sum.txt . ############################################################################# # HUMAN (hg18) PROTEINS TRACK (DONE braney 2010-12-29) # bash if not using bash shell already ) cd /cluster/data/danRer7 mkdir /cluster/data/danRer7/blastDb awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst danRer7.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 1371 pieces of 1371 written rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst danRer7.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 1421 mkdir -p /cluster/data/danRer7/bed/tblastn.hg18KG cd /cluster/data/danRer7/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 1421 query.lst # we want around 250000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\) # 36727/(250000/1421) = 208.756268 mkdir -p kgfa split -l 209 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst wc kg.lst # 176 176 2288 kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/danRer7/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer7/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome exit ssh swarm cd /cluster/data/danRer7/bed/tblastn.hg18KG gensub2 query.lst kg.lst blastGsub blastSpec para create blastSpec # para try, check, push, check etc. para time # Completed: 250096 of 250096 jobs # CPU time in finished jobs: 7759615s 129326.92m 2155.45h 89.81d 0.246 y # IO & Wait Time: 1198357s 19972.62m 332.88h 13.87d 0.038 y # Average job time: 36s 0.60m 0.01h 0.00d # Longest finished job: 1267s 21.12m 0.35h 0.01d # Submission to last job: 18191s 303.18m 5.05h 0.21d ssh swarm cd /cluster/data/danRer7/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 176 of 176 jobs # CPU time in finished jobs: 70348s 1172.47m 19.54h 0.81d 0.002 y # IO & Wait Time: 36253s 604.21m 10.07h 0.42d 0.001 y # Average job time: 606s 10.09m 0.17h 0.01d # Longest finished job: 15700s 261.67m 4.36h 0.18d # Submission to last job: 15708s 261.80m 4.36h 0.18d cd /cluster/data/danRer7/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # checked: 45598 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/danRer7/bed/tblastn.hg18KG hgLoadPsl danRer7 blastHg18KG.psl # check coverage featureBits danRer7 blastHg18KG # 21889813 bases of 1409770109 (1.553%) in intersection featureBits danRer7 blastHg18KG refGene -enrichment # blastHg18KG 1.553%, refGene 1.960%, both 0.816%, cover 52.58%, enrich 26.83x featureBits danRer7 blastHg18KG ensGene -enrichment # blastHg18KG 1.553%, ensGene 3.961%, both 1.392%, cover 89.68%, enrich 22.64x rm -rf blastOut #end tblastn ############################################################################# # GRC Incident database (DONE - 2011-02-10 - Hiram) # used to be NCBI Incident - changed to GRC Incident 2012-04-12 # this procedure is run as a cron job in Hiram's account: # 43 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo # using the two scrips there: runUpdate.sh and update.sh # which are checked into the source tree as files: # src/hg/utils/automation/grcIncidentUpdate.sh # src/hg/utils/automation/grcRunIncidentUpdate.sh # they fetch the XML files from NCBI, convert them to SQL text # files, construct a bigBed file, and pushes it to genomewiki if # it is an update from previous # the table in the dataBase is: grcIncidentDb # which is the URL to the bb file, a single row: # http://genomewiki.ucsc.edu/images/6/64/DanRer7.grcIncidentDb.bb ######################################################################### # LASTZ Turkey MelGal1 ( DONE - 2011-04-01 - Chin) mkdir /hive/data/genomes/danRer7/bed/lastzMelGal1.2011-04-01 cd /hive/data/genomes/danRer7/bed/lastzMelGal1.2011-04-01 cat << '_EOF_' > DEF # Turkey vs Zebrafish # TARGET: Zebrafish danRer7 SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit SEQ1_LEN=/scratch/data/danRer7/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Turkey melGal1 - single chunk big enough to run entire chrom SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/danRer7/bed/lastzMelGal1.2011-04-01 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # note: No -syntenicNet option below. screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 # real 179m52.632s cat fb.danRer7.chainMelGal1Link.txt # 35772051 bases of 1409770109 (2.537%) in intersection cd /hive/data/genomes/danRer7/bed ln -s lastzMelGal1.2011-04-01 lastz.melGal1 # running the swap mkdir /hive/data/genomes/melGal1/bed/blastz.danRer7.swap cd /hive/data/genomes/melGal1/bed/blastz.danRer7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer7/bed/lastzMelGal1.2011-04-01/DEF \ -swap \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 # real 3m28.672s cat fb.melGal1.chainDanRer7Link.txt # 25141503 bases of 935922386 (2.686%) in intersection cd /hive/data/genomes/melGal1/bed ln -s blastz.danRer7.swap lastz.danRer7 ############################################################################# # BUILD U MASS ChiP-Seq TRACKS (DONE 4/19/11, Fan) ssh hgwdev mkdir -p /hive/data/genomes/danRer7/bed/H3K4me cd /hive/data/genomes/danRer7/bed/H3K4me # Download the following files from http://lawsonlab.umassmed.edu/ChIP-SeqSupp.html: wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_input_signal_Zv9.WIG" wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me1_signal_Zv9.WIG" wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me3_signal_Zv9.WIG" wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me1_peaks_Zv9.WIG" wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me1_hotspots_Zv9.bed" wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me3_peaks_Zv9.WIG" wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me3_hotspots_Zv9.bed" # build UMassInput track cat '24h_input_signal_Zv9.WIG'|sed -e 's/chrMT/chrM/g' >UMassInput.wig.input wigEncode UMassInput.wig.input UMassInput.wig UMassInput.wib # Converted UMassInput.wig.input, upper limit 537.00, lower limit 10.00 mkdir /gbdb/danRer7/wib/ ln -s /hive/data/genomes/danRer7/bed/H3K4me/UMassInput.wib /gbdb/danRer7/wib/UMassInput.wib hgLoadWiggle danRer7 UMassInput UMassInput.wig #WARNING: Exceeded Zv9_NA205 size 14440 > 14336. dropping 11 data point(s) #WARNING: Exceeded Zv9_NA648 size 16200 > 16134. dropping 7 data point(s) #WARNING: Exceeded chr3 size 63268980 > 63268876. dropping 11 data point(s) #WARNING: Exceeded chrM size 16730 > 16596. dropping 14 data point(s) # Build UMassME1 track cat '24h_me1_signal_Zv9.WIG'|sed -e 's/chrMT/chrM/g' >UMassME1.wig.input wigEncode UMassME1.wig.input UMassME1.wig UMassME1.wib # Converted UMassME1.wig.input, upper limit 406.00, lower limit 10.00 ln -s /hive/data/genomes/danRer7/bed/H3K4me/UMassME1.wib /gbdb/danRer7/wib/UMassME1.wib hgLoadWiggle danRer7 UMassME1 UMassME1.wig #WARNING: Exceeded Zv9_NA205 size 14400 > 14336. dropping 7 data point(s) #WARNING: Exceeded Zv9_NA648 size 16220 > 16134. dropping 9 data point(s) #WARNING: Exceeded Zv9_NA796 size 3350 > 3298. dropping 6 data point(s) #WARNING: Exceeded Zv9_NA933 size 3040 > 3012. dropping 3 data point(s) #WARNING: Exceeded Zv9_NA949 size 3090 > 3033. dropping 6 data point(s) #WARNING: Exceeded Zv9_NA957 size 2920 > 2866. dropping 6 data point(s) #WARNING: Exceeded Zv9_NA967 size 6710 > 6697. dropping 2 data point(s) #WARNING: Exceeded chr3 size 63268980 > 63268876. dropping 11 data point(s) # Build UMassME1Peak track cat '24h_me1_peaks_Zv9.WIG'|sed -e 's/chrMT/chrM/g' >UMassME1Peak.wig.input wigEncode UMassME1Peak.wig.input UMassME1Peak.wig UMassME1Peak.wib # Converted UMassME1Peak.wig.input, upper limit 1599.00, lower limit 11.00 ln -s /hive/data/genomes/danRer7/bed/H3K4me/UMassME1Peak.wib /gbdb/danRer7/wib/UMassME1Peak.wib hgLoadWiggle danRer7 UMassME1Peak UMassME1Peak.wig # Build UMassME3 track cat '24h_me3_signal_Zv9.WIG'|sed -e 's/chrMT/chrM/g' >UMassME3.wig.input wigEncode UMassME3.wig.input UMassME3.wig UMassME3.wib # Converted UMassME3.wig.input, upper limit 409.00, lower limit 10.00 ln -s /hive/data/genomes/danRer7/bed/H3K4me/UMassME3.wib /gbdb/danRer7/wib/UMassME3.wib hgLoadWiggle danRer7 UMassME3 UMassME3.wig #WARNING: Exceeded Zv9_NA205 size 14440 > 14336. dropping 11 data point(s) #WARNING: Exceeded Zv9_NA648 size 16230 > 16134. dropping 10 data point(s) #WARNING: Exceeded Zv9_NA845 size 60950 > 60897. dropping 6 data point(s) #WARNING: Exceeded Zv9_NA850 size 26100 > 25985. dropping 12 data point(s) #WARNING: Exceeded Zv9_NA957 size 2870 > 2866. dropping 1 data point(s) #WARNING: Exceeded Zv9_NA959 size 1190 > 1142. dropping 5 data point(s) #WARNING: Exceeded chr3 size 63268980 > 63268876. dropping 11 data point(s) # Build UMassME3Peak track cat '24h_me3_peaks_Zv9.WIG'|sed -e 's/chrMT/chrM/g' >UMassME3Peak.wig.input wigEncode UMassME3Peak.wig.input UMassME3Peak.wig UMassME3Peak.wib # Converted UMassME3Peak.wig.input, upper limit 3749.00, lower limit 14.00 ln -s /hive/data/genomes/danRer7/bed/H3K4me/UMassME3Peak.wib /gbdb/danRer7/wib/UMassME3Peak.wib hgLoadWiggle danRer7 UMassME3Peak UMassME3Peak.wig # Build UMassME1Hotspot track cat '24h_me1_hotspots_Zv9.bed' |grep -v 'hotspot' > UMassME1Hotspot.bed hgLoadBed danRer7 UMassME1Hotspot UMassME1Hotspot.bed # Loaded 65585 elements of size 5 # Build UMassME3Hotspot track cat '24h_me3_hotspots_Zv9.bed' |grep -v 'hotspot' > UMassME3Hotspot.bed hgLoadBed danRer7 UMassME3Hotspot UMassME3Hotspot.bed # Loaded 27546 elements of size 5 ############################################################################ # running cpgIsland business (DONE - 2011-07-22 - Hiram) mkdir /hive/data/genomes/danRer7/bed/cpgIsland cd /hive/data/genomes/danRer7/bed/cpgIsland ln -s ../../../danRer6/bed/cpgIsland/cpglh.exe . mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../danRer7.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/danRer7/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | egrep "^chr|^Zv" | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod +x runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time # Completed: 1133 of 1133 jobs # CPU time in finished jobs: 102s 1.70m 0.03h 0.00d 0.000 y # IO & Wait Time: 4469s 74.49m 1.24h 0.05d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest finished job: 10s 0.17m 0.00h 0.00d # Submission to last job: 485s 8.08m 0.13h 0.01d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed # verify longest unique chrom name: cut -f1 cpgIsland.bed | sed -e 's/_random//' \ | awk '{print length($0)}' | sort -rn | head -1 # 16 # update the length 14 in the template to be 16: sed -e "s/14/16/" $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql cd /hive/data/genomes/danRer7/bed/cpgIsland hgLoadBed danRer7 cpgIslandExt -tab -sqlTable=cpgIslandExt.sql cpgIsland.bed # Loaded 12683 elements of size 10 featureBits danRer7 cpgIslandExt # 5207568 bases of 1409770109 (0.369%) in intersection # there should be no output from checkTableCoords: checkTableCoords -verboseBlocks -table=cpgIslandExt danRer7 # cleanup rm -fr hardMaskedFa ############################################################################ # CREATE LIFTOVER CHAINS FROM danRer7 TO danRer6 (DONE - 2011-09-09 - Hiram) doSameSpeciesLiftOver.pl -debug \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/danRer7/jkStuff/danRer7.11.ooc \ danRer7 danRer6 cd /hive/data/genomes/danRer7/bed/blat.danRer6.2011-09-09 time doSameSpeciesLiftOver.pl \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/danRer7/jkStuff/danRer7.11.ooc \ danRer7 danRer6 > do.log 2>&1 & # real 166m45.904s # test the procedure in the genome-test browser ######################################################################### # LASTZ Xenopus xenTro3 (DONE - 2010-12-17 - Hiram) mkdir /hive/data/genomes/danRer7/bed/lastzXenTro3.2011-09-20 cd /hive/data/genomes/danRer7/bed/lastzXenTro3.2011-09-20 cat << '_EOF_' > DEF # zebrafish vs. X. tropicalis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Zebrafish danRer7 SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit SEQ1_LEN=/scratch/data/danRer7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: X. tropicalis xenTro3 SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/danRer7/bed/lastzXenTro3.2011-09-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 1021m38.611s cat fb.danRer7.chainXenTro3Link.txt # 91771881 bases of 1409770109 (6.510%) in intersection cd /hive/data/genomes/danRer7/bed ln -s lastzXenTro3.2011-09-20 lastz.xenTro3 mkdir /hive/data/genomes/xenTro3/bed/blastz.danRer7.swap cd /hive/data/genomes/xenTro3/bed/blastz.danRer7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer7/bed/lastzXenTro3.2011-09-20/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 67m46.506s cat fb.xenTro3.chainDanRer7Link.txt # 96539932 bases of 1358334882 (7.107%) in intersection ######################################################################## # lastz rn5 rat (DONE - 2012-11-14 - Hiram) # the original alignment: cd /hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13 cat fb.rn5.chainDanRer7Link.txt # 66573209 bases of 2572853723 (2.588%) in intersection # and this swap: mkdir /hive/data/genomes/danRer7/bed/blastz.rn5.swap cd /hive/data/genomes/danRer7/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 15m5.664s cat fb.danRer7.chainRn5Link.txt # 68007020 bases of 1409770109 (4.824%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/danRer7/bed ln -s blastz.rn5.swap lastz.rn5 #########################################################################