# for emacs: -*- mode: sh; -*- # Cavia porcellus -- Broad Institute Release 3.0 (Feb 12 2008) # Template from bosTau4.txt ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2008-04-02 - Tim and Kate) ssh kkstore05 mkdir /cluster/store12/cavPor3 ln -s /cluster/store12/cavPor3 /cluster/data mkdir /cluster/data/cavPor3/broad cd /cluster/data/cavPor3/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly.quals.gz wget --timestamping \ #Not helpful ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly_supers.qual.gz qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin cavPor3.qual.qac ########## From broad inst. cavPor3/BasicStats.out # -------------------------------------------------------------------------------- # Sat Feb 02 03:52:11 2008 run (pid=21060), using Tue Jan 22 11:07:31 EST 2008 make # BasicStats PRE=/wga/dev/WGAdata DATA=projects/Guineapig RUN=run/work \ # SUBDIR=post5 QUAL_STATS=True OUTFILE=BasicStats.out # -------------------------------------------------------------------------------- # Supercontigs having < 3 reads or < 1kb sequence are ignored. # 8 gaps <= -1000; 0 gaps <= -10000; 0 gaps <= -100000 # fraction of gaps < -10kb or more than 4 deviations below zero: 0.0222% # 667 gaps > 10kb, 60 gaps > 50kb, 0 gaps > 200kb, 0 gaps > 1Mb # 93.8% of reads were used in the assembly (95.55% of bases, 96.3% of Q20 bases) # 0.00512% of reads were used multiply in the assembly # 61603 contigs, having N50 length 80583 # total contig length: 2663352932, spanning 2722377657 bases (with 2.17% in gaps) # 3143 supercontigs, having N50 length 27408292 (not including gaps) # 2.48% of assembly in supers of size < 200000 (67399955 bases) # Assembly base coverage: 6.79X. Assembly Q20 coverage: 5.99X. # 100% of bases have q >= 1 # 99.57% of bases have q >= 20 # 99.02% of bases have q >= 30 # 98.4% of bases have q >= 40 # 97.72% of bases have q >= 50 cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 3143 cut -f 1 assembly.agp | uniq -c | sort -b -n -r | cut -c 1-8 | uniq -c | sort -n -r -b | head -3 # Number of scaffolds with single contig: 2176 thus 967 multi-contig scaffolds ######################################################################### # Create .ra file and run makeGenomeDb.pl ssh kkstore05 cd /cluster/data/cavPor3 cat << _EOF_ >cavPor3.config.ra # Config parameters for makeGenomeDb.pl: db cavPor3 clade mammal genomeCladePriority 35 scientificName Cavia porcellus commonName Guinea Pig assemblyDate Feb. 2008 assemblyLabel Broad Institute cavPor3 orderKey 99 #mitoAcc AJ222767 mitoAcc 5679797 fastaFiles /cluster/data/cavPor3/broad/assembly.bases.gz agpFiles /cluster/data/cavPor3/broad/assembly.agp qualFiles /cluster/data/cavPor3/broad/cavPor3.qual.qac dbDbSpeciesDir guineaPig _EOF_ # use 'screen' make sure on kkstore05 makeGenomeDb.pl -verbose=2 cavPor3.config.ra > makeGenomeDb.out 2>&1 & # 'ctl-a ctl -d' returns to previous shell cut -f 2 chrom.sizes | ave stdin # Q1 7169.500000 # median 13298.000000 # Q3 55788.500000 # average 866164.007952 # min 3002.000000 # max 88675666.000000 # count 3144 # total 2723219641.000000 # standard deviation 5316243.688364 # NOTES -- STUFF THAT YOU WILL HAVE TO DO -- # # # Template trackDb.ra and .html's have been created, but they all need editing! # # cd /cluster/data/cavPor3/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/guineaPig/cavPor3 # # Search for '***' notes in each file in and make corrections (sometimes the # files used for a previous assembly might make a better template): # description.html /cluster/data/cavPor3/html/{trackDb.ra,gap.html,gold.html} # # Then cd ../.. (to trackDb/) and # - edit makefile to add cavPor3 to DBS. # - (if necessary) cvs add guineaPig # - cvs add guineaPig/cavPor3 # - cvs add guineaPig/cavPor3/*.{ra,html} # - cvs ci -m "Added cavPor3 to DBS." makefile # - cvs ci -m "Initial descriptions for cavPor3." guineaPig/cavPor3 # - (if necessary) cvs ci guineaPig cvs ci -m "Initial description page for browser gateway of Cavia porcellus (guinea pig)" description.html cvs ci -m "Initial description page of gap location track for Cavia porcellus (guinea pig)" gap.html cvs ci -m "Initial description page of assembly track for Cavia porcellus (guinea pig)" gold.html # - Run make update DBS=cavPor3 and make alpha when done. # - (optional) Clean up /cluster/data/cavPor3/TemporaryTrackDbCheckout # - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there. ######################################################################### # REPEATMASKER (DONE - 2008-04-05 - Tim) ssh kkstore06 screen # use a screen to manage this job mkdir /cluster/data/cavPor3/bed/repeatMasker cd /cluster/data/cavPor3/bed/repeatMasker doRepeatMasker.pl -buildDir=/cluster/data/cavPor3/bed/repeatMasker \ cavPor3 > do.log 2>&1 & # Note: can run simpleRepeats simultaneously #### When done with RM: ssh pk para time # CPU time in finished jobs: 26236391s 437273.18m 7287.89h 303.66d 0.832 y # IO & Wait Time: 548808s 9146.80m 152.45h 6.35d 0.017 y # Average job time: 4735s 78.91m 1.32h 0.05d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 5871s 97.85m 1.63h 0.07d # Submission to last job: 130217s 2170.28m 36.17h 1.51d time nice -n +19 featureBits cavPor3 rmsk > fb.cavPor3.rmsk.txt 2>&1 & # 732765485 bases of 2663369733 (27.513%) in intersection # RepeatMasker and lib version from do.log: # RepeatMasker version development-$Id: cavPor3.txt,v 1.22 2010/04/21 19:22:27 hiram Exp $ # Jan 11 2008 (open-3-1-9) version of RepeatMasker # CC RELEASE 20071204; # Compare coverage to previous assembly: # skip this since cavPor 2 was never added to browser #featureBits cavPor2 rmsk ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-04-04 - Tim) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/cavPor3/bed/simpleRepeat cd /cluster/data/cavPor3/bed/simpleRepeat # doSimpleRepeat.pl -buildDir=/cluster/data/cavPor3/bed/simpleRepeat \ cavPor3 > do.log 2>&1 & #### When done ssh pk para time # Completed: 73 of 73 jobs # CPU time in finished jobs: 13220s 220.33m 3.67h 0.15d 0.000 y # IO & Wait Time: 679s 11.32m 0.19h 0.01d 0.000 y # Average job time: 190s 3.17m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 397s 6.62m 0.11h 0.00d # Submission to last job: 1320s 22.00m 0.37h 0.02d featureBits cavPor3 simpleRepeat # 33641656 bases of 2663369733 (1.263%) in intersection # after RM run is done, add this mask: cd /cluster/data/cavPor3 twoBitMask cavPor3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed cavPor3.2bit twoBitToFa cavPor3.2bit stdout | faSize stdin # 2723219641 bases (59849908 N's 2663369733 real 1930234123 upper 733135610 lower) in 3144 sequences in 1 files # Total size: mean 866164.0 sd 5317089.3 min 3002 (scaffold_3142) max 88675666 (scaffold_0) median 13298 # N count: mean 19036.2 sd 100029.3 # U count: mean 613942.2 sd 3764766.9 # L count: mean 233185.6 sd 1462738.3 # %26.92 masked total, %27.53 masked real # >>> NOTE: 2723219641 bases matche chrome.sizes above but not featureBits which is 2663369733 twoBitToFa cavPor3.rmsk.2bit stdout | faSize stdin # 2723219641 bases (59849908 N's 2663369733 real 1931053208 upper 732316525 lower) in 3144 sequences in 1 files # Total size: mean 866164.0 sd 5317089.3 min 3002 (scaffold_3142) max 88675666 (scaffold_0) median 13298 # N count: mean 19036.2 sd 100029.3 # U count: mean 614202.7 sd 3766351.7 # L count: mean 232925.1 sd 1461161.8 # %26.89 masked total, %27.50 masked real ######################################################################### # Link to it from /gbdb: (DONE - 2008-04-10 - Tim) ln -s /cluster/data/cavPor3/cavPor3.2bit /gbdb/cavPor3/cavPor3.2bit ######################################################################### # Create OOC file for genbank runs (DONE - 2008-04-07 - Tim) # use same repMatch value as bosTau2 ssh kkstore05 cd /cluster/data/cavPor3 blat cavPor3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=cavPor3.11.1005.ooc -repMatch=1005 # Wrote 26905 overused 11-mers to cavPor3.11.1005.ooc # -rw-rw-r-- 1 tdreszer protein 107628 Apr 7 13:22 cavPor3.11.1005.ooc blat cavPor3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=cavPor3.11.ooc -repMatch=1024 # Wrote 25815 overused 11-mers to cavPor3.11.ooc # -rw-rw-r-- 1 tdreszer protein 103268 Apr 7 13:17 cavPor3.11.ooc ssh kkr1u00 mkdir /iscratch/i/cavPor3 cd /iscratch/i/cavPor3 cp -p /cluster/data/cavPor3/cavPor3.2bit . for R in 2 3 4 5 6 7 8 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/cavPor3/ done ######################################################################### # Run WindowMasker because RepeatMasker coverge is low (DONE - 2008-04-08 - Tim) screen mkdir /cluster/data/cavPor3/bed/windowMasker ssh kkstore05 cd /cluster/data/cavPor3/bed/windowMasker nice doWindowMasker.pl -workhorse=kolossus \ -buildDir=/cluster/data/cavPor3/bed/windowMasker cavPor3 > wmRun.log 2>&1 & # load this initial data to get ready to clean it ssh hgwdev cd /cluster/data/cavPor3/bed/windowMasker hgLoadBed cavPor3 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 14554107 elements of size 3 # eliminate the gaps from the masking featureBits cavPor3 -not gap -bed=notGap.bed # 2663369733 bases of 2663369733 (100.000%) in intersection featureBits cavPor3 windowmaskerSdust # 990884347 bases of 2663369733 (37.204%) in intersection time nice -n +19 featureBits cavPor3 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 931034439 bases of 2663369733 (34.957%) in intersection # reload track to get it clean hgLoadBed cavPor3 windowmaskerSdust cleanWMask.bed.gz # Loaded 14547726 elements of size 4 featureBits cavPor3 windowmaskerSdust # 931034439 bases of 2663369733 (34.957%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../cavPor3.unmasked.2bit stdin \ -type=.bed cavPor3.cleanWMSdust.2bit twoBitToFa cavPor3.cleanWMSdust.2bit stdout | faSize stdin \ > cavPor3.cleanWMSdust.faSize.txt cat cavPor3.cleanWMSdust.faSize.txt # 2723219641 bases (59849908 N's 2663369733 real 1732335294 upper 931034439 lower) in 3144 sequences in 1 files # Total size: mean 866164.0 sd 5317089.3 min 3002 (scaffold_3142) max 88675666 (scaffold_0) median 13298 # N count: mean 19036.2 sd 100029.3 # U count: mean 550997.2 sd 3547541.7 # L count: mean 296130.5 sd 1686844.5 # %34.19 masked total, %34.96 masked real ######################################################################### # Starting Genbank (Done - 2008-04-10 - Tim) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank # edit etc/genbank.conf to add the following entry: # cavPor3 (C. porcellus) 3144 scaffolds 967 are multi-contig and 2076 are single contig cavPor3.serverGenome = /cluster/data/cavPor3/cavPor3.2bit cavPor3.clusterGenome = /iscratch/i/cavPor3/cavPor3.2bit cavPor3.ooc = /cluster/data/cavPor3/cavPor3.11.ooc cavPor3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} cavPor3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} cavPor3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} cavPor3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} cavPor3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} cavPor3.lift = no cavPor3.downloadDir = cavPor3 cavPor3.perChromTables = no cavPor3.refseq.mrna.native.load = no cavPor3.refseq.mrna.xeno.load = yes cavPor3.genbank.mrna.xeno.load = yes cavPor3.genbank.est.native.load = yes cvs ci -m "Added cavPor3." etc/genbank.conf # update /cluster/data/genbank/: make etc-update cvs ci -m "Turned off native and on xeno." etc/genbank.conf # Edit src/lib/gbGenome.c to add new species. # static char *cavPorNames[] = {"Cavia porcellus", NULL}; # static struct dbToSpecies dbToSpeciesMap[] = { ...>>> {"cavPor", cavPorNames}, cvs ci -m "Added guinea pig." src/lib/gbGenome.c make install-server ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank # This is a call to a script that will push our jobs out to the cluster # since it's a big job. time nice -n +19 bin/gbAlignStep -initial cavPor3 & # logFile: var/build/logs/2008.03.10-14:14:43.cavPor3.initalign.log # real 567m7.431s # > For comparison: logFile: var/build/logs/2008.04.08-14:37:23.bosTau4.initalign.log # > real 45m6.595s # Batch failed after 4 tries on /cluster/genbank/genbank/bin/gbBlat genbank.164.0/cavPor3/full/psl/est.eb.native.1/scaffold_41/scaffold_41.job genbank.164.0/cavPor3/full/psl/est.eb.native.1/scaffold_41/scaffold_41.psl # command failed: ssh -x kk cd /cluster/data/genbank/build/work/initial.cavPor3/align\; para make -maxPush=200000 align.jobs >> {"cavPor", cavPorNames}, #cvs ci -m "Added guinea pig." src/lib/gbGenome.c make install-server ### Mark Deikhens recommends: # You are not really realigning, but aligning additional. If you # just kick off an initial alignment, it *should* align the # missing partitions of the data. However, sometimes weird things # happen when a new, full release is made, and a new refseq just # came out. So kick on the alignment and let me know when its # done and I can take a look. # # Also, to just reload everything, include the -drop flag on your # command line. That is usually easiest; way waste your time # figuring out weird cases when the computer can just do the work. ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank # This is a call to a script that will push our jobs out to the cluster # since it's a big job. time nice -n +19 bin/gbAlignStep -initial cavPor3 & # logFile: var/build/logs/2008.05.22-15:44:34.cavPor3.initalign.log # real 182m21.821s # gbAlignInstall: complete: real=2.35 # genbank 2008.05.22-18:46:56 cavPor3.initalign: finish # Mark Deikhens: Look good, load it. # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad cavPor3 & # logFile: var/dbload/hgwdev/logs/2008.05.23-11:39:52.dbload.log # real 17m53.123s ############################################################################ # DONE - 2008-04-10 - Tim # Reset default position to Math1=ATOH1=NM_005172=scaffold_15:19047435-19048046 hgsql -e \ 'update dbDb set defaultPos="scaffold_15:19047434-19048046" where name="cavPor3";' \ hgcentraltest # And there was a mistake in the description date entry hgsql -e \ 'update dbDb set description="Feb. 2008" where name="cavPor3";' \ hgcentraltest ######################################################################### ## genscan run (DONE - 2008-04-10 - Tim) ## create hard masked sequence ssh kkstore05 cd /cluster/data/cavPor3 mkdir hardMasked for C in `cut -f1 chrom.sizes` do echo "hardMasked/${C}.hard.fa" twoBitToFa -seq=${C} cavPor3.2bit stdout \ | maskOutFa stdin hard hardMasked/${C}.hard.fa ls -ld "hardMasked/${C}.hard.fa" done # And, make sure there aren't any sequences in this lot that have # become all N's with no sequence left in them. This drives genscan nuts echo hardMasked/*.hard.fa | xargs faCount > faCount.hard.txt # the lowest three are: egrep -v "^#|^total" faCount.hard.txt \ | awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3 # scaffold_2949 425 # scaffold_3101 357 # scaffold_3130 0 rm harMasked/scaffold_3130.hard.fa ##### Saving this junk from bosTau4.txt because it will be useful next time ## # There are a whole bunch of these, and many with just a few bases. ## # Actually, before removing these for genscan, run the cpgIsland ## # business first since it can work on them all. ## # So, remove any with less than 100 bases of sequence ## egrep -v "^#|^total" faCount.hard.txt | awk '{size=$2-$7; if (size < 100){printf "hardMasked/%s.hard.fa\n", $1}}' | xargs rm # now get them over to a kluster location mkdir /san/sanvol1/scratch/cavPor3/hardChunks cd /san/sanvol1/scratch/cavPor3/hardChunks # creating 4,000,000 sized chunks, the chroms stay together as # single pieces. The contigs get grouped together into 4,000,000 # sized fasta files. You don't want to break these things up # because genscan will be doing its own internal 2.4 million # window on these pieces, and the gene names are going to be # constructed from the sequence name in these fasta files. echo /cluster/data/cavPor3/hardMasked/*.hard.fa | xargs cat \ | faSplit about stdin ls c_ ssh hgwdev mkdir /cluster/data/cavPor3/bed/genscan cd /cluster/data/cavPor3/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh memk cd /cluster/data/cavPor3/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) # Since we split on gaps, we have no chunks like that. You can # verify with faCount on the chunks. ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_*.fa > genome.list # Create run-time script to operate gsBig in a cluster safe manner cat << \_EOF_ > runGsBig #!/bin/csh -fe set runDir = `pwd` set srcDir = $1 set inFile = $2 set fileRoot = $inFile:r mkdir /scratch/tmp/$fileRoot cp -p $srcDir/$inFile /scratch/tmp/$fileRoot pushd /scratch/tmp/$fileRoot /cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000 popd cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt rm -fr /scratch/tmp/$fileRoot _EOF_ # << happy emacs chmod +x runGsBig cat << \_EOF_ > template #LOOP runGsBig /san/sanvol1/scratch/cavPor3/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} #ENDLOOP _EOF_ # << happy emacs gensub2 genome.list single template jobList para create jobList para try, check, push, check, ... ##[tdreszer@memk /cluster/data/cavPor3/bed/genscan] para check ##172 jobs in batch ##13 jobs (including everybody's) in Parasol queue. ##Checking finished jobs ##crashed: 7 ##ranOk: 165 ##total jobs in batch: 172 ##[tdreszer@memk /cluster/data/cavPor3/bed/genscan] para status | grep -v done ##172 jobs in batch ##13 jobs (including everybody's) in Parasol queue. ##Checking finished jobs ###state tries real cpu host jobid cmd ##crash 1 60.00 55.30 mkr0u0 825047 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_19.fa gtf/c_19.gtf pep/c_19.pep subopt/c_19.bed ##crash 1 215.00 213.48 mkr0u6 825117 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_152.fa gtf/c_152.gtf pep/c_152.pep subopt/c_152.bed ##crash 1 97.00 96.01 mkr0u0 825153 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_111.fa gtf/c_111.gtf pep/c_111.pep subopt/c_111.bed ##crash 1 149.00 146.78 mkr0u0 825158 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_107.fa gtf/c_107.gtf pep/c_107.pep subopt/c_107.bed ##crash 1 638.00 636.81 mkr0u3 825178 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_71.fa gtf/c_71.gtf pep/c_71.pep subopt/c_71.bed ##crash 1 904.00 901.75 mkr0u6 825185 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_32.fa gtf/c_32.gtf pep/c_32.pep subopt/c_32.bed ##crash 1 1238.00 1232.05 mkr0u6 825196 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_01.fa gtf/c_01.gtf pep/c_01.pep subopt/c_01.bed ##[tdreszer@memk /cluster/data/cavPor3/bed/genscan] para problems | grep Insuff ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ##Insufficient memory error: results may be unreliable. ####### However, these are not the simply the largest files: [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_19.fa -rw-rw-r-- 1 tdreszer protein 4408036 Apr 8 15:39 /san/sanvol1/scratch/cavPor3/hardChunks/c_19.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_152.fa -rw-rw-r-- 1 tdreszer protein 8304826 Apr 8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_152.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_111.fa -rw-rw-r-- 1 tdreszer protein 18155094 Apr 8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_111.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_107.fa -rw-rw-r-- 1 tdreszer protein 20080719 Apr 8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_107.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_71.fa -rw-rw-r-- 1 tdreszer protein 36860211 Apr 8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_71.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_32.fa -rw-rw-r-- 1 tdreszer protein 49549701 Apr 8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_32.fa [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_01.fa -rw-rw-r-- 1 tdreszer protein 82754438 Apr 8 15:39 /san/sanvol1/scratch/cavPor3/hardChunks/c_01.fa # Okay try spliting the failed c_*.fa files into individual sequences rt=retry ssh memk pushd /san/sanvol1/scratch/cavPor3/hardChunks faSplit sequence c_19.fa 1000 c_rt19_ ## Wrong one! faSplit sequence c_151.fa 1000 c_rt151_ faSplit sequence c_111.fa 1000 c_rt111_ faSplit sequence c_107.fa 1000 c_rt107_ faSplit sequence c_71.fa 1000 c_rt71_ faSplit sequence c_32.fa 1000 c_rt32_ faSplit sequence c_01.fa 1000 c_rt01_ popd # cd /cluster/data/cavPor3/bed/genscan ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_rt*.fa > genome_rt.list gensub2 genome_rt.list single template jobList_rt para create jobList_rt para try, check, push, check, ... # Checking finished jobs # crashed: 6 # ranOk: 71 # total jobs in batch: 77 ## missed one: pushd /san/sanvol1/scratch/cavPor3/hardChunks faSplit sequence c_152.fa 1000 c_rt152_ popd # cd /cluster/data/cavPor3/bed/genscan ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_rt152*.fa > genome_rt2.list gensub2 genome_rt2.list single template jobList_rt2 para create jobList_rt2 para try, check, push, check, ... # Checking finished jobs # crashed: 1 # ranOk: 10 # total jobs in batch: 11 ######## After breaking failed chunks by sequence, the 7 culprit scaffolds are known: # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt19_0010.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt151_0010.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt111_0010.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt107_0010.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt71_0025.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt32_0005.fa # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt01_0000.fa #seq len A C G T N cpg #scaffold_115 4065651 736071 696850 700707 740890 1191133 60230 #scaffold_81 7649460 1502224 1297711 1285486 1482474 2081565 87359 #scaffold_45 15715546 2926604 2362405 2351615 2933678 5141244 126807 #scaffold_41 16993173 3764581 2421095 2432646 3805601 4569250 103547 #scaffold_21 33939536 7356221 4629107 4649855 7388452 9915901 190985 #scaffold_13 48363610 10226185 7754898 7730208 10253556 12398763 432844 #scaffold_1 81131790 17393288 12255795 12237937 17366862 21877908 561145 ### Punting. Hiram suggests that we just wait till the alignments fill us in. # At this point, all scaffolds have been successfully genscanned EXCEPT the 7 listed above # cat and lift the results into single files ssh kkstore05 cd /cluster/data/cavPor3/bed/genscan sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed cat pep/c_*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/cavPor3/bed/genscan ldHgGene cavPor3 -gtf genscan genscan.gtf # Read 64818 transcripts in 416957 lines in 1 files # 64818 groups 2179 seqs 1 sources 1 feature types # 64818 gene predictions hgPepPred cavPor3 generic genscanPep genscan.pep hgLoadBed cavPor3 genscanSubopt genscanSubopt.bed # Loaded 537376 elements of size 6 # let's check the numbers time nice -n +19 featureBits cavPor3 genscan # 76489707 bases of 2663369733 (2.872%) in intersection ######################################################################### ## genscan RErun (BEG: 2008-05-22 DONE: 2008-05-22 - Tim) ### ### QA found missing genscan data, so retrying the unlucky 7 by splitting on gaps ssh memk pushd /san/sanvol1/scratch/cavPor3/hardChunks faSplit gap c_rt01_0000.fa 1000000 c_rt_01_0000_gap_ faSplit gap c_rt32_0005.fa 1000000 c_rt_32_0005_gap_ faSplit gap c_rt71_0025.fa 1000000 c_rt_71_0025_gap_ faSplit gap c_rt107_0010.fa 1000000 c_rt_107_0010_gap_ faSplit gap c_rt111_0010.fa 1000000 c_rt_111_0010_gap_ faSplit gap c_rt151_0010.fa 1000000 c_rt_151_0010_gap_ faSplit gap c_rt19_0010.fa 1000000 c_rt_19_0010_gap_ popd # cd /cluster/data/cavPor3/bed/genscan ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_rt*_gap_*.fa > genome_gap.list gensub2 genome_gap.list single template jobList_gap para create jobList_gap para try, check, push, check, ... # 219 jobs in batch # 52852 jobs (including everybody's) in Parasol queue. # Checking finished jobs # ....... # ranOk: 219 # total jobs in batch: 219 # cat and lift the results into single files ssh kkstore05 cd /cluster/data/cavPor3/bed/genscan sort -k1,1 -k4.4n gtf/c_*.gtf > genscan_full.gtf sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt_full.bed cat pep/c_*.pep > genscan_full.pep # Load into the database as so: ssh hgwdev cd /cluster/data/cavPor3/bed/genscan ldHgGene cavPor3 -gtf genscan genscan_full.gtf # Read 70478 transcripts in 454271 lines in 1 files # 70478 groups 2398 seqs 1 sources 1 feature types # 70478 gene predictions hgPepPred cavPor3 generic genscanPep genscan_full.pep hgLoadBed cavPor3 genscanSubopt genscanSubopt_full.bed # Loaded 605836 elements of size 6 # let's check the numbers time nice -n +19 featureBits cavPor3 genscan # 76489707 bases of 2663369733 (2.872%) in intersection ### ### ### Total defeat. faSplit on gaps results in new sequence names that donot match to the known scaffold_nnnn sequences. # Clean up mess: cd /cluster/data/cavPor3/bed/genscan pushd /san/sanvol1/scratch/cavPor3/hardChunks rm c_rt_*_gap_*.fa popd # cd /cluster/data/cavPor3/bed/genscan rm gtf/c_rt_*_gap_*.gtf rm pep/c_rt_*_gap_*.pep rm subopt/c_rt_*_gap_*.bed # Try the seven culprits one more time. cat << \_EOF_ > genome_oneMoreTime.list /san/sanvol1/scratch/cavPor3/hardChunks/c_rt19_0010.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt151_0010.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt111_0010.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt107_0010.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt71_0025.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt32_0005.fa /san/sanvol1/scratch/cavPor3/hardChunks/c_rt01_0000.fa _EOF_ ssh memk cd /cluster/data/cavPor3/bed/genscan gensub2 genome_oneMoreTime.list single template jobList_oneMoreTime para create jobList_oneMoreTime para push, check, ... # para status # 7 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # #state tries real cpu host jobid cmd # crash 1 58.00 54.08 mkr0u5 1182093 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt19_0010.fa gtf/c_rt19_0010.gtf pep/c_rt19_0010.pep subopt/c_rt19_0010.bed # done 1 228.00 224.16 mkr0u6 1182094 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt151_0010.fa gtf/c_rt151_0010.gtf pep/c_rt151_0010.pep subopt/c_rt151_0010.bed # crash 1 57.00 52.47 mkr0u3 1182095 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt111_0010.fa gtf/c_rt111_0010.gtf pep/c_rt111_0010.pep subopt/c_rt111_0010.bed # crash 1 106.00 99.41 mkr0u7 1182096 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt107_0010.fa gtf/c_rt107_0010.gtf pep/c_rt107_0010.pep subopt/c_rt107_0010.bed # crash 1 617.00 602.73 mkr0u5 1182097 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt71_0025.fa gtf/c_rt71_0025.gtf pep/c_rt71_0025.pep subopt/c_rt71_0025.bed # crash 1 922.00 896.75 mkr0u3 1182098 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt32_0005.fa gtf/c_rt32_0005.gtf pep/c_rt32_0005.pep subopt/c_rt32_0005.bed # crash 1 1307.00 1258.05 mkr0u7 1182099 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt01_0000.fa gtf/c_rt01_0000.gtf pep/c_rt01_0000.pep subopt/c_rt01_0000.bed # One down, 6 to go. Try lowering window size on gsBig mv runGsBig runGsBig.24 cat << \_EOF_ > runGsBig #!/bin/csh -fe set runDir = `pwd` set srcDir = $1 set inFile = $2 set fileRoot = $inFile:r mkdir /scratch/tmp/$fileRoot cp -p $srcDir/$inFile /scratch/tmp/$fileRoot pushd /scratch/tmp/$fileRoot /cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=1200000 popd cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt rm -fr /scratch/tmp/$fileRoot _EOF_ # << happy emacs chmod +x runGsBig para push, check, ... # para status # 7 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # #state tries real cpu host jobid cmd # done 2 58.00 54.08 mkr0u5 1182093 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt19_0010.fa gtf/c_rt19_0010.gtf pep/c_rt19_0010.pep subopt/c_rt19_0010.bed # done 1 228.00 224.16 mkr0u6 1182094 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt151_0010.fa gtf/c_rt151_0010.gtf pep/c_rt151_0010.pep subopt/c_rt151_0010.bed # crash 2 57.00 52.47 mkr0u3 1182095 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt111_0010.fa gtf/c_rt111_0010.gtf pep/c_rt111_0010.pep subopt/c_rt111_0010.bed # done 2 106.00 99.41 mkr0u7 1182096 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt107_0010.fa gtf/c_rt107_0010.gtf pep/c_rt107_0010.pep subopt/c_rt107_0010.bed # done 2 617.00 602.73 mkr0u5 1182097 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt71_0025.fa gtf/c_rt71_0025.gtf pep/c_rt71_0025.pep subopt/c_rt71_0025.bed # crash 2 922.00 896.75 mkr0u3 1182098 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt32_0005.fa gtf/c_rt32_0005.gtf pep/c_rt32_0005.pep subopt/c_rt32_0005.bed # done 2 1307.00 1258.05 mkr0u7 1182099 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt01_0000.fa gtf/c_rt01_0000.gtf pep/c_rt01_0000.pep subopt/c_rt01_0000.bed # All but two! para push, check, ... # para status # 7 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # #state tries real cpu host jobid cmd # done 2 58.00 54.08 mkr0u5 1182093 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt19_0010.fa gtf/c_rt19_0010.gtf pep/c_rt19_0010.pep subopt/c_rt19_0010.bed # done 1 228.00 224.16 mkr0u6 1182094 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt151_0010.fa gtf/c_rt151_0010.gtf pep/c_rt151_0010.pep subopt/c_rt151_0010.bed # done 3 57.00 52.47 mkr0u3 1182095 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt111_0010.fa gtf/c_rt111_0010.gtf pep/c_rt111_0010.pep subopt/c_rt111_0010.bed # done 2 106.00 99.41 mkr0u7 1182096 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt107_0010.fa gtf/c_rt107_0010.gtf pep/c_rt107_0010.pep subopt/c_rt107_0010.bed # done 2 617.00 602.73 mkr0u5 1182097 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt71_0025.fa gtf/c_rt71_0025.gtf pep/c_rt71_0025.pep subopt/c_rt71_0025.bed # done 3 922.00 896.75 mkr0u3 1182098 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt32_0005.fa gtf/c_rt32_0005.gtf pep/c_rt32_0005.pep subopt/c_rt32_0005.bed # done 2 1307.00 1258.05 mkr0u7 1182099 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt01_0000.fa gtf/c_rt01_0000.gtf pep/c_rt01_0000.pep subopt/c_rt01_0000.bed ### All completed! # cat and lift the results into single files ssh kkstore05 cd /cluster/data/cavPor3/bed/genscan sort -k1,1 -k4.4n gtf/c_*.gtf > genscan_full.gtf sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt_full.bed cat pep/c_*.pep > genscan_full.pep # Load into the database as so: ssh hgwdev cd /cluster/data/cavPor3/bed/genscan ldHgGene cavPor3 -gtf genscan genscan_full.gtf # Read 70161 transcripts in 453022 lines in 1 files # 70161 groups 2185 seqs 1 sources 1 feature types # 70161 gene predictions hgPepPred cavPor3 generic genscanPep genscan_full.pep hgLoadBed cavPor3 genscanSubopt genscanSubopt_full.bed # Loaded 603786 elements of size 6 # let's check the numbers time nice -n +19 featureBits cavPor3 genscan # 82960328 bases of 2663369733 (3.115%) in intersection ######################################################################### # CPGISLANDS (DONE - 2008-04-10 - Tim) ssh hgwdev mkdir /cluster/data/cavPor3/bed/cpgIsland cd /cluster/data/cavPor3/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # There was a problem in here, in both cpg.c and cpg_lh.c: # cpg_lh.c:74: warning: conflicting types for built-in function 'malloc' # warning: conflicting types for built-in function 'malloc' # commentted out line 74 ONLY to get this to build # gcc readseq.c cpg_lh.c -o cpglh.exe cd ../.. ln -s hg3rdParty/cpgIslands/cpglh.exe . # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. mkdir results echo ../../hardMasked/*.hard.fa | sed -e "s/ /\n/g" | while read F do FA=${F/*\/} C=${FA/.hard.fa/} echo "./cpglh.exe ${FA} > results/${C}.cpg" nice -n +19 ./cpglh.exe ${F} > results/${C}.cpg done > cpglh.out 2>&1 & # about 5 minutes # Transform cpglh output to bed + cat << \_EOF_ > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } _EOF_ # << happy emacs catDir results | awk -f filter.awk | sort -k1,1 -k2,2n > cpgIsland.bed ssh hgwdev cd /cluster/data/cavPor3/bed/cpgIsland hgLoadBed cavPor3 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Loaded 36373 elements of size 10 featureBits cavPor3 cpgIslandExt # 21746514 bases of 2663369733 (0.817%) in intersection ######################################################################### # READ before BLASTZ: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment ######################################################################### # BLASTZ/CHAIN/NET Hg18-cavPor3 (START - 2008-04-10 - Tim; DONE 2008-04-15) ssh kkstore02 # store11->kkstore02-10 screen # use a screen to manage this multi-day job mkdir /cluster/data/hg18/bed/blastzCavPor3.2008-04-10 cd /cluster/data/hg18/bed/ ln -s blastzCavPor3.2008-04-10 blastz.cavPor3 cd blastzCavPor3.2008-04-10 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/cluster/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ # << this line keeps emacs coloring happy # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 # ps -ef | grep blastzCavPor3 # real 1320m35.523s # HOWEVER, doBlastzChainNet failed because the run.blastz/para.results were corrupt. cd /cluster/data/hg18/bed/blastzCavPor3.2008-04-10/run.blastz para recover jobList jobListRecovered # wc -l jobListRecovered -> 9 para create jobListRecovered para push # completed all jobs. Now: ssh kkstore02 screen -r -d time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat -syntenicNet > do.log 2>&1 & #real 210m38.768s # low because of the restart that had to occur tail do.log # *** All done! # *** Make sure that goldenPath/hg18/vsCavPor3/README.txt is accurate. # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary. cat fb.hg18.chainCavPor3Link.txt # 1267036494 bases of 2881515245 (43.971%) in intersection ######### Change locations in DEF due to Hiram's new methods cd /cluster/data/hg18/bed/blastzCavPor3.2008-04-10 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/cavPor3/bed/blastz.hg18.swap ssh kkstore05 # where cavPor3 is located screen cd /cluster/data/cavPor3/bed/blastz.hg18.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/hg18/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > do.log 2>&1 # real 216m13.808s # chainSplit chain cavPor3.hg18.all.chain.gz # sh: pipe error: Too many open files # Can't open chain/scaffold_621.chain to append: Too many open files # broken down during netSynteny.csh due to too many open files on # a chainSplit # However, there is no need to split when we have scaffolds. # Kate fixes doBlastzChainNet.pl and retry: cd /cluster/data/cavPor3/bed/blastz.hg18.swap time nice -n +19 ~kate/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/hg18/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=syntenicNet -syntenicNet > do.log 2>&1 & # real 29m49.617s # *** All done! # *** Make sure that (/usr/local/apache/htdocs/goldenPath/)cavPor3/vsHg18/README.txt is accurate. # *** Add {chain,net}Hg18 tracks to trackDb.ra if necessary. cat fb.cavPor3.chainHg18Link.txt # 1281925834 bases of 2663369733 (48.132%) in intersection ######################################################################### # BLASTZ/CHAIN/NET mm9-cavPor3 (START - 2008-04-10 - DONE 2008-04-14 - Tim) ssh kkstore06 # store4->kkstore04-10 screen # use a screen to manage this multi-day job mkdir /cluster/data/mm9/bed/blastzCavPor3.2008-04-10 cd /cluster/data/mm9/bed/ ln -s blastzCavPor3.2008-04-10 blastz.cavPor3 cd blastzCavPor3.2008-04-10 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/cluster/bluearc/scratch/data/mm9/nib SEQ1_LEN=/cluster/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/cluster/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ # << this line keeps emacs coloring happy # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 # ps -ef | grep blastzCavPor3 # real 1764m55.155s tail do.log # *** All done! # *** Make sure that goldenPath/mm9/vsCavPor3/README.txt is accurate. # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary. # blastz: ranOk: 52984 cat fb.mm9.chainCavPor3Link.txt # 757283793 bases of 2620346127 (28.900%) in intersection cd /cluster/data/mm9/bed/blastzCavPor3.2008-04-10 ######### Change locations in DEF due to Hiram's new methods cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/cavPor3/bed/blastz.mm9.swap cd /cluster/data/cavPor3/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > do.log 2>&1 & # real 166m53.671s # Exit 25 time nice -n +19 doBlastzChainNet.pl /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF -bigC # Can't open chain/scaffold_795.chain to append: Too many open files # gzip: stdout: Broken pipe # Command failed: # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastz.mm9.swap/axtChain/netSynteny.csh # broken down during netSynteny.csh due to too many open files on # a chainSplit # However, there is no need to split when we have scaffolds. # Kate fixes doBlastzChainNet.pl and retry: cd /cluster/data/cavPor3/bed/blastz.mm9.swap time nice -n +19 ~kate/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=syntenicNet -syntenicNet > syn.log 2>&1 & # real 24m37.561s # *** All done! # *** Make sure that goldenPath/cavPor3/vsMm9/README.txt is accurate. # *** Add {chain,net}Mm9 tracks to trackDb.ra if necessary. cat fb.cavPor3.chainMm9Link.txt # 781173609 bases of 2663369733 (29.330%) in intersection ######################################################################### # BLASTZ/CHAIN/NET galGal3-cavPor3 (START - 2008-04-10; DONE: 2008-04-15 - Tim) ssh kkstore03 # store6->kkstore03-10 screen # use a screen to manage this multi-day job mkdir /cluster/data/galGal3/bed/blastzCavPor3.2008-04-10 cd /cluster/data/galGal3/bed/ ln -s blastzCavPor3.2008-04-10 blastz.cavPor3 cd blastzCavPor3.2008-04-10 cat << \_EOF_ > DEF BLASTZ_M=50 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/cluster/bluearc/scratch/data/galGal3/nib SEQ1_LEN=/cluster/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/cluster/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ # << this line keeps emacs coloring happy # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -syntenicNet > do.log 2>&1 # ps -ef | grep blastzCavPor3 # real 1507m5.132s tail do.log # *** All done! # *** Make sure that goldenPath/galGal3/vsCavPor3/README.txt is accurate. # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary. cat fb.galGal3.chainCavPor3Link.txt # 106239838 bases of 1042591351 (10.190%) in intersection cd /cluster/data/galGal3/bed/blastzCavPor3.2008-04-10 ######### Change locations in DEF due to Hiram's new methods cat << \_EOF_ > DEF BLASTZ_M=50 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/scratch/data/galGal3/nib SEQ1_LEN=/scratch/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/cavPor3/bed/blastz.galGal3.swap cd /cluster/data/cavPor3/bed/blastz.galGal3.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap > do.log 2>&1 # real 23m19.970s # *** All done! # *** Make sure that goldenPath/cavPor3/vsGalGal3/README.txt is accurate. # *** Add {chain,net}GalGal3 tracks to trackDb.ra if necessary. cat fb.cavPor3.chainGalGal3Link.txt # 144795360 bases of 2663369733 (5.437%) in intersection ######################################################################### # BLASTZ/CHAIN/NET cavPor3-oryCun1 (RESTART - 2008-04-11 - DONE - 2008-04-22 Tim) ssh kkstore05 # store12->kkstore05 screen # use a screen to manage this multi-day job mkdir /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 cd /cluster/data/cavPor3/bed/ ln -s blastzOryCun1.2008-04-11 blastz.oryCun1 cd blastzOryCun1.2008-04-11 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: GuineaPig cavPor3 SEQ1_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ1_LEN=/cluster/bluearc/scratch/data/cavPor3/chrom.sizes SEQ1_LIMIT=300 SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rabbit oryCun1 SEQ2_DIR=/cluster/bluearc/scratch/data/oryCun1/oryCun1.2bit SEQ2_LEN=/cluster/bluearc/scratch/data/oryCun1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=500 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 TMPDIR=/scratch/tmp _EOF_ # << this line keeps emacs coloring happy # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -stop=partition > do.log 2>&1 & ## used stop, then `wc -l run.blastz/cavPor3.lst` * `wc -l run.balstz/monDom4.lst` ## to find out size based upon DEF SEQn_CHUNK and SEQn_LIMIT then: time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=blastz > do.log 2>&1 & # ps -ef | grep blastzCavPor3 #real 2818m19.825s tail do.log # Loading 22976113 chains into cavPor3.chainOryCun1 # Can't start query: # load data local infile 'link.tab' into table chainOryCun1Link # mySQL error 1114: The table 'chainOryCun1Link' is full # Command failed: # ssh -x hgwdev nice /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/axtChain/loadUp.csh # mysql> select count(*) from chainOryCun1Link; > 119,306,828 mm9-oryCun1 has ~40m spread out among ~20 chr tbls # featureBits cavPor3 chainOryCun1Link # 439282179 bases of 2663369733 (16.493%) in intersection # Options: # 1) combine small scaffolds into a scaffoldUn # Have to restart all blastzs Awkward since not clear dividing line between scaffold/superscaffold # 2) Do not have the raw chains available at all. Just have reciprocal best # 3) Go back to the idea of greater masking. ## [tdreszer@hgwdev /cluster/data/cavPor3] textHistogram -col=2 -binSize=40000000 chrom.sizes ## 0 ************************************************************ 3126 ## 40000000 16 ## 80000000 2 ## [tdreszer@hgwdev /cluster/data/cavPor3] textHistogram -col=2 -binSize=10000000 chrom.sizes ## 0 ************************************************************ 3079 ## 10000000 * 26 ## 20000000 14 ## 30000000 7 ## 40000000 5 ## 50000000 4 ## 60000000 5 ## 70000000 2 ## 80000000 2 ##################### Hiram recommends loading all 210 mil anyway with special table create to override row limit ################################################################### # this failed during the load because the chainLink table became # too large. These were loaded manually with an sql statement to # start the table definition, then a load data local infile using # the .tab files left over from the failed load. Note the extra # definitions on the chainOryCun1Link table time nice -n +19 hgsql -e \ "DROP TABLE chainOryCun1Link;" cavPor3 & CREATE TABLE chainOryCun1Link ( bin smallint(5) unsigned NOT NULL default 0, tName varchar(255) NOT NULL default '', tStart int(10) unsigned NOT NULL default 0, tEnd int(10) unsigned NOT NULL default 0, qStart int(10) unsigned NOT NULL default 0, chainId int(10) unsigned NOT NULL default 0, KEY tName (tName(16),bin), KEY chainId (chainId) ) ENGINE=MyISAM max_rows=220000000 avg_row_length=55 pack_keys=1 CHARSET=latin1; time nice -n +19 hgsql -e \ "load data local infile \"link.tab\" into table chainOryCun1Link;" cavPor3 # this one took a number of hours # real 272m44.943s # finish the nets and load # Add gap/repeat stats to the net file using database tables: cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/axtChain netClass -verbose=0 -noAr noClass.net cavPor3 oryCun1 cavPor3.oryCun1.net # Load nets: time nice -n +19 netFilter -minGap=10 cavPor3.oryCun1.net \ | hgLoadNet -verbose=0 cavPor3 netOryCun1 stdin > netFilter.out 2>&1 & # real 6m27.050s cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 time nice -n +19 featureBits cavPor3 chainOryCun1Link >&fb.cavPor3.chainOryCun1Link.txt & #real 61m59.509s cat fb.cavPor3.chainOryCun1Link.txt # 752079320 bases of 2663369733 (28.238%) in intersection ################################################################### ### doReciprocalbest.pl look in mm9.txt with oryCun1 # create reciprocal best chains/nets ssh hgwdev cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \ > rbest.log 2>&1 & # failed due to experiments dir being in the way cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/axtChain mv experiments old.experiments time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \ > rbestResume.log 2>&1 & # hung for some unknown reason. Starting again time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \ >> rbestAgain.log 2>&1 & # real 37m33.221s # HgStepManager: executing step 'download' Fri Apr 18 10:33:31 2008. # download: output of previous step recipBest, /usr/local/apache/htdocs/goldenPath/cavPor3/vsOryCun1 , is required but does not appear to exist. # If it actually does exist, then this error is probably due to network/filesystem delays -- wait a minute and restart with -continue download. # If it really doesn't exist, either fix things manually or try -continue recipBest time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \ -continue=recipBest >> rbestAgain.log 2>&1 & # Okay, the p[art that I am missing is that I didn't finish doBlastz with downloads! ssh kkstore05 screen cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download > doResumeAtDownload.log 2>&1 & # real 3m21.480s # *** All done! # *** Make sure that goldenPath/cavPor3/vsOryCun1/README.txt is accurate. # *** Add {chain,net}OryCun1 tracks to trackDb.ra if necessary. time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \ -continue=download >> rbestAgain.log 2>&1 & # real 0m0.418s # *** All done! # *** Steps were performed in /cluster/data/cavPor3/bed/blastz.oryCun1 # real 611m17.901s cat fb.cavPor3.chainOryCun1Link.txt # 752079320 bases of 2663369733 (28.238%) in intersection ######### Change locations in DEF due to Hiram's new methods cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: GuineaPig cavPor3 SEQ1_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ1_LEN=/scratch/data/cavPor3/chrom.sizes SEQ1_LIMIT=300 SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rabbit oryCun1 SEQ2_DIR=/scratch/data/oryCun1/oryCun1.2bit SEQ2_LEN=/scratch/data/oryCun1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=500 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/oryCun1/bed/blastz.cavPor3.swap cd /cluster/data/oryCun1/bed/blastz.cavPor3.swap screen time nice -n +19 doBlastzChainNet.pl \ /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap > do.log 2>&1 # # real 492m11.337s # load data local infile 'link.tab' into table chainCavPor3Link # mySQL error 1114: The table 'chainCavPor3Link' is full # Command failed: # ssh -x hgwdev nice /cluster/data/oryCun1/bed/blastz.cavPor3.swap/axtChain/loadUp.csh # wc -l link.tab 210745564 ################################################################### # Manual load again ssh hgwdev cd /cluster/data/oryCun1/bed/blastz.cavPor3.swap/axtChain hgsql oryCun1 DROP TABLE chainCavPor3Link; CREATE TABLE chainCavPor3Link ( bin smallint(5) unsigned NOT NULL default 0, tName varchar(255) NOT NULL default '', tStart int(10) unsigned NOT NULL default 0, tEnd int(10) unsigned NOT NULL default 0, qStart int(10) unsigned NOT NULL default 0, chainId int(10) unsigned NOT NULL default 0, KEY tName (tName(16),bin), KEY chainId (chainId) ) ENGINE=MyISAM max_rows=212000000 avg_row_length=55 pack_keys=1 CHARSET=latin1; # exit screen time nice -n +19 hgsql -e \ "load data local infile \"link.tab\" into table chainCavPor3Link;" oryCun1 # real 360m19.577s # mysql> select count(*) from chainCavPor3Link; | 210745564 | # finish the nets and load # Still on hgwdev because of sql screen time nice -n +19 /cluster/data/oryCun1/bed/blastz.cavPor3.swap/axtChain/loadUpResume.csh >> doFinishLoad.log 2>&1 # real 991m52.532s # cat fb.oryCun1.chainCavPor3Link.txt # 729628282 bases of 2076044328 (35.145%) in intersection # Now continue -swap at download ssh kkstore04 # oryCun1 -> store09 -> kkstore04 cd /cluster/data/oryCun1/bed/blastz.cavPor3.swap screen time nice -n +19 doBlastzChainNet.pl \ /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download >> doResumeAtDownload.log 2>&1 # real 2m2.757s # *** All done! # *** Make sure that goldenPath/oryCun1/vsCavPor3/README.txt is accurate. # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary. ######################################################################### # BLASTZ/CHAIN/NET rn4-cavPor3 (STARTED - 2008-04-14; DONE 04-15-2008 - Tim) ssh kkstore06 # rat on store3->kkstore06 screen # use a screen to manage this multi-day job mkdir /cluster/data/rn4/bed/blastzCavPor3.2008-04-14 cd /cluster/data/rn4/bed/ ln -s blastzCavPor3.2008-04-14 blastz.cavPor3 cd blastzCavPor3.2008-04-14 cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Rat Rn4 SEQ1_DIR=/cluster/bluearc/scratch/data/rn4/nib SEQ1_LEN=/cluster/bluearc/scratch/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/cluster/bluearc/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastzCavPor3.2008-04-14 TMPDIR=/scratch/tmp _EOF_ # << this line keeps emacs coloring happy # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 & # ps -ef | grep blastzCavPor3 # real 611m17.901s cat fb.rn4.chainCavPor3Link.txt # 716379861 bases of 2571531505 (27.858%) in intersection ######### Change locations in DEF due to Hiram's new methods cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Rat Rn4 SEQ1_DIR=/scratch/data/rn4/nib SEQ1_LEN=/scratch/data/rn4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/rn4/bed/blastzCavPor3.2008-04-14 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/cavPor3/bed/blastz.rn4.swap cd /cluster/data/cavPor3/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/rn4/bed/blastzCavPor3.2008-04-14/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > do.log 2>&1 # real 103m34.523s # broken down during netSynteny.csh due to too many open files on # a chainSplit # However, there is no need to split when we have scaffolds. # Kate fixes doBlastzChainNet.pl and retry: cd /cluster/data/cavPor3/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/rn4/bed/blastzCavPor3.2008-04-14/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=syntenicNet -syntenicNet > doSyn.log 2>&1 & # real 14m49.396s # *** All done! # *** Make sure that goldenPath/cavPor3/vsRn4/README.txt is accurate. # *** Add {chain,net}Rn4 tracks to trackDb.ra if necessary. cat fb.cavPor3.chainRn4Link.txt # 735147548 bases of 2663369733 (27.602%) in intersection ########################################################################### # BLASTZ/CHAIN/NET monDom4-cavPor3 (START - 2008-04-11 DONE 2008-04-16 - Tim) ssh kkstore04 # monDom on store9 -> kkstore04 screen # use screen to control this job mkdir /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11 cd /cluster/data/monDom4/bed/ ln -s blastzCavPor3.2008-04-11 blastz.cavPor3 cd blastzCavPor3.2008-04-11 cat << \_EOF_ > DEF # opossum vs guineaPigs # Use "mammal-fish" params even though this is mammal-mammal... # pretty distant, hopefully not too many shared undermasked repeats. #BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/cluster/bluearc/scratch/data/cavPor3/chrom.sizes SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzCavPor3.2008-04-11 TMPDIR=/scratch/tmp _EOF_ # << happy emacs time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -verbose=2 -stop=partition \ -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 & ## used stop, then `wc -l run.blastz/cavPor3.lst` * `wc -l run.balstz/monDom4.lst` ## to find out size based upon DEF SEQn_CHUNK and SEQn_LIMIT then: time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -verbose=2 -continue=blastz \ -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 & # ps -ef | grep blastzCavPor3 # real 1715m0.868s # tail do.log # updated job database on disk # Batch failed after 4 tries on /cluster/bin/scripts/blastz-run-ucsc -outFormat psl /san/sanvol1/scratch/monDom4/nib/chr5.nib:chr5:20000000-30010000 qParts/part042.lst ../DEF ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl # Command failed: # ssh -x pk nice /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11/run.blastz/doClusterRun.csh para-eta time # Completed: 65513 of 65514 jobs # Crashed: 1 jobs # CPU time in finished jobs: 9807564s 163459.40m 2724.32h 113.51d 0.311 y # IO & Wait Time: 373385s 6223.08m 103.72h 4.32d 0.012 y # Average job time: 155s 2.59m 0.04h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2413s 40.22m 0.67h 0.03d # Submission to last job: 103236s 1720.60m 28.68h 1.19d #### One for automation ???????????: para problems ## start time: Mon Apr 14 14:01:19 2008 ## return: 9 ## ../Psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl does not exist ## stderr: ## ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl already exists ## ls ../Psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl ## ls: ../Psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl: No such file or directory ## ls ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl ## ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl ########### Case sensitive path name!!! "Psl" vs "psl" ssh kkstore04 screen # use screen to control this job cd /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11 ######### Change locations in DEF due to Hiram's new methods cat << \_EOF_ > DEF # opossum vs guineaPigs # Use "mammal-fish" params even though this is mammal-mammal... # pretty distant, hopefully not too many shared undermasked repeats. #BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Opossum monDom4 SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib #SEQ1_DIR=/scratch/data/monDom4/monDom.2bit SEQ1_LEN=/scratch/data/monDom4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/monDom4/bed/blastzCavPor3.2008-04-11 TMPDIR=/scratch/tmp _EOF_ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -verbose=2 -continue=cat \ -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 & # ps -ef | grep blastzCavPor3 # real 232m24.348s # Done time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 -verbose=2 -continue=cat - cat fb.monDom4.chainCavPor3Link.txt # 334067222 bases of 3501643220 (9.540%) in intersection mkdir /cluster/data/cavPor3/bed/blastz.monDom4.swap cd /cluster/data/cavPor3/bed/blastz.monDom4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11/DEF \ -chainMinScore=5000 -verbose=2 -smallClusterHub=memk \ -swap -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 # real 238m31.843s # Loading 18041915 chains into cavPor3.chainMonDom4 # Can't start query: # load data local infile 'link.tab' into table chainMonDom4Link # mySQL error 1114: The table 'chainMonDom4Link' is full # select count(*) from chainMonDom4Link; => 119,311,098 # wc -l link.tab => 202,324,406 link.tab ################################################################### # this failed during the load because the chainLink table became # too large. These were loaded manually with an sql statement to # start the table definition, then a load data local infile using # the .tab files left over from the failed load. Note the extra # definitions on the chainMonDom4Link table time nice -n +19 hgsql -e \ "DROP TABLE chainMonDom4Link;" cavPor3 & CREATE TABLE chainMonDom4Link ( bin smallint(5) unsigned NOT NULL default 0, tName varchar(255) NOT NULL default '', tStart int(10) unsigned NOT NULL default 0, tEnd int(10) unsigned NOT NULL default 0, qStart int(10) unsigned NOT NULL default 0, chainId int(10) unsigned NOT NULL default 0, KEY tName (tName(16),bin), KEY chainId (chainId) ) ENGINE=MyISAM max_rows=210000000 avg_row_length=55 pack_keys=1 CHARSET=latin1; time nice -n +19 hgsql -e \ "load data local infile \"link.tab\" into table chainMonDom4Link;" cavPor3 # this one took a number of hours # real 272m44.943s # finish the nets and load # Add gap/repeat stats to the net file using database tables: cd /cluster/data/cavPor3/bed/blastz.monDom4.swap/axtChain # copied loadUp.csh to loadUpResume.csh and commented out lines already done time nice -n +19 ./loadUpResume.csh > loadUpResume.out 2>&1 #real 30m47.809s # real 106m17.385s cat fb.cavPor3.chainMonDom4Link.txt # 382771802 bases of 2663369733 (14.372%) in intersection ssh kkstore05 screen cd /cluster/data/cavPor3/bed/blastz.monDom4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11/DEF \ -chainMinScore=5000 -verbose=2 -smallClusterHub=memk \ -swap -chainLinearGap=loose -bigClusterHub=pk -continue=download > doDowload.log 2>&1 # real 0m36.604s # *** All done! # *** Make sure that goldenPath/cavPor3/vsMonDom4/README.txt is accurate. # *** Add {chain,net}MonDom4 tracks to trackDb.ra if necessary. ############################################################################ # FINAL BLASTZS STEPS: # Be sure to add chainCavPor3.html,netCavPor3.html to makeDb/trackDb and update makeDb/trackDb/trackDb.ra # Any orgs with non-standard priority update {org}/priority.ra # Any orgs with differnent BLASTZ_Q=HoxD55.q will need chain{DB}.html and trackDb.ra at assembly level #Modified (possibly merged) files: # trackDb/trackDb.ra #Files/directories not checked in to CVS that look like source: # doc/cavPor3.txt # trackDb/chainOryCun1.html # trackDb/netOryCun1.html ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-04-17 - Tim) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("cavPor3", "blat7", "17780", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("cavPor3", "blat7", "17781", "0", "1");' \ hgcentraltest # test it with some sequence # Can't open /gbdb/cavPor3/choHof1.2bit to read: No such file or directory ## Doh! I entered in the wrong blat server the first time! ############################################################################# # Downloads (DONE - 2008-04-23 - Tim) # Let's see if the downloads will work ssh hgwdev /cluster/data/cavPor3 # expecting to find repeat masker .out file here: ln -s bed/repeatMasker/cavPor3.fa.out . time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \ -workhorse=hgwdev cavPor3 > jkStuff/downloads.log 2>&1 # failed due to link to RepeatMasker instead of repeatMasker. Fixed and restarted. # *** All done! # *** Please take a look at the downloads for cavPor3 using a web browser. # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/cavPor3/goldenPath/database/README.txt # /cluster/data/cavPor3/goldenPath/bigZips/README.txt # (The htdocs/goldenPath/cavPor3/*/README.txt "files" are just links to those.) # *** If you have to make any edits that would always apply to future # assemblies from the same sequencing center, please edit them into # ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help). # the downloads are located at: http://hgwdev.cse.ucsc.edu/goldenPath/cavPor3/ mv goldenPath old.goldenPath mv jkStuff/downloads.log jkStuff/old.downloads.log time nice -n +19 /cluster/home/tdreszer/kent/src/hg/utils/automation/makeDownloads.pl \ -workhorse=hgwdev cavPor3 >> jkStuff/downloads.log 2>&1 # real 24m23.427s # *** Please take a look at the downloads for cavPor3 using a web browser. # *** The downloads directory is: /usr/local/apache/htdocs/goldenPath/cavPor3. # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/cavPor3/goldenPath/database/README.txt # /cluster/data/cavPor3/goldenPath/bigZips/README.txt # (The htdocs/goldenPath/cavPor3/*/README.txt "files" are just links to those.) # *** If you have to make any edits that would always apply to future # assemblies from the same sequencing center, please edit them into # ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help). ############################################################################# # PushQ entries (DONE - 2008-04-24 - Tim) ssh hgwdev /cluster/data/cavPor3 /cluster/bin/scripts/makePushQSql.pl cavPor3 > jkStuff/pushQ.sql # output warnings: # hgwdev does not have /usr/local/apache/htdocs/goldenPath/cavPor3/liftOver/cavPor3ToCavPor* ### is it worth it? NO: cavPor2 was never released. # cavPor3 does not have seq # cavPor3 does not have extFile ### Not needed because output is from multiz, visiGene, affyProbes, cloneend, stsMarker, nibbImageProbes ### Kate suggests updating makePushQsql to require gbSeq and gbExtFile instead, but it already does. # # *** All done! # *** Please edit the output to ensure correctness before using. # *** 1. Resolve any warnings output by this script. # *** 2. Remove any entries which should not be pushed. ### All accounted for but: genscanSubopt which is not in pushQ.sql but is in # *** 3. Add tables associated with the main track table (e.g. *Pep tables # for gene prediction tracks). # *** 4. Add files associated with tracks. First, look at the results # of this query: # hgsql cavPor3 -e 'select distinct(path) from extFile' ### Doesn't exist # Then, look at file(s) named in each of the following wiggle tables: # hgsql cavPor3 -e 'select distinct(file) from gc5Base' ### | /gbdb/cavPor3/wib/gc5Base.wib | # hgsql cavPor3 -e 'select distinct(file) from quality' ### | /gbdb/cavPor3/wib/qual.wib | # Files go in the second field after tables (it's tables, cgis, files). # *** 5. This script currently does not recognize composite tracks. If cavPor3 # has any composite tracks, you should manually merge the separate # per-table entries into one entry. ### Doen't apply to cavPor3 at this time. # *** 6. Just before executing the sql, note the ID of the most recent entry # in the Main Push Queue. If the ID (first column) of the last # INSERT statement is not 1 greater than the most recent entry's, # make it so to avoid an ID clash with an existing entry. ### Updated to 4283 # *** When everything looks complete and correct, use hgsql on the qapushq # machine (currently hgwbeta) to execute the sql file. (Make sure that # qapushq does not already have a table named cavPor3.) Then use the Push # Queue web interface to check the contents of all entries. ssh hgwbeta cd /cluster/data/cavPor3/jkStuff hgsql qapushq < pushQ.sql ### All is there # *** If you haven't already, please add cavPor3 to makeDb/schema/all.joiner ! # It should be in both $gbd and $chainDest. ### Already there # *** When cavPor3 is on the RR (congrats!), please doBlastz -swap if you haven't # already. ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-09) ssh kkstore05 # bash if not using bash shell already mkdir /cluster/data/cavPor3/blastDb cd /cluster/data/cavPor3 awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst cavPor3.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst cavPor3.unmasked.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 3401 mkdir -p /san/sanvol1/scratch/cavPor3/blastDb cd /cluster/data/cavPor3/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/cavPor3/blastDb done mkdir -p /cluster/data/cavPor3/bed/tblastn.hg18KG cd /cluster/data/cavPor3/bed/tblastn.hg18KG echo /san/sanvol1/scratch/cavPor3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 3401 query.lst # we want around 350000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\) # 36727/(350000/3401) = 356.881506 mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa split -l 357 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/cavPor3/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/cavPor3/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/cavPor3/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 350303 of 350303 jobs # CPU time in finished jobs: 25825855s 430430.91m 7173.85h 298.91d 0.819 y # IO & Wait Time: 2396715s 39945.25m 665.75h 27.74d 0.076 y # Average job time: 81s 1.34m 0.02h 0.00d # Longest finished job: 339s 5.65m 0.09h 0.00d # Submission to last job: 80595s 1343.25m 22.39h 0.93d ssh kkstore05 cd /cluster/data/cavPor3/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh pk cd /cluster/data/cavPor3/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 99 of 103 jobs # Crashed: 4 jobs # CPU time in finished jobs: 15940s 265.67m 4.43h 0.18d 0.001 y # IO & Wait Time: 397633s 6627.21m 110.45h 4.60d 0.013 y # Average job time: 4178s 69.63m 1.16h 0.05d # Longest finished job: 5502s 91.70m 1.53h 0.06d # Submission to last job: 7032s 117.20m 1.95h 0.08d # ran 4 crashed jobs on memk. Completed: 4 of 4 jobs CPU time in finished jobs: 1673s 27.88m 0.46h 0.02d 0.000 y IO & Wait Time: 529s 8.81m 0.15h 0.01d 0.000 y Average job time: 550s 9.17m 0.15h 0.01d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 712s 11.87m 0.20h 0.01d Submission to last job: 1299s 21.65m 0.36h 0.02d ssh kkstore05 cd /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/cavPor3/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/cavPor3/bed/tblastn.hg18KG hgLoadPsl cavPor3 blastHg18KG.psl # check coverage featureBits cavPor3 blastHg18KG # 35569442 bases of 2663369733 (1.336%) in intersection featureBits cavPor3 all_mrna blastHg18KG -enrichment # all_mrna 0.023%, blastHg18KG 1.336%, both 0.015%, cover 66.85%, enrich 50.05x ssh kkstore05 rm -rf /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut #end tblastn ############################### Make editor happy: _EOF_ ######################################################################### ## 7-Way Multiz (STARTED - 2008-04-25 - Tim) ## # From Jim: A minimal set would be human/mouse/rabbit/chicken. # Add rat and possum if you feel like turning the crank a little more. # the all.chain.gz files were split up via kluster jobs on memk # in order to get mafSynNet files. Example above in ornAna1 blastz ssh hgwdev mkdir /cluster/data/cavPor3/bed/multiz7way cd /cluster/data/cavPor3/bed/multiz7way # take the 30-way tree from mm9 and eliminate genomes not in # this alignment # rearrange to get cavPor3 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram # select the 7 organisms from the 30-way recently done on mouse mm9 /cluster/bin/phast/tree_doctor \ --prune-all-but Human_hg18,Mouse_mm9,Rat_rn4,GuineaPig_cavPor2,Rabbit_oryCun1,Opossum_monDom4,Chicken_galGal3 \ /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \ | sed -e "s/cavPor2/cavPor3/g" > 7-way.fullNames.nh # looks something like this: # ((((((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607,GuineaPig_cavPor3:0.202990):0.034350,Rabbit_oryCun1:0.208548):0.014587 # ,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); # rearrange to get guineaPig at the top: # this leaves us with: cat << _EOF_ > cavPor3.7-way.nh (((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.034350,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); _EOF_ # << happy emacs # verify all blastz's exists cat << \_EOF_ > listMafs.csh #!/bin/csh -fe cd /cluster/data/cavPor3/bed/multiz7way foreach db (`grep -v cavPor3 species.list`) set bdir = /cluster/data/cavPor3/bed/blastz.$db if (-e $bdir/mafRBestNet/cavPor3.$db.rbest.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/cavPor3.$db.syn.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/cavPor3.$db.net.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end _EOF_ # << happy emacs chmod +x ./listMafs.csh # see what it says: ./listMafs.csh # galGal3 mafNet # hg18 mafNet # mm9 mafNet # monDom4 mafNet # oryCun1 mafRBestNet # rn4 mafNet /cluster/bin/phast/all_dists cavPor3.7-way.nh > 7way.distances.txt # ERROR: Can't parse distance in tree ("0.014587 #"). #[hgwdev:tdreszer multiz7way> m cavPor3.7-way.nh # (((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.034350,Rabbit_oryCun1:0.208548):0.014587 # ,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); # Problem is that tree should not contain 'n' so reecho and retry m 7way.distances.txt # GuineaPig_cavPor3 Mouse_mm9 0.479871 # GuineaPig_cavPor3 Rat_rn4 0.487980 # GuineaPig_cavPor3 Rabbit_oryCun1 0.445888 # GuineaPig_cavPor3 Human_hg18 0.378828 # GuineaPig_cavPor3 Opossum_monDom4 0.835961 # GuineaPig_cavPor3 Chicken_galGal3 1.211508 # Mouse_mm9 Rat_rn4 0.160657 # Mouse_mm9 Rabbit_oryCun1 0.519779 # Mouse_mm9 Human_hg18 0.452719 # Mouse_mm9 Opossum_monDom4 0.909852 # Mouse_mm9 Chicken_galGal3 1.285399 # Rat_rn4 Rabbit_oryCun1 0.527888 # Rat_rn4 Human_hg18 0.460828 # Rat_rn4 Opossum_monDom4 0.917961 # Rat_rn4 Chicken_galGal3 1.293508 # Rabbit_oryCun1 Human_hg18 0.350036 # Rabbit_oryCun1 Opossum_monDom4 0.807169 # Rabbit_oryCun1 Chicken_galGal3 1.182716 # Human_hg18 Opossum_monDom4 0.710935 # Human_hg18 Chicken_galGal3 1.086482 # Opossum_monDom4 Chicken_galGal3 1.016989 # (total) - 2.228942 grep -i cavPor 7way.distances.txt | sort -k3,3n # GuineaPig_cavPor3 Human_hg18 0.378828 # GuineaPig_cavPor3 Rabbit_oryCun1 0.445888 # GuineaPig_cavPor3 Mouse_mm9 0.479871 # GuineaPig_cavPor3 Rat_rn4 0.487980 # GuineaPig_cavPor3 Opossum_monDom4 0.835961 # GuineaPig_cavPor3 Chicken_galGal3 1.211508 # Note that guinneaPig is closer to human than to other rodents. # This may be reasonable, since the speed of evolution (length of tree limbs) # in rodents is greater than for primates. For instance mm9 is closer to hg18 # than it is to either cavPor2 or oryCun1 # use the calculated # distances in the table below to order the organisms and check # the button order on the browser. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainCavPor3Link chain linearGap # distance on cavPor3 on other minScore # 0.479871 Mouse_mm9 (28.900%) (29.330%) 3000 medium # 0.487980 Rat_rn4 (27.858%) (27.602%) 3000 medium # 0.445888 Rabbit_oryCun1 (35.145%) (28.238%) 3000 medium # 0.378828 Human_hg18 (43.971%) (48.132%) 3000 medium # 0.835961 Opossum_monDom4 (9.540%) (14.372%) 5000 loose # 1.211508 Chicken_galGal3 (10.190%) (5.437%) 5000 loose ### ### ### ### Be sure to use calJac as an example because of scaffolds ### ### ### ### will require making maf files by scaffold name # create a coherent set of all the mafs involved in this run mkdir mafLinks cd mafLinks ln -s ../../blastz.hg18/mafNet ./hg18 ln -s ../../blastz.mm9/mafNet ./mm9 ln -s ../../blastz.rn4/mafNet ./rn4 ln -s ../../blastz.monDom4/mafNet ./monDom4 ln -s ../../blastz.galGal3/mafNet ./galGal3 ln -s ../../blastz.oryCun1/mafRBestNet ./oryCun1 # check data size: du -hscL * # 100M galGal3 # 930M hg18 # 577M mm9 # 268M monDom4 # 495M oryCun1 # 543M rn4 # 2.9G total # need to split these things up by Contig number for efficient kluster run ssh kkstore06 mkdir -p /san/sanvol1/scratch/cavPor3/multiz7way/contigMaf cd /scratch/tmp # the 16201 is from petMar/chrom.sizes echo "chrM 0 16801" > chrM.bed for D in `grep -v cavPor3 /cluster/data/cavPor3/bed/multiz7way/species.list` do echo ${D} zcat \ /cluster/data/cavPor3/bed/multiz7way/mafLinks/cavPor3.${D}.*.maf.gz \ > ${D}.maf mkdir /scratch/tmp/${D} cd /scratch/tmp/${D} mafSplit -verbose=2 /dev/null -byTarget -useHashedName=10 Contig \ ../${D}.maf -outDirDepth=2 mafsInRegion ../chrM.bed 0/0/chrM.maf ../${D}.maf rsync -a --progress ./ \ /san/sanvol1/scratch/cavPor3/multiz7way/contigMaf/${D} cd /scratch/tmp rm -fr ${D} ${D}.maf done # create a run-time list of contigs to operate on, not all contigs # exist in all alignments, but we want all contig names used in any # alignment: ssh kkstore05 # cavPor3 -> store12 -> kkstore05 cd /san/sanvol1/scratch/cavPor3/multiz7way/contigMaf for D in * do cd "${D}" find . -type f cd .. done | sort -u > /tmp/7-way.contig.list wc -l /tmp/7-way.contig.list mkdir /cluster/data/cavPor3/bed/multiz7way/splitRun cp -p /tmp/7-way.contig.list \ /cluster/data/cavPor3/bed/multiz7way/splitRun # 296 /tmp/7-way.contig.list # ready for the multiz run ssh pk cd /cluster/data/cavPor3/bed/multiz7way/splitRun mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # set the db and pairs directories here cat > autoMultiz.csh << \_EOF_ #!/bin/csh -ef set db = cavPor3 set subdir = $1 set c = $2 set result = $3 set resultDir = $result:h set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz7way/contigMaf rm -fr $tmp mkdir -p $tmp mkdir -p $resultDir cp ../../tree.7.nh ../../species.list $tmp pushd $tmp foreach s (`grep -v $db species.list`) set in = $pairs/$s/$subdir/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.7.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $result rm -fr $tmp rmdir --ignore-fail-on-non-empty /scratch/tmp/$db _EOF_ # << happy emacs chmod +x autoMultiz.csh cat << \_EOF_ > template #LOOP ./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/cavPor3/bed/multiz7way/splitRun/maf/$(dir1)/$(root1).maf} #ENDLOOP _EOF_ # << emacs sed -e "s/^\.\///" ../7-way.contig.list \ | gensub2 stdin single template jobList para create jobList para try ... check ... # Checking finished jobs # Completed: 10 of 296 jobs # CPU time in finished jobs: 4355s 72.59m 1.21h 0.05d 0.000 y # IO & Wait Time: 104s 1.73m 0.03h 0.00d 0.000 y # Average job time: 446s 7.43m 0.12h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1672s 27.87m 0.46h 0.02d # Submission to last job: 1681s 28.02m 0.47h 0.02d # Estimated complete: 0s 0.00m 0.00h 0.00d # [pk:tdreszer run> pc # 296 jobs in batch # 92505 jobs (including everybody's) in Parasol queue. # Checking finished jobs # unsubmitted jobs: 286 # ranOk: 10 # total jobs in batch: 296 para push ... check ... etc # Completed: 296 of 296 jobs # CPU time in finished jobs: 53659s 894.32m 14.91h 0.62d 0.002 y # IO & Wait Time: 1675s 27.92m 0.47h 0.02d 0.000 y # Average job time: 187s 3.12m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1969s 32.82m 0.55h 0.02d # Submission to last job: 4023s 67.05m 1.12h 0.05d # Estimated complete: 0s 0.00m 0.00h 0.00d # [pk:tdreszer run> pc # 296 jobs in batch # 84971 jobs (including everybody's) in Parasol queue. # Checking finished jobs # ranOk: 296 # total jobs in batch: 296 # put the split maf results back together into a single maf file # eliminate duplicate comments ssh kkstore05 cd /cluster/data/cavPor3/bed/multiz7way mkdir togetherMaf grep "^##maf version" splitRun/maf/0/0/Contig00700.maf \ | sort -u > togetherMaf/cavPor3.7way.maf ##maf version=1 scoring=autoMZ.v1 ##maf version=1 scoring=maf_project.v12 ##maf version=1 scoring=multiz for F in `find ./splitRun/maf -type f -depth` do grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \ | sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g" done | sort -u >> togetherMaf/cavPor3.7way.maf for F in `find ./splitRun/maf -type f -depth` do grep -v -h "^#" "${F}" done >> togetherMaf/cavPor3.7way.maf grep "^##eof maf" splitRun/maf/0/0/Contig00700.maf \ | sort -u >> togetherMaf/cavPor3.7way.maf # load tables for a look ssh hgwdev mkdir -p /gbdb/cavPor3/multiz7way/maf ln -s /cluster/data/cavPor3/bed/multiz7way/togetherMaf/*.maf \ /gbdb/cavPor3/multiz7way/maf/multiz7way.maf # this generates an immense multiz7way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/cavPor3/multiz7way/maf cavPor3 multiz7way # Advisory lock created # Indexing and tabulating /gbdb/cavPor3/multiz7way/maf/multiz7way.maf # Loading multiz7way into database # Loaded 6939217 mafs in 1 files from /gbdb/cavPor3/multiz7way/maf # Advisory lock has been released # # real 7m7.421s # user 1m50.625s # sys 0m34.191s ### ### ################################################################### ### ### #### Abandoning 7-Way because I am adding Squirrel 2008-05-27 ##### ### ### ################################################################### ######################################################################### ## 8-Way Multiz (STARTED - 2008-05-27 - Tim) ## 8-Way Multiz (RESTARTED - 2008-07-02 - Tim Done: 7-11-2008) ## # the all.chain.gz files were split up via kluster jobs on memk # in order to get mafSynNet files. Example above in ornAna1 blastz ssh hgwdev mkdir /cluster/data/cavPor3/bed/multiz8way cd /cluster/data/cavPor3/bed/multiz8way # take the 30-way tree from mm9 and eliminate genomes not in # this alignment # rearrange to get cavPor3 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram ### Note the ground squirrel (Spermophilus tridecemlineatus) is not in the mm9 30-way ### Add it a same distance from other rodents as rabbit # select the 7 organisms from the 30-way recently done on mouse mm9 /cluster/bin/phast/tree_doctor \ --prune-all-but Human_hg18,Mouse_mm9,Rat_rn4,GuineaPig_cavPor2,Rabbit_oryCun1,Opossum_monDom4,Chicken_galGal3 \ /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \ | sed -e "s/cavPor2/cavPor3/g" > 7-way.fullNames.nh # looks something like this: # ((((((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607,GuineaPig_cavPor3:0.202990):0.034350,Rabbit_oryCun1:0.208548):0.014587 # ,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); # rearrange to get guineaPig at the top: # this leaves us with: cat << _EOF_ > cavPor3.7-way.nh (((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.034350,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); _EOF_ # << happy emacs # Add squirrel at same distance to guineaPig and Mouse as rabbits: # Alternative tree: ((((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.017175,Squirrel_speTri0:0.208548):0.017175,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); cat << _EOF_ > cavPor3.8-way.nh (((((GuineaPig_cavPor3:0.202990,((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.18602,Squirrel_speTri0:0.208548):0.014587):0.034350,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824); _EOF_ # << happy emacs cd /cluster/data/cavPor3/bed/multiz8way cp ../multiz7way/species.list . echo speTri0 >> species.list # verify all blastz's exists cat << \_EOF_ > listMafs.csh #!/bin/csh -fe cd /cluster/data/cavPor3/bed/multiz8way foreach db (`grep -v cavPor3 species.list`) set bdir = /cluster/data/cavPor3/bed/blastz.$db if (-e $bdir/mafRBestNet/cavPor3.$db.rbest.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/cavPor3.$db.syn.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/cavPor3.$db.net.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end _EOF_ # << happy emacs chmod +x ./listMafs.csh # see what it says: ./listMafs.csh # galGal3 mafNet # hg18 mafNet # mm9 mafNet # monDom4 mafNet # oryCun1 mafRBestNet # rn4 mafNet # speTri0 mafs not found ### After some back and forth on my own, Kate finally got the speTri0 Mafs together: # galGal3 mafNet # hg18 mafNet # mm9 mafNet # monDom4 mafNet # oryCun1 mafRBestNet # rn4 mafNet # speTri0 mafRBestNet /cluster/bin/phast/all_dists cavPor3.8-way.nh > 8way.distances.txt # GuineaPig_cavPor3 Mouse_mm9 0.479871 # GuineaPig_cavPor3 Rat_rn4 0.487980 # GuineaPig_cavPor3 Squirrel_speTri0 0.426125 # GuineaPig_cavPor3 Rabbit_oryCun1 0.445888 # GuineaPig_cavPor3 Human_hg18 0.378828 # GuineaPig_cavPor3 Opossum_monDom4 0.835961 # GuineaPig_cavPor3 Chicken_galGal3 1.211508 # Mouse_mm9 Rat_rn4 0.160657 # Mouse_mm9 Squirrel_speTri0 0.470842 # Mouse_mm9 Rabbit_oryCun1 0.519779 # Mouse_mm9 Human_hg18 0.452719 # Mouse_mm9 Opossum_monDom4 0.909852 # Mouse_mm9 Chicken_galGal3 1.285399 # Rat_rn4 Squirrel_speTri0 0.478951 # Rat_rn4 Rabbit_oryCun1 0.527888 # Rat_rn4 Human_hg18 0.460828 # Rat_rn4 Opossum_monDom4 0.917961 # Rat_rn4 Chicken_galGal3 1.293508 # Squirrel_speTri0 Rabbit_oryCun1 0.466033 # Squirrel_speTri0 Human_hg18 0.398973 # Squirrel_speTri0 Opossum_monDom4 0.856106 # Squirrel_speTri0 Chicken_galGal3 1.231653 # Rabbit_oryCun1 Human_hg18 0.350036 # Rabbit_oryCun1 Opossum_monDom4 0.807169 # Rabbit_oryCun1 Chicken_galGal3 1.182716 # Human_hg18 Opossum_monDom4 0.710935 # Human_hg18 Chicken_galGal3 1.086482 # Opossum_monDom4 Chicken_galGal3 1.016989 # (total) - 2.437490 grep -i cavPor 8way.distances.txt | sort -k3,3n # GuineaPig_cavPor3 Human_hg18 0.378828 # GuineaPig_cavPor3 Squirrel_speTri0 0.426125 # GuineaPig_cavPor3 Rabbit_oryCun1 0.445888 # GuineaPig_cavPor3 Mouse_mm9 0.479871 # GuineaPig_cavPor3 Rat_rn4 0.487980 # GuineaPig_cavPor3 Opossum_monDom4 0.835961 # GuineaPig_cavPor3 Chicken_galGal3 1.211508 # Note that guinneaPig is closer to human than to other rodents. # This may be reasonable, since the speed of evolution (length of tree limbs) # in rodents is greater than for primates. For instance mm9 is closer to hg18 # than it is to either cavPor2 or oryCun1 ## ??? time nice -n +19 featureBits cavPor3 axtChain/cavPor3.speTri0.rbest.chain.psl >& fb.cavPor3.speTri0.rbest.chain.psl.txt & # use the calculated # distances in the table below to order the organisms and check # the button order on the browser. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainCavPor3Link chain linearGap # distance on cavPor3 on other minScore # 0.479871 Mouse_mm9 (28.900%) (29.330%) 3000 medium # 0.487980 Rat_rn4 (27.858%) (27.602%) 3000 medium # 0.479871 Squirrel_speTri0 3000 medium # 0.445888 Rabbit_oryCun1 (35.145%) (28.238%) 3000 medium # 0.378828 Human_hg18 (43.971%) (48.132%) 3000 medium # 0.835961 Opossum_monDom4 (9.540%) (14.372%) 5000 loose # 1.211508 Chicken_galGal3 (10.190%) (5.437%) 5000 loose ### ### ### ### Be sure to use calJac as an example because of scaffolds ### ### ### ### will require making maf files by scaffold name # create a coherent set of all the mafs involved in this run mkdir mafLinks cd mafLinks ln -s ../../blastz.hg18/mafNet ./hg18 ln -s ../../blastz.mm9/mafNet ./mm9 ln -s ../../blastz.rn4/mafNet ./rn4 ln -s ../../blastz.monDom4/mafNet ./monDom4 ln -s ../../blastz.galGal3/mafNet ./galGal3 ln -s ../../blastz.oryCun1/mafRBestNet ./oryCun1 ln -s ../../blastz.speTri0/mafRBestNet ./speTri0 # check data size: du -hscL * # 100M galGal3 # 930M hg18 # 577M mm9 # 268M monDom4 # 495M oryCun1 # 543M rn4 # 680M speTri0 # 3.6G total # need to split these things up by Contig number for efficient kluster run ssh kkstore05 mkdir -p /san/sanvol1/scratch/cavPor3/multiz8way/contigMaf cd /iscratch/tmp # the 16201 is from petMar/chrom.sizes echo "chrM 0 16801" > chrM.bed for D in `grep -v cavPor3 /cluster/data/cavPor3/bed/multiz8way/species.list` do echo ${D} zcat \ /cluster/data/cavPor3/bed/multiz8way/mafLinks/${D}/cavPor3.${D}.*.maf.gz \ > ${D}.maf mkdir /iscratch/tmp/${D} cd /iscratch/tmp/${D} mafSplit -verbose=2 /dev/null -byTarget -useHashedName=10 Contig \ ../${D}.maf -outDirDepth=2 mafsInRegion ../chrM.bed 0/0/chrM.maf ../${D}.maf rsync -a --progress ./ \ /san/sanvol1/scratch/cavPor3/multiz8way/contigMaf/${D} cd /iscratch/tmp rm -fr ${D} ${D}.maf done # create a run-time list of contigs to operate on, not all contigs # exist in all alignments, but we want all contig names used in any # alignment: ssh kkstore05 # cavPor3 -> store12 -> kkstore05 cd /san/sanvol1/scratch/cavPor3/multiz8way/contigMaf for D in * do cd "${D}" find . -type f cd .. done | sort -u > /tmp/8-way.contig.list wc -l /tmp/8-way.contig.list mkdir /cluster/data/cavPor3/bed/multiz8way/splitRun cp -p /tmp/8-way.contig.list \ /cluster/data/cavPor3/bed/multiz8way/splitRun # wc -l 8-way.contig.list # 296 8-way.contig.list # ready for the multiz run ssh pk cd /cluster/data/cavPor3/bed/multiz8way/splitRun mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # set the db and pairs directories here cat > autoMultiz.csh << \_EOF_ #!/bin/csh -ef set db = cavPor3 set subdir = $1 set c = $2 set result = $3 set resultDir = $result:h set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz8way/contigMaf rm -fr $tmp mkdir -p $tmp mkdir -p $resultDir cp ../../tree.8.nh ../../species.list $tmp pushd $tmp foreach s (`grep -v $db species.list`) set in = $pairs/$s/$subdir/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.8.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $result rm -fr $tmp rmdir --ignore-fail-on-non-empty /scratch/tmp/$db _EOF_ # << happy emacs chmod +x autoMultiz.csh cat << \_EOF_ > template #LOOP ./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/cavPor3/bed/multiz8way/splitRun/maf/$(dir1)/$(root1).maf} #ENDLOOP _EOF_ # << emacs sed -e "s/^\.\///" ../8-way.contig.list \ | gensub2 stdin single template jobList para create jobList para try ... check ... ## first 20 failed because of tree.8.nh: ### Hand edited tree.8.nh from cavPor3.8-way.nh to: # (((((cavPor3 ((mm9 rn4) speTri0)) oryCun1) hg18) monDom4) galGal3) # Next 10 succeeded # 296 jobs in batch # 15118 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 10 of 296 jobs # Crashed: 20 jobs # CPU time in finished jobs: 5742s 95.69m 1.59h 0.07d 0.000 y # IO & Wait Time: 76s 1.27m 0.02h 0.00d 0.000 y # Average job time: 582s 9.70m 0.16h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1482s 24.70m 0.41h 0.02d # Submission to last job: 2712s 45.20m 0.75h 0.03d # Estimated complete: 0s 0.00m 0.00h 0.00d para push; para check # 296 jobs in batch # 13908 jobs (including everybody's) in Parasol queue. # Checking finished jobs # running: 25 # ranOk: 271 # total jobs in batch: 296 # [pk:tdreszer run> pc # 296 jobs in batch # 10574 jobs (including everybody's) in Parasol queue. # Checking finished jobs # . # ranOk: 296 # total jobs in batch: 296 para -eta time # 296 jobs in batch # 10542 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 296 of 296 jobs # CPU time in finished jobs: 81235s 1353.92m 22.57h 0.94d 0.003 y # IO & Wait Time: 11960s 199.33m 3.32h 0.14d 0.000 y # Average job time: 315s 5.25m 0.09h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3262s 54.37m 0.91h 0.04d # Submission to last job: 59090s 984.83m 16.41h 0.68d # Estimated complete: 0s 0.00m 0.00h 0.00d # put the split maf results back together into a single maf file # eliminate duplicate comments ssh kkstore05 cd /cluster/data/cavPor3/bed/multiz8way mkdir togetherMaf grep "^##maf version" splitRun/maf/0/0/Contig00700.maf \ | sort -u > togetherMaf/cavPor3.8way.maf ##maf version=1 scoring=autoMZ.v1 ##maf version=1 scoring=maf_project.v12 ##maf version=1 scoring=multiz for F in `find ./splitRun/maf -type f -depth` do grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \ | sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g" done | sort -u >> togetherMaf/cavPor3.8way.maf for F in `find ./splitRun/maf -type f -depth` do grep -v -h "^#" "${F}" done >> togetherMaf/cavPor3.8way.maf grep "^##eof maf" splitRun/maf/0/0/Contig00700.maf \ | sort -u >> togetherMaf/cavPor3.8way.maf # load tables for a look ssh hgwdev mkdir -p /gbdb/cavPor3/multiz8way/maf ln -s /cluster/data/cavPor3/bed/multiz8way/togetherMaf/*.maf \ /gbdb/cavPor3/multiz8way/maf/multiz8way.maf # this generates an immense multiz8way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/cavPor3/multiz8way/maf cavPor3 multiz8way & # Advisory lock created # Indexing and tabulating /gbdb/cavPor3/multiz8way/maf/multiz8way.maf # Loading multiz8way into database # Loaded 8976211 mafs in 1 files from /gbdb/cavPor3/multiz8way/maf # Advisory lock has been released # real 4m26.559s # user 2m35.837s # sys 0m47.660s # load summary table time nice -n +19 cat /gbdb/cavPor3/multiz8way/maf/*.maf \ | hgLoadMafSummary cavPor3 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz8waySummary stdin # Indexing and tabulating stdin # Created 1650940 summary blocks from 26384188 components and 8747391 mafs from stdin # Loading into cavPor3 table multiz8waySummary... # Loading completeAdvisory lock has been released # real 5m31.133s # user 4m55.729s # sys 0m21.743s # Gap Annotation # prepare bed files with gap info ssh kkstore05 mkdir /cluster/data/cavPor3/bed/multiz8way/anno cd /cluster/data/cavPor3/bed/multiz8way/anno mkdir maf run # these actually already all exist from previous multiple alignments # remove the echo in front of the twoBitInfo to actually make it work for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done # creating cavPor3.N.bed # twoBitInfo -nBed /cluster/data/cavPor3/cavPor3.2bit /cluster/data/cavPor3/cavPor3.N.bed # -rw-rw-r-- 1 2164385 Jul 18 2006 /cluster/data/galGal3/galGal3.N.bed # -rw-rw-r-- 1 232970 Feb 6 2006 /cluster/data/hg18/hg18.N.bed # -rw-rw-r-- 1 27838 Oct 15 2007 /cluster/data/mm9/mm9.N.bed # -rw-rw-r-- 1 1788138 Feb 28 2006 /cluster/data/monDom4/monDom4.N.bed # -rw-rw-r-- 1 13782261 Nov 13 2005 /cluster/data/oryCun1/oryCun1.N.bed # -rw-rw-r-- 1 20910683 Feb 28 2006 /cluster/data/rn4/rn4.N.bed # creating speTri0.N.bed # twoBitInfo -nBed /cluster/data/speTri0/speTri0.2bit /cluster/data/speTri0/speTri0.N.bed cd run rm -f nBeds sizes for DB in `grep -v cavPor3 ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # galGal3 # hg18 # mm9 # monDom4 # oryCun1 # rn4 # speTri0 ssh memk # temporarily copy the cavPor3.8way.maf file onto the memk # nodes /scratch/data/cavPor3/maf/ directory for R in 0 1 2 3 4 5 6 7 do ssh mkr0u${R} rsync -a --progress \ /cluster/data/cavPor3/bed/multiz8way/togetherMaf/cavPor3.8way.maf \ /scratch/data/cavPor3/maf/ done mkdir /cluster/data/cavPor3/bed/multiz8way/anno/splitMaf # need to split up the single maf file into individual # per-scaffold maf files to run annotation on cd /cluster/data/cavPor3/bed/multiz8way/anno/splitMaf # create bed files to list approximately 394 scaffolds in # a single list, approximately 8 lists cat << \_EOF_ > mkBedLists.pl #!/usr/bin/env perl use strict; use warnings; my $bedCount = 0; my $i = 0; my $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; open (FH,") { chomp $line; if ( (($i + 1) % 394) == 0 ) { printf "%s\n", $line; close (BF); ++$bedCount; $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; } ++$i; my ($chr, $size) = split('\s+',$line); printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr; } close (FH); # close (BH); _EOF_ # << happy emacs chmod +x mkBedLists.pl ./mkBedLists.pl # -rw-rw-r-- 1 tdreszer protein 13805 Jul 10 16:23 file_0.bed # -rw-rw-r-- 1 tdreszer protein 13602 Jul 10 16:23 file_1.bed # -rw-rw-r-- 1 tdreszer protein 13756 Jul 10 16:23 file_2.bed # -rw-rw-r-- 1 tdreszer protein 14166 Jul 10 16:23 file_3.bed # -rw-rw-r-- 1 tdreszer protein 14090 Jul 10 16:23 file_4.bed # -rw-rw-r-- 1 tdreszer protein 13790 Jul 10 16:23 file_5.bed # -rw-rw-r-- 1 tdreszer protein 13790 Jul 10 16:23 file_6.bed # -rw-rw-r-- 1 tdreszer protein 13545 Jul 10 16:23 file_7.bed # now, run a mafsInRegion on each one of those lists cat << \_EOF_ > runOne #!/bin/csh -fe set runDir = "/cluster/data/cavPor3/bed/multiz8way/anno/splitMaf" set resultDir = $1 set bedFile = $resultDir.bed mkdir -p $resultDir mkdir -p /scratch/tmp/cavPor3/$resultDir pushd /scratch/tmp/cavPor3/$resultDir mafsInRegion $runDir/$bedFile -outDir . \ /scratch/data/cavPor3/maf/cavPor3.8way.maf popd rsync -q -a /scratch/tmp/cavPor3/$resultDir/ ./$resultDir/ rm -fr /scratch/tmp/cavPor3/$resultDir rmdir --ignore-fail-on-non-empty /scratch/tmp/cavPor3 _EOF_ # << happy emacs chmod +x runOne cat << \_EOF_ > template #LOOP ./runOne $(root1) #ENDLOOP _EOF_ # << happy emacs ls file*.bed > runList gensub2 runList single template jobList para create jobList para try ... check ... push ... etc # 8 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 8 of 8 jobs # CPU time in finished jobs: 1810s 30.16m 0.50h 0.02d 0.000 y # IO & Wait Time: 1218s 20.30m 0.34h 0.01d 0.000 y # Average job time: 379s 6.31m 0.11h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 614s 10.23m 0.17h 0.01d # Submission to last job: 614s 10.23m 0.17h 0.01d # Estimated complete: 0s 0.00m 0.00h 0.00d cd /cluster/data/cavPor3/bed/multiz8way/anno/run cat << \_EOF_ > doAnno.csh #!/bin/csh -ef set outDir = ../maf/$2 set result = $3 set input = $1 mkdir -p $outDir cat $input | \ nice mafAddIRows -nBeds=nBeds stdin /scratch/data/cavPor3/cavPor3.2bit $result _EOF_ # << happy emacs chmod +x doAnno.csh cat << \_EOF_ > template #LOOP ./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf} #ENDLOOP _EOF_ # << happy emacs find ../splitMaf -type f -name "*.maf" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... push ... etc. 1849 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # unsubmitted jobs: 1839 # crashed: 10 # total jobs in batch: 1849 # Couldn't open speTri0.bed , No such file or directory # lrwxrwxrwx 1 tdreszer protein 35 Jul 10 14:13 speTri0.bed -> /cluster/data/speTri0/speTri0.N.bed ### /cluster/data/speTri0/speTri0.N.bed Does not exist... twoBitInfo -nBed /cluster/data/speTri0/speTri0.2bit /cluster/data/speTri0/speTri0.N.bed & para try; para push... # 1849 jobs in batch # 0 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 1849 of 1849 jobs # CPU time in finished jobs: 6313s 105.22m 1.75h 0.07d 0.000 y # IO & Wait Time: 5708s 95.13m 1.59h 0.07d 0.000 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 171s 2.85m 0.05h 0.00d # Submission to last job: 651s 10.85m 0.18h 0.01d # Estimated complete: 0s 0.00m 0.00h 0.00d # put the results back together into a single file ssh kkstore05 cd /cluster/data/cavPor3/bed/multiz8way/anno grep "^##maf version" maf/file_0//scaffold_0.maf \ | sort -u > cavPor3.anno.8way.maf ##maf version=1 scoring=autoMZ.v1 find ./maf -type f -depth -name "*.maf" | while read F do grep -v -h "^#" "${F}" done >> cavPor3.anno.8way.maf echo "##eof maf" >> cavPor3.anno.8way.maf # -rw-rw-r-- 1 tdreszer protein 15080110322 Jul 11 14:14 cavPor3.anno.8way.maf ssh hgwdev cd /cluster/data/cavPor3/bed/multiz8way/anno mkdir -p /gbdb/cavPor3/multiz8way/anno ln -s `pwd`/cavPor3.anno.8way.maf \ /gbdb/cavPor3/multiz8way/anno/multiz8way.maf # by loading this into the table multiz8way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/cavPor3/multiz8way/anno \ cavPor3 multiz8way # Loaded 10486949 mafs in 1 files from /gbdb/cavPor3/multiz8way/anno # Advisory lock has been released # real 6m19.376s # normally filter this for chrom size > 1,000,000 and only load # those chroms. But this is a scaffold assembly, load everything: time nice -n +19 hgLoadMafSummary cavPor3 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz8waySummary \ /gbdb/cavPor3/multiz8way/anno/multiz8way.maf # Created 1650940 summary blocks from 26384188 components # and 10209337 mafs from /gbdb/cavPor3/multiz8way/anno/multiz8way.maf # real 6m29.261s # by loading this into the table multiz8waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz8way*.tab files in this /scratch/tmp directory rm multiz8way*.tab # And, you can remove the previously loaded non-annotated maf file link: rm /gbdb/cavPor3/multiz8way/maf/multiz8way.maf rmdir /gbdb/cavPor3/multiz8way/maf # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far # ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far ################## # Effort to move to hive ################## ssh hgwdev mv /hive/archive/store12/cavPor3 /hive/data/genomes/cavPor3 ln -sf /hive/data/genomes/cavPor3 /cluster/data/cavPor3 find /gbdb/cavPor3 /usr/local/apache/htdocs/goldenPath/cavPor3 -type l -ls | grep /cluster/store ################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ###################### ################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ###################### ################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ###################### ################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ###################### ########################################################################### ## Annotate 5-way multiple alignment with gene annotations ## (START - 2008-07-14 - Tim) # Gene frames ## given previous survey done for 9-way alignment on Marmoset ## and the appearance of new ensGene tables on everything # use knownGene for hg18, mm9 # use ensGene for canFam2, ornAna1 # and refGene for cavPor3 ssh hgwdev mkdir /cluster/data/cavPor3/bed/multiz8way/frames cd /cluster/data/cavPor3/bed/multiz8way/frames mkdir genes # knownGene for DB in hg18 mm9 rn4 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene for DB in oryCun1 ornAna1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # refGene for DB in cavPor3 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done ls -og genes # -rw-rw-r-- 1 861210 Mar 18 15:40 cavPor3.gp.gz # -rw-rw-r-- 1 1865308 Mar 18 15:40 canFam2.gp.gz # -rw-rw-r-- 1 2008806 Mar 18 15:39 hg18.gp.gz # -rw-rw-r-- 1 1965274 Mar 18 15:39 mm9.gp.gz # -rw-rw-r-- 1 1347532 Mar 18 15:40 ornAna1.gp.gz ssh kkstore06 cd /cluster/data/cavPor3/bed/multiz5way/frames # anything to annotate is in a pair, e.g.: cavPor3 genes/cavPor3.gp.gz time (cat ../anno/cavPor3.anno.5way.maf | nice -n +19 genePredToMafFrames cavPor3 stdin stdout cavPor3 genes/cavPor3.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz canFam2 genes/canFam2.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 77526 cavPor3 # 110768 ornAna1 # 221524 mm9 # 230010 hg18 # 243396 canFam2 # load the resulting file ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way/frames time nice -n +19 hgLoadMafFrames cavPor3 multiz5wayFrames \ multiz5way.mafFrames.gz # real 0m21.968s # enable the trackDb entries: # frames multiz5wayFrames # irows on ############################################################################# # phastCons 5-way (DONE - 2008-03-19 - Hiram) # split 5way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh memk mkdir /cluster/data/cavPor3/bed/multiz5way/msa.split cd /cluster/data/cavPor3/bed/multiz5way/msa.split mkdir -p /san/sanvol1/scratch/cavPor3/multiz5way/cons/ss cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/cavPor3/bed/multiz5way/anno/maf set WINDOWS = /san/sanvol1/scratch/cavPor3/multiz5way/cons/ss pushd $WINDOWS set resultDir = $1 set c = $2 rm -fr $resultDir/$c mkdir -p $resultDir twoBitToFa -seq=$c /scratch/data/cavPor3/cavPor3.2bit /scratch/tmp/cavPor3.$c.fa # need to truncate odd-ball scaffold/chrom names that include dots # as phastCons utils can't handle them set TMP = /scratch/tmp/$c.clean.maf.$$ perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$resultDir/$c.maf > $TMP /cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \ -M /scratch/tmp/cavPor3.$c.fa \ -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000 rm -f /scratch/tmp/cavPor3.$c.fa rm -f $TMP popd mkdir -p $resultDir date > $resultDir/$c.out '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out} #ENDLOOP '_EOF_' # << happy emacs # create list of maf files: (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 2320 of 2320 jobs # CPU time in finished jobs: 1710s 28.50m 0.47h 0.02d 0.000 y # IO & Wait Time: 6951s 115.85m 1.93h 0.08d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest finished job: 128s 2.13m 0.04h 0.00d # Submission to last job: 1048s 17.47m 0.29h 0.01d # take the cons and noncons trees from the mouse 30-way # Estimates are not easy to make, probably more correctly, # take the 30-way .mod file, and re-use it here. ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod . # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh memk mkdir -p /cluster/data/cavPor3/bed/multiz5way/cons/run.cons cd /cluster/data/cavPor3/bed/multiz5way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all gliers placentals # Well, that's what it was when used in the Mm9 30-way, # in this instance, there is only the directory "all" cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast/bin set subDir = $1 set f = $2 set c = $2:r set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/cavPor3/bed/multiz5way/cons mkdir -p $tmp set san = /san/sanvol1/scratch/cavPor3/multiz5way/cons if (-s $cons/$grp/$grp.non-inf) then cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp else cp -p $cons/$grp/$grp.mod $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir sleep 4 touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir rm -f $san/$grp/pp/$subDir/$f.pp rm -f $san/$grp/bed/$subDir/$f.bed mv $tmp/$f.pp $san/$grp/pp/$subDir mv $tmp/$f.bed $san/$grp/bed/$subDir rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # Create parasol batch and run it pushd /san/sanvol1/scratch/cavPor3/multiz5way/cons find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \ > /cluster/data/cavPor3/bed/multiz5way/cons/ss.list popd # run for all species cd .. mkdir -p all run.cons/all cd all /cluster/bin/phast.cz/tree_doctor ../../mm9.30way.mod \ --prune-all-but=bosTau3,hg18,mm9,canFam2,ornAna1 \ | sed -e "s/bosTau3/cavPor3/" > all.mod cd ../run.cons/all # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/cavPor3/multiz5way/cons/all/pp/$(lastDir1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 2569 of 2569 jobs # CPU time in finished jobs: 8636s 143.93m 2.40h 0.10d 0.000 y # IO & Wait Time: 17371s 289.52m 4.83h 0.20d 0.001 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest finished job: 44s 0.73m 0.01h 0.00d # Submission to last job: 1008s 16.80m 0.28h 0.01d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/cavPor3/multiz5way/cons/all find ./bed -type f -name "chr*.bed" | xargs cat \ | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 3 minutes cp -p mostConserved.bed /cluster/data/cavPor3/bed/multiz5way/cons/all # load into database ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way/cons/all time nice -n +19 hgLoadBed cavPor3 phastConsElements5way mostConserved.bed # Loaded 1005876 elements of size 5 # Try for 5% overall cov, and 70% CDS cov # We don't have any gene tracks to compare CDS coverage # --rho .31 --expected-length 45 --target-coverage .3 featureBits cavPor3 phastConsElements5way # 132010504 bases of 2731830700 (4.832%) in intersection # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/cavPor3/multiz5way/cons/all mkdir -p phastCons5wayScores for D in `ls -1d pp/file* | sort -t_ -k2n` do TOP=`pwd` F=${D/pp\/} out=${TOP}/phastCons5wayScores/${F}.data.gz echo "${D} > ${F}.data.gz" cd ${D} find . -name "*.pp" -type f \ | sed -e "s#^./##; s/chrUn.004./chrUn_004_/; s/-/.-./" \ | sort -t '.' -k1,1 -k3.3n \ | sed -e "s/.-./-/; s/chrUn_004_/chrUn.004./" | xargs cat \ | gzip > ${out} cd "${TOP}" done # copy those files to the downloads area: # /cluster/data/cavPor3/bed/multiz5way/downloads/phastCons5way/phastConsScores # for hgdownload downloads # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/cavPor3/multiz5way/cons/all ls -1 phastCons5wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \ | wigEncode -noOverlap stdin phastCons5way.wig phastCons5way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 time nice -n +19 cp -p *.wi? /cluster/data/cavPor3/bed/multiz5way/cons/all # real 0m40.875s # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way/cons/all ln -s `pwd`/phastCons5way.wib /gbdb/cavPor3/multiz5way/phastCons5way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz5way cavPor3 \ phastCons5way phastCons5way.wig # real 1m5.667s # remove garbage rm wiggle.tab # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way/cons/all time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=cavPor3 phastCons5way > histogram.data 2>&1 # real 3m37.316s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Cow BosTau4 Histogram phastCons5way track" set xlabel " phastCons5way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # These trackDb entries turn on the wiggle phastCons data track: # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons5way # spanList 1 # autoScale Off # windowingFunction mean # pairwiseHeight 12 # yLineOnOff Off ############################################################################# # Downloads (DONE - 2008-01-11 - Hiram) # Let's see if the downloads will work ssh hgwdev /cluster/data/cavPor3 # expecting to find repeat masker .out file here: ln -s bed/RepeatMasker/cavPor3.fa.out . time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \ -workhorse=hgwdev cavPor3 > jkStuff/downloads.log 2>&1 # real 24m3.210s # failed making upstream sequences: # featureBits cavPor3 mgcGenes:upstream:1000 -fa=stdout # setpriority: Permission denied. # the 'nice' from my bash shell causes trouble inside the csh # script which uses nice. Finish off the install step manually # with the mgcGenes upstreams ... ############################################################################# # PushQ entries (DONE - 2008-01-11 - Hiram) ssh hgwdev /cluster/data/cavPor3 /cluster/bin/scripts/makePushQSql.pl cavPor3 > jkStuff/pushQ.sql # output warnings: # cavPor3 does not have seq # cavPor3 does not have gbMiscDiff # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting # and genbank tables) which tracks to assign these tables to: # genscanPep ############################################################################# # create download files (DONE - 2008-03-19 - Hiram) ssh hgwdev cd /cluster/data/cavPor3 ln -s /cluster/data/cavPor3/bed/repeatMasker/cavPor3.fa.out . makeDownloads.pl cavPor3 > makeDownloads.log 2>&1 # *EDIT* the README files and ensure they are correct ############################################################################# # PushQ entries (DONE - 2008-03-19 - Hiram) ssh hgwdev /cluster/data/cavPor3 /cluster/bin/scripts/makePushQSql.pl cavPor3 > jkStuff/pushQ.sql # output warnings: # hgwdev does not have /usr/local/apache/htdocs/goldenPath/cavPor3/liftOver/cavPor3ToBosTau* # cavPor3 does not have seq # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting # and genbank tables) which tracks to assign these tables to: # genscanPep # looks like there should be a bosTau3 to cavPor3 liftOver run ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-03-28) ssh kkstore06 # bash if not using bash shell already mkdir /cluster/data/cavPor3/blastDb cd /cluster/data/cavPor3 grep -v chrUn chrom.sizes | awk '{print $1}' > chr.lst for i in `cat chr.lst`; do twoBitToFa cavPor3.unmasked.2bit -seq=$i stdout; done > temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft grep chrUn chrom.sizes | awk '{print $1}' > chr.lst for i in `cat chr.lst`; do twoBitToFa cavPor3.unmasked.2bit -seq=$i stdout; done > temp.fa faSplit sequence temp.fa 150 blastDb/y rm temp.fa chr.lst cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 3440 mkdir -p /san/sanvol1/scratch/cavPor3/blastDb cd /cluster/data/cavPor3/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/cavPor3/blastDb done mkdir -p /cluster/data/cavPor3/bed/tblastn.hg18KG cd /cluster/data/cavPor3/bed/tblastn.hg18KG echo /san/sanvol1/scratch/cavPor3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 3440 query.lst # we want around 350000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\) # 36727/(350000/3440) = 360.973943 mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa split -l 361 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/cavPor3/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/cavPor3/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/cavPor3/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time Completed: 350880 of 350880 jobs CPU time in finished jobs: 27082816s 451380.27m 7523.00h 313.46d 0.859 y IO & Wait Time: 2334990s 38916.50m 648.61h 27.03d 0.074 y Average job time: 84s 1.40m 0.02h 0.00d Longest finished job: 578s 9.63m 0.16h 0.01d Submission to last job: 96125s 1602.08m 26.70h 1.11d ssh kkstore06 cd /cluster/data/cavPor3/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh pk cd /cluster/data/cavPor3/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 99 of 102 jobs # Crashed: 3 jobs # CPU time in finished jobs: 113248s 1887.47m 31.46h 1.31d 0.004 y # IO & Wait Time: 86043s 1434.04m 23.90h 1.00d 0.003 y # Average job time: 2013s 33.55m 0.56h 0.02d # Longest finished job: 6139s 102.32m 1.71h 0.07d # Submission to last job: 10416s 173.60m 2.89h 0.12d # ran three crashed jobs on kolossus ssh kkstore06 cd /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/cavPor3/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/cavPor3/bed/tblastn.hg18KG hgLoadPsl cavPor3 blastHg18KG.psl # check coverage featureBits cavPor3 blastHg18KG # 40254923 bases of 2731830700 (1.474%) in intersection featureBits cavPor3 refGene:cds blastHg18KG -enrichment # refGene:cds 0.429%, blastHg18KG 1.474%, both 0.379%, cover 88.39%, enrich 59.98x ssh kkstore06 rm -rf /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## # Create 5-way downloads (DONE - 2008-03-28 - Hiram) ssh hgwdev mkdir -p /cluster/data/cavPor3/bed/multiz5way/downloads/phastCons5way cd /cluster/data/cavPor3/bed/multiz5way/downloads/phastCons5way cp -p \ /san/sanvol1/scratch/cavPor3/multiz5way/cons/all/phastCons5wayScores/* . ln -s ../../cons/all/all.mod ./5way.mod cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt . # edit that README.txt to be correct for this 5-way alignment cd .. mkdir multiz5way cd multiz5way cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt . # edit that README.txt to be correct for this 5-way alignment ssh kkstore06 cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way ln -s ../../cavPor3.5-way.nh 5way.nh time gzip -c ../../anno/cavPor3.anno.5way.maf > cavPor3.5way.maf.gz # real 34m59.295s ssh hgwdev cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way # creating upstream files from refGene, bash script: cat << '_EOF_' > mkUpstream.sh #!/bin/bash DB=cavPor3 GENE=refGene NWAY=multiz5way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \ stdin stdout \ -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done '_EOF_' # << happy emacs chmod +x ./mkUpstream.sh time nice -n +19 ./mkUpstream.sh -rw-rw-r-- 1 9883443 Mar 28 13:02 upstream1000.maf.gz -rw-rw-r-- 1 17938570 Mar 28 13:06 upstream2000.maf.gz -rw-rw-r-- 1 40384656 Mar 28 13:10 upstream5000.maf.gz # # check the names in these upstream files to ensure sanity: zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \ | sort | uniq -c | sort -rn | less # should be a list of the other 4 species with a high count, # then refGene names, e.g.: # 8806 ornAna1 # 8806 mm9 # 8806 hg18 # 8806 canFam2 # 7 NM_001077006 # 3 NM_001113231 # 3 NM_001105381 # 3 NM_001102527 # 3 NM_001102322 # ... ssh kkstore06 cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way md5sum *.maf.gz > md5sum.txt cd ../phastCons5way md5sum *.data.gz *.mod > md5sum.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/cavPor3/multiz5way mkdir /usr/local/apache/htdocs/goldenPath/cavPor3/phastCons5way cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/cavPor3/multiz5way cd ../phastCons5way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/cavPor3/phastCons5way # if your ln -s `pwd`/* made extra links to files you don't want there, # check the goldenPath locations and remove those extra links ############################################################################# # BLASTZ/CHAIN/NET 2X Ground squirrel: speTri0 (In progress 2008-05-16 kate) ssh kkstore05 cd /cluster/data/cavPor3/bed mkdir blastzSpeTri0.2008-05-16 cd blastzSpeTri0.2008-05-16 cat << '_EOF_' > DEF # Mouse vs. Ground squirrel BLASTZ_M=50 # TARGET: Mouse MM9 SEQ1_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ1_LEN=/cluster/data/cavPor3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Ground squirrel speTri0 SEQ2_DIR=/scratch/data/speTri0/speTri0.2bit SEQ2_LEN=/cluster/data/speTri0/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LIMIT=500 SEQ2_LAP=0 BASE=/cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium >& do.log & # got here ln -s blastzSpeTri0.2008-05-16 /cluster/data/cavPor3/bed/blastzSpeTri0 # failures... have to rescue manually # make axt's. Note -- too many scaffolds to do in a single pass # (exhausts mem) so we split ssh kksstore05 cd /cluster/data/cavPor3/bed/blastz.speTri0/axtChain netSplit noClass.net noClass ~kate/bin/x86_64/netSplit noClass.net noClass -lump=100 # can't just split to chains (exhausts open files) chainSplit chain cavPor3.speTri0.all.chain.gz -lump=100 cd noClass mkdir ../../axtNetChunks cat > netToAxt.csh << 'EOF' foreach i (*.net) set p = $i:r echo $i netToAxt $p.net ../chain/$p.chain /cluster/data/cavPor3/cavPor3.2bit /cluster/data/speTri0/speTri0.2bit ../../axtNetChunks/$i:r.axt end 'EOF' csh netToAxt.csh >&! netToAxt.log & # create unfiltered mafNet to get better squirrel coverage (2008-11-3 kate) cd ../../axtNetChunks cat *.axt | axtSort stdin stdout > ../axtChain/cavPor3.speTri0.net.axt mkdir ../mafNet axtToMaf -tPrefix=cavPor3. -qPrefix=speTri0. ../axtChain/cavPor3.speTri0.net.axt \ /cluster/data/cavPor3/chrom.sizes /cluster/data/speTri0/chrom.sizes \ ../mafNet/cavPor3.speTri0.net.maf gzip stdin > ../mafNet/cavPor3.speTri0.net.maf.gz # low cov genome, so use reciprocal best for multiple alignment # need to generate liftover chains, which failed in automation cat > over.csh << 'EOF' foreach i (*.net) set p = $i:r echo $i netChainSubset -verbose=0 $p.net ../chain/$p.chain stdout | \ chainStitchId stdin $i.over.chain end cat *.over.chain | gzip -c > cavPor3.speTri0.over.chain.gz 'EOF' csh over.csh >&! over.log & ssh hgwdev cd /cluster/data/cavPor3/bed/blastz.speTri0 mkdir /usr/local/apache/htdocs/goldenPath/cavPor3/vsSpeTri0 /cluster/bin/scripts/doRecipBest.pl cavPor3 speTri0 >&! rbest.log & # TODO: # load chains in database and check coverage # NOTE: exhausts mem on hgwdev -- need to try this on kolossus ? cd axtChain cat > loadChains.csh << 'EOF' hgLoadChain -tIndex cavPor3 chainSpeTri0 cavPor3.speTri0.all.chain.gz featureBits cavPor3 chainSpeTri0Link > fb.cavPor3.chainSpeTri0Link.txt cat fb.cavPor3.chainSpeTri0Link.txt 'EOF' csh loadChains.csh >&! loadChains.log & ############################################################################ # SWAP SpeTri0 Blastz (2008-11-01 kate) mkdir -p /cluster/data/speTri0/bed cd /cluster/data/speTri0/bed mkdir blastz.cavPor3.swap cd blastz.cavPor3.swap cd /cluster/data/cavPor3/bed/blastz.speTri0/axtChain/chain mkdir ../chain.swap cat > swap.csh << 'EOF' foreach c (*.chain) echo $c chainSwap $c ../chain.swap/$c end 'EOF' csh swap.csh >&! swap.log & doBlastzChainNet.pl -swap -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/cavPor3/bed/blastz.speTri0/DEF >& swap.log & ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################# # N-SCAN gene predictions (nscanGene) - (2008-04-03 markd) # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/cavPor3/bed/nscan/ wget -nv http://mblab.wustl.edu/predictions/Guinea_pig/cavPor3/cavPor3.gtf wget -nv http://mblab.wustl.edu/predictions/Guinea_pig/cavPor3/cavPor3.prot.fa wget -nv http://mblab.wustl.edu/predictions/Guinea_pig/cavPor3/readme.html bzip2 cavPor3.* chmod a-w * # load track gtfToGenePred -genePredExt cavPor3.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt cavPor3 nscanGene stdin hgPepPred cavPor3 generic nscanPep cavPor3.prot.fa.bz2 rm *.tab # update trackDb; need a cavPor3-specific page to describe informants marmoset/cavPor3/nscanGene.html (copy from readme.html) marmoset/cavPor3/trackDb.ra # set search regex to termRegex scaffold_[0-9]+\.[0-9]+\.[0-9]+ ############################################################################ # ### Found this in Kate's unchecked in cavPor3.txt. I need it to continue the blastz to get the maf for multiz !! # ### ############################################################################# # ### # BLASTZ/CHAIN/NET 2X Ground squirrel: speTri0 (In progress 2008-05-16 kate) # ### # ### ssh kkstore05 # ### cd /cluster/data/cavPor3/bed # ### mkdir blastzSpeTri0.2008-05-16 # ### cd blastzSpeTri0.2008-05-16 # ### # ### cat << '_EOF_' > DEF # ### # Mouse vs. Ground squirrel # ### # ### BLASTZ_M=50 # ### # ### # TARGET: Mouse MM9 # ### SEQ1_DIR=/scratch/data/cavPor3/cavPor3.2bit # ### SEQ1_LEN=/cluster/data/cavPor3/chrom.sizes # ### SEQ1_CHUNK=10000000 # ### SEQ1_LAP=10000 # ### # ### # QUERY: Ground squirrel speTri0 # ### SEQ2_DIR=/scratch/data/speTri0/speTri0.2bit # ### SEQ2_LEN=/cluster/data/speTri0/chrom.sizes # ### SEQ2_CHUNK=30000000 # ### SEQ2_LIMIT=500 # ### SEQ2_LAP=0 # ### # ### BASE=/cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16 # ### TMPDIR=/scratch/tmp # ### '_EOF_' # ### # << happy emacs # ### # ### doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ # ### -chainMinScore=3000 -chainLinearGap=medium >& do.log & # ### # ### # got here # ### ln -s blastzSpeTri0.2008-05-16 /cluster/data/cavPor3/bed/blastzSpeTri0 ln -s blastzSpeTri0.2008-05-16 /cluster/data/cavPor3/bed/blastz.speTri0 # ### 1 crashed job in axtChain: # ssh mkr0u7 zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:*.psl.gz \ | axtChain -psl -verbose=0 -minScore=3000 -linearGap=medium stdin \ /scratch/data/cavPor3/cavPor3.2bit \ /scratch/data/speTri0/speTri0.2bit \ tmp.chain & # invalid unsigned number: "2" ### divide and conquer ll ../../pslParts/cavPor3.2bit\:scaffold_7\:*.psl.gz -rw-rw-r-- 1 kate protein 25560954 May 18 08:12 ../../pslParts/cavPor3.2bit:scaffold_7:0-10010000.psl.gz -rw-rw-r-- 1 kate protein 28831900 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:10000000-20010000.psl.gz -rw-rw-r-- 1 kate protein 25466248 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:20000000-30010000.psl.gz -rw-rw-r-- 1 kate protein 26433148 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:30000000-40010000.psl.gz -rw-rw-r-- 1 kate protein 38048425 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:40000000-50010000.psl.gz -rw-rw-r-- 1 kate protein 24145256 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:50000000-60010000.psl.gz -rw-rw-r-- 1 kate protein 7480033 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:60000000-61004451.psl.gz zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:0-*.psl.gz \ | axtChain -psl -verbose=0 -minScore=3000 -linearGap=medium stdin \ /scratch/data/cavPor3/cavPor3.2bit \ /scratch/data/speTri0/speTri0.2bit \ tmp.chain & # Error reading 4 bytes: Success zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:40*.psl.gz \ | axtChain -psl -verbose=0 -minScore=3000 -linearGap=medium stdin \ /scratch/data/cavPor3/cavPor3.2bit \ /scratch/data/speTri0/speTri0.2bit \ tmp.chain & # invalid unsigned number: "2" mv ../../pslParts/cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz ../../pslParts/cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz.bad zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:*.psl.gz \ | axtChain -psl -verbose=0 -minScore=3000 -linearGap=medium stdin \ /scratch/data/cavPor3/cavPor3.2bit \ /scratch/data/speTri0/speTri0.2bit \ tmp.chain & #Error reading 4 bytes: Success ssh pk; para shove -retries=6 # 233 jobs in batch # 503 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 233 of 233 jobs # CPU time in finished jobs: 116206s 1936.77m 32.28h 1.34d 0.004 y # IO & Wait Time: 3377s 56.28m 0.94h 0.04d 0.000 y # Average job time: 513s 8.55m 0.14h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 8500s 141.67m 2.36h 0.10d # Submission to last job: 967976s 16132.93m 268.88h 11.20d # Estimated complete: 0s 0.00m 0.00h 0.00d ### ############## Now need to restart the blastz: tail do.log # Command failed: # ssh -x kki nice /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/run/doChainRun.csh ssh kkstore05 # store12->kkstore05 cd /cluster/data/cavPor3/bed/blastz.speTri0 screen time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium -continue chainMerge >> do.log 2>&1 # real 220m10.699s # tail do.log # writing /dev/null # memory usage 2419843072, utime 5416 s/100, stime 389 # netChainSubset -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz stdout # chainStitchId stdin stdout # gzip -c # Killed # # gzip: stdout: Broken pipe # Command failed: # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/netChains.csh cp netChain.csh netChain.restart.csh # edit to restart in correct place ./netChain.restart.csh ### After some searching I was able to figure out that  = octal "\031" # Now I can find the original problem file and try to rebuild it: grep -n -P "\031" cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl # 4253:62 40 0 0 1 1 1 6 + scaffold_6393.1-308827 308827 149765 149868 scaffold_' 61004451 49604348 49604456 3 2,38,35, 149765,149794,149833, 49604348,49604383,49604421, # For comparison: tail -55 cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl | head -5 # 91 37 0 0 1 8 3 16 + scaffold_6393.1-308827 308827 244021 244157 scaffold_7 61004451 48951564 48951708 5 18,25,14,15,56, 244021,244039,244072,244086,244101, 48951564,48951583,48951608,48951629,48951652, # 62 32 0 0 0 0 0 0 + scaffold_6393.1-08827 308827 54194 54288 scaffold_7 61004451 49114670 49114764 1 94, 54194, 49114670, # 62 40 0 0 1 1 1 6 + scaffold_6393.1-308827 308827 149765 149868 scaffold_' 61004451 49604348 49604456 3 2,38,35, 149765,149794,149833, 49604348,49604383,49604421, # 127 63 0 0 1 18 2 5 + scaffold_6393.1-308827 308827 159033 159241 scaffold_7 61004451 49613715 49613910 4 59,84,20,27, 159033,159092,159176,159214, 49613715,49613775,49613863,49613883, # 29 8 0 0 0 0 0 0 + scaffold_6393.1-308827 308827 160121 160158 scaffold_7 61004451 49620825 49620862 1 37, 160121, 49620825, ### ### ### Yikes! the prior line as a different strange char:  = octal "\023" ## Run on same machine: ssh kkr10u47 /cluster/bin/scripts/blastz-run-ucsc -outFormat psl ../psl/cavPor3.2bit:scaffold_7:40000000-50010000 qParts/part007.lst ../DEF ../psl/cavPor3.2bit:scaffold_7:40000000-50010000/cavPor3.2bit:scaffold_7:40000000-50010000_part007.lst.new.psl ll ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007* # -rw-rw-r-- 1 tdreszer protein 785511 Jun 3 13:11 ../psl/cavPor3.2bit:scaffold_7:40000000-50010000/cavPor3.2bit:scaffold_7:40000000-50010000_part007.lst.new.psl # -rw-rw-r-- 1 kate protein 785511 May 16 19:40 ../psl/cavPor3.2bit:scaffold_7:40000000-50010000/cavPor3.2bit:scaffold_7:40000000-50010000_part007.lst.psl grep -n -P "\031" ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.new.psl tail -55 ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.new.psl | head -5 # 91 37 0 0 1 8 3 16 + scaffold_6393.1-308827 308827 244021 244157 scaffold_7 61004451 48951564 48951708 518,25,14,15,56, 244021,244039,244072,244086,244101, 48951564,48951583,48951608,48951629,48951652, # 62 32 0 0 0 0 0 0 + scaffold_6393.1-308827 308827 54194 54288 scaffold_7 61004451 49114670 49114764 194, 54194, 49114670, # 62 40 0 0 1 1 1 6 + scaffold_6393.1-308827 308827 149765 149868 scaffold_7 61004451 49604348 49604456 329,38,35, 149765,149794,149833, 49604348,49604383,49604421, # 127 63 0 0 1 18 2 5 + scaffold_6393.1-308827 308827 159033 159241 scaffold_7 61004451 49613715 49613910 459,84,20,27, 159033,159092,159176,159214, 49613715,49613775,49613863,49613883, # 29 8 0 0 0 0 0 0 + scaffold_6393.1-308827 308827 160121 160158 scaffold_7 61004451 49620825 49620862 137, 160121, 49620825, mv ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl.old mv ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.new.psl ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl # Okay, we are NOW done with run.blastz so it is on to run.cat ssh kkstore05 cd ../run.cat ./cat.csh cavPor3.2bit:scaffold_7:40000000-50010000 cavPor3.2bit:scaffold_7:40000000-50010000.psl.new.gz ll cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.* # -rw-rw-r-- 1 tdreszer protein 38048414 Jun 3 13:28 cavPor3.2bit:scaffold_7:40000000-50010000.psl.new.gz # -rw-rw-r-- 1 tdreszer protein 38048475 May 18 08:13 cavPor3.2bit:scaffold_7:40000000-50010000.psl.old.gz mv cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.new.gz cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz mv cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.old.gz cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz.old mv cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz* ../pslParts/ # Okay, we are now done with run.cat. What can be done for axtChain? # run on same cluster machine ssh kkr4u00 cd /cluster/data/cavPor3/bed/blastz.speTri0/axtChain/run ./chain.csh cavPor3.2bit:scaffold_7: chain/cavPor3.2bit:scaffold_7:.chain ll chain/cavPor3.2bit\:scaffold_7\:.chain # -rw-rw-r-- 1 tdreszer protein 304948646 Jun 3 15:05 chain/cavPor3.2bit:scaffold_7:.chain # Okay, NOW we are ready to consider restarting doBlastzChainNet.pl # Though I assume that some of what was already done, will be in the way. time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium -continue chainMerge >> do.log 2>&1 # postProcessChains: looks like this was run successfully already (cavPor3.speTri0.all.chain.gz exists). # Either run with -continue net or some later stage, or move aside/remove /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/cavPor3.speTri0.all.chain.gz and run again. mv axtChain/cavPor3.speTri0.all.chain.gz axtChain/cavPor3.speTri0.all.chain.gz.old time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium -continue chainMerge >> do.log 2>&1 # real 54m1.762s tail do.log # HgStepManager: executing step 'net' Thu Jun 5 12:46:09 2008. # netChains: looks like we are not starting with a clean slate. Please move aside or remove /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/noClass.net and run again. mv /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/noClass.net /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/noClass.net.old screen time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium -continue net >> do.log 2>&1 real 204m20.325s tail do.log # writing /dev/null # memory usage 2427514880, utime 5453 s/100, stime 405 # netChainSubset -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz stdout # chainStitchId stdin stdout # gzip -c # # gzip: stdout: Broken pipe # Killed # Command failed: # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/netChains.csh ### Can't seem to get ahead!!! ## Doing each step in netChains.csh manually... cd /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain # Make nets ("noClass", i.e. without rmsk/class stats which are added later): chainPreNet cavPor3.speTri0.all.chain.gz /cluster/data/cavPor3/chrom.sizes /cluster/data/ speTri0/chrom.sizes stdout \ | chainNet stdin -minSpace=1 /cluster/data/cavPor3/chrom.sizes /cluster/data/speTri0/chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # Completed successfully # Make liftOver chains: netChainSubset -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz netChainSubset.out ll netChainSubset.out # -rw-rw-r-- 1 tdreszer protein 73728 Jun 6 17:12 netChainSubset.out chainStitchId netChainSubset.out chainStitchId.out ll chainStitchId.out # -rw-rw-r-- 1 tdreszer protein 0 Jun 9 09:13 chainStitchId.out # gzip -c chainStitchId.out > cavPor3.speTri0.over.chain.gz # Duh: gzip: stdout: Broken pipe # Okay, continuing with next step, ignoring the empty cavPor3.speTri0.over.chain.gz # Make axtNet for download: one .axt for all of cavPor3. mkdir ../axtNet netToAxt -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz \ /scratch/data/cavPor3/cavPor3.2bit /scratch/data/speTri0/speTri0.2bit stdout \ | axtSort stdin stdout \ | gzip -c > ../axtNet/cavPor3.speTri0.net.axt.gz ### Okay Kate rescued this by adjusting file sizes to handle memory ssh kkstore05 cd /cluster/data/cavPor3/bed/blastz.speTri0 time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium -continue net >> doAferChain.log 2>&1 # NOTE: be sure to ls the data in above script on workhorse machine before starting script time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 # ps -ef | grep blastzCavPor3 # real 1764m55.155s tail do.log # *** All done! # *** Make sure that goldenPath/mm9/vsCavPor3/README.txt is accurate. # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary. # blastz: ranOk: 52984 cat fb.mm9.chainCavPor3Link.txt # 757283793 bases of 2620346127 (28.900%) in intersection cd /cluster/data/mm9/bed/blastzCavPor3.2008-04-10 ######### Change locations in DEF due to Hiram's new methods cat << \_EOF_ > DEF BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/mm9/nib SEQ1_LEN=/scratch/data/mm9/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: GuineaPig cavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/mm9/bed/blastzCavPor3.2008-04-10 TMPDIR=/scratch/tmp _EOF_ mkdir /cluster/data/cavPor3/bed/blastz.mm9.swap cd /cluster/data/cavPor3/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > do.log 2>&1 & # real 166m53.671s # Exit 25 time nice -n +19 doBlastzChainNet.pl /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF -bigC # Can't open chain/scaffold_795.chain to append: Too many open files # gzip: stdout: Broken pipe # Command failed: # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastz.mm9.swap/axtChain/netSynteny.csh # broken down during netSynteny.csh due to too many open files on # a chainSplit # However, there is no need to split when we have scaffolds. # Kate fixes doBlastzChainNet.pl and retry: cd /cluster/data/cavPor3/bed/blastz.mm9.swap time nice -n +19 ~kate/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=syntenicNet -syntenicNet > syn.log 2>&1 & # real 24m37.561s # *** All done! # *** Make sure that goldenPath/cavPor3/vsMm9/README.txt is accurate. # *** Add {chain,net}Mm9 tracks to trackDb.ra if necessary. cat fb.cavPor3.chainMm9Link.txt # 781173609 bases of 2663369733 (29.330%) in intersection ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # 3way Rodent Multiz (special for Jurgens (Schmitz & co.) in Muenster # Previously, Robert Baertsch handled these. I committed to these # many months ago... # 2008-07-26 kate # Redo with unfiltered net mafs to maximize squirrel sequence (2008-11-08 kate) ssh kkstore05 cd /cluster/data/cavPor3/bed mkdir multiz3way cd multiz3way mkdir -p /san/sanvol1/scratch/cavPor3/multiz3way/ cd /san/sanvol1/scratch/cavPor3/multiz3way # copy mafs to cluster-friendly disk # mm9 - high quality mammalian genome, so use syntenic net mafSplit /dev/null -byTarget mm9/ /cluster/data/cavPor3/bed/blastz.mm9/axtChain/cavPor3.mm9.synNet.maf.gz cd mm9 # rename to reflect chrom name foreach f (*.maf) set c = `head -3 $f | grep cavPor3 | awk '{print $2}' | sed 's/cavPor3.//'` echo $c mv $f $c.maf end # speTri0 - low quality mammalian genome, so use reciprocal best net # mafSplit /dev/null -byTarget speTri0/ /cluster/data/cavPor3/bed/blastz.speTri0/mafRBestNet/cavPor3.speTri0.rbest.maf.gz # redo with full net, to get more sequence mafSplit /dev/null -byTarget speTri0/ /cluster/data/cavPor3/bed/blastz.speTri0/mafNet/cavPor3.speTri0.net.maf.gz cd speTri0 # rename to reflect chrom name foreach f (*.maf) set c = `head -3 $f | grep cavPor3 | awk '{print $2}' | sed 's/cavPor3.//'` echo $c mv $f $c.maf end cd /san/sanvol1/scratch/cavPor3/multiz3way # get latest PSU utilities mkdir penn set p=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $p/{autoMZ,multiz,maf_project} penn # the autoMultiz cluster run ssh pk cd /cluster/data/cavPor3/bed/multiz3way # create species list and stripped down tree for autoMZ cat > tree.nh << 'EOF' ((cavPor3 mm9) speTri0) 'EOF' cat > species.lst << 'EOF' cavPor3 mm9 speTri0 'EOF' mkdir run maf cd run cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = cavPor3 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/$db/multiz3way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz3way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP ./autoMultiz $(root1) {check out line+ /cluster/data/cavPor3/bed/multiz3way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs mkdir /cluster/data/cavPor3/bed/multiz3way/maf/ awk '{print $1}' /cluster/data/cavPor3/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # ~3000 jobs para try para check # ~2 hours of run-time # remove empty alignment files and package up ssh kkstore05 cd /cluster/data/cavPor3/bed/multiz3way/maf mkdir empty cat > finish.csh << 'EOF' foreach f (*.maf) head -11 $f | grep -q 's cavPor3' if ($status == 1) then echo "$f empty" mv $f empty else echo " $f NOT empty" endif end 'EOF' # oops, forgot to mkdir, so 'empty' files were lost ls *.maf | wc -l # 1491 tar cvfz rodent3way.tar.gz *.maf ######### # Also do a multiz with non-filtered mouse mafs: # mm9 syntenic net mafs are sparse (cover only ~300 scaffolds), so check net ssh hgwdev cd /cluster/data/cavPor3/bed/blastz.mm9/axtChain netFilter -minGap=10 cavPor3.mm9.syn.net.gz | hgLoadNet -verbose=0 cavPor3 synNetMm9 stdin # select distinct(tName) shows 302 rows mkdir -p /san/sanvol1/scratch/cavPor3/multiz3way.full/ cd /san/sanvol1/scratch/cavPor3/multiz3way.full mafSplit /dev/null -byTarget mm9/ /cluster/data/cavPor3/bed/blastz.mm9//cavPor3.mm9.net.maf.gz cd mm9 # rename to reflect chrom name foreach f (*.maf) set c = `head -3 $f | grep cavPor3 | awk '{print $2}' | sed 's/cavPor3.//'` echo $c mv $f $c.maf end # copy and link various files from first run ln -s ../multiz3way/{speTri0,penn} . cp ../multiz3way/{species.lst,tree.nh} . mkdir run maf cp ../multiz3way/run/chrom.lst run sed 's/multiz3way/multiz3way.full/' ../multiz3way/run/autoMultiz > run/autoMultiz sed 's/multiz3way/multiz3way.full/' ../multiz3way/run/template > run/template # check it chmod +x run/autoMultiz mkdir -p /cluster/data/cavPor3/bed/multiz3way.full/maf ssh pk cd /san/sanvol1/scratch/cavPor3/multiz3way.full/run gensub2 chrom.lst single template jobList para create jobList para try para check # ~2 hours of run-time ssh kkstore05 cd /cluster/data/cavPor3/bed/multiz3way.full/maf mkdir empty csh ../../multiz3way/maf/finish.csh >&! finish.log & # post to hgwdev downloads area ssh hgwdev cd /cluster/data/cavPor3/bed/multiz3way mv *.gz /usr/local/apache/htdocs/goldenPath/cavPor3/multizRodent3way/syntenic cd /cluster/data/cavPor3/bed/multiz3way.full mv *.gz /usr/local/apache/htdocs/goldenPath/cavPor3/multizRodent3way/full ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: cavPor3.upstreamGeneTbl = xenoRefGene cavPor3.upstreamMaf = multiz8way /hive/data/genomes/cavPor3/bed/multiz8way/species.list ############################################################################ # cavPor3 - Guinea Pig - Ensembl Genes version 51 (DONE - 2008-12-02 - hiram) ssh hgwdev cd /hive/data/genomes/cavPor3 cat << '_EOF_' > cavPor3.ensGene.ra # required db variable db cavPor3 # do we need to translate geneScaffold coordinates # geneScaffolds yes nameTranslation "s/^MT/chrM/;" '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 cavPor3.ensGene.ra ssh hgwdev cd /hive/data/genomes/cavPor3/bed/ensGene.51 featureBits cavPor3 ensGene # 30699872 bases of 2663369733 (1.153%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/cavPor3/bed/ensGene.51 ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # lastz Rabbit oryCun2 swap (DONE - 2010-01-22 - Hiram) # original alignment cd /hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21 cat fb.oryCun2.chainCavPor3Link.txt # 964546600 bases of 2604023284 (37.041%) in intersection # and for the swap mkdir /hive/data/genomes/cavPor3/bed/blastz.oryCun2.swap cd /hive/data/genomes/cavPor3/bed/blastz.oryCun2.swap time doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21/DEF \ -swap -noLoadChainSplit -workhorse=hgwdev -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 186m1.649s cat fb.cavPor3.chainOryCun2Link.txt # 1003499831 bases of 2663369733 (37.678%) in intersection ############################################################################ # enable native RefSeq Genes track (2010-06-15) # in etc/genbank.conf: cavPor3.refseq.mrna.native.load = yes ssh genbank cd /cluster/data/genbank ./bin/gbAlignStep -orgCat=native -initial -srcDb=refseq cavPor3 ssh hgwdev cd /cluster/data/genbank ./bin/gbDbLoadStep -byPassGbLoaded -reload -srcDb=refseq cavPor3 ############################################################################ # liftOver for hg19 (DONE - 2011-07-14 - Hiram) cd /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain ln -s `pwd`/cavPor3.hg19.over.chain.gz \ /gbdb/cavPor3/liftOver/cavPor3ToHg19.over.chain.gz hgAddLiftOverChain -minMatch=0.1 -multiple \ -path=/gbdb/cavPor3/liftOver/cavPor3ToHg19.over.chain.gz \ cavPor3 hg19 ln -s `pwd`/cavPor3.hg19.over.chain.gz \ /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/liftOver/cavPor3ToHg19.over.chain.gz cd /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/liftOver md5sum cavPor3ToHg19.over.chain.gz >> md5sum.txt ############################################################################ # SWAP lastz mm10 (DONE - 2012-03-19 - Hiram) # original alignment to mm10 cat /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16/fb.mm10.chainCavPor3Link.txt # 754642254 bases of 2652783500 (28.447%) in intersection # and this swap mkdir /hive/data/genomes/cavPor3/bed/blastz.mm10.swap cd /hive/data/genomes/cavPor3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 80m23.870s cat fb.cavPor3.chainMm10Link.txt # 775452752 bases of 2663369733 (29.115%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cavPor3/bed ln -s blastz.mm10.swap lastz.mm10 ##############################################################################