# for emacs: -*- mode: sh; -*- # This file describes browser build for the Sea Lamprey # genome, Petromyzon marinus, March 2007, v.3.0 from WUSTL # # "$Id: petMar1.txt,v 1.23 2009/11/25 21:48:41 hiram Exp $" # # http://www.ncbi.nlm.nih.gov/genome/287 # http://www.ncbi.nlm.nih.gov/bioproject/50489 - Mich. State # http://www.ncbi.nlm.nih.gov/bioproject/12880 - WashU - SRA # http://www.ncbi.nlm.nih.gov/sra/SRX028154 - SRA records # http://www.ncbi.nlm.nih.gov/bioproject/11809 - NCBI 2004 chrMt ########################################################################## ### Fetch sequence (DONE - 2008-01-14 - Hiram) # determine estimated disk usage, from example genome like this one, # braFlo1 which used about 3.5 Gb, do not need much space, store04 # has 42 Gb, plenty of space on store9 ssh kkstore04 mkdir /cluster/store9/petMar1 ln -s /cluster/store9/petMar1 /cluster/data/petMar1 mkdir /cluster/data/petMar1/downloads cd /cluster/data/petMar1/downloads wget --timestamping \ 'ftp://genome.wustl.edu/pub/organism/Other_Vertebrates/Petromyzon_marinus/assembly/Petromyzon_marinus-3.0/output/*' ########################################################################## # Fixup the agp file to make a chrUn agp file, add a 1000 base gap # between the supercontigs ########################################################################## # Run the makeGenomeDb.pl script (DONE - 2008-01-27 - Hiram) ssh kkstore04 cd /cluster/data/petMar1/downloads # eliminate zero-length fragments from the AGP file zcat supercontigs.agp.gz | awk ' { size=$3-$2 if (size >= 0) print } ' | gzip -c > supers.agp.gz cd /cluster/data/petMar1 cat << '_EOF_' > petMar1.config.ra # Config parameters for makeGenomeDb.pl: db petMar1 clade vertebrate genomeCladePriority 130 scientificName Petromyzon marinus commonName Lamprey assemblyDate Mar. 2007 assemblyLabel WUSTL v.3.0 (March 2007) orderKey 480 mitoAcc 5835065 fastaFiles /cluster/data/petMar1/downloads/supercontigs.fa.gz agpFiles /cluster/data/petMar1/downloads/supers.agp.gz # qualFiles none dbDbSpeciesDir lamprey '_EOF_' # can stop at db since trackDb and dbDb were created in first # attempt makeGenomeDb.pl -stop=db petMar1.config.ra > makeGenome.log 2>&1 & twoBitToFa petMar1.unmasked.2bit stdout | faSize stdin # 1135497967 bases (303801529 N's 831696438 real 831696438 # upper 0 lower) in 2 sequences in 1 files # chrM has a gi:5835065 fragment name, I would rather have NC_001626 hgsql -e 'update gold set frag="NC_001626" where chrom="chrM";' petMar1 ########################################################################## # Pick up an image from the EPA, usage rights are free # http://www.epa.gov/glnpo/image/restrictions.htm mkdir /cluster/data/petMar1/photo cd /cluster/data/petMar1/photo wget --timestamping 'http://www.epa.gov/glnpo/image/vbig/229.jpg' \ -O petromyzon_marinus.epa.jpg ########################################################################## # Running WindowMasker (DONE - 2008-01-26 - Hiram) ssh kolossus mkdir /cluster/data/petMar1/bed/WindowMasker cd /cluster/data/petMar1/bed/WindowMasker /cluster/bin/scripts/doWindowMasker.pl \ -buildDir /cluster/data/petMar1/bed/WindowMasker \ -workhorse=kolossus -verbose=2 petMar1 > wm.log 2>&1 & twoBitToFa petMar1.wmsk.sdust.2bit stdout | faSize stdin # 1027258967 bases (195562529 N's 831696438 real 494032631 upper # 337663807 lower) in 108241 sequences in 1 files # %32.87 masked total, %40.60 masked real ssh hgwdev cd /cluster/data/petMar1/bed/WindowMasker hgLoadBed petMar1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 4938211 elements of size 3 featureBits petMar1 windowmaskerSdust # 533224658 bases of 831696438 (64.113%) in intersection # WM has a bug where it masks the gaps too, don't like that, # fetch the WM track without the gaps, can do this in the # table browser with an intersection of WMask AND ! gap # or via featureBits: featureBits petMar1 -not gap -bed=notGap.bed # hint: also works with the gold table instead of notGap time nice -n +19 featureBits petMar1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 337663807 bases of 831696438 (40.599%) in intersection # real 182m45.469s hgLoadBed petMar1 windowmaskerSdust cleanWMask.bed.gz # Loaded 4919722 elements of size 4 featureBits petMar1 windowmaskerSdust > fb.petMar1.cleanWMask.txt 2>&1 # 337663807 bases of 831696438 (40.599%) in intersection ########################################################################## # Running TRF simple repeats (DONE - 2008-01-26 - Hiram) ssh kkstore04 mkdir /cluster/data/petMar1/bed/simpleRepeat cd /cluster/data/petMar1/bed/simpleRepeat doSimpleRepeat.pl petMar1 -smallClusterHub=memk \ -workhorse=kolossus \ -buildDir=/cluster/data/petMar1/bed/simpleRepeat > do.log 2>&1 & # ran into a bug with the nice difficulty on the featureBits featureBits petMar1 simpleRepeat > fb.petMar1.simpleRepeat.txt 2>&1 # 75844864 bases of 831696438 (9.119%) in intersection doSimpleRepeat.pl petMar1 -smallClusterHub=memk \ -continue=cleanup -workhorse=kolossus \ -buildDir=/cluster/data/petMar1/bed/simpleRepeat > cleanup.log 2>&1 & ########################################################################## # Repeat Masker (DONE - 2008-01-26 - Hiram) ssh kkstore04 mkdir /cluster/data/petMar1/bed/RepeatMasker cd /cluster/data/petMar1/bed/RepeatMasker doRepeatMasker.pl petMar1 -bigClusterHub=kk -workhorse=kolossus \ -buildDir=/cluster/data/petMar1/bed/RepeatMasker \ -verbose=2 > do.log 2>&1 ########################################################################## # Add WindowMasker and trfMask masking to the 2bit sequenct # (DONE - 2008-01-26 - Hiram) ssh kkstore04 cd /cluster/data/petMar1 gzip bed/simpleRepeat/trfMask.bed zcat bed/WindowMasker/cleanWMask.bed.gz bed/simpleRepeat/trfMask.bed.gz \ | twoBitMask -type=.bed petMar1.unmasked.2bit stdin petMar1.2bit # ignore the warning about bed file > 13 fields, that is OK twoBitToFa petMar1.2bit stdout | faSize stdin # 1027258967 bases (195562529 N's 831696438 real 493479824 upper # 338216614 lower) in 108241 sequences in 1 files # %32.92 masked total, %40.67 masked real ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-01-15 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("petMar1", "blat15", "17788", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("petMar1", "blat15", "17789", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## Reset default position to # Where RHO1 protein is found via blat: # GEMSLWSLVV LAIERYIVIC KPMGNFRFGS THAYMGVAFT WFMALSCAAP PLVGWSR ssh hgwdev hgsql -e 'update dbDb set defaultPos="Contig11034:8429-10197" where name="petMar1";' hgcentraltest ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2008-01-29 - Hiram) # Use -repMatch=200 (based on size -- for human we use 1024, and # medaka size is ~20% of human judging by gapless oryLat1 vs. hg18 # genome sizes from featureBits. ssh kkstore04 cd /cluster/data/petMar1 blat petMar1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=petMar1.11.ooc -repMatch=200 # Wrote 49155 overused 11-mers to petMar1.11.ooc ########################################################################## # GENBANK AUTO UPDATE (DONE - 2008-01-15 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add petMar1 just after gasAcu1 # petMar1 (Branchiostoma floridae - Lancelet) petMar1.serverGenome = /cluster/data/petMar1/petMar1.2bit petMar1.clusterGenome = /cluster/bluearc/scratch/data/petMar1/petMar1.2bit petMar1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} petMar1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} petMar1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} petMar1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} petMar1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} petMar1.ooc = /cluster/data/petMar1/petMar1.11.ooc petMar1.lift = /cluster/data/petMar1/jkStuff/petMar1.nonBridged.lft petMar1.refseq.mrna.native.load = yes petMar1.refseq.mrna.xeno.load = yes petMar1.genbank.mrna.xeno.load = yes petMar1.genbank.est.native.load = yes petMar1.downloadDir = petMar1 petMar1.perChromTables = no cvs ci -m "Added petMar1." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. cvs ci -m "Added Petromyzon marinus (Sea Lamprey)." src/lib/gbGenome.c make install-server ssh genbank screen # control this with a screen so you can get back to it cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial petMar1 & # logFile: var/build/logs/2008.01.29-11:27:10.petMar1.initalign.log # real 332m36.025s # There was some kind of problem with this alignment, although the # kluster run actually finished: # Completed: 366520 of 366520 jobs # CPU time in finished jobs: 6854057s 114234.28m 1903.90h 79.33d 0.217 y # IO & Wait Time: 2300432s 38340.53m 639.01h 26.63d 0.073 y # Average job time: 25s 0.42m 0.01h 0.00d # Longest finished job: 1292s 21.53m 0.36h 0.01d # Submission to last job: 21121s 352.02m 5.87h 0.24d # So, continuing: time nice -n +19 bin/gbAlignStep -continue=finish -initial petMar1 & # logFile: var/build/logs/2008.01.30-09:18:56.petMar1.initalign.log # real 70m20.669s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad petMar1 # logFile: var/dbload/hgwdev/logs/2008.01.15-16:21:10.dbload.log # real 19m34.853s # enable daily alignment and update of hgwdev (2008-01-30 - Hiram) cd ~/kent/src/hg/makeDb/genbank cvsup # add petMar1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added petMar1 - Petromyzon marinus (lamprey)" \ etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### # BLASTZ/CHAIN/NET Medaka oryLat1 (DONE - 2008-01-15 - Hiram) # Try 1 failed, run again with contigs ssh kkstore04 screen # use screen to control this job mkdir /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15 cd /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15 cat << '_EOF_' > DEF # Medaka vs. Lamprey # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LIMIT=30 SEQ1_LAP=10000 # QUERY: Lamprey petMar1 SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/cluster/data/petMar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=memk > do.log 2>&1 & # abandon this run, ended up using the swap from oryLat1 in: # /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15 # real 1053m9.027s # netChains failed, during chainStitchId: # q end mismatch 1073742062 vs 1111312799 line 510 of stdin ############################################################################ # Create contigs-only masked 2bit file for non-bridged gap free blastz runs # (DONE - 2008-01-15 - Hiram) ssh kkstore04 mkdir /cluster/data/petMar1/contigSequence cd /cluster/data/petMar1/contigSequence cp -p /cluster/data/priPac1/jkStuff/lft2BitToFa.pl ../jkStuff ../jkStuff/lft2BitToFa.pl ../petMar1.2bit \ ../jkStuff/petMar1.nonBridged.lft > supercontigs.fa # this took quite a long time, about 6 hours or so # this doesn't have chrM yet faSize supercontigs.fa ../M/chrM.fa # 1027258967 bases (195562529 N's 831696438 real 493405059 upper # 338291379 lower) in 108241 sequences in 2 files # %32.93 masked total, %40.67 masked real cat supercontigs.fa ../M/chrM.fa | faToTwoBit stdin petMar1.supers.2bit twoBitToFa petMar1.supers.2bit stdout | faSize stdin # 1027258967 bases (195562529 N's 831696438 real 493405059 upper # 338291379 lower) in 108241 sequences in 1 files # %32.93 masked total, %40.67 masked real twoBitInfo petMar1.supers.2bit stdout | sort -k2nr > super.sizes cp -p petMar1.supers.2bit /cluster/bluearc/scratch/data/petMar1 cd .. ln -s contigSequence/super.sizes . ######################################################################### # BLASTZ/CHAIN/NET Medaka oryLat1 (DONE - 2008-01-16 - 2008-01-30 - Hiram) # Try 2 with contigs for Lamprey ssh kkstore04 screen # use screen to control this job mkdir /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15 cd /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15 cat << '_EOF_' > DEF # Medaka vs. Lamprey # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LIMIT=30 SEQ1_LAP=10000 # QUERY: Lamprey petMar1 SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/cluster/data/petMar1/chrom.sizes SEQ2_CTGDIR=/cluster/bluearc/scratch/data/petMar1/petMar1.supers.2bit SEQ2_CTGLEN=/cluster/data/petMar1/super.sizes SEQ2_LIFT=/cluster/data/petMar1/jkStuff/petMar1.nonBridged.lft SEQ2_CHUNK=10000000 SEQ2_LIMIT=150 SEQ2_LAP=0 BASE=/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > do.log 2>&1 & # real 218m14.787s # Completed: 69774 of 70854 jobs # Crashed: 1080 jobs # CPU time in finished jobs: 3950133s 65835.55m 1097.26h 45.72d 0.125 y # IO & Wait Time: 1218716s 20311.93m 338.53h 14.11d 0.039 y # Average job time: 74s 1.23m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1629s 27.15m 0.45h 0.02d # Submission to last job: 12951s 215.85m 3.60h 0.15d # had some parasol bookeeping problems, it actually finished # the kluster run but got confused. Checked manually with # para recover -> nothing in new job list, and check files in # psl result directory finding the correct number of files for the # the job count. Make the run.time file and continuing: time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=cat -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > cat.log 2>&1 & # real 7m47.963s # something failed # had the specification to the lft file incorrect, failed during # chain run. Fix that, finish the run, now continuing: time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=chainMerge -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > chainMerge.log 2>&1 & # had the same failure as the first time test of this alignment. # something is wrong with netChainSubset, needs to be debugged. # ignore the step to create the "over.chain" file, finish the net # step manually, then continuing: time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=load -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > load.log 2>&1 & # real 3m53.686s cat fb.oryLat1.chainPetMar1Link.txt # 24990733 bases of 700386597 (3.568%) in intersection # And, for the swap: mkdir /cluster/data/petMar1/bed/blastz.oryLat1.swap cd /cluster/data/petMar1/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > swap.log 2>&1 & # real 1m3.572s # same problem in netChainSubset trying to make the "over.chain" file. # skip that, finish the chains manually: # real 14m35.340s # then continuing: time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16/DEF \ -continue=load -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=kk > load.log 2>&1 & cat fb.petMar1.chainOryLat1Link.txt # 23358156 bases of 831696438 (2.808%) in intersection # create reciprocal best chains/nets for 9-way maf alignments ssh hgwdev cd /cluster/data/petMar1/bed/blastz.oryLat1.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 oryLat1 \ > rbest.log 2>&1 & # real 22m20.900s ############################################################################# # BLASTZ SWAP Lamprey - Lanclet (DONE - 2008-04-12 - Hiram) ssh kkstore05 screen # control this job with a screen # the original cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11 cat fb.braFlo1.chainPetMar1Link.txt # 27418217 bases of 923355587 (2.969%) in intersection # and the swap mkdir /cluster/data/petMar1/bed/blastz.braFlo1.swap cd /cluster/data/petMar1/bed/blastz.braFlo1.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 & # real 194m53.514s cat fb.petMar1.chainBraFlo1Link.txt # 27373264 bases of 831696438 (3.291%) in intersection # create reciprocal best chains/nets for 9-way maf alignments ssh hgwdev cd /cluster/data/petMar1/bed/blastz.braFlo1.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 braFlo1 \ > rbest.log 2>&1 & # real 110m25.613s ############################################################################# # BLASTZ SWAP Chicken GalGal3 (DONE - 2008-04-15 - Hiram) # the original ssh kkstore03 cd /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11 cat fb.galGal3.chainPetMar1Link.txt # 20134896 bases of 1042591351 (1.931%) in intersection # and the swap mkdir /cluster/data/petMar1/bed/blastz.galGal3.swap cd /cluster/data/petMar1/bed/blastz.galGal3.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=memk -verbose=2 > swap.log 2>&1 & # real 33m41.587s cat fb.petMar1.chainGalGal3Link.txt # 22308118 bases of 831696438 (2.682%) in intersection # create reciprocal best chains/nets for 9-way maf alignments ssh hgwdev cd /cluster/data/petMar1/bed/blastz.galGal3.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 galGal3 \ > rbest.log 2>&1 & # real 13m33.139s ############################################################################# # BLASTZ SWAP Human hg18 (DONE - 2008-01-30 - Hiram) ssh kkstore02 screen # use screen to control this job # the original cd /cluster/data/hg18/bed/blastzPetMar1.2008-01-29 cat fb.hg18.chainPetMar1Link.txt # 36042598 bases of 2881515245 (1.251%) in intersection # That is OK, now for the swap: mkdir /cluster/data/petMar1/bed/blastz.hg18.swap cd /cluster/data/petMar1/bed/blastz.hg18.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/hg18/bed/blastzPetMar1.2008-01-29/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk > swap.log 2>&1 & # real 60m1.928s cat fb.petMar1.chainHg18Link.txt # 26751073 bases of 831696438 (3.216%) in intersection # create reciprocal best chains/nets for 9-way maf alignments ssh hgwdev cd /cluster/data/petMar1/bed/blastz.hg18.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 hg18 \ > rbest.log 2>&1 & # real 22m34.274s ############################################################################# # BLASTZ SWAP Mouse mm9 (DONE - 2008-04-15 - Hiram) ssh kkstore06 screen # use screen to control this job # the original cd /cluster/data/mm9/bed/blastzPetMar1.2008-04-14 cat fb.mm9.chainPetMar1Link.txt # 29113438 bases of 2620346127 (1.111%) in intersection # That is OK, now for the swap: mkdir /cluster/data/petMar1/bed/blastz.mm9.swap cd /cluster/data/petMar1/bed/blastz.mm9.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -bigClusterHub=pk > swap.log 2>&1 & # real 33m29.076s cat fb.petMar1.chainMm9Link.txt # 26052507 bases of 831696438 (3.132%) in intersection # create reciprocal best chains/nets for 9-way maf alignments ssh hgwdev cd /cluster/data/petMar1/bed/blastz.mm9.swap time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 mm9 \ > rbest.log 2>&1 & # real 21m9.320s ############################################################################# ## 6-Way Multiz (DONE - 2008-04-17 - Hiram) ## ssh hgwdev mkdir /cluster/data/petMar1/bed/multiz6way cd /cluster/data/petMar1/bed/multiz6way # take the 30-way tree from mm9 and eliminate genomes not in # this alignment. Manually add in petMar1 and braFlo1 # arbitrarily somewhere outside of the fishes # rearrange to get petMar1 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram # select the 6 organisms from the 30-way recently done on mouse mm9 /cluster/bin/phast/tree_doctor \ --prune-all-but Human_hg18,Mouse_mm9,Chicken_galGal3,Medaka_oryLat1 \ /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \ > 6-way.fullNames.nh # looks something like this: (((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757, Chicken_galGal3:0.488824):0.325954,Medaka_oryLat1:1.077873); # Adding Lamprey and Lanclet: ((Lamprey_petMar1:1.6, (((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757, Chicken_galGal3:0.488824):0.325954,Medaka_oryLat1:1.077873):0.4):0.4, Lanclet_braFlo1:2.0); ((Lamprey_petMar1:1.6, (Medaka_oryLat1:1.077873,((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757, Chicken_galGal3:0.488824):0.325954):0.4):0.4, Lanclet_braFlo1:2.0); ((Lamprey_petMar1:1.6, (Medaka_oryLat1:1.077873,(Chicken_galGal3:0.488824, (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.325954):0.4):0.4, Lanclet_braFlo1:2.0); # rearrange to get Marmoset at the top: # this leaves us with: cat << '_EOF_' > petMar1.6-way.nh ((Lamprey_petMar1:1.6, (Medaka_oryLat1:1.077873,(Chicken_galGal3:0.488824, (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.325954):0.4):0.4, Lanclet_braFlo1:2.0); '_EOF_' # << happy emacs # create a species list from that file: sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' petMar1.6-way.nh \ | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \ | sed -e "s/.*_//; s/:.*//" | sort > species.list # verify that has 6 db names in it # create a stripped down nh file for use in autoMZ run echo \ `sed 's/[a-zA-Z0-9]*_//g; s/:[012].[0-9]*//g; s/[,;]/ /g' petMar1.6-way.nh \ | sed -e "s/ / /g"` > tree.6.nh # that looks like, as a single line: # ((petMar1 (oryLat1 (galGal3 (mm9 hg18)))) braFlo1) # verify all blastz's exists cat << '_EOF_' > listMafs.csh #!/bin/csh -fe cd /cluster/data/petMar1/bed/multiz6way foreach db (`grep -v petMar1 species.list`) set bdir = /cluster/data/petMar1/bed/blastz.$db if (-e $bdir/mafRBestNet/petMar1.$db.rbest.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/petMar1.$db.net.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/petMar1.$db.net.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end '_EOF_' # << happy emacs chmod +x ./listMafs.csh # see what it says, the "mafs not found" should only show up on petMar1 ./listMafs.csh # braFlo1 mafRBestNet # galGal3 mafRBestNet # hg18 mafRBestNet # mm9 mafRBestNet # oryLat1 mafRBestNet /cluster/bin/phast/all_dists petMar1.6-way.nh > 6way.distances.txt grep -i petmar 6way.distances.txt | sort -k3,3n Lamprey_petMar1 Chicken_galGal3 2.814778 Lamprey_petMar1 Human_hg18 2.923612 Lamprey_petMar1 Medaka_oryLat1 3.077873 Lamprey_petMar1 Mouse_mm9 3.122529 Lamprey_petMar1 Lanclet_braFlo1 4.000000 # use the calculated # distances in the table below to order the organisms and check # the button order on the browser. Zebrafish ends up before # tetraodon and fugu on the browser despite its distance. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainCalJac1Link chain linearGap # distance on PetMar1 on other minScore # 1 Chicken_galGal3 2.814778 (% 2.682) (% 1.931) 5000 loose # 2 Human_hg18 2.923612 (% 3.216) (% 1.251) 5000 loose # 3 Medaka_oryLat1 3.077873 (% 2.808) (% 3.568) 3000 medium # 4 Mouse_mm9 3.122529 (% 3.132) (% 1.111) 5000 loose # 5 Lanclet_braFlo1 4.000000 (% 3.291) (% 2.969) 5000 loose # copy net mafs to cluster-friendly storage, splitting chroms mkdir mafLinks cd mafLinks ln -s ../../blastz.*.swap/mafRBestNet/*.rbest.maf.gz . # need to split these things up by Contig number for efficient kluster run ssh kkstore04 mkdir -p /san/sanvol1/scratch/petMar1/multiz6way/contigMaf cd /scratch/tmp # the 16201 is from petMar/chrom.sizes echo "chrM 0 16201" > chrM.bed for D in `grep -v petMar1 /cluster/data/petMar1/bed/multiz6way/species.list` do echo ${D} zcat \ /cluster/data/petMar1/bed/multiz6way/mafLinks/petMar1.${D}.rbest.maf.gz \ > ${D}.maf mkdir /scratch/tmp/${D} cd /scratch/tmp/${D} mafSplit -verbose=2 /dev/null -byTarget -useSequenceName Contig \ ../${D}.maf -outDirDepth=2 mafsInRegion ../chrM.bed 0/0/chrM.maf ../${D}.maf rsync -a --progress ./ \ /san/sanvol1/scratch/petMar1/multiz6way/contigMaf/${D} cd /scratch/tmp rm -fr ${D} ${D}.maf done # create a run-time list of contigs to operate on, not all contigs # exist in all alignments, but we want all contig names used in any # alignment: cd /san/sanvol1/scratch/petMar1/multiz6way/contigMaf for D in * do cd "${D}" find . -type f cd .. done | sort -u > /tmp/6-way.contig.list wc -l /tmp/6-way.contig.list mkdir /cluster/data/petMar1/bed/multiz6way/splitRun cp -p /tmp/6-way.contig.list \ /cluster/data/petMar1/bed/multiz6way/splitRun # 31488 /tmp/6-way.contig.list # ready for the multiz run ssh pk cd /cluster/data/petMar1/bed/multiz6way/splitRun mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = petMar1 set subdir = $1 set c = $2 set result = $3 set resultDir = $result:h set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz6way/contigMaf rm -fr $tmp mkdir -p $tmp mkdir -p $resultDir cp ../../tree.6.nh ../../species.list $tmp pushd $tmp foreach s (`grep -v $db species.list`) set in = $pairs/$s/$subdir/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.6.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $result rm -fr $tmp rmdir --ignore-fail-on-non-empty /scratch/tmp/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/petMar1/bed/multiz6way/splitRun/maf/$(dir1)/$(root1).maf} #ENDLOOP '_EOF_' # << emacs sed -e "s/^\.\///" ../6-way.contig.list \ | gensub2 stdin single template jobList para create jobList para try ... check ... push ... etc # Completed: 31488 of 31488 jobs # CPU time in finished jobs: 5723s 95.38m 1.59h 0.07d 0.000 y # IO & Wait Time: 84949s 1415.82m 23.60h 0.98d 0.003 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 12s 0.20m 0.00h 0.00d # Submission to last job: 900s 15.00m 0.25h 0.01d # Estimated complete: 0s 0.00m 0.00h 0.00d # put the split maf results back together into a single maf file # eliminate duplicate comments ssh kkstore04 cd /cluster/data/petMar1/bed/multiz6way mkdir togetherMaf grep "^##maf version" splitRun/maf/0/0/Contig00000.maf \ | sort -u > togetherMaf/petMar1.6way.maf for F in `find ./splitRun/maf -type f -depth` do grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \ | sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g" done | sort -u >> togetherMaf/petMar1.6way.maf for F in `find ./splitRun/maf -type f -depth` do grep -v -h "^#" "${F}" done >> togetherMaf/petMar1.6way.maf grep "^##eof maf" splitRun/maf/0/0/Contig00000.maf \ | sort -u >> togetherMaf/petMar1.6way.maf # load tables for a look ssh hgwdev mkdir -p /gbdb/petMar1/multiz6way/maf ln -s /cluster/data/petMar1/bed/multiz6way/togetherMaf/*.maf \ /gbdb/petMar1/multiz6way/maf/multiz6way.maf # this generates an immense multiz6way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/petMar1/multiz6way/maf petMar1 multiz6way # real 0m6.011s # Loaded 232205 mafs in 1 files from /gbdb/petMar1/multiz6way/maf # load summary table *NOT* - no Contigs are long enough to create # anything for the summary table. This command produces an empty # table. time nice -n +19 cat /gbdb/petMar1/multiz6way/maf/*.maf \ | hgLoadMafSummary petMar1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz6waySummary stdin # real 5m58.150s # Gap Annotation # prepare bed files with gap info ssh kkstore04 mkdir /cluster/data/petMar1/bed/multiz6way/anno cd /cluster/data/petMar1/bed/multiz6way/anno mkdir maf run # these actually already all exist from previous multiple alignments # if there are done you will see an ls of them # or else the twoBitInfo command will be echoed, run it if you want for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `grep -v petMar1 ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done ssh memk # temporarily copy the petMar1.6way.maf file onto the memk # nodes /scratch/data/petMar1/maf/ directory for R in 0 1 2 3 4 5 6 7 do ssh mkr0u${R} mkdir /scratch/data/petMar1/maf ssh mkr0u${R} rsync -a --progress \ /cluster/data/petMar1/bed/multiz6way/togetherMaf/petMar1.6way.maf \ /scratch/data/petMar1/maf/ done mkdir /cluster/data/petMar1/bed/multiz6way/anno/splitMaf # need to split up the single maf file into individual # per-scaffold maf files to run annotation on cd /cluster/data/petMar1/bed/multiz6way/anno/splitMaf # create bed files to list approximately 1553 scaffolds in # a single list, approximately 33 lists cat << '_EOF_' > mkBedLists.pl #!/usr/bin/env perl use strict; use warnings; my $bedCount = 0; my $i = 0; my $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; open (FH,") { chomp $line; if ( (($i + 1) % 1553) == 0 ) { printf "%s\n", $line; close (BF); ++$bedCount; $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; } ++$i; my ($chr, $size) = split('\s+',$line); printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr; } close (FH); close (BF); '_EOF_' # << happy emacs chmod +x mkBedLists.pl ./mkBedLists.pl # now, run a mafsInRegion on each one of those lists cat << '_EOF_' > runOne #!/bin/csh -fe set runDir = "/cluster/data/petMar1/bed/multiz6way/anno/splitMaf" set resultDir = $1 set bedFile = $resultDir.bed mkdir -p $resultDir mkdir -p /scratch/tmp/petMar1/$resultDir pushd /scratch/tmp/petMar1/$resultDir mafsInRegion $runDir/$bedFile -outDir . \ /scratch/data/petMar1/maf/petMar1.6way.maf popd rsync -q -a /scratch/tmp/petMar1/$resultDir/ ./$resultDir/ rm -fr /scratch/tmp/petMar1/$resultDir rmdir --ignore-fail-on-non-empty /scratch/tmp/petMar1 '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' > template #LOOP ./runOne $(root1) #ENDLOOP '_EOF_' # << happy emacs ls file*.bed > runList gensub2 runList single template jobList para create jobList para try ... check ... push ... etc # Completed: 70 of 70 jobs # CPU time in finished jobs: 255s 4.24m 0.07h 0.00d 0.000 y # IO & Wait Time: 1904s 31.74m 0.53h 0.02d 0.000 y # Average job time: 31s 0.51m 0.01h 0.00d # Longest finished job: 72s 1.20m 0.02h 0.00d # Submission to last job: 143s 2.38m 0.04h 0.00d cd /cluster/data/petMar1/bed/multiz6way/anno/run cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set outDir = ../maf/$2 set result = $3 set input = $1 mkdir -p $outDir cat $input | \ nice mafAddIRows -nBeds=nBeds stdin /scratch/data/petMar1/petMar1.2bit $result '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs find ../splitMaf -type f -name "*.maf" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 31488 of 31488 jobs # CPU time in finished jobs: 10711s 178.52m 2.98h 0.12d 0.000 y # IO & Wait Time: 79430s 1323.83m 22.06h 0.92d 0.003 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 12s 0.20m 0.00h 0.00d # Submission to last job: 3237s 53.95m 0.90h 0.04d ssh kkstore04 # put them all back together into a single file cd /cluster/data/petMar1/bed/multiz6way/anno grep "^##maf version" maf/file_0/Contig0.maf \ | sort -u > petMar1.anno.6way.maf find ./maf -type f -depth -name "*.maf" | while read F do grep -v -h "^#" "${F}" done >> petMar1.anno.6way.maf echo "##eof maf" >> petMar1.anno.6way.maf XXX - running 2008-04-23 09:58 ls -og *.maf # -rw-rw-r-- 1 285783792 Apr 23 09:58 petMar1.anno.6way.maf ssh hgwdev cd /cluster/data/petMar1/bed/multiz6way/anno mkdir -p /gbdb/petMar1/multiz6way/anno ln -s `pwd`/petMar1.anno.6way.maf \ /gbdb/petMar1/multiz6way/anno/multiz6way.maf # by loading this into the table multiz6way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/petMar1/multiz6way/anno \ petMar1 multiz6way # Loaded 296783 mafs in 1 files from /gbdb/petMar1/multiz6way/anno # real 0m6.690s # normally filter this for chrom size > 1,000,000 and only load # those chroms. But this is a scaffold assembly, load everything. # *NOT* Does not work on petMar1, all contigs are too small, # this makes an empty table. hgLoadMafSummary petMar1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz6waySummary \ /gbdb/petMar1/multiz6way/anno/multiz6way.maf # Created 121083 summary blocks from 3410157 components and 749940 mafs # from /gbdb/petMar1/multiz6way/anno/multiz6way.maf # by loading this into the table multiz6waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz6way*.tab files in this /scratch/tmp directory rm multiz6way*.tab # And, you can remove the previously loaded non-annotated maf file link: rm /gbdb/petMar1/multiz6way/maf/multiz6way.maf rmdir /gbdb/petMar1/multiz6way/maf ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-19) # (auto) establish native host of /cluster/data/ ssh kkstore04 # bash if not using bash shell already mkdir /cluster/data/petMar1/blastDb cd /cluster/data/petMar1 awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst if test -s 1meg.lst; then twoBitToFa -seqList=1meg.lst petMar1.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa fi rm 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst if test -s less1meg.lst; then twoBitToFa -seqList=less1meg.lst petMar1.unmasked.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa fi rm less1meg.lst cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 1018 mkdir -p /san/sanvol1/scratch/petMar1/blastDb cd /cluster/data/petMar1/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/petMar1/blastDb done mkdir -p /cluster/data/petMar1/bed/tblastn.hg18KG cd /cluster/data/petMar1/bed/tblastn.hg18KG echo /san/sanvol1/scratch/petMar1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 1018 query.lst # we want around 100,000 jobs per gig (0.0001 jobs per base) numJobs=`awk '{sum += $2} END {print sum * 0.0001 }' /cluster/data/petMar1/chrom.sizes` # we want around numJobs jobs numGenes=`wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'` numQueries=`wc query.lst | awk '{print $1}'` numKGFiles=`awk -v ng=$numGenes -v nq=$numQueries -v nj=$numJobs 'BEGIN {printf "%d", ng/(nj/nq);exit}' ` mkdir -p /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa split -l $numKGFiles /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/petMar1/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then if test -s /cluster/data/petMar1/blastDb.lft then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/petMar1/blastDb.lft carry $f.2 else mv $f.2 $f.3 fi liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/petMar1/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 103836 of 103836 jobs # CPU time in finished jobs: 9418283s 156971.39m 2616.19h 109.01d 0.299 y # IO & Wait Time: 867856s 14464.26m 241.07h 10.04d 0.028 y # Average job time: 99s 1.65m 0.03h 0.00d # Longest finished job: 683s 11.38m 0.19h 0.01d # Submission to last job: 38789s 646.48m 10.77h 0.45d ssh kkstore05 cd /cluster/data/petMar1/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh memk cd /cluster/data/petMar1/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 102 of 102 jobs # CPU time in finished jobs: 4354s 72.57m 1.21h 0.05d 0.000 y # IO & Wait Time: 70795s 1179.91m 19.67h 0.82d 0.002 y # Average job time: 737s 12.28m 0.20h 0.01d # Longest finished job: 1308s 21.80m 0.36h 0.02d # Submission to last job: 2927s 48.78m 0.81h 0.03d ssh kkstore04 cd /cluster/data/petMar1/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/petMar1/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/petMar1/bed/tblastn.hg18KG hgLoadPsl petMar1 blastHg18KG.psl # check coverage featureBits petMar1 blastHg18KG # 7171010 bases of 831696438 (0.862%) in intersection featureBits petMar1 xenoRefGene:cds blastHg18KG -enrichment # xenoRefGene:cds 0.676%, blastHg18KG 0.862%, both 0.362%, cover 53.51%, enrich 62.06x ssh kkstore05 rm -rf /cluster/data/petMar1/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################### ## Annotate 6-way multiple alignment with gene annotations ## (DONE - 2008-01-08 - Hiram) # Gene frames ## given previous survey done for 8-way alignment on Orangutan, ## try using the following tables for this gene annotation: # hg18 is in a bit of transition at the moment, so use ensGene # mm9 can use knownGene # xenoMrna for braFlo1, petMar1 # ensGene v49 for galGal3, oryLat1 ssh hgwdev mkdir /cluster/data/petMar1/bed/multiz6way/frames cd /cluster/data/petMar1/bed/multiz6way/frames mkdir genes # knownGene for DB in mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene for DB in hg18 galGal3 oryLat1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # use xenoMrna for braFlo1, petMar1 for DB in braFlo1 petMar1 do tmpExt=`mktemp temp.XXXXXX` tmpMrnaCds=${DB}.mrna-cds.${tmpExt} tmpMrna=${DB}.mrna.${tmpExt} tmpCds=${DB}.cds.${tmpExt} hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \ from xenoMrna,gbCdnaInfo,cds \ where (xenoMrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ $DB > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \ genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz rm -f $tmpExt echo "${DB} done" done ls -og genes # -rw-rw-r-- 1 1040642 Apr 28 14:30 braFlo1.gp.gz # -rw-rw-r-- 1 1557911 Apr 28 14:29 galGal3.gp.gz # -rw-rw-r-- 1 2151412 Apr 28 14:29 hg18.gp.gz # -rw-rw-r-- 1 1965274 Apr 28 14:28 mm9.gp.gz # -rw-rw-r-- 1 1713692 Apr 28 14:29 oryLat1.gp.gz # -rw-rw-r-- 1 596014 Apr 28 14:31 petMar1.gp.gz ssh kkstore04 cd /cluster/data/petMar1/bed/multiz6way/frames # hg18 is in a bit of transition at the moment, so use ensGene # mm9 can use knownGene # xenoMrna for braFlo1, petMar1 # ensGene v49 for galGal3, oryLat1 # anything to annotate is in a pair, e.g.: petMar1 genes/petMar1.gp.gz time (cat ../anno/petMar1.anno.6way.maf | nice -n +19 genePredToMafFrames petMar1 stdin stdout petMar1 genes/petMar1.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz oryLat1 genes/oryLat1.gp.gz braFlo1 genes/braFlo1.gp.gz | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz6way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 37711 braFlo1 # 55189 petMar1 # 75566 oryLat1 # 95715 galGal3 # 107885 mm9 # 110835 hg18 # load the resulting file ssh hgwdev cd /cluster/data/petMar1/bed/multiz6way/frames time nice -n +19 hgLoadMafFrames petMar1 multiz6wayFrames \ multiz6way.mafFrames.gz # real 0m7.652s # enable the trackDb entries: # frames multiz6wayFrames # irows on ############################################################################# # phastCons 6-way (DONE - 2008-04-28 - Hiram) # split 6way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh memk mkdir /cluster/data/petMar1/bed/multiz6way/msa.split cd /cluster/data/petMar1/bed/multiz6way/msa.split mkdir -p /san/sanvol1/scratch/petMar1/multiz6way/cons/ss cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/petMar1/bed/multiz6way/anno/maf set WINDOWS = /san/sanvol1/scratch/petMar1/multiz6way/cons/ss pushd $WINDOWS set resultDir = $1 set c = $2 rm -fr $resultDir/$c mkdir -p $resultDir twoBitToFa -seq=$c /scratch/data/petMar1/petMar1.2bit /scratch/tmp/petMar1.$c.fa /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$resultDir/$c.maf -i MAF \ -M /scratch/tmp/petMar1.$c.fa \ -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000 rm -f /scratch/tmp/petMar1.$c.fa popd mkdir -p $resultDir date > $resultDir/$c.out '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out} #ENDLOOP '_EOF_' # << happy emacs # create list of maf files: (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 31488 of 31488 jobs # CPU time in finished jobs: 2289s 38.15m 0.64h 0.03d 0.000 y # IO & Wait Time: 82557s 1375.95m 22.93h 0.96d 0.003 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 40s 0.67m 0.01h 0.00d # Submission to last job: 3052s 50.87m 0.85h 0.04d # take the cons and noncons trees from the mouse 30-way # Estimates are not easy to make, probably more correctly, # take the 30-way .mod file, and re-use it here. ssh hgwdev cd /cluster/data/petMar1/bed/multiz6way cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod . # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh memk mkdir -p /cluster/data/petMar1/bed/multiz6way/cons/run.cons cd /cluster/data/petMar1/bed/multiz6way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all gliers placentals # Well, that's what it was when used in the Mm9 30-way, # in this instance, there is only the directory "all" cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin set subDir = $1 set f = $2 set c = $2:r set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/petMar1/bed/multiz6way/cons mkdir -p $tmp set san = /san/sanvol1/scratch/petMar1/multiz6way/cons if (-s $cons/$grp/$grp.non-inf) then cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp else cp -p $cons/$grp/$grp.mod $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir sleep 4 touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir rm -f $san/$grp/pp/$subDir/$f.pp rm -f $san/$grp/bed/$subDir/$f.bed mv $tmp/$f.pp $san/$grp/pp/$subDir mv $tmp/$f.bed $san/$grp/bed/$subDir rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/petMar1/multiz6way/cons/all/pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/petMar1/multiz6way/cons find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \ > /cluster/data/petMar1/bed/multiz6way/cons/ss.list popd # run for all species cd .. mkdir -p all run.cons/all cd all /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin/tree_doctor \ ../../mm9.30way.mod \ --prune-all-but=petMar1,hg18,galGal3,mm9,oryLat1,braFlo1 \ > all.mod # Actually, this doesn't work because petMar1 and braFlo1 are not # in that business. So, manually place them in the TREE: line in all.mod, # from the manufactured tree above, and the numbers from mm9.30way.mod: # TREE: ((petMar1:1.6,(oryLat1:1.077873,(galGal3:0.488824,(mm9:0.325818,hg18:0.136019):0.470757):0.325954):0.4):0.4,braFlo1:2.0); cd ../run.cons/all # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/petMar1/multiz6way/cons/all/pp/$(lastDir1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 10471 of 10471 jobs # CPU time in finished jobs: 991s 16.52m 0.28h 0.01d 0.000 y # IO & Wait Time: 68810s 1146.83m 19.11h 0.80d 0.002 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest finished job: 10s 0.17m 0.00h 0.00d # Submission to last job: 4767s 79.45m 1.32h 0.06d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all find ./bed -type f -name "Contig*.bed" | xargs cat \ | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 3 minutes cp -p mostConserved.bed /cluster/data/petMar1/bed/multiz6way/cons/all # load into database ssh hgwdev cd /cluster/data/petMar1/bed/multiz6way/cons/all time nice -n +19 hgLoadBed petMar1 phastConsElements6way mostConserved.bed # Loaded 76016 elements of size 5 # Try for 5% overall cov, and 70% CDS cov # We don't have any gene tracks to compare CDS coverage # --rho .31 --expected-length 45 --target-coverage .3 featureBits petMar1 phastConsElements6way # 23909522 bases of 831696438 (2.875%) in intersection # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all mkdir -p phastCons6wayScores for D in `ls -1d pp/file* | sort -t_ -k2n` do F=${D/pp\/} out=phastCons6wayScores/${F}.data.gz echo "${D} > ${F}.data.gz" ls -S ${D}/*.pp | xargs cat | gzip > ${out} done # copy the phastCons6wayScores to: # /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way/phastConsScores # for hgdownload downloads # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all ls -1 phastCons6wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \ | wigEncode stdin phastCons6way.wig phastCons6way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 time nice -n +19 cp -p *.wi? /cluster/data/petMar1/bed/multiz6way/cons/all # real 0m1.729s # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/petMar1/bed/multiz6way/cons/all ln -s `pwd`/phastCons6way.wib /gbdb/petMar1/multiz6way/phastCons6way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/petMar1/multiz6way petMar1 \ phastCons6way phastCons6way.wig # real 0m2.068s # remove garbage rm wiggle.tab # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/petMar1/bed/multiz6way/cons/all time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=petMar1 phastCons6way > histogram.data 2>&1 # real 5m0.608s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Lamprey PetMar1 Histogram phastCons6way track" set xlabel " phastCons6way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # These trackDb entries turn on the wiggle phastCons data track: # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons6way # spanList 1 # autoScale Off # windowingFunction mean # pairwiseHeight 12 # yLineOnOff Off ############################################################################# # Downloads (DONE - 2008-04-29 - Hiram) # Let's see if the downloads will work ssh hgwdev cd /cluster/data/petMar1 # expecting to find repeat masker .out file here: ln -s bed/RepeatMasker/petMar1.fa.out . gunzip bed/simpleRepeat/trfMask.bed.gz time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \ -workhorse=hgwdev petMar1 > jkStuff/downloads.log 2>&1 # real 7m13.080s # failed making upstream sequences: # featureBits calJac1 mgcGenes:upstream:1000 -fa=stdout # setpriority: Permission denied. # the 'nice' from my bash shell causes trouble inside the csh # script which uses nice. Finish off the install step manually # with the mgcGenes upstreams ... ############################################################################# # PushQ entries (DONE - 2008-04-29 - Hiram) ssh hgwdev /cluster/data/petMar1 /cluster/bin/scripts/makePushQSql.pl petMar1 > jkStuff/pushQ.sql # output warnings: # petMar1 does not have seq # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting # and genbank tables) which tracks to assign these tables to: # gbWarn # multiz6wayFrames # phastCons6way scp -p jkStuff/pushQ.sql hgwbeta:/tmp/petMar1.sql ssh hgwbeta cd /tmp hgsql qapushq < petMar1.sql ############################################################################# # Create 6-way downloads (DONE - 2008-05-01 - Hiram) ssh hgwdev mkdir -p /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way cd /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way # if these haven't been copied here from before: cp -p \ /san/sanvol1/scratch/petMar1/multiz6way/cons/all/phastCons6wayScores/* . ln -s ../../cons/all/all.mod ./6way.mod cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt . # edit that README.txt to be correct for this 6-way alignment cd .. mkdir multiz6way cd multiz6way cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt . # edit that README.txt to be correct for this 6-way alignment ssh kkstore04 cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way ln -s ../../petMar1.6-way.nh ./6way.nh time nice -n +19 gzip -c ../../anno/petMar1.anno.6way.maf \ > petMar1.6way.maf.gz # real 0m49.861s ssh hgwdev cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way # creating upstream files from xenoRefGene, bash script: cat << '_EOF_' > mkUpstream.sh #!/bin/bash DB=petMar1 GENE=xenoRefGene NWAY=multiz6way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \ stdin stdout \ -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done '_EOF_' # << happy emacs chmod +x ./mkUpstream.sh time nice -n +19 ./mkUpstream.sh # real 2m26.254s # -rw-rw-r-- 1 61861 May 1 16:10 upstream1000.maf.gz # -rw-rw-r-- 1 104203 May 1 16:11 upstream2000.maf.gz # -rw-rw-r-- 1 195382 May 1 16:12 upstream5000.maf.gz # check the names in these upstream files to ensure sanity: zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \ | sort | uniq -c | sort -rn | less # should be a list of the other 4 species with a high count, # then xenoRefGene names, e.g.: # 206 oryLat1 # 206 mm9 # 206 hg18 # 206 galGal3 # 206 braFlo1 # 8 NM_054045 # 7 NM_013550 # 5 NM_069752 ssh kkstore04 cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way md5sum *.maf.gz > md5sum.txt cd ../phastCons6way md5sum *.data.gz *.mod > md5sum.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way mkdir /usr/local/apache/htdocs/goldenPath/petMar1/phastCons6way cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way rm /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way/mkUpstream.sh cd ../phastCons6way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/petMar1/phastCons6way # if your ln -s `pwd`/* made extra links to files you don't want there, # check the goldenPath locations and remove those extra links ############################################################################# ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram ssh kkstore01 # not too important since everything moved to hive screen # use screen to control this job cd /cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26 cat fb.oryLat2.chainPetMar1Link.txt # 41568923 bases of 700386597 (5.935%) in intersection # And, for the swap: mkdir /cluster/data/petMar1/bed/blastz.oryLat2.swap cd /cluster/data/petMar1/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -smallClusterHub=pk -bigClusterHub=kk > swap.log 2>&1 & # real 52m27.942s cat fb.petMar1.chainOryLat2Link.txt # 39181422 bases of 831696438 (4.711%) in intersection ############################################################################ ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: petMar1.upstreamGeneTbl = xenoRefGene petMar1.upstreamMaf = multiz6way /hive/data/genomes/petMar1/bed/multiz6way/species.list ############################################################################ # QUALITY TRACK (DONE - 2008-11-25 - Hiram) cd /hive/data/genomes/petMar1/downloads qaToQac contigs.fa.quals.gz assembly.qac qacAgpLift ../petMar1.agp assembly.qac scaffolds.qac mkdir /hive/data/genomes/petMar1/bed/qual cd /hive/data/genomes/petMar1/bed/qual qacToWig -fixed ../../downloads/scaffolds.qac stdout \ | wigEncode stdin qual.wig qual.wib ln -s `pwd`/qual.wib /gbdb/petMar1/wib hgLoadWiggle -pathPrefix=/gbdb/petMar1/wib petMar1 quality qual.wig ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # construct liftOver to petMar2 (DONE - 2012-10-16 - Hiram) screen # manage this longish running job in a screen mkdir /hive/data/genomes/petMar1/bed/blat.petMar2.2012-10-16 cd /hive/data/genomes/petMar1/bed/blat.petMar2.2012-10-16 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/petMar1/petMar1.11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev petMar1 petMar2 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/petMar1/petMar1.11.ooc \ -dbHost=hgwdev -workhorse=hgwdev petMar1 petMar2 > do.log 2>&1 # real 87m15.687s # verify this file exists: # /gbdb/petMar1/liftOver/petMar1ToPetMar2.over.chain.gz # and try out the conversion on genome-test from petMar1 to petMar2 ############################################################################