# for emacs: -*- mode: sh; -*- # Bos Taurus -- Baylor Release 4.0 (Oct 4 2007) # "$Id: bosTau4.txt,v 1.23 2010/04/01 17:33:10 hiram Exp $" ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2008-03-07 - Hiram) ssh kkstore06 mkdir /cluster/store3/bosTau4 ln -s /cluster/store3/bosTau4 /cluster/data/bosTau4 mkdir /cluster/data/bosTau4/baylor cd /cluster/data/bosTau4/baylor for F in README.Btau20070913.txt Contigs/* do wget --timestamping \ "ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20070913-freeze/${F}" done mkdir chroms cd chroms wget --timestamping \ "ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20070913-freeze/LinearScaffolds/*" # Looking up chrM in Entrez nucleotide search, # from bosTau3 note: NC_006853 GI:60101824 # search for Bos taurus mitochondrion complete genome # Note, their Contigs files are from their 2006 release. # Their chrom files are new to this release. # fixup the Chr case names in the agp file: sed -e "s/^Chr/chr/" Btau20070913.All.ac.agp > bosTau4.agp # fixup the contig names in the contigs.fa and qual files, utilize # the following perl scripts: cat << '_EOF_' > fixContigNames.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if (1 != $argc) { print "usage ./fixContigNames.pl Btau20060815.contigs.fa.gz \\\n", " | gzip > bosTau4.contigs.fa.gz\n"; exit 255; } my $fName = shift; open (FH,"zcat $fName|") or die "can not read $fName $!"; while (my $line = ) { if ($line =~ m/^>/) { my @a = split('\|', $line); $a[3] =~ s/\.[0-9]+//; print ">$a[3]\n"; } else { print "$line"; } } close (FH) '_EOF_' # << happy emacs cat << '_EOF_' > fixQuals.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if (1 != $argc) { print "usage ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \\\n", " | gzip > bosTau4.qual.gz\n"; exit 255; } my $fName = shift; open (FH,"zcat $fName|") or die "can not read $fName $!"; while (my $line = ) { if ($line =~ m/^>/) { $line =~ s/\.[0-9]+.*//; } print "$line"; } close (FH) '_EOF_' # << happy emacs # There are extra records in the qual and contigs fa files. # Only the sequences in the agp file should be used: grep "^chr" bosTau4.agp | egrep -v "clone|fragment" | cut -f6 \ | sort > agp.names # run those scripts, extract only the used records, and lift the # qual scores via the AGP file to a qac file. chmod +x fixContigNames.pl fixQuals.pl ./fixContigNames.pl Btau20060815.contigs.fa.gz \ | faSomeRecords stdin agp.names stdout | gzip > bosTau4.contigs.fa.gz ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \ | faSomeRecords stdin agp.names stdout \ | qaToQac stdin stdout \ | qacAgpLift bosTau4.agp stdin bosTau4.qual.qac ############################################################################# # running makeGenomeDb.pl (DONE - 2008-03-07 - Hiram) ssh kkstore06 cd /cluster/data/bosTau4 cat << '_EOF_' > bosTau4.config.ra db bosTau4 clade mammal scientificName Bos Taurus assemblyDate Oct. 2007 assemblyLabel Baylor Release 4.0 orderKey 236 dbDbSpeciesDir cow mitoAcc 60101824 agpFiles /cluster/data/bosTau4/baylor/bosTau4.agp fastaFiles /cluster/data/bosTau4/baylor/bosTau4.contigs.fa.gz qualFiles /cluster/data/bosTau4/baylor/bosTau4.qual.qac commonName Cow '_EOF_' # << happy emacs makeGenomeDb.pl bosTau4.config.ra > makeGenomeDb.log 2>&1 ######################################################################### # REPEATMASKER (DONE - 2008-03-07 - Hiram) ssh kkstore06 screen # use a screen to manage this job mkdir /cluster/data/bosTau4/bed/repeatMasker cd /cluster/data/bosTau4/bed/repeatMasker # # This was going too slow on memk, on the order of 8 days. # chilled that memk batch, then switched it to finish on pk doRepeatMasker.pl -buildDir=/cluster/data/bosTau4/bed/repeatMasker \ -bigClusterHub=memk bosTau4 > do.log 2>&1 & # continuing doRepeatMasker.pl -buildDir=/cluster/data/bosTau4/bed/repeatMasker \ -continue=cat -bigClusterHub=pk bosTau4 > cat.log 2>&1 & time nice -n +19 featureBits bosTau4 rmsk > fb.bosTau4.rmsk.txt 2>&1 & # 1280927549 bases of 2731830700 (46.889%) in intersection # RepeatMasker and lib version from do.log: # RepeatMasker,v 1.20 2008/01/16 18:15:45 angie Exp $ # Jan 11 2008 (open-3-1-9) version of RepeatMasker # CC RELEASE 20071204; # Compare coverage to previous assembly: featureBits bosTau3 rmsk # 1200525422 bases of 2731807384 (43.946%) in intersection featureBits bosTau2 rmsk # 1291561904 bases of 2812203870 (45.927%) in intersection ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-03-07 - Hiram) ssh kkstore06 screen # use a screen to manage this job mkdir /cluster/data/bosTau4/bed/simpleRepeat cd /cluster/data/bosTau4/bed/simpleRepeat # doSimpleRepeat.pl -buildDir=/cluster/data/bosTau4/bed/simpleRepeat \ bosTau4 > do.log 2>&1 & # after RM run is done, add this mask: cd /cluster/data/bosTau4 twoBitMask bosTau4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau4.2bit twoBitToFa bosTau4.2bit stdout | faSize stdin # 2917974530 bases (186161294 N's 2731813236 real # 1450855409 upper 1280957827 lower) in 11900 sequences in 1 files # %43.90 masked total, %46.89 masked real twoBitToFa bosTau4.rmsk.2bit stdout | faSize stdin # 2917974530 bases (186161294 N's 2731813236 real # 1451369861 upper 1280443375 lower) in 11900 sequences in 1 files # %43.88 masked total, %46.87 masked real ######################################################################### # Create OOC file for genbank runs (DONE - 2008-03-10 - Hiram) # use same repMatch value as bosTau2 ssh kkstore06 cd /cluster/data/bosTau3 blat bosTau4.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=bosTau4.11.ooc -repMatch=1005 ssh kkr1u00 mkdir /iscratch/i/bosTau4 cd /iscratch/i/bosTau4 cp -p /cluster/data/bosTau4/bosTau4.2bit . for R in 2 3 4 5 6 7 8 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/bosTau4/ done ######################################################################### # Starting Genbank ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank # edit etc/genbank.conf to add the following entry: # bosTau4 (B. taurus) 31 chromosomes plus 11869 scaffolds bosTau4.serverGenome = /cluster/data/bosTau4/bosTau4.2bit bosTau4.clusterGenome = /iscratch/i/bosTau4/bosTau4.2bit bosTau4.ooc = /cluster/data/bosTau4/bosTau4.11.ooc bosTau4.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} bosTau4.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} bosTau4.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} bosTau4.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} bosTau4.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} bosTau4.lift = no bosTau4.downloadDir = bosTau4 bosTau4.perChromTables = no bosTau4.mgc = yes cvs ci -m "Added bosTau4." etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank # This is a call to a script that will push our jobs out to the cluster # since it's a big job. time nice -n +19 bin/gbAlignStep -initial bosTau4 & # logFile: var/build/logs/2008.03.10-14:14:43.bosTau4.initalign.log # real 567m7.431s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad bosTau4 # logFile: var/dbload/hgwdev/logs/2008.03.11-08:48:17.dbload.log # real 34m55.565s # enable daily alignment and update of hgwdev (DONE - 2008-03-11 - Hiram) cd ~/kent/src/hg/makeDb/genbank cvsup # add bosTau4 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added bosTau4." etc/align.dbs etc/hgwdev.dbs make etc-update ############################################################################ # DONE - 2008-03-11 - Hiram # Reset default position to an area of chr3 with a number of genes hgsql -e \ 'update dbDb set defaultPos="chr3:21083267-21232422" where name="bosTau4";' \ hgcentraltest # And there was a mistake in the description date entry hgsql -e \ 'update dbDb set description="Oct. 2007" where name="bosTau4";' \ hgcentraltest ######################################################################### ## genscan run (DONE - 2008-03-11 - Hiram) ## create hard masked sequence ssh kkstore06 cd /cluster/data/bosTau4 mkdir hardMasked for C in `cut -f1 chrom.sizes` do echo "hardMasked/${C}.hard.fa" twoBitToFa -seq=${C} bosTau4.2bit stdout \ | maskOutFa stdin hard hardMasked/${C}.hard.fa ls -ld "hardMasked/${C}.hard.fa" done # And, make sure there aren't any sequences in this lot that have # become all N's with no sequence left in them. This drives genscan nuts echo hardMasked/*.hard.fa | xargs faCount > faCount.hard.txt # the lowest three are: egrep -v "^#|^total" faCount.hard.txt \ | awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3 # chrUn.004.9957 0 # chrUn.004.9961 0 # chrUn.004.9996 0 # There are a whole bunch of these, and many with just a few bases. # Actually, before removing these for genscan, run the cpgIsland # business first since it can work on them all. # So, remove any with less than 100 bases of sequence egrep -v "^#|^total" faCount.hard.txt \ | awk '{size=$2-$7; if (size < 100){printf "hardMasked/%s.hard.fa\n", $1}}' \ | xargs rm # now get them over to a kluster location mkdir /san/sanvol1/scratch/bosTau4/hardChunks cd /san/sanvol1/scratch/bosTau4/hardChunks # creating 4,000,000 sized chunks, the chroms stay together as # single pieces. The contigs get grouped together into 4,000,000 # sized fasta files. You don't want to break these things up # because genscan will be doing its own internal 2.4 million # window on these pieces, and the gene names are going to be # constructed from the sequence name in these fasta files. echo /cluster/data/bosTau4/hardMasked/*.hard.fa | xargs cat \ | faSplit about stdin 4000000 c_ ssh hgwdev mkdir /cluster/data/bosTau4/bed/genscan cd /cluster/data/bosTau4/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh memk cd /cluster/data/bosTau4/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) # Since we split on gaps, we have no chunks like that. You can # verify with faCount on the chunks. ls -1Sr /san/sanvol1/scratch/bosTau4/hardChunks/c_*.fa > genome.list # Create run-time script to operate gsBig in a cluster safe manner cat << '_EOF_' > runGsBig #!/bin/csh -fe set runDir = `pwd` set srcDir = $1 set inFile = $2 set fileRoot = $inFile:r mkdir /scratch/tmp/$fileRoot cp -p $srcDir/$inFile /scratch/tmp/$fileRoot pushd /scratch/tmp/$fileRoot /cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000 popd cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt rm -fr /scratch/tmp/$fileRoot '_EOF_' # << happy emacs chmod +x runGsBig cat << '_EOF_' > template #LOOP runGsBig /san/sanvol1/scratch/bosTau4/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.list single template jobList para create jobList para try, check, push, check, ... XXX - running 2008-03-11 11:20 # Completed: 97 of 97 jobs # CPU time in finished jobs: 93952s 1565.87m 26.10h 1.09d 0.003 y # IO & Wait Time: 372s 6.19m 0.10h 0.00d 0.000 y # Average job time: 972s 16.21m 0.27h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 4870s 81.17m 1.35h 0.06d # Submission to last job: 7902s 131.70m 2.19h 0.09d # cat and lift the results into single files ssh kkstore06 cd /cluster/data/bosTau4/bed/genscan sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed cat pep/c_*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/bosTau4/bed/genscan ldHgGene bosTau4 -gtf genscan genscan.gtf # Read 49598 transcripts in 346887 lines in 1 files # 49598 groups 5059 seqs 1 sources 1 feature types # 49598 gene predictions hgPepPred bosTau4 generic genscanPep genscan.pep hgLoadBed bosTau4 genscanSubopt genscanSubopt.bed # Loaded 528092 elements of size 6 # let's check the numbers time nice -n +19 featureBits bosTau4 genscan # 58224594 bases of 2731830700 (2.131%) in intersection time nice -n +19 featureBits bosTau3 genscan # 59251085 bases of 2731807384 (2.169%) in intersection ######################################################################### # CPGISLANDS (DONE - 2008-03-11 - Hiram) ssh hgwdev mkdir /cluster/data/bosTau4/bed/cpgIsland cd /cluster/data/bosTau4/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # There was a problem in here, in both cpg.c and cpg_lh.c: # warning: conflicting types for built-in function 'malloc' # comment out the lines where malloc is declared to get this to build # gcc readseq.c cpg_lh.c -o cpglh.exe cd ../.. ln -s hg3rdParty/cpgIslands/cpglh.exe . # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. mkdir results echo ../../hardMasked/*.hard.fa | sed -e "s/ /\n/g" | while read F do FA=${F/*\/} C=${FA/.hard.fa/} echo "./cpglh.exe ${FA} > results/${C}.cpg" nice -n +19 ./cpglh.exe ${F} > results/${C}.cpg done > cpglh.out 2>&1 & # about 5 minutes # Transform cpglh output to bed + cat << '_EOF_' > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << happy emacs catDir results \ | awk -f filter.awk | sort -k1,1 -k2,2n > cpgIsland.bed ssh hgwdev cd /cluster/data/bosTau4/bed/cpgIsland hgLoadBed bosTau4 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 37595 elements of size 10 featureBits bosTau4 cpgIslandExt # 24202824 bases of 2731830700 (0.886%) in intersection featureBits bosTau3 cpgIslandExt # 24374280 bases of 2731807384 (0.892%) in intersection ######################################################################### # BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11 - Hiram) ssh kkstore02 screen # use a screen to manage this multi-day job mkdir /cluster/data/hg18/bed/blastzBosTau4.2008-03-11 cd /cluster/data/hg18/bed/ ln -s blastzBosTau4.2008-03-11 blastz.bosTau4 cd blastzBosTau4.2008-03-11 cat << '_EOF_' > DEF BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau4 SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzBosTau4.2008-03-11 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 # real 611m17.901s cat fb.hg18.chainBosTau4Link.txt # 1345348636 bases of 2881515245 (46.689%) in intersection mkdir /cluster/data/bosTau4/bed/blastz.hg18.swap cd /cluster/data/bosTau4/bed/blastz.hg18.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/hg18/bed/blastzBosTau4.2008-03-11/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > do.log 2>&1 # broken down during netSynteny.csh due to too many open files on # a chainSplit # To get them split, set up a job on memk to do the split # via chainFilter: chainFilter -t=${T} bigFile.chain > ${T}.chain # Where the targets T are from the list: # grep "^chain " bigFile.chain | awk '{print $3}' | sort -u # see full example below in Platypus Blastz ########################################################################### # Blastz Platypus ornAna1 (DONE - 2008-03-11,14 - Hiram) ssh kkstore06 screen # use screen to control this job mkdir /cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11 cd /cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11 cat << '_EOF_' > DEF # Cow vs. platypus BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_M=50 # TARGET: Cow bosTau4 SEQ1_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit SEQ1_LEN=/cluster/data/bosTau4/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=300 SEQ1_CHUNK=20000000 SEQ1_LAP=0 # QUERY: Platypus ornAna1 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -verbose=2 \ -chainLinearGap=loose -bigClusterHub=kk > do.log 2>&1 & # real 1233m16.397s # the pk kluster became free as this job had a long time to go. # Unsure how long it had to go because the batch became confused # and couldn't calculate eta, it had waiting=0 jobs ? So, chill # out the batch, allow the running jobs to complete, # go to pk and get the jobs completed there. # There was a difficulty with the para.results file. It lost track # of 1158 jobs, but they were completed and results exist. # then continuing: time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -verbose=2 -smallClusterHub=memk \ -continue=cat -chainLinearGap=loose -bigClusterHub=pk > cat.log 2>&1 & # real 66m36.465s cat fb.bosTau4.chainOrnAna1Link.txt # 201799073 bases of 2731830700 (7.387%) in intersection mkdir /cluster/data/ornAna1/bed/blastz.bosTau4.swap cd /cluster/data/ornAna1/bed/blastz.bosTau4.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11/DEF \ -chainMinScore=5000 -verbose=2 -smallClusterHub=memk \ -swap -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 & # real 106m17.385s cat fb.ornAna1.chainBosTau4Link.txt # 187868681 bases of 1842236818 (10.198%) in intersection # need to split up the chain file to run the reciprocal net # there are too many chroms to work with chain split # do it with chainFilter as a kluster job on memk ssh memk mkdir /scratch/tmp/ornAna1 cd /scratch/tmp/ornAna1 for R in 0 1 2 3 4 5 6 7 do echo $R ssh mkr0u${R} mkdir /scratch/tmp/ornAna1 ssh mkr0u${R} cp -p /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/bosTau4.ornAna1.all.chain.gz /scratch/tmp/ornAna1 ssh mkr0u${R} gunzip /scratch/tmp/ornAna1/bosTau4.ornAna1.all.chain.gz done zcat /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/bosTau4.ornAna1.all.chain.gz | grep "^chain " | awk '{print $3}' | sort -u > chain.list cat << '_EOF_' > splitOne.csh #!/bin/csh -fe set tmpDir = /scratch/tmp/ornAna1 set T=$1 chainFilter -t=${T} $tmpDir/bosTau4.ornAna1.all.chain > $tmpDir/${T}.chain '_EOF_' # << happy emacs cat << '_EOF_' > template #LOOP ./splitOne.csh $(file1) '_EOF_' # << happy emacs gensub2 chain.list single template jobList para create jobList para try ... check ... push ... time ... # fetch the results from tmp directories for R in 0 1 2 3 4 5 6 7 do ssh mkr0u${R} "(cd /scratch/tmp/ornAna1; cp -p -u chr*.chain /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/chain)" done ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-03-12 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("bosTau4", "blat2", "17778", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("bosTau4", "blat2", "17779", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### ## 5-Way Multiz (DONE - 2008-03-18 - Hiram) ## # the all.chain.gz files were split up via kluster jobs on memk # in order to get mafSynNet files. Example above in ornAna1 blastz ssh hgwdev mkdir /cluster/data/bosTau4/bed/multiz5way cd /cluster/data/bosTau4/bed/multiz5way # take the 30-way tree from mm9 and eliminate genomes not in # this alignment # rearrange to get bosTau4 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram # select the 9 organisms from the 30-way recently done on mouse mm9 /cluster/bin/phast/tree_doctor \ --prune-all-but Human_hg18,Mouse_mm9,Dog_canFam2,Platypus_ornAna1,Cow_bosTau3 \ /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \ | sed -e "s/bosTau3/bosTau4/g" > 5-way.fullNames.nh # looks something like this: (((Mouse_mm9:0.325818,Human_hg18:0.126901):0.019763, (Dog_canFam2:0.156637,Cow_bosTau4:0.163945):0.031326):0.332197, Platypus_ornAna1:0.488110); # rearrange to get Cow at the top: # this leaves us with: cat << '_EOF_' > bosTau4.5-way.nh (((Cow_bosTau4:0.163945,Dog_canFam2:0.156637):0.031326, (Human_hg18:0.126901,Mouse_mm9:0.325818):0.019763):0.332197, Platypus_ornAna1:0.488110); '_EOF_' # << happy emacs # create a species list from that file: sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' bosTau4.5-way.nh \ | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \ | sed -e "s/.*_//; s/:.*//" | sort > species.list # verify that has 5 db names in it # create a stripped down nh file for use in autoMZ run echo \ `sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' bosTau4.5-way.nh \ | sed -e "s/ / /g"` > tree.5.nh # that looks like, as a single line: # (((bosTau4 canFam2) (hg18 mm9)) ornAna1) # verify all blastz's exists cat << '_EOF_' > listMafs.csh #!/bin/csh -fe cd /cluster/data/bosTau4/bed/multiz5way foreach db (`grep -v bosTau4 species.list`) set bdir = /cluster/data/bosTau4/bed/blastz.$db if (-e $bdir/mafRBestNet/chr1.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/chr1.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/chr1.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end '_EOF_' # << happy emacs chmod +x ./listMafs.csh # see what it says: ./listMafs.csh # canFam2 mafSynNet # hg18 mafSynNet # mm9 mafSynNet # ornAna1 mafSynNet /cluster/bin/phast/all_dists bosTau4.5-way.nh > 5way.distances.txt grep -i bostau 5way.distances.txt | sort -k3,3n # Cow_bosTau4 Dog_canFam2 0.320582 # Cow_bosTau4 Human_hg18 0.341935 # Cow_bosTau4 Mouse_mm9 0.540852 # Cow_bosTau4 Platypus_ornAna1 1.015578 # use the calculated # distances in the table below to order the organisms and check # the button order on the browser. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainCalJac1Link chain linearGap # distance on CalJac1 on other minScore # 1 0.320582 Dog_canFam2 (% 82.846) (% 78.351) 3000 medium # 2 0.341935 Human_hg18 (% 74.810) (% 77.648) 3000 medium # 3 0.540852 Mouse_mm9 (% 76.925) (% 74.694) 3000 medium # 4 1.015578 Platypus_ornAna1 (% 77.296) (% 76.308) 3000 medium # create a coherent set of all the mafs involved in this run mkdir mafLinks cd mafLinks ln -s ../../blastz.canFam2/mafSynNet ./canFam2 ln -s ../../blastz.hg18/mafSynNet ./hg18 ln -s ../../blastz.mm9/mafSynNet ./mm9 ln -s ../../blastz.ornAna1/mafSynNet ./ornAna1 # check data size: du -hscL * # 982M canFam2 # 940M hg18 # 483M mm9 # 79M ornAna1 # 2.5G total # copy net mafs to cluster-friendly storage ssh kkstore06 cd /cluster/data/bosTau4/bed/multiz5way/mafLinks mkdir -p /san/sanvol1/scratch/bosTau4/multiz5way/mafs rsync -a --progress --copy-links ./ \ /san/sanvol1/scratch/bosTau4/multiz5way/mafs/ # create a run-time list of contigs to operate on, not all contigs # exist in all alignments, but we want all contig names used in any # alignment: cd /san/sanvol1/scratch/bosTau4/multiz5way/mafs for D in * do cd "${D}" find . -type f cd .. done | sed -e "s#^./##; s#.maf##" | sort -u > /tmp/5-way.contig.list wc -l /tmp/5-way.contig.list # 2321 /tmp/5-way.contig.list # ready for the multiz run ssh pk mkdir /cluster/data/bosTau4/bed/multiz5way/splitRun cd /cluster/data/bosTau4/bed/multiz5way/splitRun scp -p kkstore06:/tmp/5-way.contig.list . mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # set the db and pairs directories here cat << '_EOF_' > autoMultiz.csh #!/bin/csh -ef set db = bosTau4 set c = $1 set result = $2 set resultDir = $result:h set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/multiz5way/mafs rm -fr $tmp mkdir -p $tmp mkdir -p $resultDir cp ../../tree.5.nh ../../species.list $tmp pushd $tmp foreach s (`grep -v $db species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.5.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $result rm -fr $tmp rmdir --ignore-fail-on-non-empty /scratch/tmp/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/bosTau4/bed/multiz5way/splitRun/maf/$(root1).maf} #ENDLOOP '_EOF_' # << emacs gensub2 ../5-way.contig.list single template jobList para create jobList para try ... check ... push ... etc # Completed: 2321 of 2321 jobs # CPU time in finished jobs: 35680s 594.67m 9.91h 0.41d 0.001 y # IO & Wait Time: 9398s 156.63m 2.61h 0.11d 0.000 y # Average job time: 19s 0.32m 0.01h 0.00d # Longest finished job: 1982s 33.03m 0.55h 0.02d # Submission to last job: 3983s 66.38m 1.11h 0.05d # put the split maf results back together into a single maf file # eliminate duplicate comments ssh kkstore06 cd /cluster/data/bosTau4/bed/multiz5way grep "^##maf version" splitRun/maf/chr1.maf \ | sort -u > bosTau4.5way.maf for F in `find ./splitRun/maf -type f -depth` do grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \ | sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g" done | sort -u >> bosTau4.5way.maf for F in `find ./splitRun/maf -type f -depth` do grep -v -h "^#" "${F}" done >> bosTau4.5way.maf grep "^##eof maf" splitRun/maf/chr1.maf \ | sort -u >> bosTau4.5way.maf # load tables for a look ssh hgwdev mkdir -p /gbdb/bosTau4/multiz5way/maf ln -s /cluster/data/bosTau4/bed/multiz5way/*.maf \ /gbdb/bosTau4/multiz5way/maf/multiz5way.maf # this generates an immense multiz5way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/bosTau4/multiz5way/maf bosTau4 multiz5way # real 3m31.593s # Loaded 5392296 mafs in 1 files from /gbdb/bosTau4/multiz5way/maf # load summary table time nice -n +19 cat /gbdb/bosTau4/multiz5way/maf/*.maf \ | hgLoadMafSummary bosTau4 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz5waySummary stdin # real 3m41.663s # Created 779088 summary blocks from 10697864 components and 5164426 mafs from stdin # Gap Annotation # prepare bed files with gap info ssh kkstore06 mkdir /cluster/data/bosTau4/bed/multiz5way/anno cd /cluster/data/bosTau4/bed/multiz5way/anno mkdir maf run # these actually already all exist from previous multiple alignments # remove the echo in front of the twoBitInfo to actually make it work for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `grep -v bosTau4 ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done ssh memk # temporarily copy the bosTau4.5way.maf file onto the memk # nodes /scratch/data/bosTau4/maf/ directory for R in 0 1 2 3 4 5 6 7 do ssh mkr0u${R} rsync -a --progress \ /cluster/data/bosTau4/bed/multiz5way/bosTau4.5way.maf \ /scratch/data/bosTau4/maf/ done mkdir /cluster/data/bosTau4/bed/multiz5way/anno/splitMaf # need to split up the single maf file into individual # per-scaffold maf files to run annotation on cd /cluster/data/bosTau4/bed/multiz5way/anno/splitMaf # create bed files to list approximately 1553 scaffolds in # a single list, approximately 33 lists cat << '_EOF_' > mkBedLists.pl #!/usr/bin/env perl use strict; use warnings; my $bedCount = 0; my $i = 0; my $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; open (FH,") { chomp $line; if ( (($i + 1) % 1553) == 0 ) { printf "%s\n", $line; close (BF); ++$bedCount; $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; } ++$i; my ($chr, $size) = split('\s+',$line); printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr; } close (FH); close (BH); '_EOF_' # << happy emacs chmod +x mkBedLists.pl ./mkBedLists.pl # now, run a mafsInRegion on each one of those lists cat << '_EOF_' > runOne #!/bin/csh -fe set runDir = "/cluster/data/bosTau4/bed/multiz5way/anno/splitMaf" set resultDir = $1 set bedFile = $resultDir.bed mkdir -p $resultDir mkdir -p /scratch/tmp/bosTau4/$resultDir pushd /scratch/tmp/bosTau4/$resultDir mafsInRegion $runDir/$bedFile -outDir . \ /scratch/data/bosTau4/maf/bosTau4.5way.maf popd rsync -q -a /scratch/tmp/bosTau4/$resultDir/ ./$resultDir/ rm -fr /scratch/tmp/bosTau4/$resultDir rmdir --ignore-fail-on-non-empty /scratch/tmp/bosTau4 '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' > template #LOOP ./runOne $(root1) #ENDLOOP '_EOF_' # << happy emacs ls file*.bed > runList gensub2 runList single template jobList para create jobList para try ... check ... push ... etc # Completed: 8 of 8 jobs # CPU time in finished jobs: 1137s 18.96m 0.32h 0.01d 0.000 y # IO & Wait Time: 552s 9.19m 0.15h 0.01d 0.000 y # Average job time: 211s 3.52m 0.06h 0.00d # Longest finished job: 599s 9.98m 0.17h 0.01d # Submission to last job: 599s 9.98m 0.17h 0.01d cd /cluster/data/bosTau4/bed/multiz5way/anno/run cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set outDir = ../maf/$2 set result = $3 set input = $1 mkdir -p $outDir cat $input | \ nice mafAddIRows -nBeds=nBeds stdin /scratch/data/bosTau4/bosTau4.2bit $result '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs find ../splitMaf -type f -name "*.maf" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 2320 of 2320 jobs # CPU time in finished jobs: 8405s 140.09m 2.33h 0.10d 0.000 y # IO & Wait Time: 6934s 115.56m 1.93h 0.08d 0.000 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest finished job: 369s 6.15m 0.10h 0.00d # Submission to last job: 834s 13.90m 0.23h 0.01d # put the results back together into a single file ssh kkstore06 cd /cluster/data/bosTau4/bed/multiz5way/anno grep "^##maf version" maf/file_0/chr1.maf \ | sort -u > bosTau4.anno.5way.maf find ./maf -type f -depth -name "*.maf" | while read F do grep -v -h "^#" "${F}" done >> bosTau4.anno.5way.maf echo "##eof maf" >> bosTau4.anno.5way.maf ssh hgwdev cd /cluster/data/bosTau4/bed/multiz5way/anno mkdir -p /gbdb/bosTau4/multiz5way/anno ln -s `pwd`/bosTau4.anno.5way.maf \ /gbdb/bosTau4/multiz5way/anno/multiz5way.maf # by loading this into the table multiz5way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/bosTau4/multiz5way/anno \ bosTau4 multiz5way # Loaded 6711605 mafs in 1 files from /gbdb/bosTau4/multiz5way/anno # real 4m54.025s # normally filter this for chrom size > 1,000,000 and only load # those chroms. But this is a scaffold assembly, load everything: time nice -n +19 hgLoadMafSummary bosTau4 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz5waySummary \ /gbdb/bosTau4/multiz5way/anno/multiz5way.maf # Created 779088 summary blocks from 10697864 components # and 6412983 mafs from /gbdb/bosTau4/multiz5way/anno/multiz5way.maf # real 4m6.782s # by loading this into the table multiz5waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz5way*.tab files in this /scratch/tmp directory rm multiz5way*.tab # And, you can remove the previously loaded non-annotated maf file link: rm /gbdb/bosTau4/multiz5way/maf/bosTau4.5way.maf rmdir /gbdb/bosTau4/multiz5way/maf ########################################################################### ## Annotate 5-way multiple alignment with gene annotations ## (DONE - 2008-03-18 - Hiram) # Gene frames ## given previous survey done for 9-way alignment on Marmoset ## and the appearance of new ensGene tables on everything # use knownGene for hg18, mm9 # use ensGene for canFam2, ornAna1 # and refGene for bosTau4 ssh hgwdev mkdir /cluster/data/bosTau4/bed/multiz5way/frames cd /cluster/data/bosTau4/bed/multiz5way/frames mkdir genes # knownGene for DB in hg18 mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene for DB in canFam2 ornAna1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # refGene for DB in bosTau4 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done ls -og genes # -rw-rw-r-- 1 861210 Mar 18 15:40 bosTau4.gp.gz # -rw-rw-r-- 1 1865308 Mar 18 15:40 canFam2.gp.gz # -rw-rw-r-- 1 2008806 Mar 18 15:39 hg18.gp.gz # -rw-rw-r-- 1 1965274 Mar 18 15:39 mm9.gp.gz # -rw-rw-r-- 1 1347532 Mar 18 15:40 ornAna1.gp.gz ssh kkstore06 cd /cluster/data/bosTau4/bed/multiz5way/frames # anything to annotate is in a pair, e.g.: bosTau4 genes/bosTau4.gp.gz time (cat ../anno/bosTau4.anno.5way.maf | nice -n +19 genePredToMafFrames bosTau4 stdin stdout bosTau4 genes/bosTau4.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz canFam2 genes/canFam2.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 77526 bosTau4 # 110768 ornAna1 # 221524 mm9 # 230010 hg18 # 243396 canFam2 # load the resulting file ssh hgwdev cd /cluster/data/bosTau4/bed/multiz5way/frames time nice -n +19 hgLoadMafFrames bosTau4 multiz5wayFrames \ multiz5way.mafFrames.gz # real 0m21.968s # enable the trackDb entries: # frames multiz5wayFrames # irows on ############################################################################# # phastCons 5-way (DONE - 2008-03-19 - Hiram) # split 5way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh memk mkdir /cluster/data/bosTau4/bed/multiz5way/msa.split cd /cluster/data/bosTau4/bed/multiz5way/msa.split mkdir -p /san/sanvol1/scratch/bosTau4/multiz5way/cons/ss cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/bosTau4/bed/multiz5way/anno/maf set WINDOWS = /san/sanvol1/scratch/bosTau4/multiz5way/cons/ss pushd $WINDOWS set resultDir = $1 set c = $2 rm -fr $resultDir/$c mkdir -p $resultDir twoBitToFa -seq=$c /scratch/data/bosTau4/bosTau4.2bit /scratch/tmp/bosTau4.$c.fa # need to truncate odd-ball scaffold/chrom names that include dots # as phastCons utils can't handle them set TMP = /scratch/tmp/$c.clean.maf.$$ perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$resultDir/$c.maf > $TMP /cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \ -M /scratch/tmp/bosTau4.$c.fa \ -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000 rm -f /scratch/tmp/bosTau4.$c.fa rm -f $TMP popd mkdir -p $resultDir date > $resultDir/$c.out '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out} #ENDLOOP '_EOF_' # << happy emacs # create list of maf files: (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 2320 of 2320 jobs # CPU time in finished jobs: 1710s 28.50m 0.47h 0.02d 0.000 y # IO & Wait Time: 6951s 115.85m 1.93h 0.08d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest finished job: 128s 2.13m 0.04h 0.00d # Submission to last job: 1048s 17.47m 0.29h 0.01d # take the cons and noncons trees from the mouse 30-way # Estimates are not easy to make, probably more correctly, # take the 30-way .mod file, and re-use it here. ssh hgwdev cd /cluster/data/bosTau4/bed/multiz5way cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod . # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh memk mkdir -p /cluster/data/bosTau4/bed/multiz5way/cons/run.cons cd /cluster/data/bosTau4/bed/multiz5way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all gliers placentals # Well, that's what it was when used in the Mm9 30-way, # in this instance, there is only the directory "all" cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast/bin set subDir = $1 set f = $2 set c = $2:r set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/bosTau4/bed/multiz5way/cons mkdir -p $tmp set san = /san/sanvol1/scratch/bosTau4/multiz5way/cons if (-s $cons/$grp/$grp.non-inf) then cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp else cp -p $cons/$grp/$grp.mod $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir sleep 4 touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir rm -f $san/$grp/pp/$subDir/$f.pp rm -f $san/$grp/bed/$subDir/$f.bed mv $tmp/$f.pp $san/$grp/pp/$subDir mv $tmp/$f.bed $san/$grp/bed/$subDir rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # Create parasol batch and run it pushd /san/sanvol1/scratch/bosTau4/multiz5way/cons find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \ > /cluster/data/bosTau4/bed/multiz5way/cons/ss.list popd # run for all species cd .. mkdir -p all run.cons/all cd all /cluster/bin/phast.cz/tree_doctor ../../mm9.30way.mod \ --prune-all-but=bosTau3,hg18,mm9,canFam2,ornAna1 \ | sed -e "s/bosTau3/bosTau4/" > all.mod cd ../run.cons/all # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/bosTau4/multiz5way/cons/all/pp/$(lastDir1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 2569 of 2569 jobs # CPU time in finished jobs: 8636s 143.93m 2.40h 0.10d 0.000 y # IO & Wait Time: 17371s 289.52m 4.83h 0.20d 0.001 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest finished job: 44s 0.73m 0.01h 0.00d # Submission to last job: 1008s 16.80m 0.28h 0.01d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all find ./bed -type f -name "chr*.bed" | xargs cat \ | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 3 minutes cp -p mostConserved.bed /cluster/data/bosTau4/bed/multiz5way/cons/all # load into database ssh hgwdev cd /cluster/data/bosTau4/bed/multiz5way/cons/all time nice -n +19 hgLoadBed bosTau4 phastConsElements5way mostConserved.bed # Loaded 1005876 elements of size 5 # Try for 5% overall cov, and 70% CDS cov # We don't have any gene tracks to compare CDS coverage # --rho .31 --expected-length 45 --target-coverage .3 featureBits bosTau4 phastConsElements5way # 132010504 bases of 2731830700 (4.832%) in intersection # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all mkdir -p phastCons5wayScores for D in `ls -1d pp/file* | sort -t_ -k2n` do TOP=`pwd` F=${D/pp\/} out=${TOP}/phastCons5wayScores/${F}.data.gz echo "${D} > ${F}.data.gz" cd ${D} find . -name "*.pp" -type f \ | sed -e "s#^./##; s/chrUn.004./chrUn_004_/; s/-/.-./" \ | sort -t '.' -k1,1 -k3.3n \ | sed -e "s/.-./-/; s/chrUn_004_/chrUn.004./" | xargs cat \ | gzip > ${out} cd "${TOP}" done # copy those files to the downloads area: # /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way/phastConsScores # for hgdownload downloads # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all ls -1 phastCons5wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \ | wigEncode -noOverlap stdin phastCons5way.wig phastCons5way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 time nice -n +19 cp -p *.wi? /cluster/data/bosTau4/bed/multiz5way/cons/all # real 0m40.875s # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/bosTau4/bed/multiz5way/cons/all ln -s `pwd`/phastCons5way.wib /gbdb/bosTau4/multiz5way/phastCons5way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/bosTau4/multiz5way bosTau4 \ phastCons5way phastCons5way.wig # real 1m5.667s # remove garbage rm wiggle.tab # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/bosTau4/bed/multiz5way/cons/all time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=bosTau4 phastCons5way > histogram.data 2>&1 # real 3m37.316s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Cow BosTau4 Histogram phastCons5way track" set xlabel " phastCons5way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # These trackDb entries turn on the wiggle phastCons data track: # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons5way # spanList 1 # autoScale Off # windowingFunction mean # pairwiseHeight 12 # yLineOnOff Off ############################################################################# # Downloads (DONE - 2008-01-11 - Hiram) # Let's see if the downloads will work ssh hgwdev /cluster/data/bosTau4 # expecting to find repeat masker .out file here: ln -s bed/RepeatMasker/bosTau4.fa.out . time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \ -workhorse=hgwdev bosTau4 > jkStuff/downloads.log 2>&1 # real 24m3.210s # failed making upstream sequences: # featureBits bosTau4 mgcGenes:upstream:1000 -fa=stdout # setpriority: Permission denied. # the 'nice' from my bash shell causes trouble inside the csh # script which uses nice. Finish off the install step manually # with the mgcGenes upstreams ... ############################################################################# # PushQ entries (DONE - 2008-01-11 - Hiram) ssh hgwdev /cluster/data/bosTau4 /cluster/bin/scripts/makePushQSql.pl bosTau4 > jkStuff/pushQ.sql # output warnings: # bosTau4 does not have seq # bosTau4 does not have gbMiscDiff # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting # and genbank tables) which tracks to assign these tables to: # genscanPep ############################################################################# # create download files (DONE - 2008-03-19 - Hiram) ssh hgwdev cd /cluster/data/bosTau4 ln -s /cluster/data/bosTau4/bed/repeatMasker/bosTau4.fa.out . makeDownloads.pl bosTau4 > makeDownloads.log 2>&1 # *EDIT* the README files and ensure they are correct ############################################################################# # PushQ entries (DONE - 2008-03-19 - Hiram) ssh hgwdev /cluster/data/bosTau4 /cluster/bin/scripts/makePushQSql.pl bosTau4 > jkStuff/pushQ.sql # output warnings: # hgwdev does not have /usr/local/apache/htdocs/goldenPath/bosTau4/liftOver/bosTau4ToBosTau* # bosTau4 does not have seq # Could not tell (from trackDb, all.joiner and hardcoded lists of supporting # and genbank tables) which tracks to assign these tables to: # genscanPep # looks like there should be a bosTau3 to bosTau4 liftOver run ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-03-28) ssh kkstore06 # bash if not using bash shell already mkdir /cluster/data/bosTau4/blastDb cd /cluster/data/bosTau4 grep -v chrUn chrom.sizes | awk '{print $1}' > chr.lst for i in `cat chr.lst`; do twoBitToFa bosTau4.unmasked.2bit -seq=$i stdout; done > temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft grep chrUn chrom.sizes | awk '{print $1}' > chr.lst for i in `cat chr.lst`; do twoBitToFa bosTau4.unmasked.2bit -seq=$i stdout; done > temp.fa faSplit sequence temp.fa 150 blastDb/y rm temp.fa chr.lst cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 3440 mkdir -p /san/sanvol1/scratch/bosTau4/blastDb cd /cluster/data/bosTau4/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/bosTau4/blastDb done mkdir -p /cluster/data/bosTau4/bed/tblastn.hg18KG cd /cluster/data/bosTau4/bed/tblastn.hg18KG echo /san/sanvol1/scratch/bosTau4/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 3440 query.lst # we want around 350000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\) # 36727/(350000/3440) = 360.973943 mkdir -p /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa split -l 361 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/bosTau4/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/bosTau4/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/bosTau4/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time Completed: 350880 of 350880 jobs CPU time in finished jobs: 27082816s 451380.27m 7523.00h 313.46d 0.859 y IO & Wait Time: 2334990s 38916.50m 648.61h 27.03d 0.074 y Average job time: 84s 1.40m 0.02h 0.00d Longest finished job: 578s 9.63m 0.16h 0.01d Submission to last job: 96125s 1602.08m 26.70h 1.11d ssh kkstore06 cd /cluster/data/bosTau4/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh pk cd /cluster/data/bosTau4/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 99 of 102 jobs # Crashed: 3 jobs # CPU time in finished jobs: 113248s 1887.47m 31.46h 1.31d 0.004 y # IO & Wait Time: 86043s 1434.04m 23.90h 1.00d 0.003 y # Average job time: 2013s 33.55m 0.56h 0.02d # Longest finished job: 6139s 102.32m 1.71h 0.07d # Submission to last job: 10416s 173.60m 2.89h 0.12d # ran three crashed jobs on kolossus ssh kkstore06 cd /cluster/data/bosTau4/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/bosTau4/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/bosTau4/bed/tblastn.hg18KG hgLoadPsl bosTau4 blastHg18KG.psl # check coverage featureBits bosTau4 blastHg18KG # 40254923 bases of 2731830700 (1.474%) in intersection featureBits bosTau4 refGene:cds blastHg18KG -enrichment # refGene:cds 0.429%, blastHg18KG 1.474%, both 0.379%, cover 88.39%, enrich 59.98x ssh kkstore06 rm -rf /cluster/data/bosTau4/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## # Create 5-way downloads (DONE - 2008-03-28 - Hiram) ssh hgwdev mkdir -p /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way cd /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way cp -p \ /san/sanvol1/scratch/bosTau4/multiz5way/cons/all/phastCons5wayScores/* . ln -s ../../cons/all/all.mod ./5way.mod cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt . # edit that README.txt to be correct for this 5-way alignment cd .. mkdir multiz5way cd multiz5way cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt . # edit that README.txt to be correct for this 5-way alignment ssh kkstore06 cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way ln -s ../../bosTau4.5-way.nh 5way.nh time gzip -c ../../anno/bosTau4.anno.5way.maf > bosTau4.5way.maf.gz # real 34m59.295s ssh hgwdev cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way # creating upstream files from refGene, bash script: cat << '_EOF_' > mkUpstream.sh #!/bin/bash DB=bosTau4 GENE=refGene NWAY=multiz5way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \ stdin stdout \ -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done '_EOF_' # << happy emacs chmod +x ./mkUpstream.sh time nice -n +19 ./mkUpstream.sh -rw-rw-r-- 1 9883443 Mar 28 13:02 upstream1000.maf.gz -rw-rw-r-- 1 17938570 Mar 28 13:06 upstream2000.maf.gz -rw-rw-r-- 1 40384656 Mar 28 13:10 upstream5000.maf.gz # # check the names in these upstream files to ensure sanity: zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \ | sort | uniq -c | sort -rn | less # should be a list of the other 4 species with a high count, # then refGene names, e.g.: # 8806 ornAna1 # 8806 mm9 # 8806 hg18 # 8806 canFam2 # 7 NM_001077006 # 3 NM_001113231 # 3 NM_001105381 # 3 NM_001102527 # 3 NM_001102322 # ... ssh kkstore06 cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way md5sum *.maf.gz > md5sum.txt cd ../phastCons5way md5sum *.data.gz *.mod > md5sum.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/bosTau4/multiz5way mkdir /usr/local/apache/htdocs/goldenPath/bosTau4/phastCons5way cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/bosTau4/multiz5way cd ../phastCons5way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/bosTau4/phastCons5way # if your ln -s `pwd`/* made extra links to files you don't want there, # check the goldenPath locations and remove those extra links ############################################################################# # N-SCAN gene predictions (nscanGene) - (2008-04-21 markd) # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/bosTau4/bed/nscan/ wget http://mblab.wustl.edu/predictions/Cow/bosTau4/bosTau4.gtf wget http://mblab.wustl.edu/predictions/Cow/bosTau4/bosTau4.prot.fa wget http://mblab.wustl.edu/predictions/Cow/bosTau4/readme.html bzip2 bosTau4.* chmod a-w * # load track gtfToGenePred -genePredExt bosTau4.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt bosTau4 nscanGene stdin hgPepPred bosTau4 generic nscanPep bosTau4.prot.fa.bz2 rm *.tab # update trackDb; need a bosTau4-specific page to describe informants cow/bosTau4/nscanGene.html (copy from readme.html) cow/bosTau4/trackDb.ra # set search regex to termRegex chr[0-9a-zA-Z_]+\.[0-9]+\.[0-9]+(\.[0-9]+)* ############################################################################# ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ # SELF BLASTZ (DONE - 2008-06-27 - Hiram) # do not want the noise from the contigs ssh kkstore06 screen # use a screen to manage this multi-day job mkdir /cluster/data/bosTau4/noContigs cd /cluster/data/bosTau4/noContigs cut -f1 ../chrom.sizes | grep -v chrUn | while read C do echo "twoBitToFa -seq=${C} ../bosTau4.2bit ${C}.fa" twoBitToFa -seq=${C} ../bosTau4.2bit ${C}.fa done faToTwoBit chr*.fa bosTau4.noContigs.2bit twoBitToFa bosTau4.noContigs.2bit stdout | faSize stdin # 2634429662 bases (167456864 N's 2466972798 real # 1326108891 upper 1140863907 lower) in 31 sequences in 1 files # %43.31 masked total, %46.25 masked real grep -v chrUn ../chrom.sizes | ave -col=2 stdin # count 31 # total 2634429662.000000 twoBitInfo bosTau4.noContigs.2bit stdout | sort -k2nr \ > bosTau4.noContigs.chrom.sizes cp -p bosTau4.noContigs.2bit /cluster/bluearc/scratch/data/bosTau4 cp -p bosTau4.noContigs.chrom.sizes /cluster/bluearc/scratch/data/bosTau4 mkdir /cluster/data/bosTau4/bed/blastzSelf.2008-08-27 cd /cluster/data/bosTau4/bed ln -s blastzSelf.2008-08-27 blastzSelf cd blastzSelf.2008-08-27 cat << '_EOF_' > DEF BLASTZ_M=400 # TARGET: Cow bosTau4 SEQ1_DIR=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.2bit SEQ1_LEN=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau4 SEQ2_DIR=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.2bit SEQ2_LEN=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/bosTau4/bed/blastzSelf.2008-08-27 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy cd /cluster/data/bosTau4/bed/blastzSelf.2008-08-27 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & # real 3537m49.719s # had to fix a slight problem in the make downloads step, then: time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ -continue=cleanup `pwd`/DEF > cleanup.out 2>&1 & ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: bosTau4.upstreamGeneTbl = mgcGenes bosTau4.upstreamMaf = multiz5way /hive/data/genomes/bosTau4/bed/multiz5way/species.list ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ ############################################################################# # LIFTOVER TO bosTauMd3 (DONE - 2009-12-16 - galt) ssh hgwdev cd /hive/data/genomes/bosTau4 ln -s bosTau4.11.ooc 11.ooc time nice +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ bosTau4 bosTauMd3 >& doLiftOverToBosTauMd3.log pushd /usr/local/apache/htdocs/goldenPath/bosTau4/liftOver/ md5sum bosTau4ToBosTauMd3.over.chain.gz >> md5sum.txt popd ######################################################################### # susScr1 Pig BLASTZ/CHAIN/NET (DONE - 2010-01-25 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25 cd /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25 cat << '_EOF_' > DEF # Pig vs. Cow BLASTZ_M=50 # TARGET: Cow BosTau4 SEQ1_DIR=/scratch/data/bosTau4/bosTau4.2bit SEQ1_LEN=/scratch/data/bosTau4/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Pig SusScr1 SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit SEQ2_LEN=/scratch/data/susScr1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2377m21.473s # never did get this to work. susScr2 as target bosTau4 as query # did finish but with immense chain pileups # failed second time too # XXX - re-running as 01-25 (was 01-21) Mon Jan 25 11:29:32 PST 2010 # real 3811m31.428s # failed # XXX - running Thu Jan 25 14:49:43 PST 2010 time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ `pwd`/DEF -verbose=2 \ -bigClusterHub=memk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 ######################################################################### # lastz swap Rat Rn4 (DONE - 2010-01-22 - Hiram) # original run cd /cluster/data/rn4/bed/blastzBosTau4.2010-01-19 cat fb.rn4.chainBosTau4Link.txt # 649931321 bases of 2571531505 (25.274%) in intersection mkdir /cluster/data/bosTau4/bed/blastz.rn4.swap cd /cluster/data/bosTau4/bed/blastz.rn4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/rn4/bed/blastzBosTau4.2010-01-19/DEF \ -noLoadChainSplit -workhorse=hgwdev -smallClusterHub=memk \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > swap.log 2>&1 & # real 77m18.645s cat fb.bosTau4.chainRn4Link.txt # 664253901 bases of 2731830700 (24.315%) in intersection ######################################################################### # lastz swap Pig SusScr2 (DONE - 2010-03-31 - Hiram) # the primary lastz run on susScr2 cd /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27 cat fb.susScr2.chainBosTau4Link.txt # 1478903080 bases of 2231298548 (66.280%) in intersection # and the swap mkdir /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap cd /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # most interesting, this failed on the first step chainSwap # but the failure didn't make it stop, it continued and produced # zero results to the end. Running manually: cd /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap/axtChain export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG chainSwap /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/axtChain/susScr2.bosTau4.all.chain.gz stdout \ | nice chainSort stdin stdout | nice gzip -c > bosTau4.susScr2.all.chain.gz # it also runs out of memory in the lift over file creation: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG netChainSubset -verbose=0 noClass.net bosTau4.susScr2.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > bosTau4.susScr2.over.chain.gz # and netChains.csh is finished manually with this added: # memory limit 160 Gb limit datasize 163840m limit vmemoryuse 163840m # manually run the loadUp.csh with this added: # memory limit 160 Gb limit datasize 163840m limit vmemoryuse 163840m # real 498m5.861s cat fb.bosTau4.chainSusScr2Link.txt # 1383557633 bases of 2731830700 (50.646%) in intersection ############################################################################# # LIFTOVER TO bosTau5 (Btau_4.2) (DONE - 2010-07-19 - Chin) ssh hgwdev cd /hive/data/genomes/bosTau4 ln -s bosTau4.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ bosTau4 bosTau5 > doLiftOverToBosTau5.log 2>&1 & # real 245m42.802s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau4/liftOver/ md5sum bosTau4ToBosTau5.over.chain.gz >> md5sum.txt popd ######################################################################### # SEGMENTAL DUPLICATIONS (DONE 6/17/2011 Fan) # Original raw data file sent by email from Arthur Ko [a5ko@uw.edu] mkdir /hive/data/genomes/bosTau4/bed/genomicSuperDups cd /hive/data/genomes/bosTau4/bed/genomicSuperDups # place the data file, bosTau4-genomicSuperDups_2008Oct02.tab.gz, here gzip -d bosTau4-genomicSuperDups_2008Oct02.tab.gz awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' bosTau4-genomicSuperDups_2008Oct02.tab \ | hgLoadBed bosTau4 genomicSuperDups stdin \ -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql # Kayla found that the strand values were "+" and "_" -- fix: hgsql bosTau4 -e 'update genomicSuperDups set strand = "-" where strand = "_";' ############################################################################# # LIFTOVER TO bosTau6 (DONE - 2011-10-20 - Chin) ssh hgwdev cd /hive/data/genomes/bosTau4 ln -s bosTau4.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ bosTau4 bosTau6 >& doLiftOverToBosTau6.log 2>&1 & # real 119m16.934s pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau4/liftOver/ md5sum bosTau4ToBosTau6.over.chain.gz >> md5sum.txt popd #########################################################################