# for emacs: -*- mode: sh; -*- # The UCSC build oryLat1 had a broken chrUn sequence # It had been put together incorrectly here. # This oryLat2 build is for the same sequence as was in oryLat1 # but re-worked to fix our chrUn so we can make it correspond # to the Ensembl sequence. # http://www.ncbi.nlm.nih.gov/nuccore/144600648 # This file describes browser build for the Medaka # genome, Oryzias latipes, October 2005, MEDAKA1 from # Institute of Genetics (NIG) and the University of Tokyo # NIG: http://dolphin.lab.nig.ac.jp/medaka/ # UTGB: http://medaka.utgenome.org/o # Data release policy: http://medaka.utgenome.org/#policy # Ensembl: http://www.ensembl.org/Oryzias_latipes/index.html # # "$Id: oryLat2.txt,v 1.17 2009/11/25 21:48:41 hiram Exp $" # ########################################################################### ssh kkstore04 mkdir /cluster/store9/oryLat2 ln -s /cluster/store9/oryLat2 /cluster/data/oryLat2 mkdir /cluster/data/oryLat2/download # The following note was received from Bronwen Aken at Ensembl # describing the AGP files they used in their assembly: # The medaka assembly that we've used consists of: # - contigs as the basic unit of assembly # - scaffolds that are made up of contigs # - ultracontigs that are made up of scaffolds but can't be placed on a # chromosome # - chromsomes that are made up of scaffolds # # I have attached 4 gzipped AGP files and they correspond to the above # four levels of assembly. The first three files were given to us by # University of Tokyo. The assembly shown in these files, called # HdrR_200510, was not submitted to a public repository. The next # assembly, version 1.0, is the one with modified gap sizes. Version 1.0 # is the assembly used by UCSC and it has been submitted to Genbank/DDBJ/EMBL # # # The files are: # (i) 200510.chr.agp.txt.gz # This file specifies where scaffolds are found on a chromosome and how # big the gap is between each scaffold # # (ii) 200510.ultracontig.agp.txt.gz # This file specifies where scaffolds are found on an ultracontig and how # big the gap is between each scaffold # # (iii) 200510.scaffold.agp.txt.gz # This file specifies where contigs are found on a scaffold and how big # the gap is between each contig # # (iv) contig_fake_scaffolds.agp.txt.gz # There were 92 contigs that did not occur in any of the above agp files. # We made our own agp file for them, to put them onto their own scaffolds. # This was done because Ensembl defines 'toplevel' sequence from which we # do our analyses. (In the toplevel sequence for a genome, every contig # must occur exactly once.) # # Hope that helps, # Bronwen # # These four agp files were extracted from email and dropped into: # /cluster/data/oryLat2/download wget --timestamping \ "ftp://ftp.ensembl.org/pub/release-50/fasta/oryzias_latipes/dna/Oryzias_latipes.MEDAKA1.50.dna.toplevel.fa.gz" \ -O ensembl.MEDAKA1.50.dna.toplevel.fa.gz wget --timestamping \ "ftp://ftp.ensembl.org/pub/release-50/fasta/oryzias_latipes/dna/Oryzias_latipes.MEDAKA1.50.dna.nonchromosomal.fa.gz" \ -O ensembl.MEDAKA1.50.dna.nonchromosomal.fa.gz for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 do wget --timestamping \ "ftp://ftp.ensembl.org/pub/release-50/fasta/oryzias_latipes/dna/Oryzias_latipes.MEDAKA1.50.dna.chromosome.$C.fa.gz" \ -O ensembl.MEDAKA1.50.dna.chromosome.$C.fa.gz echo ensembl.MEDAKA1.50.dna.chromosome.$C.fa.gz done # Taking a look at those AGP files from Bronwen and these fasta files. # Determined that all of the DNA is actually in the single file # ensembl.MEDAKA1.50.dna.toplevel.fa.gz # And a combination of bits from the four AGP files gets a complete # AGP file to match this fasta file, # all of: 200510.chr.agp.txt.gz # Then, 79 sequences from: 200510.ultracontig.agp.txt.gz # Then, 7001 sequences from: 200510.scaffold.agp.txt.gz # and the entire contents of: contig_fake_scaffolds.agp.txt.gz # The specifications from 200510.chr.agp.txt.gz had gaps at the # ends of most chromosomes but there were no N's at that location # within the fasta file, so, edit the resulting conglomerate AGP # file to remove those gaps at the ends. Can now pass the # test for checkAgpAndFa. It would be tough to create a chrUn # sequence, so, let's try it without ############################################################################## ssh hgwdev cd /cluster/data/oryLat2 # the config.ra script pretty much specifies everything cat << '_EOF_' > oryLat2.ra db oryLat2 scientificName Oryzias latipes assemblyDate Oct. 2005 assemblyLabel MEDAKA1 # Just above oryLat1 orderKey 474 # NC_004387 mitoAcc 25057156 fastaFiles /cluster/data/oryLat2/ucsc/ucsc.source.fa dbDbSpeciesDir medaka agpFiles /cluster/data/oryLat2/ucsc/chromUltraScaff.agp commonName Medaka clade Vertebrate genomeCladePriority 105 '_EOF_' # << happy emacs makeGenomeDb.pl oryLat2.ra -stop=agp -workhorse=hgwdev makeGenomeDb.pl oryLat2.ra -continue=db -workhorse=hgwdev # That worked just fine. ############################################################################### # Quick check to see if the Ensembl genes will now map properly to # this sequence: # oryLat2 - Medaka - Ensembl Genes version 50 (DONE - 2008-08-12 - hiram) ssh kkstore04 cd /cluster/data/oryLat2 cat << '_EOF_' > oryLat2.ensGene.ra # required db variable db oryLat2 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9][0-9]*\)/chr\1/; s/^MT/chrM/" '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=50 oryLat2.ensGene.ra ssh hgwdev cd /cluster/data/oryLat2/bed/ensGene.50 featureBits oryLat2 ensGene # 32418913 bases of 700386597 (4.629%) in intersection # Seems to work just fine, no lifing required, all genes worked ######################################################################### # REPEATMASKER (DONE - 2008-08-12 - Hiram) ssh kkstore04 screen # use a screen to manage this long running job mkdir /cluster/data/oryLat2/bed/repeatMasker cd /cluster/data/oryLat2/bed/repeatMasker # Run -debug to create the dir structure and preview the scripts: time nice -n +19 doRepeatMasker.pl oryLat2 -workhorse=kolossus \ -buildDir=/cluster/data/oryLat2/bed/repeatMasker \ -bigClusterHub=pk -smallClusterHub=memk > do.log 2>&1 # RM version from the log: # RepeatMasker version development-$Id: oryLat2.txt,v 1.17 2009/11/25 21:48:41 hiram Exp $ # CC RELEASE 20080801 # about 40 minutes run time # Calculate coverage to compare with other masking efforts featureBits oryLat2 rmsk # 23161223 bases of 700386597 (3.307%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (DONE 2008-08-12 - Hiram) ssh kolossus screen # use a screen to manage this long running job mkdir /cluster/data/oryLat2/bed/simpleRepeat cd /cluster/data/oryLat2/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl \ -buildDir=/cluster/data/oryLat2/bed/simpleRepeat \ oryLat2 -smallClusterHub=memk -workhorse=kolossus > do.log 2>&1 & # real 19m50.401s cat fb.simpleRepeat # 13229585 bases of 700386597 (1.889%) in intersectio ######################################################################### # windowMasker (DONE 2008-08-12,14 - Hiram) ssh kolossus screen # use a screen to manage this long running job mkdir /cluster/data/oryLat2/bed/windowMasker cd /cluster/data/oryLat2/bed/windowMasker time nice -n +19 doWindowMasker.pl \ -buildDir=/cluster/data/oryLat2/bed/windowMasker \ -workhorse=kolossus oryLat2 > do.log 2>&1 & # real 99m18.161s twoBitToFa oryLat2.wmsk.sdust.2bit stdout | faSize stdin # 869000216 bases (168613619 N's 700386597 real 468915765 upper # 231470832 lower) in 7189 sequences in 1 files # %26.64 masked total, %33.05 masked real # load this initial data to get ready to clean it ssh hgwdev cd /cluster/data/oryLat2/bed/windowMasker hgLoadBed oryLat2 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 4784449 elements of size 3 # eliminate the gaps from the masking featureBits oryLat2 -not gap -bed=notGap.bed # 700386597 bases of 700386597 (100.000%) in intersection featureBits oryLat2 windowmaskerSdust # 400084451 bases of 700386597 (57.123%) in intersection time nice -n +19 featureBits oryLat2 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # real 6m31.017s # 231470832 bases of 700386597 (33.049%) in intersection # reload track to get it clean hgLoadBed oryLat2 windowmaskerSdust cleanWMask.bed.gz # Loaded 4798190 elements of size 4 featureBits oryLat2 windowmaskerSdust # 231470832 bases of 700386597 (33.049%) in intersection # mask the sequence with this clean mask cd /cluster/data/oryLat2 zcat bed/windowMasker/cleanWMask.bed.gz \ | twoBitMask oryLat2.unmasked.2bit stdin \ -type=.bed oryLat2.WMSdust.2bit twoBitToFa oryLat2.WMSdust.2bit stdout | faSize stdin \ > oryLat2.WMSdust.faSize.txt cat oryLat2.WMSdust.faSize.txt # 869000216 bases (168613619 N's 700386597 real 468915765 upper # 231470832 lower) in 7189 sequences in 1 files # %26.64 masked total, %33.05 masked real # Add trfMask: twoBitMask oryLat2.WMSdust.2bit \ -add bed/simpleRepeat/trfMask.bed oryLat2.2bit # You can safely ignore the warning about extra BED columns twoBitToFa oryLat2.2bit stdout | faSize stdin \ > oryLat2.faSize.txt cat oryLat2.faSize.txt # 869000216 bases (168613619 N's 700386597 real 468647738 upper # 231738859 lower) in 7189 sequences in 1 files # %26.67 masked total, %33.09 masked real # This becomes our masked genome sequence: ln -s `pwd`/oryLat2.2bit /gbdb/oryLat2 ############################################################################# # Verify all N's in sequence are marked in the gap track # (DONE - 2008-08-14 - Hiram) ssh kkstore04 mkdir /cluster/data/oryLat2/bed/gap cd /cluster/data/oryLat2/bed/gap time nice -n +19 findMotif -strand=+ -verbose=4 -motif=gattaca \ ../../oryLat2.unmasked.2bit > oryLat2.Ns.txt 2>&1 # real 0m13.949 grep "^#GAP " oryLat2.Ns.txt | sed -e "s/#GAP //" \ | sort -k1,1 -k2,2n > oryLat2.Ns.bed # what are their sizes like: ave -col=5 oryLat2.Ns.bed Q1 10.000000 median 10.000000 Q3 285.000000 average 1324.922554 min 10.000000 max 5149430.000000 count 127263 total 168613619.000000 standard deviation 26497.729501 ssh hgwdev cd /cluster/data/oryLat2/bed/gap hgsql -N -e "select chrom,chromStart,chromEnd from gap;" oryLat2 \ | awk '{print $1,$2,$3}' | sort -k1,1 -k2,2n | sum # 00926 2904 awk '{print $1,$2,$3}' oryLat2.Ns.bed | sort -k1,1 -k2,2n | sum # 00926 2904 # Identical sums. All N's are included in the gaps. # same exact definition. If they were not, would have to figure # out where the Ns were that were not in gaps, and add them to # the gap table. This can be achieved with featureBits. ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-18 - Hiram) # Use -repMatch=200 (based on size -- for human we use 1024, and # medaka size is ~20% of human judging by gapless oryLat2 vs. hg18 # genome sizes from featureBits. ssh kolossus cd /cluster/data/oryLat2 blat oryLat2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/11.ooc -repMatch=200 # Wrote 46667 overused 11-mers to jkStuff/11.ooc mkdir /cluster/bluearc/scratch/data/oryLat2 cp -p oryLat2.2bit chrom.sizes jkStuff/11.ooc \ /cluster/bluearc/scratch/data/oryLat2 # Get this bluearc scratch data area copied to the kluster nodes ######################################################################### # GENBANK ALIGNMENTS (DONE - 2008-08-14 - Hiram) ssh kkstore04 cd /cluster/data/oryLat2 simplePartition.pl oryLat2.2bit 5000000 /tmp/oryLat2P cat /tmp/oryLat2P/*/*.lft > jkStuff/liftAll.lft rm -r /tmp/oryLat2P # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add oryLat1 just after gasAcu1 # oryLat2 (Oryzias latipes - Medaka) oryLat2.serverGenome = /cluster/data/oryLat2/oryLat2.2bit oryLat2.clusterGenome = /scratch/data/oryLat2/oryLat2.2bit oryLat2.lift = /cluster/data/oryLat2/jkStuff/liftAll.lft oryLat2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} oryLat2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} oryLat2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} oryLat2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} oryLat2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} oryLat2.ooc = /scratch/data/oryLat2/11.ooc oryLat2.genbank.mrna.xeno.load = yes oryLat2.refseq.mrna.native.load = yes oryLat2.refseq.mrna.xeno.load = yes oryLat2.downloadDir = oryLat2 cvs ci -m "Added oryLat2." etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen # use screen to manage this long running job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial oryLat2 & # logFile: var/build/logs/2008.08.14-16:39:51.oryLat2.initalign.log # real 304m52.587s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad oryLat2 # logFile: var/dbload/hgwdev/logs/2008.08.15-11:36:18.dbload.log # real 10m58.023s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add oryLat2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added oryLat2." etc/align.dbs etc/hgwdev.dbs make etc-update ########################################################################## ## BLASTZ swap from Human Hg18 to Medaka OryLat2 (DONE - 2008-08-25 - Hiram) ssh kkstore02 # not too important since everything moved to the hive screen # use a screen to control this job cd /cluster/data/hg18/bed/blastzOryLat2.2008-08-19 cat fb.hg18.chainOryLat2Link.txt # 52713428 bases of 2881515245 (1.829%) in intersection # That is OK, now for the swap: mkdir /cluster/data/oryLat2/bed/blastz.hg18.swap cd /cluster/data/oryLat2/bed/blastz.hg18.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/hg18/bed/blastzOryLat2.2008-08-19/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust \ -bigClusterHub=pk > swap.log 2>&1 & # real 17m9.675s cat fb.oryLat2.chainHg18Link.txt # 46961822 bases of 700386597 (6.705%) in intersection ######################################################################### # BLASTZ/CHAIN/NET FR2 (DONE - 2008-08-25,27 - Hiram) ## align to fr2 scaffolds, results lifted to fr2 chrUn coordinates ssh kkstore04 # not too important since everything moved to the hive screen # use a screen to control this job mkdir /cluster/data/oryLat2/bed/blastz.fr2.2008-08-25 cd /cluster/data/oryLat2/bed/blastz.fr2.2008-08-25 cat << '_EOF_' > DEF # Medaka vs. Fugu # Using the "nearby" organism settings # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp) SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=200 SEQ1_LAP=10000 # QUERY: Fugu fr2 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/scratch/data/fr2/fr2.2bit SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/oryLat2/bed/blastz.fr2.2008-08-25 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=pk > do.log 2>&1 & # real 0m42.311s # problems on pk, finish the kluster run manually, then continuing: time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -smallClusterHub=pk -continue=cat -verbose=2 -bigClusterHub=pk \ > cat.log 2>&1 & # real 230m12.868s cat fb.oryLat2.chainFr2Link.txt # 180945351 bases of 700386597 (25.835%) in intersection mkdir /cluster/data/fr2/bed/blastz.oryLat2.swap cd /cluster/data/fr2/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/oryLat2/bed/blastz.fr2.2008-08-25/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 & # real 24m32.826s cat fb.fr2.chainOryLat2Link.txt # 153621820 bases of 393312790 (39.058%) in intersection ######################################################################### # BLASTZ/CHAIN/NET Medaka oryLat2 (DONE - 2008-08-26,27 - Hiram) # with contigs for Lamprey ssh kkstore01 # not too important since everything moved to hive screen # use screen to control this job mkdir /cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26 cd /cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26 cat << '_EOF_' > DEF # Medaka vs. Lamprey # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp) SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=200 SEQ1_LAP=10000 # QUERY: Lamprey petMar1, sequence is in contigs SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/scratch/data/petMar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=150 SEQ2_LAP=0 BASE=/cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > do.log 2>&1 & # real 736m11.276s cat fb.oryLat2.chainPetMar1Link.txt # 41568923 bases of 700386597 (5.935%) in intersection # And, for the swap: mkdir /cluster/data/petMar1/bed/blastz.oryLat2.swap cd /cluster/data/petMar1/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -smallClusterHub=pk -bigClusterHub=kk > swap.log 2>&1 & # real 52m27.942s cat fb.petMar1.chainOryLat2Link.txt # 39181422 bases of 831696438 (4.711%) in intersection ######################################################################### # BLASTZ/CHAIN/NET Stickleback gasAcu1 (DONE - 2008-08-26,27 - Hiram) # with contigs for chrUn in Stickleback ssh kkstore02 # not too important since everything moved to hive screen # use screen to control this job mkdir /cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26 cd /cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26 cat << '_EOF_' > DEF # Medaka vs. Stickleback # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp) SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=200 SEQ1_LAP=10000 # QUERY: Stickleback gasAcu1, chrUn is in contigs, lifted to chrUn SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit SEQ2_LEN=/cluster/data/gasAcu1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift SEQ2_CHUNK=10000000 SEQ2_LIMIT=150 SEQ2_LAP=0 BASE=/cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > do.log 2>&1 & # real 698m10.533s cat fb.oryLat2.chainGasAcu1Link.txt # 211589769 bases of 700386597 (30.210%) in intersection # And, for the swap: mkdir /cluster/data/gasAcu1/bed/blastz.oryLat2.swap cd /cluster/data/gasAcu1/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium -swap \ /cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 & # real 31m44.711s cat fb.gasAcu1.chainOryLat2Link.txt # 189793612 bases of 446627861 (42.495%) in intersection ######################################################################### # BLASTZ/CHAIN/NET tetNig1 (DONE - 2008-08-26,27 - Hiram) ssh kkstore03 # not too important since everything moved to hive screen # use screen to control this job mkdir /cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26 cd /cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26 cat << '_EOF_' > DEF # Medaka vs. Tetraodon # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp) SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=200 SEQ1_LAP=10000 # QUERY: Tetraodon tetNig1 SEQ2_DIR=/cluster/data/tetNig1/tetNig1.sdTrf.2bit SEQ2_LEN=/cluster/data/tetNig1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > do.log 2>&1 & time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -continue=cat -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > cat.log 2>&1 & # had to finish netChains.csh manually since SEQ2_DIR was incorrect # continuing: time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -continue=load -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > load.log 2>&1 & # real 10m42.744s cat fb.oryLat2.chainTetNig1Link.txt # 163171121 bases of 700386597 (23.297%) in intersection # And, for the swap: mkdir /cluster/data/tetNig1/bed/blastz.oryLat2.swap cd /cluster/data/tetNig1/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26/DEF \ -swap -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 & # real 22m48.350s cat fb.tetNig1.chainOryLat2Link.txt # 139059872 bases of 342403326 (40.613%) in intersection ######################################################################### # BLASTZ/CHAIN/NET danRer5 (DONE - 2008-08-27,28 - Hiram) ssh kkstore01 # not too important since everything moved to hive screen # use screen to control this job mkdir /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27 cd /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27 cat << '_EOF_' > DEF # Medaka vs. Zebrafish # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp) SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=200 SEQ1_LAP=10000 # Query - zebrafish (danRer5) SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit SEQ2_LEN=/scratch/data/danRer5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -verbose=2 -tRepeats=windowmaskerSdust \ -smallClusterHub=pk -bigClusterHub=pk > do.log 2>&1 & # continuing after kluster difficulties time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -continue=cat -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > cat.log 2>&1 & # real 744m38.748s cat fb.oryLat2.chainDanRer5Link.txt # 138070427 bases of 700386597 (19.713%) in intersection # and for the swap mkdir /cluster/data/danRer5/bed/blastz.oryLat2.swap cd /cluster/data/danRer5/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \ -swap -tRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 & # real 58m15.303s # had to finish the load nets manually, mistakenly included # -qRepeats=windowmaskerSdust when that is not valid for danRer5 cat fb.danRer5.chainOryLat2Link.txt # 158709399 bases of 1435609608 (11.055%) in intersection # Then, continuing: doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \ -continue=download -swap -tRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > download.log 2>&1 & ######################################################################### ## 5-Way Multiz (DONE - 2008-09-03 - Hiram) ssh hgwdev mkdir /cluster/data/oryLat2/bed/multiz5way cd /cluster/data/oryLat2/bed/multiz5way sed -e "s/oryLat1/oryLat2/; s/danRer4/danRer5/" \ /cluster/data/oryLat1/bed/multiz5way/5way.nh > 5way.nh # this file looks like: cat << '_EOF_' > 5way.nh (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2, (stickleback_gasAcu1:0.2,medaka_oryLat2:0.2):0.2):0.292961, zebrafish_danRer5:0.782561); '_EOF_' # << happy emacs # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/oryLat2_5way.gif /cluster/bin/phast/all_dists 5way.nh > 5way.distances.txt # Use this output to create the table below grep -y oryLat2 5way.distances.txt | sort -k3,3n # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainOryLat1Link chain linearGap # distance on oryLat2 on other minScore # 1 0.4000 - stickleback gasAcu1 (% 30.210) (% 42.495) 3000 medium # 2 0.7994 - tetraodon tetNig1 (% 23.297) (% 40.613) 3000 medium # 3 0.8399 - fugu fr2 (% 25.835) (% 39.058) 3000 medium # 4 1.4755 - zebrafish danRer5 (% 19.713) (% 11.055) 3000 medium # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 5way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list cd /cluster/data/oryLat2/bed/multiz5way # bash shell syntax here ... export H=/cluster/data/oryLat2/bed mkdir mafLinks for G in gasAcu1 tetNig1 fr2 danRer5 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" else ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G fi done # need to split these things up by chrom and scaffold names for # efficient kluster run. Using the new hive architecture. ssh hgwdev cd /hive/data/genomes/oryLat2/bed/multiz5way mkdir mafSplit for G in gasAcu1 tetNig1 fr2 danRer5 do echo -n "working ${G} ..." rm -fr mafSplit/${G} mkdir mafSplit/${G} cd mafSplit/${G} mafSplit /dev/null scaf ../../mafLinks/${G}/oryLat2.${G}.net.maf.gz \ -verbose=2 -byTarget -useHashedName=16 -outDirDepth=2 cd /hive/data/genomes/oryLat2/bed/multiz5way echo " done" done # create a run-time list of files to operate on, not all file names # exist for all assemblies cd mafSplit for D in * do cd "${D}" find . -type f cd .. done | sort -u | sed -e "s#./##" > ../5-way.split.list wc -l ../5-way.split.list # 792 ../5-way.split.list # the autoMultiz cluster run ssh swarm cd /cluster/data/oryLat2/bed/multiz5way/ mkdir splitRun cd splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = oryLat2 set subdir = $1 set c = $2 set result = $3 set resultDir = $result:h set run = `pwd` set tmp = $run/tmp/$db/multiz.$c set pairs = /hive/data/genomes/oryLat2/bed/multiz5way/mafSplit rm -fr $tmp mkdir -p $tmp mkdir -p $resultDir cp -p ../../tree.nh ../../species.list $tmp pushd $tmp pwd ls -l cat species.list foreach s (`sed -e "s/ $db//" species.list`) set in = $pairs/$s/$subdir/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp -p $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd rm -f $result cp -p $tmp/$c.maf $result rm -fr $tmp rmdir --ignore-fail-on-non-empty $run/tmp/$db rmdir --ignore-fail-on-non-empty $run/tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(dir1) $(root1) {check out line+ /hive/data/genomes/oryLat2/bed/multiz5way/splitRun/maf/$(dir1)/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../../5-way.split.list single template jobList para create jobList # Completed: 792 of 792 jobs # CPU time in finished jobs: 5423s 90.38m 1.51h 0.06d 0.000 y # IO & Wait Time: 138287s 2304.79m 38.41h 1.60d 0.004 y # Average job time: 181s 3.02m 0.05h 0.00d # Longest finished job: 404s 6.73m 0.11h 0.00d # Submission to last job: 436s 7.27m 0.12h 0.01d # Estimated complete: 0s 0.00m 0.00h 0.00d # put the split maf results back together into a single maf file # eliminate duplicate comments ssh hgwdev cd /hive/data/genomes/oryLat2/bed/multiz5way/splitRun grep "^##maf version" maf/0/0/scaf11300.maf \ | sort -u > oryLat2.5way.maf for F in `find ./maf -depth -type f` do grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \ | sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g" done | sort -u >> oryLat2.5way.maf for F in `find ./maf -depth -type f` do grep -v -h "^#" "${F}" done >> oryLat2.5way.maf grep "^##eof maf" maf/0/0/scaf11300.maf \ | sort -u >> oryLat2.5way.maf # load tables for a look ssh hgwdev mkdir -p /gbdb/oryLat2/multiz5way/maf ln -s /hive/data/genomes/oryLat2/bed/multiz5way/splitRun/oryLat2.5way.maf \ /gbdb/oryLat2/multiz5way/maf/multiz5way.maf # this generates an immense multiz5way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/oryLat2/multiz5way/maf oryLat2 multiz5way # real 1m10.380s # Loaded 1366931 mafs in 1 files from /gbdb/oryLat2/multiz5way/maf # load summary table time nice -n +19 cat /gbdb/oryLat2/multiz5way/maf/*.maf \ | hgLoadMafSummary oryLat2 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz5waySummary stdin # real 2m39.822 # Created 353577 summary blocks from 2852890 components and 1197504 mafs # from stdin # Gap Annotation # prepare bed files with gap info mkdir /cluster/data/oryLat2/bed/multiz5way/anno cd /cluster/data/oryLat2/bed/multiz5way/anno mkdir maf run # these actually already all exist from previous multiple alignments for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `sed -e "s/ oryLat2//" ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done ssh memk # need to split up the single maf file into individual # per-scaffold maf files to run annotation on mkdir /cluster/data/oryLat2/bed/multiz5way/anno/splitMaf cd /cluster/data/oryLat2/bed/multiz5way/anno/splitMaf # create bed files to list approximately 112 scaffolds in # a single list, approximately 64 lists cat << '_EOF_' > mkBedLists.pl #!/usr/bin/env perl use strict; use warnings; my $bedCount = 0; my $i = 0; my $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; open (FH,") { chomp $line; if ( (($i + 1) % 111) == 0 ) { printf "%s\n", $line; close (BF); ++$bedCount; $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; } ++$i; my ($chr, $size) = split('\s+',$line); printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr; } close (FH); '_EOF_' # << happy emacs chmod +x mkBedLists.pl ./mkBedLists.pl # now, run a mafsInRegion on each one of those lists cat << '_EOF_' > runOne #!/bin/csh -fe set runDir = `pwd` set resultDir = $1 set bedFile = $resultDir.bed mkdir -p $resultDir mkdir -p /scratch/tmp/oryLat2/$resultDir pushd /scratch/tmp/oryLat2/$resultDir mafsInRegion $runDir/$bedFile -outDir . \ /cluster/data/oryLat2/bed/multiz5way/splitRun/oryLat2.5way.maf popd rsync -q -a /scratch/tmp/oryLat2/$resultDir/ ./$resultDir/ rm -fr /scratch/tmp/oryLat2/$resultDir rmdir --ignore-fail-on-non-empty /scratch/tmp/oryLat2 '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' > template #LOOP ./runOne $(root1) #ENDLOOP '_EOF_' # << happy emacs ls file*.bed > runList gensub2 runList single template jobList para create jobList para try ... check ... push ... etc # Completed: 65 of 65 jobs # CPU time in finished jobs: 2615s 43.58m 0.73h 0.03d 0.000 y # IO & Wait Time: 11436s 190.60m 3.18h 0.13d 0.000 y # Average job time: 216s 3.60m 0.06h 0.00d # Longest finished job: 493s 8.22m 0.14h 0.01d # Submission to last job: 3875s 64.58m 1.08h 0.04d cd /cluster/data/oryLat2/bed/multiz5way/anno/run cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set outDir = ../maf/$2 set result = $3 set input = $1 mkdir -p $outDir cat $input | \ nice mafAddIRows -nBeds=nBeds stdin /hive/data/genomes/oryLat2/oryLat2.2bit $result '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs find ../splitMaf -type f -name "*.maf" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 5542 of 5542 jobs # CPU time in finished jobs: 725s 12.09m 0.20h 0.01d 0.000 y # IO & Wait Time: 37945s 632.41m 10.54h 0.44d 0.001 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest finished job: 53s 0.88m 0.01h 0.00d # Submission to last job: 243s 4.05m 0.07h 0.00d ssh hgwdev cd /cluster/data/oryLat2/bed/multiz5way/anno grep "^##maf version" maf/file_0/chr1.maf \ | sort -u > oryLat2.5way.maf find ./maf -type f -depth -name "*.maf" | while read F do grep -v -h "^#" "${F}" done >> oryLat2.5way.maf echo "##eof maf" >> oryLat2.5way.maf ssh hgwdev cd /cluster/data/oryLat2/bed/multiz5way/anno mkdir -p /gbdb/oryLat2/multiz5way/anno ln -s `pwd`/oryLat2.5way.maf \ /gbdb/oryLat2/multiz5way/anno/multiz5way.maf # by loading this into the table multiz5way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/oryLat2/multiz5way/anno \ oryLat2 multiz5way # Loaded 1620199 mafs in 1 files from /gbdb/oryLat2/multiz5way/anno # real 3m19.209s # normally filter this for chrom size > 1,000,000 and only load # those chroms. But this is a scaffold assembly, load everything: time nice -n +19 hgLoadMafSummary oryLat2 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz5waySummary \ /gbdb/oryLat2/multiz5way/anno/multiz5way.maf # Created 353577 summary blocks from 2852890 components and 1428024 mafs # from /gbdb/oryLat2/multiz5way/anno/multiz5way.maf # real 3m49.692s # by loading this into the table multiz5waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz5way*.tab files in this /scratch/tmp directory rm multiz5way*.tab # And, you can remove the previously loaded non-annotated maf file link: rm /gbdb/oryLat2/multiz5way/maf/multiz5way.maf rmdir /gbdb/oryLat2/multiz5way/maf ########################################################################### ## Annotate 5-way multiple alignment with gene annotations ## (DONE - 2008-09-03 - Hiram) # Gene frames, using the "best" gene set on each organism: # use ensGene for oryLat2, gasAcu1, fr2, danRer5 # with Mrnas for tetNig1 ssh hgwdev mkdir /cluster/data/oryLat2/bed/multiz5way/frames cd /cluster/data/oryLat2/bed/multiz5way/frames mkdir genes # ensGene for DB in oryLat2 gasAcu1 fr2 danRer5 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # use the xenoMrna data for tetNig1 for DB in tetNig1 do tmpExt=`mktemp temp.XXXXXX` tmpMrnaCds=${DB}.mrna-cds.${tmpExt} tmpMrna=${DB}.mrna.${tmpExt} tmpCds=${DB}.cds.${tmpExt} hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \ from xenoMrna,gbCdnaInfo,cds \ where (xenoMrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ $DB > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \ genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz rm -f $tmpExt echo "${DB} done" done ls -og genes # -rw-rw-r-- 1 1866277 Sep 3 14:02 oryLat2.gp.gz # -rw-rw-r-- 1 1984758 Sep 3 14:02 gasAcu1.gp.gz # -rw-rw-r-- 1 1909042 Sep 3 14:02 fr2.gp.gz # -rw-rw-r-- 1 1845404 Sep 3 14:02 danRer5.gp.gz # -rw-rw-r-- 1 1693359 Sep 3 14:13 tetNig1.gp.gz cd /cluster/data/oryLat2/bed/multiz5way/frames # anything to annotate is in a pair, e.g.: oryLat2 genes/oryLat2.gp.gz time (cat ../anno/oryLat2.5way.maf | nice -n +19 genePredToMafFrames oryLat2 stdin stdout oryLat2 genes/oryLat2.gp.gz gasAcu1 genes/gasAcu1.gp.gz fr2 genes/fr2.gp.gz danRer5 genes/danRer5.gp.gz tetNig1 genes/tetNig1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 201595 oryLat2 # 213947 tetNig1 # 280670 danRer5 # 298913 gasAcu1 # 318664 fr2 # load the resulting file cd /cluster/data/oryLat2/bed/multiz5way/frames time nice -n +19 hgLoadMafFrames oryLat2 multiz5wayFrames \ multiz5way.mafFrames.gz # real 0m53.888s # enable the trackDb entries: # frames multiz5wayFrames # irows on ############################################################################# # phastCons 5-way (DONE - 2008-09-03,08 - Hiram) # split 5way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh memk mkdir /cluster/data/oryLat2/bed/multiz5way/msa.split cd /cluster/data/oryLat2/bed/multiz5way/msa.split mkdir -p /cluster/data/oryLat2/bed/multiz5way/cons/ss cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/oryLat2/bed/multiz5way/anno/maf set WINDOWS = /cluster/data/oryLat2/bed/multiz5way/cons/ss pushd $WINDOWS set resultDir = $1 set c = $2 rm -fr $resultDir/$c mkdir -p $resultDir twoBitToFa -seq=$c /cluster/data/oryLat2/oryLat2.2bit /scratch/tmp/oryLat2.$c.fa /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$resultDir/$c.maf -i MAF \ -M /scratch/tmp/oryLat2.$c.fa \ -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000 rm -f /scratch/tmp/oryLat2.$c.fa popd mkdir -p $resultDir date > $resultDir/$c.out '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out} #ENDLOOP '_EOF_' # << happy emacs # create list of maf files: (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 5542 of 5542 jobs # CPU time in finished jobs: 707s 11.79m 0.20h 0.01d 0.000 y # IO & Wait Time: 16217s 270.28m 4.50h 0.19d 0.001 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 100s 1.67m 0.03h 0.00d # Submission to last job: 7795s 129.92m 2.17h 0.09d # create starting-tree with the largest ss file: cd /cluster/data/oryLat2/bed/multiz5way/cons time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS \ ss/file_0/chr4.10000001-19997581.ss \ --tree "(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)" \ --out-root starting-tree # Fitting tree model to s1.1-39973033.ss using REV ... # Writing model to starting-tree.mod ... # real 0m8.798s # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.417 # This 0.417 is used in the --gc argument below # Create a random list of 50 1 mb regions (do not use the _randoms) cd /cluster/data/oryLat2/bed/multiz5way/cons find ./ss -type f | grep "/chr" | xargs ls -1l \ | awk '$5 > 4000000 {print $9;}' \ | sed -e "s#./ss/##" | randomLines stdin 50 randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh memk mkdir /cluster/data/oryLat2/bed/multiz5way/cons/treeRun1 cd /cluster/data/oryLat2/bed/multiz5way/cons/treeRun1 mkdir tree log # Tuning this loop should come back to here to recalculate # Create little script that calls phastCons with right arguments # --target-coverage of 0.20 is about right for mouse, will be # tuned exactly below cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/oryLat2/bed/multiz5way/cons/starting-tree.mod \ --gc 0.417 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-length 10 --target-coverage 0.20 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # << happy emacs chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP ./makeTree.csh $(path1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # CPU time in finished jobs: 7629s 127.15m 2.12h 0.09d 0.000 y # IO & Wait Time: 367s 6.11m 0.10h 0.00d 0.000 y # Average job time: 160s 2.67m 0.04h 0.00d # Longest finished job: 282s 4.70m 0.08h 0.00d # Submission to last job: 518s 8.63m 0.14h 0.01d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ls -1 tree/file_0/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ave.cons.mod > cons_summary.txt ls -1 tree/file_0/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ave.noncons.mod > noncons_summary.txt cp -p ave.*.mod .. cd .. cp -p ave.*.mod /cluster/data/oryLat1/bed/multiz5way/cons # measuring entropy, aiming for a PIT = 10 # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 # --expected-length 10 --target-coverage 0.20 \ time /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.7834 \ 0.20 10 ave.{cons,noncons}.mod # ( Solving for new omega: 10.000000 10.103157 10.102202 ) # Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000 # Relative entropy: H=0.861827 bits/site # Expected min. length: L_min=11.328911 sites # Expected max. length: L_max=7.054937 sites # Phylogenetic information threshold: PIT=L_min*H=9.763558 bits # Recommended expected length: omega=10.102202 sites (for L_min*H=9.783400) # --expected-length 14 --target-coverage 0.20 \ # ( Solving for new omega: 14.000000 10.288426 9.355350 9.262718 9.261710 ) # Transition parameters: gamma=0.200000, omega=14.000000, mu=0.071429, nu=0.017857 # Relative entropy: H=0.783596 bits/site # Expected min. length: L_min=13.493848 sites # Expected max. length: L_max=9.090919 sites # Phylogenetic information threshold: PIT=L_min*H=10.573722 bits # Recommended expected length: omega=9.261710 sites (for L_min*H=9.783400) # --expected-length 12 --target-coverage 0.20 \ # this one comes closest to PIT=10 # ( Solving for new omega: 12.000000 9.993483 9.668856 9.658335 9.658324 ) # Transition parameters: gamma=0.200000, omega=12.000000, mu=0.083333, nu=0.020833 # Relative entropy: H=0.816888 bits/site # Expected min. length: L_min=12.489452 sites # Expected max. length: L_max=8.136430 sites # Phylogenetic information threshold: PIT=L_min*H=10.202482 bits # Recommended expected length: omega=9.658324 sites (for L_min*H=9.783400) # --expected-length 13 --target-coverage 0.20 \ # ( Solving for new omega: 13.000000 10.112898 9.494883 9.455298 9.455126 ) # Transition parameters: gamma=0.200000, omega=13.000000, mu=0.076923, nu=0.019231 # Relative entropy: H=0.799098 bits/site # Expected min. length: L_min=13.008620 sites # Expected max. length: L_max=8.627917 sites # Phylogenetic information threshold: PIT=L_min*H=10.395159 bits # Recommended expected length: omega=9.455126 sites (for L_min*H=9.783400) featureBits oryLat2 *.bed # 61468061 bases of 700386597 (8.776%) in intersection # Run phastCons # This job is I/O intensive in its output files, run it on # a kluster without too many CPUs ssh memk mkdir -p /cluster/data/oryLat2/bed/multiz5way/cons/run.cons cd /cluster/data/oryLat2/bed/multiz5way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all gliers placentals # Well, that's what it was when used in the Mm9 30-way, # in this instance, there is only the directory "all" cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.2007-05-04/bin set subDir = $1 set f = $2 set c = $2:r set len = $3 set cov = $4 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/oryLat2/bed/multiz5way/cons mkdir -p $tmp set san = /cluster/data/oryLat2/bed/multiz5way/cons cp -p $cons/$grp/ave.cons.mod $cons/$grp/ave.noncons.mod $tmp cp -p $san/ss/$subDir/${f}.ss $tmp pushd $tmp > /dev/null $PHASTBIN/phastCons ${f}.ss ave.cons.mod,ave.noncons.mod \ --expected-length $len --target-coverage $cov \ --quiet --seqname ${c} --idpref ${c} --viterbi ${f}.bed --score > ${f}.pp popd > /dev/null mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir sleep 4 touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir rm -f $san/$grp/pp/$subDir/$f.pp rm -f $san/$grp/bed/$subDir/$f.bed mv $tmp/$f.pp $san/$grp/pp/$subDir mv $tmp/$f.bed $san/$grp/bed/$subDir rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # Create parasol batch and run it cd /cluster/data/oryLat2/bed/multiz5way/cons find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" > ss.list # run for all species mkdir -p all run.cons/all cd all ln -s ../ave.*cons.mod . cd run.cons/all # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(lastDir1) $(file1) 12 .2 {check out line+ /cluster/data/oryLat2/bed/multiz5way/cons/all/pp/$(lastDir1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 4265 of 4265 jobs # CPU time in finished jobs: 2722s 45.36m 0.76h 0.03d 0.000 y # IO & Wait Time: 28535s 475.59m 7.93h 0.33d 0.001 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest finished job: 42s 0.70m 0.01h 0.00d # Submission to last job: 1181s 19.68m 0.33h 0.01d # create Most Conserved track cd /cluster/data/oryLat2/bed/multiz5way/cons/all find ./bed -type f -name "*.bed" | xargs cat \ | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # load into database ssh hgwdev cd /cluster/data/oryLat2/bed/multiz5way/cons/all time nice -n +19 hgLoadBed oryLat2 phastConsElements5way mostConserved.bed # Loaded 1047037 elements of size 5 # real 0m33.582s # Try for 5% overall cov, and 70% CDS cov # We don't have any gene tracks to compare CDS coverage # --rho .31 --expected-length 45 --target-coverage .3 featureBits oryLat2 phastConsElements5way # 61468061 bases of 700386597 (8.776%) in intersection # creating the phastCons data files mkdir -p phastCons5wayScores # take a look at the file names to see where the chrom files # are: for D in `ls -1d pp/file* | sort -t_ -k2n` do F=${D/pp\/} out=phastCons5wayScores/${F}.data.gz ls -S ${D}/*.pp done | sed -e "s#/# s #g; s#\.# d #g; s#-# m #" \ | sort -k5,5 -k7,7n | sed -e "s# s #/#g; s# d #.#g; s# m #-#" > fileList.txt # the chrom files are all in file_0, with chrM in file_12 # therefore: for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 M do find ./pp/file_0 ./pp/file_12 -type f | grep "/chr${C}\." \ | sed -e "s#^./##" | sed -e "s#/# s #g; s#\.# d #g; s#-# m #" \ | sort -k5,5 -k7,7n | sed -e "s# s #/#g; s# d #.#g; s# m #-#" \ | while read F do cat "${F}" done | gzip >> phastCons5wayScores/chr${C}.data.txt.gz echo chr${C}.data.txt done # these files mistakenly got gzipped during the first run of # this script, so now they are *.pp.gz and they are zcat # And, for the rest, place them in their file_N groups for D in `ls -1d pp/file* | sort -t_ -k2n` do F=${D/pp\/} out=phastCons5wayScores/${F}.data.gz ls -S ${D}/*.pp.gz | grep -v "/chr" \ | sed -e "s#/# s #g; s#\.# d #g; s#-# m #" \ | sort -k5,5 -k7,7n | sed -e "s# s #/#g; s# d #.#g; s# m #-#" \ | xargs zcat > phastCons5wayScores/${F}.data.txt gzip phastCons5wayScores/${F}.data.txt ls -og phastCons5wayScores/${F}.data.txt.gz done # verify nothing got lost find ./pp -type f | xargs zcat | wc -l # 300140068 zcat phastCons5wayScores/*.txt.gz | wc -l # 300140068 # can now convert to wiggle data ls -rt phastCons5wayScores/*.txt.gz | while read F do zcat "${F}" done | wigEncode stdin phastCons5way.wig phastCons5way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # Load gbdb and database with wiggle. ssh hgwdev cd /hive/data/genomes/oryLat2/bed/multiz5way/cons/all ln -s `pwd`/phastCons5way.wib /gbdb/oryLat2/multiz5way/phastCons5way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/oryLat2/multiz5way oryLat2 \ phastCons5way phastCons5way.wig # real 0m13.164s # remove garbage rm wiggle.tab # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/oryLat2/bed/multiz5way/cons/all time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=oryLat2 phastCons5way > histogram.data 2>&1 # real 5m0.608s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Medaka oryLat2 Histogram phastCons5way track" set xlabel " phastCons5way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # These trackDb entries turn on the wiggle phastCons data track: # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons5way # spanList 1 # autoScale Off # windowingFunction mean # pairwiseHeight 12 # yLineOnOff Off ############################################################################# # CREATE MICROSAT TRACK (DONE - 2008-09-08 - Hiram) ssh hgwdev mkdir /cluster/data/oryLat2/bed/microsat cd /cluster/data/oryLat2/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed oryLat2 microsat microsat.bed ############################################################################# # Create Quality Track (DONE - 2008-09-08 - Hiram) # The quality scores obtained for oryLat1 are now correct since # we haven't smashed the scaffolds and ultracontigs into chrUn # So, using the file picked up for oryLat1: ssh hgwdev mkdir /cluster/data/oryLat2/bed/qual cd /cluster/data/oryLat2/bed/qual ln -s /cluster/data/oryLat1/bed/qual/medakaver1.0.UCSC.fasta.qual . # some of the numbers in this are incorrect, too large. Truncate them # to 90 as per advice from Budrul in email, and convert chrom numbers # to UCSC chrom names time nice -n +19 sed -e "s/ dna:.*//; s/^>\([0-9][0-9]*\)/>chr\1/;" \ medakaver1.0.UCSC.fasta.qual \ | sed -e \ "s/ 1[0-9][0-9]/ 90/g; s/ 2[0-9][0-9]/ 90/g; s/^1[0-9][0-9]/90/; s/^2[0-9][0-9]/90/;" \ | qaToQac stdin oryLat2.qual.qac # real 3m0.928s time nice -n +19 qacToWig -fixed oryLat2.qual.qac stdout \ | wigEncode stdin oryLat2.qual.wig oryLat2.qual.wib # Converted stdin, upper limit 99.00, lower limit 0.00 # real real 5m31.095s ssh hgwdev cd /cluster/data/oryLat2/bed/qual ln -s /cluster/data/oryLat2/bed/qual/oryLat2.qual.wib /gbdb/oryLat2/wib time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \ -pathPrefix=/gbdb/oryLat2/wib oryLat2 quality oryLat2.qual.wig # real 0m20.513s ############################################################################# # create downloads (DONE - 2008-09-08 - Hiram) ssh hgwdev cd /cluster/data/oryLat2 ln -s bed/repeatMasker/oryLat2.fa.out . time nice -n +19 makeDownloads.pl oryLat2 > makeDownloads.log 2>&1 # real 7m39.978s # Edit the resulting README files # add upstream sequences for ensGenes version 50 cd /cluster/data/oryLat2/goldenPath/bigZips foreach size (1000 2000 5000) echo $size featureBits oryLat2 ensGene:upstream:$size -fa=stdout \ | gzip -c > ensGene.upstream$size.fa.gz end # add those to the md5 sum file and the README file md5sum ensGene.up* >> md5sum.txt ######################################################################### # Adding automatic generation of upstream files (DONE - 2009-08-13 - Hiram) # edit src/hg/makeDb/genbank/genbank.conf to add: oryLat2.upstreamGeneTbl = ensGene oryLat2.upstreamMaf = multiz5way /hive/data/genomes/oryLat2/bed/multiz5way/species.list ######################################################################### # MULTIZ5WAY DOWNLOADABLES (DONE - 2008-09-08 - Hiram) ssh hgwdev mkdir /cluster/data/oryLat2/bed/multiz5way/mafDownloads cd /cluster/data/oryLat2/bed/multiz5way # upstream mafs # There isn't any refGene table, using ensGene instead for S in 1000 2000 5000 do echo "making upstream${S}.maf" nice -n +19 featureBits -verbose=2 oryLat2 \ ensGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags oryLat2 multiz5way \ stdin stdout -orgs=species.list \ | gzip -c > mafDownloads/ensGene.upstream${S}.maf.gz echo "done ensGene.upstream${S}.maf.gz" done cd /cluster/data/oryLat2/bed/multiz5way/mafDownloads cp -p ../anno/oryLat2.5way.maf . gzip oryLat2.5way.maf nice -n +19 md5sum *.maf.gz > md5sum.txt # Make a README.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/oryLat2/multiz5way cd /usr/local/apache/htdocs/goldenPath/oryLat2/multiz5way ln -s /cluster/data/oryLat2/bed/multiz5way/mafDownloads/{*.gz,*.txt} . ############################################################################# # making the pushQ entry (DONE - 2008-09-08 - Hiram) ssh hgwdev cd /cluster/data/oryLat2/jkStuff makePushQSql.pl oryLat2 > oryLat2.pushQ.sql oryLat2 does not have seq Could not tell (from trackDb, all.joiner and hardcoded lists of supporting and genbank tables) which tracks to assign these tables to: ensGtp ensPep multiz5way multiz5wayFrames multiz5waySummary phastCons5way phastConsElements5way scp -p oryLat2.pushQ.sql hiram@hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < oryLat2.pushQ.sql # manually add the entry for the 5way business ############################################################################# # Create 5-way downloads (DONE - 2008-09-09 - Hiram) ssh hgwdev mkdir /cluster/data/oryLat2/goldenPath/multiz5way mkdir /cluster/data/oryLat2/goldenPath/phastCons5way cd /cluster/data/oryLat2/goldenPath/phastCons5way ln -s ../../bed/multiz5way/cons/all/phastCons5wayScores/*.txt.gz . ln -s ../../bed/multiz5way/cons/ave.cons.mod 5way.conserved.mod ln -s ../../bed/multiz5way/cons/ave.noncons.mod 5way.non-conserved.mod md5sum *.gz *.mod > md5sum.txt # copy a README and fix it up for here: cp /usr/local/apache/htdocs/goldenPath/oryLat1/phastCons5Scores/README.txt . mkdir /cluster/data/oryLat2/goldenPath/multiz5way cd /cluster/data/oryLat2/goldenPath/multiz5way ln -s ../../bed/multiz5way/mafDownloads/* . ln -s ../../bed/multiz5way/5way.nh ./oryLat2.5way.nh # check the names in these upstream files to ensure sanity: zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \ | sort | uniq -c | sort -rn | less # should be a list of the other 4 species with a high count, # then Ensembl gene names, e.g.: # 4558 tetNig1 # 4558 gasAcu1 # 4558 fr2 # 4558 danRer5 # 1 ENSORLT00000025880 # 1 ENSORLT00000025875 # 1 ENSORLT00000025852 # ... etc ... ssh hgwdev cd /cluster/data/oryLat2/goldenPath/phastCons5way mkdir /usr/local/apache/htdocs/goldenPath/oryLat2/phastCons5way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/oryLat2/phastCons5way cd /cluster/data/oryLat2/goldenPath/multiz5way mkdir /usr/local/apache/htdocs/goldenPath/oryLat2/multiz5way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/oryLat2/multiz5way # if your ln -s `pwd`/* made extra links to files you don't want there, # check the goldenPath locations and remove those extra links ############################################################################# ## Set a more interesting default position, location of IGHMBP2 gene ## DONE - 2008-10-15 - Hiram ssh hgwdev hgsql -e \ 'update dbDb set defaultPos="chr6:11,027,149-11,038,707" where name = "oryLat2";' \ -h genome-testdb hgcentraltest ## note from QA changed position because default tracks were sparse in old ## position: 'update dbDb set defaultPos='chr18:14,435,198-14,444,829' where name="oryLat2";' ############################################################################ ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: oryLat2.upstreamGeneTbl = refGene ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram) hgsql -e 'update dbDb set sourceName="NIG and Tokyo Univ MEDAKA1 (NCBI project 16702, BAAF04000000)" where name="oryLat2";' hgcentraltest ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE 2009-02-02 braney ) # bash if not using bash shell already cd /cluster/data/oryLat2 mkdir /cluster/data/oryLat2/blastDb awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst oryLat2.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst oryLat2.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 884 mkdir -p /cluster/data/oryLat2/bed/tblastn.hg18KG cd /cluster/data/oryLat2/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 884 # we want around 50000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(50000/`wc query.lst | awk '{print $1}'`\) # 36727/(50000/884) = 649.333360 mkdir -p kgfa split -l 649 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/oryLat2/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/oryLat2/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh swarm cd /cluster/data/oryLat2/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 50388 of 50388 jobs # CPU time in finished jobs: 4705256s 78420.94m 1307.02h 54.46d 0.149 y # IO & Wait Time: 240976s 4016.26m 66.94h 2.79d 0.008 y # Average job time: 98s 1.64m 0.03h 0.00d # Longest finished job: 297s 4.95m 0.08h 0.00d # Submission to last job: 7579s 126.32m 2.11h 0.09d ssh swarm cd /cluster/data/oryLat2/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 57 of 57 jobs # CPU time in finished jobs: 775s 12.91m 0.22h 0.01d 0.000 y # IO & Wait Time: 1481s 24.69m 0.41h 0.02d 0.000 y # Average job time: 40s 0.66m 0.01h 0.00d # Longest finished job: 65s 1.08m 0.02h 0.00d # Submission to last job: 66s 1.10m 0.02h 0.00d cd /cluster/data/oryLat2/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # checked: 40021 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/oryLat2/bed/tblastn.hg18KG hgLoadPsl oryLat2 blastHg18KG.psl # check coverage featureBits oryLat2 blastHg18KG # 19888142 bases of 700386597 (2.840%) in intersection featureBits oryLat1 blastHg18KG # 18960975 bases of 700386597 (2.707%) in intersection featureBits oryLat2 all_mrna blastHg18KG -enrichment # all_mrna 0.131%, blastHg18KG 2.840%, both 0.058%, cover 44.55%, enrich 15.69x rm -rf blastOut #end tblastn ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ # lastz Zebrafish danRer6 (DONE - 2009-08-13 - Hiram) mkdir /hive/data/genomes/oryLat2/bed/lastzDanRer6.2009-08-13 cd /hive/data/genomes/oryLat2/bed/lastzDanRer6.2009-08-13 cat << '_EOF_' > DEF # Medaka vs. Zebrafish # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Medaka oryLat2 SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LIMIT=100 SEQ1_LAP=10000 # Query - zebrafish (danRer6) SEQ2_DIR=/scratch/data/danRer6/danRer6.2bit SEQ2_LEN=/scratch/data/danRer6/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/danRer6/contigs/danRer6.contigs.2bit SEQ2_CTGLEN=/hive/data/genomes/danRer6/contigs/danRer6.contigs.sizes SEQ2_LIFT=/hive/data/genomes/danRer6/contigs/danRer6.contigs.lift SEQ2_CHUNK=10000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/oryLat2/bed/lastzDanRer6.2009-08-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -noLoadChainSplit -tRepeats=windowmaskerSdust \ -smallClusterHub=encodek -bigClusterHub=swarm > do.log 2>&1 & # real 232m4.673s # kluster difficulties, after finishing lastz manually: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -noLoadChainSplit -tRepeats=windowmaskerSdust \ -smallClusterHub=encodek -bigClusterHub=swarm \ -continue=cat > cat.log 2>&1 & # about 1 hour cat fb.oryLat2.chainDanRer6Link.txt # 123547945 bases of 700386597 (17.640%) in intersection mkdir /hive/data/genomes/danRer6/bed/blastz.oryLat2.swap cd /hive/data/genomes/danRer6/bed/blastz.oryLat2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/oryLat2/bed/lastzDanRer6.2009-08-13/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -noLoadChainSplit -tRepeats=windowmaskerSdust \ -smallClusterHub=encodek -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 53m31.962s cat fb.danRer6.chainOryLat2Link.txt # 147597031 bases of 1506896106 (9.795%) in intersection ############################################################################ # BLASTZ/CHAIN/NET tetNig2 (DONE - 2008-08-26,27 - Hiram) screen # use screen to control this job mkdir /hive/data/genomes/oryLat2/bed/blastzTetNig2.2009-09-14 cd /hive/data/genomes/oryLat2/bed/blastzTetNig2.2009-09-14 cat << '_EOF_' > DEF # Medaka vs. Tetraodon # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp) SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=200 # QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/oryLat2/bed/blastzTetNig2.2009-09-14 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -workhorse=hgwdev \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > do.log 2>&1 & # real 159m0.619s - failed batch run # recovered and continuing: time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF \ -continue=cat -workhorse=hgwdev \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > cat.log 2>&1 & cat fb.oryLat2.chainTetNig2Link.txt # 162783854 bases of 700386597 (23.242%) in intersection # And, for the swap: mkdir /hive/data/genomes/tetNig2/bed/blastz.oryLat2.swap cd /hive/data/genomes/tetNig2/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ /hive/data/genomes/oryLat2/bed/blastzTetNig2.2009-09-14/DEF \ -swap -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 & # about 2h18m cat fb.tetNig2.chainOryLat2Link.txt # 136115939 bases of 302314788 (45.025%) in intersection ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # lastz swap danRer7 (DONE - 2010-12-17 - Hiram) # original alignment to danRer7 cd /hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17/DEF cat fb.oryLat2.chainDanRer7Link.txt # 141428502 bases of 1409770109 (10.032%) in intersection # running the swap mkdir /hive/data/genomes/oryLat2/bed/blastz.danRer7.swap cd /hive/data/genomes/oryLat2/bed/blastz.danRer7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17/DEF \ -chainMinScore=2000 -chainLinearGap=medium \ -noLoadChainSplit -qRepeats=windowmaskerSdust \ -swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 & # real 20m56.926s cat fb.oryLat2.chainDanRer7Link.txt # 124527579 bases of 700386597 (17.780%) in intersection ############################################################################ ##########################################################################pubStart # Publications track (DONE - 04-27-12 - Max) # article download and conversion is run every night on hgwdev: # 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh # the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/ # then converts them to text into /hive/data/outside/literature/{pmc,elsevier} # all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py # data processing was run manually like this export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/ # pmc cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc ssh swarm cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat steps:annot-tables exit pubBlat load # elsevier cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier ssh swarm cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat steps:annot-tables exit pubBlat load #--pubEnd ######################################################################### # LASTZ/CHAIN/NET Medaka oryLat2 (DONE - 2012-10-23 - Hiram) screen -S oryLat2PetMar2 # use screen to control this job mkdir /cluster/data/oryLat2/bed/lastzPetMar2.2012-10-23 cd /cluster/data/oryLat2/bed/lastzPetMar2.2012-10-23 cat << '_EOF_' > DEF # Medaka vs. Lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp) SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=25 SEQ1_LAP=10000 # QUERY: Lamprey petMar2, sequence is in contigs SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/oryLat2/bed/lastzPetMar2.2012-10-23 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl \ `pwd`/DEF \ -workhorse=hgwdev -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=swarm -verbose=2 > do.log 2>&1 & # real 654m52.829s cat fb.oryLat2.chainPetMar2Link.txt # 42137127 bases of 700386597 (6.016%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oryLat2/bed ln -s lastzPetMar2.2012-10-23 lastz.petMar2 # And, for the swap: mkdir /cluster/data/petMar2/bed/blastz.oryLat2.swap cd /cluster/data/petMar2/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -verbose=2 -swap \ /hive/data/genomes/oryLat2/bed/lastzPetMar2.2012-10-23/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -smallClusterHub=encodek -bigClusterHub=swarm > swap.log 2>&1 & # real 8m42.333s cat fb.petMar2.chainOryLat2Link.txt # 31889826 bases of 647368134 (4.926%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/petMar2/bed ln -s blastz.oryLat2.swap lastz.oryLat2 #########################################################################