# for emacs: -*- mode: sh; -*- # # the above keeps emacs happy while working with this text document # This file describes how we made the browser database on the Rattus # Norvegicus genome, March 2008 update (Rnor4.0) from Baylor. ######################################################################### ## Download sequence (DONE - 2008-11-13 - Hiram) mkdir /hive/data/genomes/rn5 mkdir /hive/data/genomes/rn5/baylor cd /hive/data/genomes/rn5/baylor # assuming BASH sequence here F="ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor4.0/LinearizedChromosomes" for S in fa.gz fa.qual.gz do wget --timestamping ${F}/Rat20080227.${S} done F="ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor4.0" for S in htm txt do wget --timestamping ${F}/README_Rnor4.${S} done F="ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor4.0/Contigs" for S in .agp _contigs.fa _contigs_fa.qual \ _contig_scaffold.agp _scaffold_chr.agp _BACS.fa \ _BACS.fa.qual _accnum.agp _ctg_name.agp _contigs_accnum do wget --timestamping ${F}/Rat20080227${S}.gz done ######################################################################### # fixup names to correspond to UCSC syntax (DONE - Hiram - 2008-11-13) cd /hive/data/genomes/rn5/baylor zcat Rat20080227.fa.gz | sed -e "s/gnl.Rnor4.0.chr0/chr/; s/gnl.Rnor4.0.chr1/chr1/; s/gnl.Rnor4.0.chr2/chr2/; s/gnl.Rnor4.0.chrX/chrX/; s/gnl.Rnor4.0.chrUn/chrUn/;" \ | gzip -c > Rnor4.chroms.fa.gz zcat Rat20080227.fa.qual.gz | sed -e "s/gnl.Rnor4.0.chr0/chr/; s/gnl.Rnor4.0.chr1/chr1/; s/gnl.Rnor4.0.chr2/chr2/; s/gnl.Rnor4.0.chrX/chrX/; s/gnl.Rnor4.0.chrUn/chrUn/;" \ | gzip -c > Rnor4.chroms.fa.qual.gz zcat Rat20080227_scaffold_chr.agp.gz \ | sed -e "s/^chr0/chr/; s/chr0\([1-9]*.scaffold\)/chr\1/; s/[\t]*#.*//;" \ > Rnor4.scaffold_chr.agp ######################################################################### # Create .ra file and run makeGenomeDb.pl (DONE - Hiram - 2008-11-14) cd /hive/data/genomes/rn5 cat << '_EOF_' >rn5.config.ra # Config parameters for makeGenomeDb.pl: db rn5 clade mammal scientificName Rattus norvegicus commonName Rat assemblyDate Mar. 2008 assemblyLabel Baylor BCM-HGSC Rnor4.0 (NCBI project ID: 10621) orderKey 177 mitoAcc AC_000022 fastaFiles /hive/data/genomes/rn5/baylor/Rnor4.chroms.fa.gz agpFiles /hive/data/genomes/rn5/baylor/Rnor4.scaffold_chr.agp qualFiles /hive/data/genomes/rn5/baylor/Rnor4.chroms.fa.qual.gz dbDbSpeciesDir rat '_EOF_' # << happy emacs # this was run manually step by step, the results were not recorded makeGenomeDb.pl rn5.config.ra > makeGenomeDb.log 2>&1 # real 41m44.090s ######################################################################### ## Repeat Masker (DONE - 2008-11-14 - Hiram) screen # to manage this several day job mkdir /hive/data/genomes/rn5/bed/repeatMasker cd /hive/data/genomes/rn5/bed/repeatMasker time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` rn5 > do.log 2>&1 & # real 475m44.607s cat faSize.rmsk.txt # 3613304564 bases (785910941 N's 2827393623 real 1586542816 upper # 1240850807 lower) in 23 sequences in 1 files # %34.34 masked total, %43.89 masked real ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-11-14,17 - Hiram) screen # use a screen to manage this job mkdir /hive/data/genomes/rn5/bed/simpleRepeat cd /hive/data/genomes/rn5/bed/simpleRepeat # time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/cluster/data/rn5/bed/simpleRepeat rn5 > do.log 2>&1 & # real 30m4.231s # having trouble with one of them: chrM - has no result ... $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -continue=filter -buildDir=/cluster/data/rn5/bed/simpleRepeat \ rn5 > filter.log 2>&1 cat fb.simpleRepeat # 90043163 bases of 3372561689 (2.670%) in intersection # after RM run is done, add this mask: cd /hive/data/genomes/rn5 rm rn5.2bit twoBitMask rn5.rmsk.2bit -add bed/simpleRepeat/trfMask.bed rn5.2bit # can safely ignore warning about >=13 fields in bed file twoBitToFa rn5.2bit stdout | faSize stdin > rn5.2bit.faSize.txt # 3613304564 bases (785910941 N's 2827393623 real 1584580949 upper 1242812674 # lower) in 23 sequences in 1 files # %34.40 masked total, %43.96 masked real # link to gbdb ln -s `pwd`/rn5.2bit /gbdb/rn5 ########################################################################### # prepare for kluster runs (DONE _ 2008-11-17 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # rn5: 2827393623 # thus: 1024 * 2827393623/2881421696 = 1004 # rounding up to 1100 for a more conservative masking cd /hive/data/genomes/rn5 time blat rn5.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=rn5.11.ooc -repMatch=1100 # Wrote 31521 overused 11-mers to rn5.11.ooc # real 2m9.494s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/rn5 cp -p rn5.2bit chrom.sizes rn5.11.ooc \ /hive/data/staging/data/rn5 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # BLATSERVERS ENTRY (DONE - 2008-11-18 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("rn5", "blatx", "17788", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("rn5", "blatx", "17789", "0", "1");' \ hgcentraltest # test it with some sequence ########################################################################### # GENBANK operation (DONE - 2008-11-18,26 - Hiram) # need a liftUp file for the chrUn bits cd /hive/data/genomes/rn5/jkStuff grep chrUn ../rn5.agp | /cluster/bin/scripts/agpToLift > chrUn.lift # align with latest genbank process. cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add rn5 just before rn4, the entry looks like: # rn5 rn5.serverGenome = /hive/data/genomes/rn5/rn5.2bit rn5.clusterGenome = /scratch/data/rn5/rn5.2bit rn5.ooc = /scratch/data/rn5/rn5.11.ooc rn5.align.unplacedChroms = chrUn rn5.lift = /hive/data/genomes/rn5/jkStuff/chrUn.lift rn5.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} rn5.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} rn5.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} rn5.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} rn5.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} rn5.downloadDir = rn5 rn5.refseq.mrna.xeno.load = yes rn5.refseq.mrna.xeno.loadDesc = yes rn5.mgc = yes rn5.upstreamGeneTbl = refGene # rn5.upstreamMaf = multiz9way # /hive/data/genomes/rn5/bed/multiz9way/species.lst cvs ci -m "Added rn5" etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank # start the alignments time nice -n +19 bin/gbAlignStep -initial rn5 & # logFile: var/build/logs/2008.11.18-14:10:02.rn5.initalign.log # real 487m2.796s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad rn5 # logFile: var/dbload/hgwdev/logs/2008.11.19-21:16:48.dbload.log # real 51m7.581s # catch up to refSeq.32 since above initial alignment was done time nice -n +19 bin/gbAlignStep rn5 & # var/build/logs/2008.11.26-10:22:43.align.log # real 43m46.613s time nice -n +19 ./bin/gbDbLoadStep rn5 # var/dbload/hgwdev/logs/2008.11.26-11:25:43.dbload.log # real 8m39.589s # enable daily alignment and update of hgwdev (DONE - 2008-11-26 - Hiram) cd ~/kent/src/hg/makeDb/genbank cvsup # add rn5 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added rn5" etc/align.dbs etc/hgwdev.dbs make etc-update ########################################################################### # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2008-11-18 - Hiram) ssh memk mkdir /hive/data/genomes/rn5/bed/lineageSpecificRepeats cd /hive/data/genomes/rn5/bed/lineageSpecificRepeats cat << '_EOF_' > mkLSR.csh #!/bin/csh -fe rm -f ./chr$1.dateRepeats.txt rm -f /hive/data/genomes/rn5/$1/chr$1.fa.out_mus-musculus_homo-sapiens_canis-familiaris_bos-taurus_oryctolagus-cuniculus /scratch/data/RepeatMasker/DateRepeats \ /hive/data/genomes/rn5/$1/chr$1.fa.out -query rat -comp mouse \ -comp human -comp dog -comp cow -comp rabbit cp -p /hive/data/genomes/rn5/$1/chr$1.fa.out_mus-musculus_homo-sapiens_canis-familiaris_bos-taurus_oryctolagus-cuniculus ./$1.dateRepeats.txt rm -f /hive/data/genomes/rn5/$1/chr$1.fa.out_mus-musculus_homo-sapiens_canis-familiaris_bos-taurus_oryctolagus-cuniculus '_EOF_' # << happy emacs chmod +x mkLSR.csh cat << '_EOF_' > template #LOOP ./mkLSR.csh $(path1) {check out line+ chr$(path1).dateRepeats.txt} #ENDLOOP '_EOF_' # << happy emacs cut -f1 ../../chrom.sizes | sed -e "s/chr//" > chr.list gensub2 chr.list single template jobList para try ... check ... push ... etc... para time # Completed: 23 of 23 jobs # CPU time in finished jobs: 1109s 18.48m 0.31h 0.01d 0.000 y # IO & Wait Time: 116s 1.94m 0.03h 0.00d 0.000 y # Average job time: 53s 0.89m 0.01h 0.00d # Longest finished job: 97s 1.62m 0.03h 0.00d # Submission to last job: 161s 2.68m 0.04h 0.00d # Estimated complete: 0s 0.00m 0.00h 0.00d mkdir notInMouse notInHuman notInDog notInCow notInRabbit for F in chr*.dateRepeats.txt do B=${F/.dateRepeats.txt/} echo $B /cluster/bin/scripts/extractRepeats 1 ${F} > \ notInMouse/${B}.out.spec /cluster/bin/scripts/extractRepeats 2 ${F} > \ notInHuman/${B}.out.spec /cluster/bin/scripts/extractRepeats 3 ${F} > \ notInDog/${B}.out.spec /cluster/bin/scripts/extractRepeats 4 ${F} > \ notInCow/${B}.out.spec /cluster/bin/scripts/extractRepeats 5 ${F} > \ notInRabbit/${B}.out.spec done # the notInHuman, notInDog, notInCow and notInRabbit ended up being # identical. Only the notInMouse was different than them # To check identical find . -name "*.out.spec" | \ while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \ | sort -k1,1n | sort -t"/" -k3,3 # Copy to scratch staging for use in kluster runs mkdir /hive/data/staging/data/rn5/notInMouse mkdir /hive/data/staging/data/rn5/notInOthers cp -p notInMouse/*.out.spec /hive/data/staging/data/rn5/notInMouse cp -p notInHuman/*.out.spec /hive/data/staging/data/rn5/notInOthers # We also need the nibs for blastz runs with lineage specific repeats cd /hive/data/genomes/rn5 mkdir nib for C in `cut -f1 chrom.sizes` do echo $C twoBitToFa -seq=$C rn5.2bit stdout | faToNib -softMask stdin nib/$C.nib done mkdir /hive/data/staging/data/rn5/nib cp -p nib/*.nib /hive/data/staging/data/rn5/nib # Ask cluster-admin to sync /scratch/ filesystem to kluster nodes ############################################################################# # BLASTZ Mouse Mm9 (DONE - 2008-11-18,12-01 - Hiram) screen # use screen to manage this job mkdir /hive/data/genomes/rn5/bed/blastzMm9.2008-11-18 cd /hive/data/genomes/rn5/bed/blastzMm9.2008-11-18 cat << '_EOF_' > DEF # rat vs mouse # Specially tuned blastz parameters from Webb Miller BLASTZ=lastz BLASTZ_M=254 BLASTZ_ABRIDGE_REPEATS=0 BLASTZ_O=600 BLASTZ_E=55 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/hive/data/genomes/rn5/bed/blastzMm9.2008-11-18/mouse_rat.q # TARGET: Rat Rn5 SEQ1_DIR=/scratch/data/rn5/nib SEQ1_LEN=/scratch/data/rn5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Mouse Mm9 SEQ2_DIR=/scratch/data/mm9/nib SEQ2_LEN=/scratch/data/mm9/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/rn5/bed/blastzMm9.2008-11-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ `pwd`/DEF > do.log 2>&1 & time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -continue=cat `pwd`/DEF > cat.log 2>&1 & cat fb.rn5.chainMm9Link.txt # 1900898298 bases of 3372561689 (56.364%) in intersection mkdir /hive/data/genomes/mm9/bed/blastz.rn5.swap cd /hive/data/genomes/mm9/bed/blastz.rn5.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/blastzMm9.2008-11-18/DEF \ -workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium \ -swap -stop=net > swap.log 2>&1 & # real 67m39.044s time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rn5/bed/blastzMm9.2008-11-18/DEF \ -workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium \ -debug -swap -continue=load -stop=load > load.log 2>&1 & # running the loadUp.csh manually after fixing the table names to # not clash with existing rn5 tables in mm9 # real 24m13.143s cat fb.mm9.chainRn5LastzLink.txt # 1751703421 bases of 2620346127 (66.850%) in intersection #############################################################################