# for emacs: -*- mode: sh; -*- # Tarsius syrichta # (original doc had incorrect name of Tarsier syrichta) # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRT00 ######################################################################### # DOWNLOAD SEQUENCE (DONE braney 2008-10-07) ssh kolossus mkdir /hive/data/genomes/tarSyr1 ln -s /cluster/store12/tarSyr1 /cluster/data mkdir /cluster/data/tarSyr1/broad cd /cluster/data/tarSyr1/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/tarsier/Tarsyr1.0/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/tarsier/Tarsyr1.0/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/tarsier/Tarsyr1.0/assembly.quals.gz md5sum ass* > assembly.md5sum qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin tarSyr1.qual.qac cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 656709 ######################################################################### # Create .ra file and run makeGenomeDb.pl (working... braney ssh hgwdev screen cd /cluster/data/tarSyr1 cat << _EOF_ >tarSyr1.config.ra # Config parameters for makeGenomeDb.pl: db tarSyr1 clade mammal genomeCladePriority 35 scientificName Tarsier syrichta commonName Tarsier assemblyDate Aug. 2008 assemblyLabel Broad Institute tarSyr1 orderKey 236.5 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/tarSyr1/broad/assembly.bases.gz agpFiles /cluster/data/tarSyr1/broad/assembly.agp qualFiles /cluster/data/tarSyr1/broad/tarSyr1.qual.qac dbDbSpeciesDir tarsier _EOF_ # use 'screen' make sure on kkstore05 makeGenomeDb.pl -workhorse kolossus -verbose=2 tarSyr1.config.ra > makeGenomeDb.out 2>&1 & # 'ctl-a ctl -d' returns to previous shell cut -f 2 chrom.sizes | ave stdin # Q1 7169.500000 # median 13298.000000 # Q3 55788.500000 # average 866164.007952 # min 3002.000000 # max 88675666.000000 # count 3144 # total 2723219641.000000 # standard deviation 5316243.688364 ######################################################################### # REPEATMASKER (not done) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/tarSyr1/bed/repeatMasker cd /cluster/data/tarSyr1/bed/repeatMasker /cluster/bin/scripts/doRepeatMasker.pl -species "tarsier" \ -buildDir=/cluster/data/tarSyr1/bed/repeatMasker \ tarSyr1 > do.log 2>&1 & # new parasol, lots of crashes, no times /cluster/bin/scripts/doRepeatMasker.pl -species "tarsier" \ -continue cat -buildDir=/cluster/data/tarSyr1/bed/repeatMasker \ tarSyr1 > do2.log 2>&1 & time nice -n +19 featureBits tarSyr1 rmsk > fb.tarSyr1.rmsk.txt 2>&1 & # 1154651023 bases of 2771976320 (41.654%) in intersection ######################################################################### # SIMPLE REPEATS TRF (not done) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/tarSyr1/bed/simpleRepeat cd /cluster/data/tarSyr1/bed/simpleRepeat # doSimpleRepeat.pl -buildDir=/cluster/data/tarSyr1/bed/simpleRepeat \ tarSyr1 > do.log 2>&1 & #### When done ssh pk para time # Completed: 51 of 51 jobs # CPU time in finished jobs: 24985s 416.41m 6.94h 0.29d # 0.001 y # IO & Wait Time: 101s 1.69m 0.03h 0.00d # 0.000 y # Average job time: 492s 8.20m 0.14h 0.01d # Longest finished job: 3887s 64.78m 1.08h 0.04d # Submission to last job: 3911s 65.18m 1.09h 0.05d featureBits tarSyr1 simpleRepeat # 50851363 bases of 2771976320 (1.834%) in intersection # after RM run is done, add this mask: cd /cluster/data/tarSyr1 twoBitMask tarSyr1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed tarSyr1.2bit twoBitToFa tarSyr1.2bit stdout | faSize stdin # 3183347966 bases (411371646 N's 2771976320 real 1619032005 upper # 1152944315 lower) in 659020 sequences in 1 files # Total size: mean 4830.4 sd 8873.5 min 600 (scaffold_659019) max 329427 # (scaffold_0) median 1645 # N count: mean 624.2 sd 1398.1 # U count: mean 2456.7 sd 5229.3 # L count: mean 1749.5 sd 2909.8 # %36.22 masked total, %41.59 masked real twoBitToFa tarSyr1.rmsk.2bit stdout | faSize stdin # 3183347966 bases (411371646 N's 2771976320 real 1619660149 upper 1152316171 # lower) in 659020 sequences in 1 files # Total size: mean 4830.4 sd 8873.5 min 600 (scaffold_659019) max 329427 # (scaffold_0) median 1645 # N count: mean 624.2 sd 1398.1 # U count: mean 2457.7 sd 5230.7 # L count: mean 1748.5 sd 2908.4 # %36.20 masked total, %41.57 masked real # Link to it from /gbdb ln -s /cluster/data/tarSyr1/tarSyr1.2bit /gbdb/tarSyr1/tarSyr1.2bit # mkdir /san/sanvol1/scratch/tarSyr1 cp /cluster/data/tarSyr1/tarSyr1.2bit /san/sanvol1/scratch/tarSyr1 cp /cluster/data/tarSyr1/chrom.sizes /san/sanvol1/scratch/tarSyr1 ######################################################################### ## Repeat Masker (DONE - 2008-10-15 - Hiram) screen # use a screen to manage this job mkdir /hive/data/genomes/tarSyr1/bed/repeatMasker cd /hive/data/genomes/tarSyr1/bed/repeatMasker doRepeatMasker.pl -workhorse=hgwdev -bigClusterHub=swarm \ -species=tarsier -buildDir=`pwd` tarSyr1 > do.log 2>&1 & # real 810m18.865s # failed in doMask.csh due to no tarSyr1.unmasked.2bit file ? # which was there at the beginning, now it is gone ? # the broken doRepeatMasker.pl script removed it due to hive confusion # went back to the jkStuff/makeUnmasked script and re-ran part of that. # then ran the doMask.csh script to finish that, and continuing: time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -continue=install -workhorse=hgwdev -bigClusterHub=swarm \ -species=tarsier -buildDir=`pwd` tarSyr1 > install.log 2>&1 & # real 26m23.595s twoBitToFa tarSyr1.rmsk.2bit stdout | faSize stdin # 3179905132 bases (411368789 N's 2768536343 real 1613280363 upper 1155255980 # lower) in 656709 sequences in 1 files # %36.33 masked total, %41.73 masked real ########################################################################### # SIMPLE REPEATS TRF (DONE - 2008-10-16 - Hiram) screen # use a screen to manage this job mkdir /hive/data/genomes/tarSyr1/bed/simpleRepeat cd /hive/data/genomes/tarSyr1/bed/simpleRepeat # time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/hive/data/genomes/tarSyr1/bed/simpleRepeat \ tarSyr1 > do.log 2>&1 & # real 382m17.104s cat fb.simpleRepeat # 50848145 bases of 2768536343 (1.837%) in intersection # after RM run is done, add this mask: cd /hive/data/genomes/tarSyr1 rm tarSyr1.2bit twoBitMask tarSyr1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed tarSyr1.2bit # can safely ignore warning about >=13 fields in bed file twoBitToFa tarSyr1.2bit stdout | faSize stdin > tarSyr1.2bit.faSize.txt # 3179905132 bases (411368789 N's 2768536343 real 1612652870 upper 1155883473 # lower) in 656709 sequences in 1 files # %36.35 masked total, %41.75 masked real # link to gbdb ln -s `pwd`/tarSyr1.2bit /gbdb/tarSyr1 ########################################################################### # prepare for kluster runs (DONE _ 2008-10-16 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # tarSyr1: 2768536343 # thus: 1024 * 2768536343/2881421696 = 983 # rounding up to 1000 for a bit more conservative masking cd /hive/data/genomes/tarSyr1 time blat tarSyr1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=tarSyr1.11.ooc -repMatch=1000 # Wrote 31947 overused 11-mers to tarSyr1.11.ooc # real 2m30.155s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/tarSyr1 cp -p tarSyr1.2bit chrom.sizes tarSyr1.11.ooc \ /hive/data/staging/data/tarSyr1 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram) hgsql -e 'update dbDb set sourceName="Broad Institute tarSyr1 (NCBI project 20335, ABRT000000000)" where name="tarSyr1";' hgcentraltest ############################################################################ # tarSyr1 - Tarsier - Ensembl Genes version 51 (DONE - 2008-12-04 - hiram) ssh kkr14u02 cd /hive/data/genomes/tarSyr1 cat << '_EOF_' > tarSyr1.ensGene.ra # required db variable db tarSyr1 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the 2,819 genes that do not translate properly to UCSC # coordinates # out of 43,529 genes '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 tarSyr1.ensGene.ra ssh hgwdev cd /hive/data/genomes/tarSyr1/bed/ensGene.51 featureBits tarSyr1 ensGene # 20086184 bases of 2768536343 (0.726%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/tarSyr1/bed/ensGene.51 ############################################################################ # Swap lastz from Mm10 (DONE - 2012-03-12 - Hiram) # original alignment on Mm10 cd /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 cat fb.mm10.chainTarSyr1Link.txt # 651517559 bases of 2652783500 (24.560%) in intersection # and this swap: mkdir /hive/data/genomes/tarSyr1/bed/blastz.mm10.swap cd /hive/data/genomes/tarSyr1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 746m30.852s cat fb.tarSyr1.chainMm10Link.txt # 691746721 bases of 2768536343 (24.986%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tarSyr1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # cpgIsland - (DONE - 2012-04-21 - Hiram) mkdir /hive/data/genomes/tarSyr1/bed/cpgIslands cd /hive/data/genomes/tarSyr1/bed/cpgIslands time doCpgIslands.pl tarSyr1 > do.log 2>&1 real 664m57.705s # there is a problem with this command: #cut -f1 cpgIsland.bed | sort -u | awk '{print length($0)}' | sort -rn | head -1 # replace the 'head -1' with 'sed -n -e '1,1 p' # finished manually with C set at 15: time ./doLoadCpg.csh # real 3m25.459s # continuing: time doCpgIslands.pl -continue=cleanup tarSyr1 # real 34m36.015s cat fb.tarSyr1.cpgIslandExt.txt # 9784198 bases of 2768536343 (0.353%) in intersection ############################################################################## # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/tarSyr1/bed/genscan cd /hive/data/genomes/tarSyr1/bed/genscan time doGenscan.pl tarSyr1 > do.log 2>&1 # real 1086m30.217s cat fb.tarSyr1.genscan.txt # 49393140 bases of 2768536343 (1.784%) in intersection cat fb.tarSyr1.genscanSubopt.txt # 46556603 bases of 2768536343 (1.682%) in intersection ######################################################################### # windowMasker - (DONE - 2012-05-03 - Hiram) screen -S tarSyr1 mkdir /hive/data/genomes/tarSyr1/bed/windowMasker cd /hive/data/genomes/tarSyr1/bed/windowMasker # trying out new version of the script that does all the usual steps # that used to be performed manually after it was done time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev tarSyr1 > do.log 2>&1 # real 2741m46.799s sed -e 's/^/ #\t/' fb.tarSyr1.windowmaskerSdust.beforeClean.txt # 1510322967 bases of 3179905132 (47.496%) in intersection sed -e 's/^/ #\t/' fb.tarSyr1.windowmaskerSdust.clean.txt # 1098954178 bases of 3179905132 (34.559%) in intersection sed -e 's/^/ #\t/' fb.tarSyr1.rmsk.windowmaskerSdust.txt # 669452844 bases of 3179905132 (21.053%) in intersection ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - 2012-06-22 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Tarsius syrichta 8 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add tarSyr1 # tarSyr1 (Tarsier) tarSyr1.serverGenome = /hive/data/genomes/tarSyr1/tarSyr1.2bit tarSyr1.clusterGenome = /scratch/data/tarSyr1/tarSyr1.2bit tarSyr1.ooc = /scratch/data/tarSyr1/tarSyr1.11.ooc tarSyr1.lift = no tarSyr1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} tarSyr1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} tarSyr1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} tarSyr1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} tarSyr1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} tarSyr1.refseq.mrna.native.load = no tarSyr1.refseq.mrna.xeno.load = yes tarSyr1.genbank.mrna.xeno.load = yes tarSyr1.genbank.est.native.load = yes tarSyr1.downloadDir = tarSyr1 tarSyr1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding tarSyr1 tarsier" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for tarSyrNames" src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S tarSyr1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial tarSyr1 & # var/build/logs/2012.06.08-10:01:20.tarSyr1.initalign.log # real 4309m23.845s # after finishing the kluster run manually: # Completed: 650332 of 650332 jobs # CPU time in finished jobs: 2156583s 35943.06m 599.05h 24.96d 0.068 y # IO & Wait Time: 3057671s 50961.18m 849.35h 35.39d 0.097 y # Average job time: 8s 0.13m 0.00h 0.00d # Longest finished job: 210s 3.50m 0.06h 0.00d # Submission to last job: 207812s 3463.53m 57.73h 2.41d time nice -n +19 ./bin/gbAlignStep -initial -continue=finish tarSyr1 & # var/build/logs/2012.06.16-12:07:23.tarSyr1.initalign.log # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad tarSyr1 & # logFile: var/dbload/hgwdev/logs/2012.06.22-14:55:19.dbload.log # real 64m45.767s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add tarSyr1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added tarSyr1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-26 - Hiram) hgsql -e \ 'update dbDb set defaultPos="scaffold_66368:1-2388" where name="tarSyr1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-26 - Hiram) mkdir /hive/data/genomes/tarSyr1/pushQ cd /hive/data/genomes/tarSyr1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl tarSyr1 2> stderr.txt | grep -v transMap > tarSyr1.sql # real 4m6.158s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/tarSyr1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/tarSyr1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/tarSyr1/bbi/quality.bw # WARNING: tarSyr1 does not have seq # WARNING: tarSyr1 does not have extFile # WARNING: tarSyr1 does not have estOrientInfo scp -p tarSyr1.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/tarSyr1.sql" ############################################################################