# for emacs: -*- mode: sh; -*- # Vicugna pacos # http://www.ncbi.nlm.nih.gov/bioproject/30567 # http://www.ncbi.nlm.nih.gov/genome/905 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRR01 ######################################################################### # DOWNLOAD SEQUENCE (DONE braney 2008-10-07) ssh kolossus screen mkdir /hive/data/genomes/vicPac1 ln -s /hive/data/genomes/vicPac1 /cluster/data mkdir /cluster/data/vicPac1/broad cd /cluster/data/vicPac1/broad wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/VicPac1.0/assembly.agp \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/VicPac1.0/assembly.bases.gz \ ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/VicPac1.0/assembly.quals.gz qaToQac assembly.quals.gz stdout | \ qacAgpLift assembly.agp stdin vicPac1.qual.qac cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 298413 ######################################################################### # Create .ra file and run makeGenomeDb.pl (working braney... cd /cluster/data/vicPac1 cat << _EOF_ >vicPac1.config.ra # Config parameters for makeGenomeDb.pl: db vicPac1 clade mammal genomeCladePriority 35 scientificName Vicugna vicugna commonName Alpaca assemblyDate Jul. 2008 assemblyLabel Broad Institute vicPac1 orderKey 233.5 #mitoAcc AJ222767 mitoAcc none fastaFiles /cluster/data/vicPac1/broad/assembly.bases.gz agpFiles /cluster/data/vicPac1/broad/assembly.agp qualFiles /cluster/data/vicPac1/broad/vicPac1.qual.qac dbDbSpeciesDir alpaca _EOF_ # use 'screen' make sure on kkstore05 makeGenomeDb.pl -workhorse kolossus -verbose=2 vicPac1.config.ra > makeGenomeDb.out 2>&1 & # 'ctl-a ctl -d' returns to previous shell cut -f 2 chrom.sizes | ave stdin # Q1 1042.000000 # median 1372.000000 # Q3 2292.000000 # average 9926.690888 # min 555.000000 # max 5516956.000000 # count 298413 # total 2962253608.000000 # standard deviation 67785.419160 ######################################################################### # REPEATMASKER (not done) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/vicPac1/bed/repeatMasker cd /cluster/data/vicPac1/bed/repeatMasker doRepeatMasker.pl -buildDir=/cluster/data/vicPac1/bed/repeatMasker \ vicPac1 > do.log 2>&1 & # Note: can run simpleRepeats simultaneously #### When done with RM: ssh pk para time # Completed: 7141 of 7141 jobs # CPU time in finished jobs: 19375689s 322928.14m 5382.14h 224.26d 0.614 y # IO & Wait Time: 185390s 3089.84m 51.50h 2.15d 0.006 y # Average job time: 2739s 45.65m 0.76h 0.03d # Longest finished job: 13925s 232.08m 3.87h 0.16d # Submission to last job: 175075s 2917.92m 48.63h 2.03d time nice -n +19 featureBits vicPac1 rmsk > fb.vicPac1.rmsk.txt 2>&1 & # 614774346 bases of 1923010363 (31.969%) in intersection # RepeatMasker and lib version from do.log: # Jun 13 2008 (open-3-2-5) version of RepeatMasker # CC RELEASE 20080611; ######################################################################### # SIMPLE REPEATS TRF (not done) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/vicPac1/bed/simpleRepeat cd /cluster/data/vicPac1/bed/simpleRepeat # had to change genus/species to "vicugna genus" in dummyRun doSimpleRepeat.pl -buildDir=/cluster/data/vicPac1/bed/simpleRepeat \ vicPac1 > do.log 2>&1 & #### When done ssh pk para time # Completed: 60 of 60 jobs # CPU time in finished jobs: 71877s 1197.95m 19.97h 0.83d 0.002 y # IO & Wait Time: 2811s 46.85m 0.78h 0.03d 0.000 y # Average job time: 1245s 20.75m 0.35h 0.01d # Longest finished job: 12819s 213.65m 3.56h 0.15d # Submission to last job: 13133s 218.88m 3.65h 0.15d featureBits vicPac1 simpleRepeat # 77931003 bases of 1923010363 (4.053%) in intersection # after RM run is done, add this mask: cd /cluster/data/vicPac1 twoBitMask vicPac1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed vicPac1.2bit twoBitToFa vicPac1.2bit stdout | faSize stdin # 2962350360 bases (1039339997 N's 1923010363 real 1305704427 upper 617305936 # lower) in 298420 sequences in 1 files # Total size: mean 9926.8 sd 67784.7 min 600 (scaffold_298419) max 5516956 (scaffold_0) median 1372 # N count: mean 3482.8 sd 21549.2 # U count: mean 4375.4 sd 33856.2 # L count: mean 2068.6 sd 15148.2 # %20.84 masked total, %32.10 masked real twoBitToFa vicPac1.rmsk.2bit stdout | faSize stdin # 2962350360 bases (1039339997 N's 1923010363 real 1309436724 upper 613573639 # lower) in 298420 sequences in 1 files # Total size: mean 9926.8 sd 67784.7 min 600 (scaffold_298419) max 5516956 # (scaffold_0) median 1372 # N count: mean 3482.8 sd 21549.2 # U count: mean 4387.9 sd 33863.9 # L count: mean 2056.1 sd 15140.4 # %20.71 masked total, %31.91 masked real ln -s /cluster/data/vicPac1/vicPac1.2bit /gbdb/vicPac1/vicPac1.2bit cp /cluster/data/vicPac1/vicPac1.2bit /san/sanvol1/scratch/vicPac1 cp /cluster/data/vicPac1/chrom.sizes /san/sanvol1/scratch/vicPac1 ######################################################################### ## Repeat Masker (DONE - 2008-10-16 - Hiram) screen # to manage this several day job mkdir /hive/data/genomes/vicPac1/bed/repeatMasker cd /hive/data/genomes/vicPac1/bed/repeatMasker time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` vicPac1 > do.log 2>&1 & # real 292m6.556s twoBitToFa vicPac1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt # 2962253608 bases (1039343173 N's 1922910435 real 1306779321 upper 616131114 # lower) in 298413 sequences in 1 files # %20.80 masked total, %32.04 masked real ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-10-17 - Hiram) screen # use a screen to manage this job mkdir /hive/data/genomes/vicPac1/bed/simpleRepeat cd /hive/data/genomes/vicPac1/bed/simpleRepeat # time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \ -buildDir=/cluster/data/vicPac1/bed/simpleRepeat vicPac1 > do.log 2>&1 & # real 74m17.044s cat fb.simpleRepeat # 77925712 bases of 1922910435 (4.052%) in intersection # after RM run is done, add this mask: cd /hive/data/genomes/vicPac1 rm vicPac1.2bit twoBitMask vicPac1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed vicPac1.2bit # can safely ignore warning about >=13 fields in bed file twoBitToFa vicPac1.2bit stdout | faSize stdin > vicPac1.2bit.faSize.txt # 2962253608 bases (1039343173 N's 1922910435 real 1303047359 upper 619863076 # lower) in 298413 sequences in 1 files # %20.93 masked total, %32.24 masked rea # link to gbdb ln -s `pwd`/vicPac1.2bit /gbdb/vicPac1 ########################################################################### # prepare for kluster runs (DONE _ 2008-10-22 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # vicPac1: 1922910435 # thus: 1024 * 1922910435/2881421696 = 683 # rounding up to 700 for a more conservative masking cd /hive/data/genomes/vicPac1 time blat vicPac1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=vicPac1.11.ooc -repMatch=700 # Wrote 25830 overused 11-mers to vicPac1.11.ooc # real 2m8.728s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/vicPac1 cp -p vicPac1.2bit chrom.sizes vicPac1.11.ooc \ /hive/data/staging/data/vicPac1 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # fix the scientific name (DONE - 2008-10-22 - Hiram) hgsql -e 'update dbDb set scientificName="Vicugna pacos" where name="vicPac1";' hgcentraltest ############################################################################ # vicPac1 - Alpaca - Ensembl Genes version 51 (DONE - 2008-12-04 - hiram) ssh swarm cd /hive/data/genomes/vicPac1 cat << '_EOF_' > vicPac1.ensGene.ra # required db variable db vicPac1 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the 53 genes that do not translate properly to UCSC coordinates '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 vicPac1.ensGene.ra ssh hgwdev cd /hive/data/genomes/vicPac1/bed/ensGene.51 featureBits vicPac1 ensGene # 17730015 bases of 1922910435 (0.922%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/vicPac1/bed/ensGene.51 ############################################################################ # SWAP mm10 lastz (DONE - 2012-03-19 - Hiram) # original alignment to mm10: cat /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16/fb.mm10.chainVicPac1Link.txt # 600477253 bases of 2652783500 (22.636%) in intersection # and this swap mkdir /hive/data/genomes/vicPac1/bed/blastz.mm10.swap cd /hive/data/genomes/vicPac1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 159m21.952s cat fb.vicPac1.chainMm10Link.txt # 610885692 bases of 1922910435 (31.769%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/vicPac1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/vicPac1/bed/cpgIslands cd /hive/data/genomes/vicPac1/bed/cpgIslands time doCpgIslands.pl vicPac1 > do.log 2>&1 # real 897m19.342s # fixing broken command in the script: time ./doLoadCpg.csh # real 2m37.853s time doCpgIslands.pl -continue=cleanup vicPac1 > cleanup.log 2>&1 # real 119m41.201s cat fb.vicPac1.cpgIslandExt.txt # 9241028 bases of 1922910435 (0.481%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/vicPac1/bed/genscan cd /hive/data/genomes/vicPac1/bed/genscan time doGenscan.pl vicPac1 > do.log 2>&1 # recovering from power failure, kluster run had just finished # and it had just started on makeBed: time ./doMakeBed.csh # real 445m44.260s # continuing: time doGenscan.pl -continue=load vicPac1 > load.log 2>&1 # real 40m21.604s cat fb.vicPac1.genscan.txt # 46607076 bases of 1922910435 (2.424%) in intersection cat fb.vicPac1.genscanSubopt.txt # 54656455 bases of 1922910435 (2.842%) in intersection ######################################################################### # windowMasker - (DONE - 2012-05-02 - Hiram) screen -S vicPac1 mkdir /hive/data/genomes/vicPac1/bed/windowMasker cd /hive/data/genomes/vicPac1/bed/windowMasker # trying out new version of the script that does all the usual steps # that used to be performed manually after it was done time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev vicPac1 > do.log 2>&1 # real 1042m31.380s # fixing a broken command in the doLoad step: time ./lastLoad.csh # real 9m4.895s sed -e 's/^/ #\t/' fb.vicPac1.windowmaskerSdust.beforeClean.txt # 1672360288 bases of 2962253608 (56.456%) in intersection sed -e 's/^/ #\t/' fb.vicPac1.windowmaskerSdust.clean.txt # 633017115 bases of 2962253608 (21.369%) in intersection sed -e 's/^/ #\t/' fb.vicPac1.rmsk.windowmaskerSdust.txt # 266161799 bases of 2962253608 (8.985%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-07 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 1922910435 / 2897316137 \) \* 1024 # ( 1922910435 / 2897316137 ) * 1024 = 679.615269 # round up to 700 cd /hive/data/genomes/vicPac1 time blat vicPac1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/vicPac1.11.ooc -repMatch=700 # Wrote 25830 overused 11-mers to jkStuff/vicPac1.11.ooc # Done making jkStuff/vicPac1.11.ooc # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" vicPac1 | sort | uniq -c # 422879 yes # cd /hive/data/genomes/vicPac1/jkStuff # gapToLift vicPac1 vicPac1.nonBridged.lift -bedFile=vicPac1.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' vicPac1.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-07 - 2012-06-22 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Vicugna pacos 164 7286 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add vicPac1 # vicPac1 (alpaca) vicPac1.serverGenome = /hive/data/genomes/vicPac1/vicPac1.2bit vicPac1.clusterGenome = /hive/data/genomes/vicPac1/vicPac1.2bit vicPac1.ooc = /hive/data/genomes/vicPac1/jkStuff/vicPac1.11.ooc vicPac1.lift = no vicPac1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} vicPac1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} vicPac1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} vicPac1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} vicPac1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} vicPac1.refseq.mrna.native.load = no vicPac1.refseq.mrna.xeno.load = yes vicPac1.genbank.mrna.native.load = no vicPac1.genbank.mrna.xeno.load = no vicPac1.genbank.est.native.load = yes vicPac1.downloadDir = vicPac1 vicPac1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding vicPac1 alpaca" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for vicPacNames" src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S vicPac1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial vicPac1 & # var/build/logs/2012.06.16-19:10:31.vicPac1.initalign.log # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad vicPac1 & # var/dbload/hgwdev/logs/2012.06.22-12:27:12.dbload.log # real 39m1.879s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add vicPac1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added vicPac1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ##################################################################### # SWAP bosTau7 LASTZ (DONE - 2012-06-19 - Chin) # original alignment done at bosTau7.txt cd /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19 cat fb.bosTau7.chainVicPac1Link.txt # 1259182233 bases of 2804673174 (44.896%) in intersection # Create link cd /hive/data/genomes/bosTau7/bed ln -s lastzVicPac1.2012-06-19 lastz.vicPac1 # and the swap mkdir /hive/data/genomes/vicPac1/bed/blastz.bosTau7.swap cd /hive/data/genomes/vicPac1/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 717m22.295s cat fb.vicPac1.chainBosTau7Link.txt # 1309266730 bases of 1922910435 (68.088%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/vicPac1/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-26 - Hiram) hgsql -e \ 'update dbDb set defaultPos="scaffold_5479:34441-51753" where name="vicPac1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-26 - Hiram) mkdir /hive/data/genomes/vicPac1/pushQ cd /hive/data/genomes/vicPac1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl vicPac1 2> stderr.txt | grep -v transMap > vicPac1.sql # real 3m49.201s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/vicPac1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/vicPac1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/vicPac1/bbi/quality.bw # WARNING: vicPac1 does not have seq # WARNING: vicPac1 does not have extFile # WARNING: vicPac1 does not have estOrientInfo scp -p vicPac1.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/vicPac1.sql" ############################################################################