# for emacs: -*- mode: sh; -*- # Pristionchus pacificus # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABKE01 ############################################################################## ## Fetch sequence (DONE - 2010-09-28 - Hiram) mkdir /cluster/data/priPac2 cd /cluster/data/priPac2 mkdir wgs cd wgs wget --timestamping -O wgs.ABKE.1.qscore.gz \ ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABKE.1.qscore.gz wget --timestamping -O wgs.ABKE.1.fsa_nt.gz \ ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABKE.1.fsa_nt.gz wget --timestamping -O wgs.ABKE.1.gbff.gz \ ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABKE.1.gbff.gz zcat wgs.ABKE.1.qscore.gz | sed -e 's#^>gb|#>#; s#.1|.*##' \ | gzip -c > ABKE010.ucsc.qa.gz zcat wgs.ABKE.1.fsa_nt.gz | sed -e 's#^>gi.*ABKE#>ABKE#; s#.1|.*##' \ | gzip -c > ABKE010.ucsc.fa.gz zcat wgs.ABKE.1.fsa_nt.gz | grep "^>" | sed -e 's#^>gi.*ABKE#ABKE#; s#| Pris.*PS312##; s#, whole.*##' > accToContig.list hgFakeAgp -minContigGap=1 ABKE010.ucsc.fa.gz ABKE010.ucsc.agp cat << '_EOF_' > replaceNames.pl #!/bin/env perl use strict; use warnings; my %ableToContig; open (FH, ") { chomp $line; my ($ableName, $contigName) = split('\s+', $line); $ableName =~ s/.1$//; $ableToContig{$ableName} = $contigName; } close (FH); open (FH, ") { chomp $line; my ($acc, $start, $end, $id, $type, $ctg, $ctgStart, $ctgEnd, $strand) = split('\s+', $line); if ($type eq "N") { printf "%s\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", $acc, $start, $end, $id, $type, $ctg, $ctgStart, $ctgEnd; } else { $acc =~ s/.1$//; printf "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", $acc, $start, $end, $id, $type, $ableToContig{$acc}, $ctgStart, $ctgEnd, $strand; } } close (FH); '_EOF_' # << happy emacs chmod +x replaceNames.pl ./replaceNames.pl > contig.ucsc.agp qaToQac ABKE010.ucsc.qa.gz ABKE010.ucsc.qac ############################################################################## ## Initial browser build (DONE - 2010-09-29 - Hiram) cd /hive/data/genomes/priPac2 cat << '_EOF_' > priPac2.config.ra # Config parameters for makeGenomeDb.pl: db priPac2 clade worm scientificName Pristionchus pacificus commonName P. pacificus assemblyDate Dec. 2008 assemblyLabel Washington University School of Medicine GSC P. pacificus 5.0.1 assemblyShortLabel WUGSC 5.0.1 orderKey 884 mitoAcc none fastaFiles /hive/data/genomes/priPac2/wgs/ABKE010.ucsc.fa.gz agpFiles /hive/data/genomes/priPac2/wgs/contig.ucsc.agp qualFiles /hive/data/genomes/priPac2/wgs/ABKE010.ucsc.qac dbDbSpeciesDir worm taxId 54126 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp -workhorse=hgwdev -verbose=2 \ priPac2.config.ra > agp.log 2>&1 # real 0m6.409s time makeGenomeDb.pl -continue=db -workhorse=hgwdev -verbose=2 \ priPac2.config.ra > makeGenomeDb.log 2>&1 # real 2m21.252s ############################################################################## # REPEATMASKER (DONE - 2010-09-29 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/priPac2/bed/repeatMasker cd /hive/data/genomes/priPac2/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \ -buildDir=`pwd` priPac2 > do.log 2>&1 & # real 68m59.569s # from the do.log: # June 30 2010 (open-3-2-9) version of RepeatMasker # CC RELEASE 20090604; # RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp $ cat faSize.rmsk.txt # 133635077 bases (304 N's 133634773 real 131467395 upper # 2167378 lower) in 13962 sequences in 1 files # %1.62 masked total, %1.62 masked real ######################################################################### # SIMPLE REPEATS (DONE - 2010-09-29 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/priPac2/bed/simpleRepeat cd /hive/data/genomes/priPac2/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -workhorse=hgwdev \ -smallClusterHub=memk -buildDir=`pwd` priPac2 > do.log 2>&1 & # real 11m23.587s cat fb.simpleRepeat # 2706905 bases of 133634773 (2.026%) in intersection ######################################################################### # run window masker (DONE - 2010-09-29 - Hiram) mkdir /hive/data/genomes/priPac2/bed/windowMasker cd /hive/data/genomes/priPac2/bed/windowMasker time doWindowMasker.pl -verbose=2 -workhorse=kolossus \ -buildDir=`pwd` priPac2 > do.log 2>&1 & # real 3m42.686s # real 4m14.628s twoBitToFa priPac2.wmsk.sdust.2bit stdout | faSize stdin # 133635077 bases (304 N's 133634773 real 103391560 upper # 30243213 lower) in 13962 sequences in 1 files # %22.63 masked total, %22.63 masked real # load this initial data to get ready to clean it hgLoadBed priPac2 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 799331 elements of size 3 featureBits priPac2 windowmaskerSdust # 30243416 bases of 133634773 (22.631%) in intersection # eliminate the gaps from the masking time featureBits priPac2 -not gap -bed=notGap.bed # 133634773 bases of 133634773 (100.000%) in intersection # real 0m1.895s time nice -n +19 featureBits priPac2 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # real 0m33.349s # 30243213 bases of 133634773 (22.631%) in intersection # reload track to get it clean hgLoadBed priPac2 windowmaskerSdust cleanWMask.bed.gz # Loaded 799488 elements of size 3 time featureBits priPac2 windowmaskerSdust > fb.priPac2.cleanWMask.txt 2>&1 # real 0m9.075s cat fb.priPac2.cleanWMask.txt # 30243213 bases of 133634773 (22.631%) in intersection cd /hive/data/genomes/priPac2 # mask the sequence with this clean mask zcat bed/windowMasker/cleanWMask.bed.gz \ | twoBitMask priPac2.unmasked.2bit stdin \ -type=.bed priPac2.2bit twoBitToFa priPac2.2bit stdout | faSize stdin > priPac2.faSize.txt cat priPac2.faSize.txt # 133635077 bases (304 N's 133634773 real 103391560 upper # 30243213 lower) in 13962 sequences in 1 files # %22.63 masked total, %22.63 masked real ln -s `pwd`/priPac2.2bit /gbdb/priPac2/priPac2.2bit ######################################################################### # Check for *all* gaps - if they are not all in the AGP file # (DONE - 2010-10-20 - Hiram) mkdir /hive/data/genomes/priPac2/bed/allGaps cd /hive/data/genomes/priPac2/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../priPac2.2bit > findMotif.txt 2>&1 # real 0m2.078s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits priPac2 -not gap -bed=notGap.bed # 133634773 bases of 133634773 (100.000%) in intersection featureBits priPac2 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 133634773 (0.000%) in intersection ### - there are no extra gaps # (see felCat4.txt for remainder of procedure if there are gaps) ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2010-09-29 - Hiram) # numerator is priPac2 gapless bases "real" as reported by faSize # denominator is hg19 gapless bases as reported by featureBits, # featureBits hg19 gap # 1024 is threshold used for human -repMatch: calc \( 133634773 / 2897310462 \) \* 1024 # ( 133634773 / 2897310462 ) * 1024 = 47.230702 # 47 is way too small, use 100 to keep the number of overused # 11-mers to a smaller number cd /hive/data/genomes/priPac2 blat priPac2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=priPac2.11.ooc -repMatch=100 # Wrote 3164 overused 11-mers to priPac2.11.ooc mkdir /hive/data/staging/data/priPac2 cp -p priPac2.2bit chrom.sizes priPac2.11.ooc \ /hive/data/staging/data/priPac2 ############################################################################## # BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("priPac2", "blat5", "17784", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("priPac2", "blat5", "17785", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # reset default position to ZC101.2e protein blat result ssh hgwdev hgsql -e 'update dbDb set defaultPos="ABKE01000932:1-29163" where name="priPac2";' hgcentraltest ############################################################################ # ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-07 - Hiram) cd /hive/data/genomes/priPac2 mkdir blastDb twoBitToFa priPac2.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft # 13962 pieces of 13962 written rm temp.fa cd blastDb for i in *.fa do /scratch/data/blast-2.2.11/bin/formatdb -i $i -p F done rm *.fa ## create the query protein set mkdir -p /hive/data/genomes/priPac2/bed/tblastn.ce9SG cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG echo /hive/data/genomes/priPac2/blastDb/*.nsq | xargs ls -S \ | sed "s/\.nsq//" > query.lst wc -l query.lst # 13962 query.lst # we want around 400000 jobs calc `wc -l /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl | cut -d" " -f1`/\(400000/`wc -l query.lst | cut -d" " -f1`\) # 28103/(400000/13962) = 980.935215 mkdir sgfa split -l 900 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg cd sgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S sgfa/*.fa > sg.lst mkdir blastOut for i in `cat sg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > template #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh DB=priPac2 BLASTMAT=/scratch/data/blast-2.2.11/data SCR="/scratch/tmp/${DB}" g=`basename $2` D=`basename $1` export BLASTMAT DB SCR g D mkdir -p ${SCR} cp -p $1.* ${SCR} f=${SCR}/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \ -e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/x86_64/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 \ /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \ /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.* rmdir --ignore-fail-on-non-empty ${SCR} exit 1 '_EOF_' # << happy emacs chmod +x blastSome ssh swarm cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG gensub2 query.lst sg.lst template jobList para create jobList para try, check, push, check etc. # Completed: 446784 of 446784 jobs # CPU time in finished jobs: 6100754s 101679.24m 1694.65h 70.61d 0.193 y # IO & Wait Time: 1723436s 28723.93m 478.73h 19.95d 0.055 y # Average job time: 18s 0.29m 0.00h 0.00d # Longest finished job: 52s 0.87m 0.01h 0.00d # Submission to last job: 18781s 313.02m 5.22h 0.22d # do the cluster run for chaining ssh swarm mkdir /hive/data/genomes/priPac2/bed/tblastn.ce9SG/chainRun cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG/chainRun cat << '_EOF_' > template #LOOP chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainOne #!/bin/csh -fe cd $1 set b = $1:t cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \ /hive/data/genomes/priPac2/bed/tblastn.ce9SG/blastOut/c.$b.psl '_EOF_' # << happy emacs chmod +x chainOne ls -1dS /hive/data/genomes/priPac2/bed/tblastn.ce9SG/blastOut/sg?? \ > chain.lst gensub2 chain.lst single template jobList cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG/chainRun para create jobList para try, check, push, check etc. # Completed: 32 of 32 jobs # CPU time in finished jobs: 305s 5.09m 0.08h 0.00d 0.000 y # IO & Wait Time: 25350s 422.50m 7.04h 0.29d 0.001 y # Average job time: 802s 13.36m 0.22h 0.01d # Longest finished job: 833s 13.88m 0.23h 0.01d # Submission to last job: 1149s 19.15m 0.32h 0.01d cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG/blastOut for i in sg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \ > ../blastCe9SG.psl cd .. pslCheck blastCe9SG.psl # checked: 20932 failed: 0 errors: 0 # load table ssh hgwdev cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG hgLoadPsl priPac2 blastCe9SG.psl # check coverage featureBits priPac2 blastCe9SG # 5436779 bases of 133634773 (4.068%) in intersection featureBits cb3 blastCe9SG # 18490367 bases of 108433446 (17.052%) in intersection featureBits caeRem3 blastCe9SG # 20302540 bases of 138406388 (14.669%) in intersection featureBits caePb2 blastCe9SG # 23730009 bases of 170473138 (13.920%) in intersection featureBits caeJap3 blastCe9SG # 12894398 bases of 154057934 (8.370%) in intersection featureBits melHap1 blastCe9SG # 4376245 bases of 53017507 (8.254%) in intersection featureBits melInc1 blastCe9SG # 3882043 bases of 82095019 (4.729%) in intersection featureBits bruMal1 blastCe9SG # 4424694 bases of 89235536 (4.958%) in intersection featureBits haeCon1 blastCe9SG # 4990746 bases of 278844984 (1.790%) in intersection featureBits ce9 sangerGene # 28689552 bases of 100286004 (28.608%) in intersection rm -rf blastOut ######################################################################### # GENBANK AUTO UPDATE (DONE - 2010-10-20 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add priPac2 just before priPac1 # priPac2 (P. pacificus) priPac2.serverGenome = /hive/data/genomes/priPac2/priPac2.2bit priPac2.clusterGenome = /scratch/data/priPac2/priPac2.2bit priPac2.ooc = /scratch/data/priPac2/priPac2.11.ooc priPac2.lift = no priPac2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} priPac2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} priPac2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} priPac2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} priPac2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} priPac2.refseq.mrna.native.load = yes priPac2.refseq.mrna.xeno.load = yes priPac2.refseq.mrna.xeno.loadDesc = yes priPac2.genbank.mrna.xeno.load = yes priPac2.genbank.est.native.load = yes priPac2.genbank.est.native.loadDesc = no priPac2.downloadDir = priPac2 priPac2.perChromTables = no git commit -m "Added priPac2." etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial priPac2 & # logFile: var/build/logs/2010.10.20-11:29:48.priPac2.initalign.log # real 179m55.957s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad priPac2 # logFile: var/dbload/hgwdev/logs/2010.10.20-15:03:44.dbload.log # real 24m11.438s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add priPac2 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added priPac2 - P. pacificus" etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### # WS210 WormBase genes loading (DONE - 2010-10-14 - Hiram) mkdir /hive/data/genomes/priPac2/bed/ws210Gene cd /hive/data/genomes/priPac2/bed/ws210Gene ln -s /hive/data/genomes/priPac1/bed/blat.priPac2.2010-10-14/priPac1ToPriPac2.over.chain.gz . ln -s /hive/data/genomes/priPac1/bed/ws210Gene/priPac1.ws210Gene.gp . export DB=priPac2 liftOver -genePred priPac1.ws210Gene.gp priPac1ToPriPac2.over.chain.gz \ ${DB}.ws210Gene.gp ws210gene.unmapped.gp genePredCheck -db=${DB} ${DB}.ws210Gene.gp # checked: 24319 failed: 0 hgLoadGenePred ${DB} ws210Gene ${DB}.ws210Gene.gp ######################################################################### # set this as the defaultDb (DONE - 2010-10-20 - Hiram) hgsql -e 'update defaultDb set name="priPac2" where name="priPac1";' \ hgcentraltest ######################################################################### # verify all.joiner has everything (DONE - 2010-10-20 - Hiram) # edit all.joiner until all these commands are successful cd $HOME/kent/src/hg/makeDb/schema joinerCheck -tableCoverage -database=priPac2 ./all.joiner joinerCheck -keys -database=priPac2 ./all.joiner joinerCheck -times -database=priPac2 ./all.joiner joinerCheck -all -database=priPac2 ./all.joiner # the -all command will complain about other databases on hgwdev # that are not specified in all.joiner. There are a lot of test # databases on hgwdev ######################################################################### # construct downloads files (DONE - 2010-10-20 - Hiram) cd /hive/data/genomes/priPac2 makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 priPac2 \ > downloads.log 2>&1 ######################################################################### ## Creating pushQ (DONE - 2010-10-20 - Hiram) ssh hgwdev mkdir /hive/data/genomes/priPac2/pushQ cd /hive/data/genomes/priPac2/pushQ makePushQSql.pl priPac2 > priPac2.sql 2> errorLog.out ## check the errorLog.out for anything that needs to be fixed ## copy priPac2.sql to hgwbeta:/tmp ## then on hgwbeta: hgsql qapushq < priPac2.sql #######################################################################