# for emacs: -*- mode: sh; -*- # Sus scrofa - SGSC Sscrofa8 NCBI project 10718, CM000812 ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2009-10-14 - Hiram) mkdir /hive/data/genomes/susScr1 cd /hive/data/genomes/susScr1 mkdir sanger cd sanger for F in README Sus_scrofa.Sscrofa9.53.dna.chromosome.fa.bz2 \ Sus_scrofa.Sscrofa9.53_repeat_coords.txt.bz2 \ Sus_scrofa9.agp Sus_scrofa9.pgp do wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/S_scrofa/assemblies/Ensembl_Sscrofa9/${F}" done bunzip *.bz2 gzip Sus_scrofa.Sscrofa9.53.dna.chromosome.fa \ Sus_scrofa.Sscrofa9.53_repeat_coords.txt grep -v "^#" Sus_scrofa9.agp > susScr1.agp zcat Sus_scrofa.Sscrofa9.53.dna.chromosome.fa.gz \ | sed -e "s/^>/>chr/" | gzip > susScr1.fa.gz ######################################################################### # Initial makeGenomeDb.pl (DONE - 2009-11-06 - Hiram) cd /hive/data/genomes/susScr1 cat << '_EOF_' > susScr1.config.ra # Config parameters for makeGenomeDb.pl: db susScr1 clade mammal genomeCladePriority 35 scientificName Sus scrofa commonName Pig assemblyDate Apr. 2009 assemblyLabel SGSC Sscrofa8 (NCBI project 10718, CM000812) orderKey 235 mitoAcc NC_012095 fastaFiles /hive/data/genomes/susScr1/sanger/susScr1.fa.gz agpFiles /hive/data/genomes/susScr1/sanger/susScr1.agp # qualFiles none dbDbSpeciesDir pig taxId 9823 '_EOF_' makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev susScr1.config.ra \ > makeGenomeDb.log 2>&1 # add the trackDb entries to the source tree ln -s `pwd`/susScr1.unmasked.2bit /gbdb/susScr1/susScr1.2bit # browser should function now ######################################################################### # RepeatMasker (DONE - 2009-11-06 - Hiram) mkdir /hive/data/genomes/susScr1/bed/repeatMasker cd /hive/data/genomes/susScr1/bed/repeatMasker doRepeatMasker.pl -buildDir=`pwd` -workhorse=hgwdev -bigClusterHub=pk \ -noSplit susScr1 > do.log 2>&1 # about 7.5 hours cat faSize.rmsk.txt # 2262596571 bases (31264552 N's 2231332019 real 1286238193 upper # 945093826 lower) in 20 sequences in 1 files # %41.77 masked total, %42.36 masked real ######################################################################### # simpleRepeats (DONE - 2009-11-06 - Hiram) mkdir /hive/data/genomes/susScr1/bed/simpleRepeat cd /hive/data/genomes/susScr1/bed/simpleRepeat doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev -bigClusterHub=pk \ -smallClusterHub=pk susScr1 > do.log 2>&1 cat fb.simpleRepeat # 26577444 bases of 2231496571 (1.191%) in intersection # add to the repeatMasker cd /hive/data/genomes/susScr1 twoBitMask susScr1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed susScr1.2bit twoBitToFa susScr1.2bit stdout | faSize stdin > susScr1.2bit.faSize.txt cat susScr1.2bit.faSize.txt # 2262596571 bases (31264552 N's 2231332019 real 1285077160 upper # 946254859 lower) in 20 sequences in 1 files # %41.82 masked total, %42.41 masked real ######################################################################## # Marking *all* gaps - they are not all in the AGP file # (DONE - 2009-11-09 - Hiram) mkdir /hive/data/genomes/susScr1/bed/allGaps cd /hive/data/genomes/susScr1/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../susScr1.unmasked.2bit > findMotif.txt 2>&1 # real 1m12.153s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits susScr1 -not gap -bed=notGap.bed featureBits susScr1 allGaps.bed notGap.bed -bed=new.gaps.bed # what is the last index in the existing gap table: hgsql -N -e "select ix from gap;" susScr1 | sort -n | tail -1 # 27297 cat << '_EOF_' > mkGap.pl #!/usr/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" susScr1 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.gap hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad susScr1 otherGap other.gap # Loaded 96565 # adding this many: wc -l bed.tab # 96565 # starting with this many hgsql -e "select count(*) from gap;" susScr1 # 100202 hgsql susScr1 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" susScr1 # 196767 # == 100202 + 96565 ######################################################################## # Create kluster run files (DONE - 2009-11-09 - Hiram) cd /hive/data/genomes/susScr1 blat susScr1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/susScr1.11.ooc \ -repMatch=800 # Wrote 28011 overused 11-mers to jkStuff/susScr1.11.ooc mkdir /hive/data/staging/data/susScr1 cp -p susScr1.2bit jkStuff/susScr1.11.ooc /hive/data/staging/data/susScr1 cp -p chrom.sizes /hive/data/staging/data/susScr1 gapToLift susScr1 jkStuff/nonBridged.lft -bedFile=jkStuff/nonBridged.bed cp -p jkStuff/nonBridged.lft \ /hive/data/staging/data/susScr1/susScr1.nonBridged.lft ######################################################################### # GENBANK AUTO UPDATE (DONE - 2009-11-09 - Hiram) ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add the following two lines to src/lib/gbGenome.c # static char *susScrNames[] = {"Sus scrofa", "Sus scrofa coreanus", # "Sus scrofa domestica", "Sus scrofa ussuricus", NULL}; cvs ci -m "Adding susScr - Pig" src/lib/gbGenome.c make install-server # edit etc/genbank.conf to add susScr1 just before equCab2 # susScr1 (Pig) susScr1.serverGenome = /hive/data/genomes/susScr1/susScr1.2bit susScr1.clusterGenome = /scratch/data/susScr1/susScr1.2bit susScr1.ooc = /scratch/data/susScr1/susScr1.11.ooc susScr1.lift = /scratch/data/susScr1/susScr1.nonBridged.lft susScr1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} susScr1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} susScr1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} susScr1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} susScr1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} susScr1.downloadDir = susScr1 susScr1.genbank.mrna.xeno.loadDesc = yes susScr1.refseq.mrna.native.load = no cvs ci -m "Added susScr1" etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial susScr1 & # logFile: var/build/logs/2010.01.14-11:14:40.susScr1.initalign.log # real 182m41.068s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad susScr1 # logFile: var/dbload/hgwdev/logs/2010.01.15-15:54:26.dbload.log XXX - running Fri Jan 15 15:54:42 PST 2010 # real 31m29.282s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add susScr1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added susScr1 - Tetraodon Nigirividis" \ etc/align.dbs etc/hgwdev.dbs make etc-update ########################################################################## # HUMAN (hg18) PROTEINS TRACK (DONE braney 2009-11-17) # bash if not using bash shell already cd /cluster/data/susScr1 mkdir /cluster/data/susScr1/blastDb awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst susScr1.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst susScr1.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 2915 mkdir -p /cluster/data/susScr1/bed/tblastn.hg18KG cd /cluster/data/susScr1/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 2915 query.lst # we want around 200000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(200000/`wc query.lst | awk '{print $1}'`\) # 36727/(200000/2915) = 535.296025 mkdir -p kgfa split -l 535 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst wc kg.lst # 69 69 897 kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/susScr1/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/susScr1/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome exit ssh swarm cd /cluster/data/susScr1/bed/tblastn.hg18KG gensub2 query.lst kg.lst blastGsub blastSpec para create blastSpec # para try, check, push, check etc. para time # Completed: 201135 of 201135 jobs # CPU time in finished jobs: 13017817s 216963.62m 3616.06h 150.67d 0.413 y # IO & Wait Time: 1450739s 24178.98m 402.98h 16.79d 0.046 y # Average job time: 72s 1.20m 0.02h 0.00d # Longest finished job: 244s 4.07m 0.07h 0.00d # Submission to last job: 18278s 304.63m 5.08h 0.21d ssh swarm cd /cluster/data/susScr1/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 69 of 69 jobs # CPU time in finished jobs: 1275820s 21263.66m 354.39h 14.77d 0.040 y # IO & Wait Time: 106286s 1771.44m 29.52h 1.23d 0.003 y # Average job time: 20031s 333.84m 5.56h 0.23d # Longest finished job: 69022s 1150.37m 19.17h 0.80d # Submission to last job: 69037s 1150.62m 19.18h 0.80d ssh hgwdev cd /cluster/data/susScr1/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # checked: 77865 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/susScr1/bed/tblastn.hg18KG hgLoadPsl susScr1 blastHg18KG.psl # check coverage featureBits susScr1 blastHg18KG # 32676867 bases of 2231332019 (1.464%) in intersection # featureBits susScr1 blastHg18KG xenoRefGene -enrichment # blastHg18KG 1.362%, xenoRefGene 1.615%, both 0.695%, cover 51.03%, enrich 31.60x rm -rf blastOut #end tblastn ######################################################################### # lastz swap Mouse Mm9 (DONE - 2010-01-22 - Hiram) # original run cd /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21 cat fb.mm9.chainSusScr1Link.txt # 616833828 bases of 2620346127 (23.540%) in intersection mkdir /hive/data/genomes/susScr1/bed/blastz.mm9.swap cd /hive/data/genomes/susScr1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 69m27.221s cat fb.susScr1.chainMm9Link.txt # 656445475 bases of 2231332019 (29.419%) in intersection ############################################################################ # reset position to RHO location as found from blat of hg19 RHO gene hgsql -e \ 'update dbDb set defaultPos="chr13:57394166-57402412" where name="susScr1";' \ hgcentraltest ############################################################################