# for emacs: -*- mode: sh; -*- # zebra finch ( Taeniopygia guttata ) # http://www.ncbi.nlm.nih.gov/genome/367 # http://www.ncbi.nlm.nih.gov/bioproject/17289 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABQF00 ######################################################################### # DOWNLOAD SEQUENCE (DONE braney 2008-08-05) ssh kkstore03 mkdir /cluster/store7/taeGut1 ln -s /cluster/store7/taeGut1 /cluster/data mkdir /cluster/data/taeGut1/fromWustl cd /cluster/data/taeGut1/fromWustl wget "http://genome.wustl.edu/pub/user/lhillier/private/T_guttata/Taeniopygia_guttata-3.2.4.tar.gz" gunzip * mkdir tmp cd tmp tar xvf ../*.tar cat *.fa | sed 's/Chr/chr/' | sed 's/LG/chrLG/' | gzip -c > ../assembly.bases.gz cat *.agp | sed 's/Chr/chr/' | sed 's/LG/chrLG/' > ../assembly.agp # some scaffolds missing quality values XXX - this is broken. These *qual files are already in chrom coordinates, XXX - they do not need a lift. The lift had the effect of applying -mScore=98 XXX - to all values. --Hiram 2008-12-04 cat *qual | sed 's/Chr/chr/' | sed 's/LG/chrLG/' | qaToQac stdin stdout | qacAgpLift -mScore=98 ../assembly.agp stdin ../taeGut1.qual.qac cd .. cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 69 ######################################################################### # Create .ra file and run makeGenomeDb.pl (DONE braney 2008-08-05) ssh kkstore03 cd /cluster/data/taeGut1 cat << _EOF_ >taeGut1.config.ra # Config parameters for makeGenomeDb.pl: db taeGut1 clade vertebrate genomeCladePriority 68 scientificName Taeniopygia guttata commonName Zebra finch assemblyDate Jul. 2008 assemblyLabel Washington University taeGut1 orderKey 437 mitoAcc NC_007897 fastaFiles /cluster/data/taeGut1/fromWustl/assembly.bases.gz agpFiles /cluster/data/taeGut1/fromWustl/assembly.agp qualFiles /cluster/data/taeGut1/fromWustl/taeGut1.qual.qac dbDbSpeciesDir zebraFinch _EOF_ # use 'screen' make sure on kkstore03 makeGenomeDb.pl -verbose=2 taeGut1.config.ra > makeGenomeDb.out 2>&1 & # 'ctl-a ctl -d' returns to previous shell cut -f 2 chrom.sizes | ave stdin # Q1 432944.500000 # median 2517995.000000 # Q3 17897912.000000 # average 17616947.728571 # min 9909.000000 # max 175225315.000000 # count 70 # total 1233186341.000000 # standard deviation 35378770.640295 ######################################################################### # SIMPLE REPEATS TRF (DONE braney 2008-08-05) ssh kkstore03 screen # use a screen to manage this job mkdir /cluster/data/taeGut1/bed/simpleRepeat cd /cluster/data/taeGut1/bed/simpleRepeat # doSimpleRepeat.pl -buildDir=/cluster/data/taeGut1/bed/simpleRepeat \ taeGut1 > do.log 2>&1 & #### When done ssh pk para time featureBits taeGut1 simpleRepeat # 25363702 bases of 1222864691 (2.074%) in intersection # Link to it from /gbdb: ln -s /cluster/data/taeGut1/taeGut1.2bit /gbdb/taeGut1/taeGut1.2bit ######################################################################### # Run WindowMasker (DONE braney 2008-08-05) ssh kkstore03 screen mkdir /cluster/data/taeGut1/bed/windowMasker cd /cluster/data/taeGut1/bed/windowMasker nice doWindowMasker.pl -workhorse=kolossus \ -buildDir=/cluster/data/taeGut1/bed/windowMasker taeGut1 > wmRun.log 2>&1 & # load this initial data to get ready to clean it ssh hgwdev cd /cluster/data/taeGut1/bed/windowMasker hgLoadBed taeGut1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 7487462 elements of size 3 # eliminate the gaps from the masking featureBits taeGut1 -not gap -bed=notGap.bed featureBits taeGut1 windowmaskerSdust # 260083327 bases of 1222864691 (21.268%) in intersection time nice -n +19 featureBits taeGut1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 249761677 bases of 1222864691 (20.424%) in intersection # reload track to get it clean hgLoadBed taeGut1 windowmaskerSdust cleanWMask.bed.gz featureBits taeGut1 windowmaskerSdust # 249761677 bases of 1222864691 (20.424%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../taeGut1.unmasked.2bit stdin \ -type=.bed taeGut1.cleanWMSdust.2bit twoBitToFa taeGut1.cleanWMSdust.2bit stdout | faSize stdin \ > taeGut1.cleanWMSdust.faSize.txt cat taeGut1.cleanWMSdust.faSize.txt # 1233186341 bases (10321650 N's 1222864691 real 973103014 upper 249761677 # lower) in 70 sequences in 1 files # Total size: mean 17616947.7 sd 35634216.3 min 9909 (chr16) max 175225315 # (chrUn) median 2517995 # N count: mean 147452.1 sd 400992.9 # U count: mean 13901471.6 sd 27798925.9 # L count: mean 3568024.0 sd 7705836.1 # %20.25 masked total, %20.42 masked real cp taeGut1.cleanWMSdust.2bit /cluster/data/taeGut1/taeGut1.2bit ln -s /cluster/data/taeGut1/taeGut1.2bit /gbdb/taeGut1 ######################################################################### # Create OOC file for genbank runs (DONE braney 2008-08-05) # use same repMatch value as bosTau2 ssh kkstore03 cd /cluster/data/taeGut1 blat taeGut1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=taeGut1.11.1005.ooc -repMatch=1005 # -rw-rw-r-- 1 braney protein 6368 Aug 5 14:31 taeGut1.11.1005.ooc blat taeGut1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=taeGut1.11.ooc -repMatch=1024 # -rw-rw-r-- 1 braney protein 6044 Jul 30 16:03 taeGut1.11.ooc mkdir -p /cluster/bluearc/scratch/data/taeGut1 cp taeGut1.2bit taeGut1.11.ooc chrom.sizes /cluster/bluearc/scratch/data/taeGut1 # request admins to sync these to pk ######################################################################### ## Create a lift file for genbank work(DONE braney 2008-08-05) ssh kkstore03 cd /cluster/data/taeGut1 cat fromWustl/assembly.agp | /cluster/bin/scripts/agpToLift -revStrand > jkStuff/liftAll.lft ######################################################################### # GENBANK AUTO UPDATE (DONE braney 2008-08-06) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add taeGut1 # taeGut1 taeGut1.serverGenome = /cluster/data/taeGut1/taeGut1.2bit taeGut1.clusterGenome = /scratch/data/taeGut1/taeGut1.2bit taeGut1.ooc = /scratch/data/taeGut1/11.ooc taeGut1.align.unplacedChroms = chrUn,chr*_random taeGut1.lift = /cluster/data/taeGut1/jkStuff/liftAll.lft taeGut1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} taeGut1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} taeGut1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} taeGut1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} taeGut1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} taeGut1.refseq.mrna.native.load = yes taeGut1.refseq.mrna.xeno.load = yes taeGut1.genbank.mrna.xeno.load = yes taeGut1.downloadDir = taeGut1 cvs ci -m "Added taeGut1." etc/genbank.conf # add taeGut to src/lib/gbGenome.c cvs ci -m "Added taeGut" src/lib/gbGenome.c # update /cluster/data/genbank/: make etc-update ssh genbank cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial taeGut1 & # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad taeGut1 cd ~/kent/src/hg/makeDb/genbank cvsup # add taeGut1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added taeGut1 - zebra finch" etc/align.dbs etc/hgwdev.dbs make etc-update ############################################################################ ## Default position (DONE braney 2008-08-06) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrZ:56,293,325-56,333,199" where name="taeGut1";' hgcentraltest ############################################################################ # MAKE DOWNLOADABLE FILES (restarted braney 2008-08-02) ssh kkstore03 screen cd /cluster/data/taeGut1 # edited makeDownloads.pl to take out repeat masker files # makeDownloads.pl taeGut1 > downloads.log 2>&1 & /cluster/bin/scripts/fripple taeGut1 > downloads.log 2>&1 & ############################################################################ ## Adding a Photograph, from Art Arnold in email (restarted braney 2008-08-03) mkdir /cluster/data/taeGut1/photograph cd /cluster/data/taeGut1/photograph ls -l Taeniopygia_guttata.jpg # -rw-rw-r-- 1 braney protein 16160 Aug 4 09:43 Taeniopygia_guttata.jpg # check this .jpg file into the browser doc source tree cvs add -kb Taeniopygia_guttata.jpg cvs commit Taeniopygia_guttata.jpg cp -p Taeniopygia_guttata.jpg /usr/local/apache/htdocs/images ############################################################################ ########################################################################## ## Creating pushQ (DONE - braney - 2008-08-06 ) ssh hgwdev mkdir /cluster/data/taeGut1/pushQ cd /cluster/data/taeGut1/pushQ /cluster/bin/scripts/makePushQSql.pl taeGut1 > taeGut1.sql 2> stderr.out ## check the stderr.out for anything that needs to be fixed scp taeGut1.sql hgwbeta.cse.ucsc.edu:/tmp ssh hgwbeta cd /tmp hgsql qapushq < taeGut1.sql ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-08-09 - braney) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("taeGut1", "blat2", "17782", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("taeGut1", "blat2", "17783", "0", "1");' \ hgcentraltest ############################################################################ # BAC ends (working braney...) ssh kkstore03 mkdir /cluster/data/taeGut1/bed/bacEnds cd /cluster/data/taeGut1/bed/bacEnds mkdir fromWustl cd fromWustl wget "http://genome.wustl.edu/pub/organism/Other_Vertebrates/Taeniopygia_guttata/end_sequences/bac_ends.fa.gz" gunzip * ssh hgwdev mkdir -p /gbdb/taeGut1/bacEnds ln -s `pwd`/bac_ends.fa /gbdb/taeGut1/bacEnds cd /tmp hgLoadSeq taeGut1 /gbdb/taeGut1/bacEnds/bac_ends.fa ssh kkstore03 cd /cluster/data/taeGut1/bed/bacEnds rm -rf /san/sanvol1/scratch/taeGut1.splitEnds mkdir -p /san/sanvol1/scratch/taeGut1.splitEnds faSplit sequence fromWustl/bac_ends.fa 400 /san/sanvol1/scratch/taeGut1.splitEnds/x mkdir run.blat cd run.blat ls /san/sanvol1/scratch/taeGut1.splitEnds/*.fa > inSeq.lst rm -rf /cluster/bluearc/taeGut1.blatEnds mkdir -p /cluster/bluearc/taeGut1.blatEnds for i in `cat inSeq.lst` do f=`basename $i .fa`.psl echo /cluster/bin/x86_64/blat /scratch/data/taeGut1/taeGut1.2bit \ -ooc=/scratch/data/taeGut1/taeGut1.11.ooc -tileSize=11 \ $i {check out line+ /cluster/bluearc/taeGut1.blatEnds/$f} done > jobList ssh memk cd /cluster/data/taeGut1/bed/bacEnds/run.blat para create jobList para time # CPU time in finished jobs: 124366s 2072.77m 34.55h 1.44d 0.004 y # IO & Wait Time: 865s 14.41m 0.24h 0.01d 0.000 y # Average job time: 383s 6.38m 0.11h 0.00d # Longest finished job: 557s 9.28m 0.15h 0.01d # Submission to last job: 4817s 80.28m 1.34h 0.06d ssh kkstore03 cd /cluster/data/taeGut1/bed/bacEnds for i in /cluster/bluearc/taeGut1.blatEnds/*.psl; do tail +6 $i; done | pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons stdin bacEnds.psl /dev/null cat fromWustl/bac_ends.fa | awk '/>/ {print $1}' | \ tr -d '>'| sed 's/\./ /' | \ awk '{clones[$1] = $2 " " clones[$1] } \ END {for (i in clones) print i, clones[i]}' | \ awk 'BEGIN {OFS="\t"} \ {first="";second=""; \ for(ii=2; ii <= NF ; ii++) \ {if (($ii == "b1") || ($ii == "b2")) \ first=$1 "." $ii "," first; \ if (($ii == "g1") || ($ii == "g2")) \ second=$1 "." $ii "," second;} print first,second,$1}' | \ awk '{if (NF == 3) print}' > allPairs.txt time /cluster/bin/x86_64/pslPairs -tInsert=10000 \ -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.psl allPairs.txt \ all_bacends bacEnds # Filter by score and sort by {chrom,chromStart}: awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed cat bacEnds.{slop,short,long,mismatch,orphan} \ | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed # CHECK bacEndPairs.bed ID's to make sure they have no blanks in them awk '{print $5}' bacEndPairs.bed | sort -u /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \ bacEndPairsBad.bed \ | sorttbl tname tstart | headchg -del \ > bacEnds.load.psl wc -l bacEnds.* # 526720 bacEnds.load.psl # 65 bacEnds.long # 1412 bacEnds.mismatch # 33163 bacEnds.orphan # 111379 bacEnds.pairs # 625401 bacEnds.psl # 530 bacEnds.short # 312 bacEnds.slop # 1298982 total # load into database ssh hgwdev cd /cluster/data/taeGut1/bed/bacEnds hgLoadBed -notItemRgb taeGut1 bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # note - the "Bad" track isn't pushed to RR, just used for assembly QA. hgLoadBed -notItemRgb taeGut1 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql hgLoadPsl taeGut1 -table=all_bacends bacEnds.load.psl featureBits taeGut1 all_bacends # 154544748 bases of 1222864691 (12.638%) in intersection featureBits taeGut1 bacEndPairs # 118749166 bases of 1222864691 (9.711%) in intersection featureBits taeGut1 bacEndPairsBad # 21778830 bases of 1222864691 (1.781%) in intersection ######################################################################### # PRODUCING GENSCAN PREDICTIONS (DONE 2008-08-29 braney) ssh hgwdev mkdir /cluster/data/taeGut1/bed/genscan cd /cluster/data/taeGut1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux ssh swarm cd /cluster/data/taeGut1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt fasta for i in ../../goldenPath/chromosomes/*.gz do echo $i maskOutFa $i hard fasta/`basename $i .gz` done ls fasta/*.fa > genome.lst wc -l genome.lst # 70 genome.lst # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.lst single gsub jobList para make jobList para time #Completed: 255 of 259 jobs #Crashed: 4 jobs #CPU time in finished jobs: 82258s 1370.97m 22.85h 0.95d 0.003 y #IO & Wait Time: 1053s 17.55m 0.29h 0.01d 0.000 y #Average job time: 327s 5.45m 0.09h 0.00d #Longest finished job: 20131s 335.52m 5.59h 0.23d #Submission to last job: 23171s 386.18m 6.44h 0.27d # two jobs crashed due to genscan running out of memory, re-run it # with "-window=1200000" instead of "-window=2400000". para crashed | sed 's/2400000/1200000/' > crash.jobs para create -batch=crashBatch crash.jobs para push -batch=crashBatch # Convert these to chromosome level files as so: cat gtf/*.gtf > genscan.gtf cat subopt/*.bed > genscanSubopt.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/taeGut1/bed/genscan ldHgGene taeGut1 genscan genscan.gtf hgPepPred taeGut1 generic genscanPep genscan.pep hgLoadBed taeGut1 genscanSubopt genscanSubopt.bed # cleanup rm -rf fasta ##################################################### # build 2bit with contigs from random chroms and chrUn (reDONE braney 2008-09-06) cd /cluster/data/taeGut1 fgrep 'random Un' taeGut1.agp | awk '{if ($5 == "W") print $1, $2-1, $3, $6, $9}' \ | while read chr start stop contig strand; do echo ">$contig"; if test $strand == '+' then twoBitToFa taeGut1.2bit:$chr:$start-$stop stdout \ | tail -n +2; else twoBitToFa taeGut1.2bit:$chr:$start-$stop stdout | \ faRc -keepCase stdin stdout | tail -n +2; fi done > unRandom.fa fgrep -v 'random Un' chrom.sizes | awk '{print $1}' | \ twoBitToFa taeGut1.2bit -seqList=stdin noUn.fa cat noUn.fa unRandom.fa | faToTwoBit stdin taeGut1.blastz.2bit twoBitInfo taeGut1.blastz.2bit taeGut1.blastz.sizes ######################################################################### ## BLASTZ galGal3 swap (reDONE braney 2008-09-06) ## the original blastz to galGal3 measured ssh swarm mkdir /cluster/data/taeGut1/bed/blastz.galGal3.swap cd /cluster/data/taeGut1/bed/blastz.galGal3.swap screen doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=swarm -smallCluster=swarm \ -workhorse=kolossus -swap > swap.log 2>&1 & ssh hgwdev cd /cluster/data/taeGut1/bed/blastz.galGal3.swap nice -n +19 featureBits taeGut1 chainGalGal3Link \ > fb.taeGut1l.chainGalGal3Link.txt 2>&1 # 802731012 bases of 1222864691 (65.643%) in intersection ######################################################################### ## BLASTZ hg18 swap (reDONE braney 2008-09-10) ssh swarm mkdir /cluster/data/taeGut1/bed/blastz.hg18.swap cd /cluster/data/taeGut1/bed/blastz.hg18.swap screen doBlastzChainNet.pl \ /cluster/data/hg18/bed/blastz.taeGut1.2008-09-09/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=swarm -smallCluster=swarm \ -workhorse=kolossus -swap > swap.log 2>&1 & ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-09-07) # bash if not using bash shell already ssh kolossus mkdir /cluster/data/taeGut1/blastDb cd /cluster/data/taeGut1 awk '{if ($2 > 1000000) print $1}' taeGut1.blastz.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst taeGut1.blastz.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' taeGut1.blastz.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst taeGut1.blastz.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 1243 mkdir -p /cluster/data/taeGut1/bed/tblastn.hg18KG cd /cluster/data/taeGut1/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 1243 query.lst # we want around 150000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(150000/`wc query.lst | awk '{print $1}'`\) # 36727/(150000/1243) = 304.344407 mkdir -p kgfa split -l 304 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/taeGut1/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/taeGut1/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh swarm cd /cluster/data/taeGut1/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 150403 of 150403 jobs # CPU time in finished jobs: 7194206s 119903.43m 1998.39h 83.27d 0.228 y # IO & Wait Time: 576761s 9612.69m 160.21h 6.68d 0.018 y # Average job time: 52s 0.86m 0.01h 0.00d # Longest finished job: 120s 2.00m 0.03h 0.00d # Submission to last job: 12759s 212.65m 3.54h 0.15d ssh swarm cd /cluster/data/taeGut1/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 121 of 121 jobs # CPU time in finished jobs: 14143s 235.72m 3.93h 0.16d 0.000 y # IO & Wait Time: 54085s 901.42m 15.02h 0.63d 0.002 y # Average job time: 564s 9.40m 0.16h 0.01d # Longest finished job: 995s 16.58m 0.28h 0.01d # Submission to last job: 1047s 17.45m 0.29h 0.01d cd /cluster/data/taeGut1/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq > ../unliftBlastHg18KG.psl cd .. pslCheck unliftBlastHg18KG.psl liftUp -nohead temp.psl ../../jkStuff/liftAll.lft carry unliftBlastHg18KG.psl sort -T /tmp -k 14,14 -k 16,16n -k 17,17n temp.psl > blastHg18KG.psl rm temp.psl pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/taeGut1/bed/tblastn.hg18KG hgLoadPsl taeGut1 blastHg18KG.psl # check coverage featureBits taeGut1 blastHg18KG # 19587281 bases of 1222864691 (1.602%) in intersection featureBits taeGut1 all_mrna blastHg18KG -enrichment # all_mrna 0.117%, blastHg18KG 1.602%, both 0.049%, cover 41.81%, enrich 26.10x rm -rf blastOut #end tblastn ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: taeGut1.upstreamGeneTbl = refGene ######################################### # RepeatMasker ( 2009-02-04 RMH and braney) mkdir /hive/data/genomes/taeGut1/bed/RMRunRMH cd /hive/data/genomes/taeGut1/bed/RMRunRMH doRepeatMasker.pl -buildDir `pwd` taeGut1 featureBits taeGut1 rmsk # 99288448 bases of 1222864691 (8.119%) in intersection # was earlier... # 97507442 bases of 1222864691 (7.974%) in intersection ############################################################################# # N-SCAN gene predictions (nscanGene) - (2009-04-17 markd) /mblab.wustl.edu/predictions/taeGut1 # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/taeGut1/bed/nscan/ wget -nv http://mblab.wustl.edu/predictions/taeGut1/taeGut1.gtf wget -nv http://mblab.wustl.edu/predictions/taeGut1/taeGut1.prot.fa bzip2 taeGut1.* chmod a-w * # load track ldHgGene -bin -gtf -genePredExt taeGut1 nscanGene taeGut1.gtf.bz2 hgPepPred taeGut1 generic nscanPep taeGut1.prot.fa.bz2 rm *.tab # create zebraFinch/nscanGene.html # zebraFinch/taeGut1/trackDb.ra, add: track nscanGene override informant Zerba Finch N-SCAN uses chicken (galGal3) as the informant searchTable nscanGene searchType genePred termRegex chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)? searchPriority 50 # a mismatch was found in nscanPep; WUStL recreate wget -nv http://mblab.wustl.edu/predictions/taeGut1/taeGut1.prot.fa bzip2 taeGut1.prot.fa chmod a-w * hgPepPred taeGut1 generic nscanPep taeGut1.prot.fa.bz2 ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ######################################################################### # SWAP melGal1 lastz (DONE - 2011-03-28 - Chin) # original alignment cat fb.melGal1.chainTaeGut1Link.txt # 671105409 bases of 935922386 (71.705%) in intersection mkdir /hive/data/genomes/taeGut1/bed/blastz.melGal1.swap cd /hive/data/genomes/taeGut1/bed/blastz.melGal1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/melGal1/bed/lastzTaeGut1.2011-03-28/DEF \ -swap \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 # real 36m7.017s cat fb.taeGut1.chainMelGal1Link.txt # 795001886 bases of 1222864691 (65.011%) in intersection cd /hive/data/genomes/taeGut1/bed ln -s blastz.melGal1.swap lastz.melgal1 ############################################################################ # LASTZ Medium Ground Finch geoFor1 (DONE - 2012-07-28 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S taeGut1 mkdir /hive/data/genomes/taeGut1/bed/lastzGeoFor1.2012-07-28 cd /hive/data/genomes/taeGut1/bed/lastzGeoFor1.2012-07-28 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # Zebra finch vs. Medium Ground Finch BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Zebra finch taeGut1 SEQ1_DIR=/hive/data/genomes/taeGut1/taeGut1.2bit SEQ1_LEN=/hive/data/genomes/taeGut1/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: Medium Ground Finch GeoFor1 SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/taeGut1/bed/lastzGeoFor1.2012-07-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 # real 3641m55.514s # several chain jobs went for several days: # Submission to last job: 205474s 3424.57m 57.08h 2.38d cat fb.taeGut1.chainGeoFor1Link.txt # 1126123816 bases of 1222864691 (92.089%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/taeGut1/bed ln -s lastzGeoFor1.2012-07-28 lastz.geoFor1 mkdir /hive/data/genomes/geoFor1/bed/blastz.taeGut1.swap cd /hive/data/genomes/geoFor1/bed/blastz.taeGut1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/taeGut1/bed/lastzGeoFor1.2012-07-28/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 102m30.103s cat fb.geoFor1.chainTaeGut1Link.txt # 963520807 bases of 1041286029 (92.532%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/geoFor1/bed ln -s blastz.taeGut1.swap lastz.taeGut1 ##############################################################################