# for emacs: -*- mode: sh; -*- # Tetraodon Nigroviridis from Genoscope, version v8 (released Mar 2007) # Project website: # http://www.genoscope.cns.fr/externe/Download/Projets/Projet_C/genomique/goldenpath_v2/ # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CAAE01 ########################################################################## # Download sequence (DONE - 2009-08-06 - Hiram) mkdir /hive/data/genomes/tetNig2 cd /hive/data/genomes/tetNig2 mkdir genoscope cd genoscope D="http://www.genoscope.cns.fr/externe/Download/Projets/Projet_C/genomique/golde npath_v2/" for F in chr.agp chr.agp.info do wget --timestamping "${D}/${F}" done for C in 1 1_random 2 2_random 3 4 5 6 7 8 9 10 11 12 13 14 15 \ 15_random 16 17 18 19 20 21 21_random Un_random do wget --timestamping "${D}/unmasked/chr${C}.fa.bz2" done for F in Tetraodon_mRNAs_v8.2.fa Tetraodon_peptide_v8.2.fa \ Tetraodon_peptide_v8.2.fa.compo annotation_tetraodon_v8.2.gff do wget --timestamping "${D}/annotation_v8.2/${F}" done bunzip *.fa.bz2 gzip *.fa ########################################################################## # Initial browser (DONE - 2009-08-06 - Hiram) cd /hive/data/genomes/tetNig2 cat << '_EOF_' > tetNig2.config.ra # Config parameters for makeGenomeDb.pl: db tetNig2 clade vertebrate scientificName Tetraodon nigroviridis commonName Tetraodon assemblyDate Mar. 2007 assemblyLabel Genoscope Tetraodon v8.0 (NCBI project 12350, CAAE01000000) orderKey 459 mitoAcc NC_007176 fastaFiles /hive/data/genomes/tetNig2/genoscope/chr*.fa.gz agpFiles /hive/data/genomes/tetNig2/genoscope/chr.agp # qualFiles none dbDbSpeciesDir tetraodon taxId 99883 '_EOF_' # << happy emacs time nice -n +19 makeGenomeDb.pl -verbose=2 -workhorse=hgwdev \ -noGoldGapSplit -stop=agp tetNig2.config.ra > agp.log 2>&1 time nice -n +19 makeGenomeDb.pl -verbose=2 -workhorse=hgwdev \ -noGoldGapSplit -continue=db -stop=db tetNig2.config.ra > db.log 2>&1 time nice -n +19 makeGenomeDb.pl -verbose=2 -workhorse=hgwdev \ -noGoldGapSplit -continue=dbDb tetNig2.config.ra > dbDb.log 2>&1 # add the trackDb files to the source tree and entry to trackDb/makefile ########################################################################## # Repeat Masker (DONE - 2009-08-06 - Hiram) mkdir /hive/data/genomes/tetNig2/bed/repeatMasker cd /hive/data/genomes/tetNig2/bed/repeatMasker doRepeatMasker.pl -verbose=2 -workhorse=hgwdev \ -noSplit -buildDir=`pwd` tetNig2 > do.log 2>&1 cat faSize.rmsk.txt # 358618246 bases (56303458 N's 302314788 real 292078336 upper 10236452 lower) # in 27 sequences in 1 files # %2.85 masked total, %3.39 masked real # since this doesn't mask very much, use windowmasker instead hgsql -e "drop table rmsk;" tetNig2 # this leaves the interrupted repeats track showing on genome-test # from the do.log: # Repeat masker version: June 4 2009 (open-3-2-8) # Library release 20090604 ######################################################################## # Simple Repeats (DONE - 2009-08-06 - Hiram) mkdir /hive/data/genomes/tetNig2/bed/simpleRepeat cd /hive/data/genomes/tetNig2/bed/simpleRepeat doSimpleRepeat.pl -workhorse=hgwdev \ -buildDir=`pwd` tetNig2 > do.log 2>&1 & # fails on the job for chrM, make an empty result: touch /hive/data/genomes/tetNig2/TrfPart/009/009.lst.bed doSimpleRepeat.pl -workhorse=hgwdev -continue=filter \ -buildDir=`pwd` tetNig2 > filter.log 2>&1 & cat fb.simpleRepeat # 11549259 bases of 332311746 (3.475%) in intersection ######################################################################## # Marking *all* gaps - they are not all in the AGP file # (DONE - 2009-08-07 - Hiram) mkdir /hive/data/genomes/tetNig2/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../tetNig2.2bit > findMotif.txt 2>&1 # real 0m7.967s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits tetNig2 -not gap -bed=notGap.bed featureBits tetNig2 allGaps.bed notGap.bed -bed=new.gaps.bed # what is the last index in the existing gap table: hgsql -N -e "select ix from gap;" tetNig2 | sort -n | tail -1 # 34284 cat << '_EOF_' > mkGap.pl #!/usr/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" tetNig2 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.gap hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad tetNig2 otherGap other.gap # Loaded 17051 # adding this many: wc -l bed.tab # 17051 # starting with this many hgsql -e "select count(*) from gap;" tetNig2 # 17140 hgsql tetNig2 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" tetNig2 # 34191 # == 17140 + 17051 ######################################################################## # WindowMasker (DONE - 2009-08-06 - Hiram) mkdir /hive/data/genomes/tetNig2/bed/windowMasker cd /hive/data/genomes/tetNig2/bed/windowMasker doWindowMasker.pl -workhorse=hgwdev -buildDir=`pwd` tetNig2 > do.log 2>&1 twoBitToFa tetNig2.wmsk.sdust.2bit stdout | faSize stdin # 358618246 bases (56303458 N's 302314788 real 241249522 upper # 61065266 lower) in 27 sequences in 1 files # %17.03 masked total, %20.20 masked real # load this initial data to get ready to clean it ssh hgwdev cd /hive/data/genomes/tetNig2/bed/windowMasker hgLoadBed tetNig2 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 1649525 elements of size 3 featureBits -countGaps tetNig2 windowmaskerSdust # 117367586 bases of 358618246 (32.728%) in intersection # eliminate the gaps from the masking featureBits tetNig2 -not gap -bed=notGap.bed # 302314788 bases of 302314788 (100.000%) in intersection time nice -n +19 featureBits tetNig2 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 61065266 bases of 302314788 (20.199%) in intersection # reload track to get it clean hgLoadBed tetNig2 windowmaskerSdust cleanWMask.bed.gz # Loaded 1644977 elements of size 4 featureBits -countGaps tetNig2 windowmaskerSdust # 61065266 bases of 358618246 (17.028%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../tetNig2.unmasked.2bit stdin \ -type=.bed tetNig2.cleanWMSdust.2bit twoBitToFa tetNig2.cleanWMSdust.2bit stdout | faSize stdin \ > tetNig2.cleanWMSdust.faSize.txt cat tetNig2.cleanWMSdust.faSize.txt # 358618246 bases (56303458 N's 302314788 real 241249522 upper 61065266 # lower) in 27 sequences in 1 files # %17.03 masked total, %20.20 masked real ######################################################################## # MASK SEQUENCE WITH WM+TRF (DONE - 2009-08-06 - Hiram) cd /hive/data/genomes/tetNig2 twoBitMask -add bed/windowMasker/tetNig2.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed tetNig2.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa tetNig2.2bit stdout | faSize stdin > faSize.tetNig2.txt cat faSize.tetNig2.txt # 358618246 bases (56303458 N's 302314788 real 241039472 upper 61275316 # lower) in 27 sequences in 1 files # %17.09 masked total, %20.27 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/tetNig2/tetNig2.2bit ln -s `pwd`/tetNig2.2bit /gbdb/tetNig2/tetNig2.2bit ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2009-08-06 - Hiram) # Use -repMatch=130 (based on size -- for human we use 1024, and # Tetraodon size is ~12.7% of human judging by gapless tetNig2 vs. # hg18 genome size from featureBits. # genome. Bump that up a bit to be more conservative. # 100*302314788/2897310462 = 10.434324 cd /hive/data/genomes/tetNig2 blat tetNig2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/tetNig2.11.ooc -repMatch=130 # Wrote 8132 overused 11-mers to jkStuff/tetNig2.11.ooc # copy all of this stuff to the klusters: cd /hive/data/genomes/tetNig2/jkStuff gapToLift tetNig2 nonBridged.lift -bedFile=nonBridged.bed cd /hive/data/genomes/tetNig2 mkdir /hive/data/staging/data/tetNig2 cp -p jkStuff/tetNig2.11.ooc jkStuff/nonBridged.lift \ chrom.sizes tetNig2.2bit /hive/data/staging/data/tetNig2 ######################################################################### # Ensembl genes v55 (DONE - 2009-08-06 - Hiram) cd /hive/data/genomes/tetNig2 cat << '_EOF_' > tetNig2.ensGene.ra # required db variable db tetNig2 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" '_EOF_' # << happy emacs doEnsGeneUpdate.pl -verbose=2 -workhorse=hgwdev \ -ensVersion=55 -stop=process tetNig2.ensGene.ra > tetNig2.55.log 2>&1 doEnsGeneUpdate.pl -verbose=2 -workhorse=hgwdev \ -ensVersion=55 -continue=load tetNig2.ensGene.ra >> tetNig2.55.log 2>&1 featureBits tetNig2 ensGene # 31637658 bases of 332311746 (9.520%) in intersection ######################################################################### # BLATSERVERS ENTRY (DONE - 2009-08-06 - Hiram) Thu Aug 6 15:58:09 PDT 2009 # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("tetNig2", "blat15", "17794", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("tetNig2", "blat15", "17795", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # Set default position same as tetNig1 after blating tetNig1 to here # (DONE - 2009-08-06 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chr10:2797091-2804214" where name="tetNig2";' \ hgcentraltest ######################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE 2009-08-07 braney ) ### !!!! needs re-doing to deal with chrUn correctly !!!! # bash if not using bash shell already cd /cluster/data/tetNig2 mkdir /cluster/data/tetNig2/blastDb awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst twoBitToFa -seqList=1meg.lst tetNig2.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa 1meg.lst awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst twoBitToFa -seqList=less1meg.lst tetNig2.2bit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa less1meg.lst cd blastDb for i in *.fa do /hive/data/outside/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 394 mkdir -p /cluster/data/tetNig2/bed/tblastn.hg18KG cd /cluster/data/tetNig2/bed/tblastn.hg18KG echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 394 query.lst # we want around 100000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(100000/`wc query.lst | awk '{print $1}'`\) # 36727/(100000/394) = 144.704380 mkdir -p kgfa split -l 145 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst wc kg.lst # 254 254 3302 kg.lst mkdir -p blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/tetNig2/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/hive/data/outside/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/tetNig2/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome exit ssh swarm cd /cluster/data/tetNig2/bed/tblastn.hg18KG gensub2 query.lst kg.lst blastGsub blastSpec para create blastSpec # para try, check, push, check etc. para time # Completed: 100076 of 100076 jobs # CPU time in finished jobs: 2219419s 36990.31m 616.51h 25.69d 0.070 y # IO & Wait Time: 445463s 7424.39m 123.74h 5.16d 0.014 y # Average job time: 27s 0.44m 0.01h 0.00d # Longest finished job: 319s 5.32m 0.09h 0.00d # Submission to last job: 2791s 46.52m 0.78h 0.03d ssh swarm cd /cluster/data/tetNig2/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS ../blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining para create chainSpec para try, check, push, check etc. # Completed: 254 of 254 jobs # CPU time in finished jobs: 590077s 9834.62m 163.91h 6.83d 0.019 y # IO & Wait Time: 20749s 345.81m 5.76h 0.24d 0.001 y # Average job time: 2405s 40.08m 0.67h 0.03d # Longest finished job: 36592s 609.87m 10.16h 0.42d # Submission to last job: 36604s 610.07m 10.17h 0.42d cd /cluster/data/tetNig2/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # checked: 38517 failed: 0 errors: 0 # load table ssh hgwdev cd /cluster/data/tetNig2/bed/tblastn.hg18KG hgLoadPsl tetNig2 blastHg18KG.psl # check coverage featureBits tetNig2 blastHg18KG # 18646943 bases of 332311746 (5.611%) in intersection featureBits tetNig2 blastHg18KG ensGene -enrichment # blastHg18KG 5.611%, ensGene 9.520%, both 4.993%, cover 88.97%, enrich 9.35x rm -rf blastOut #end tblastn ######################################################################### # Creating contigs 2bit file (DONE - 2009-08-10 - Hiram) cd /hive/data/genomes/tetNig2/jkStuff gapToLift tetNig2 nonBridged.lift -bedFile=nonBridged.bed # there aren't any gaps in chrM and it doesn't show up in # the result ! Add chrM to this lift file: echo -e "0\tchrM.00\t16462\tchrM\t16462" >> nonBridged.lift echo -e "chrM\t0\t16462\tchrM.00" >> nonBridged.bed # take a look at the nonBridged.bed as a custom track mkdir /hive/data/genomes/tetNig2/contigs cd /hive/data/genomes/tetNig2/contigs ln -s ../jkStuff/nonBridged.lift . ~/kent/src/hg/utils/lft2BitToFa.pl ../tetNig2.2bit nonBridged.lift \ | gzip -c > tetNig2.contigs.fa.gz # make sure nothing was destroyed: faCount *.fa.gz > faCount.contigs.txt 2>&1 twoBitToFa ../tetNig2.2bit stdout | faCount stdin > faCount.2bit.txt 2>&1 tail -1 faCount.contigs.txt # total 333797246 81050307 70114488 70126181 # 81023812 31482458 9799877 tail -1 faCount.2bit.txt # total 358618246 81050307 70114488 70126181 # 81023812 56303458 9799877 # only the total size and N count are different faSize tetNig2.contigs.fa.gz # 333797246 bases (31482458 N's 302314788 real 241039472 upper 61275316 # lower) in 14352 sequences in 1 files # %18.36 masked total, %20.27 masked real twoBitToFa ../tetNig2.2bit stdout | faSize stdin # 358618246 bases (56303458 N's 302314788 real 241039472 upper 61275316 # lower) in 27 sequences in 1 files # %17.09 masked total, %20.27 masked real faToTwoBit tetNig2.contigs.fa.gz tetNig2.contigs.2bit twoBitInfo tetNig2.contigs.2bit stdout | sort -k2nr > tetNig2.contigs.sizes cp -p tetNig2.contigs.2bit tetNig2.contigs.sizes \ /hive/data/staging/data/tetNig2 cp -p nonBridged.lift /hive/data/staging/data/tetNig2/tetNig2.contigs.lift ######################################################################### # GENBANK AUTO UPDATE (DONE - 2009-08-11 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add tetNig2 just before tetNig1 # tetNig2 (Tetraodon) tetNig2.serverGenome = /hive/data/genomes/tetNig2/tetNig2.2bit tetNig2.clusterGenome = /scratch/data/tetNig2/tetNig2.2bit tetNig2.ooc = /scratch/data/tetNig2/tetNig2.11.ooc tetNig2.lift = /scratch/data/tetNig2/tetNig2.contigs.lift tetNig2.align.unplacedChroms = chr*_random tetNig2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} tetNig2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} tetNig2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} tetNig2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} tetNig2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} tetNig2.downloadDir = tetNig2 tetNig2.genbank.mrna.xeno.loadDesc = yes tetNig2.refseq.mrna.native.load = no cvs ci -m "Added tetNig2" etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial tetNig2 & # logFile: var/build/logs/2009.08.10-16:42:06.tetNig2.initalign.log # real 578m42.777s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad tetNig2 # logFile: var/dbload/hgwdev/logs/2009.08.11-09:22:29.dbload.log # real 31m29.282s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add tetNig2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added tetNig2 - Tetraodon Nigirividis" \ etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### # lastz swap Human hg19 (DONE - 2009-08-11 - Hiram) # result of human alignment (date stamp on directory is incorrect) cd /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10 cat fb.hg19.chainTetNig2Link.txt # 49611132 bases of 2897316137 (1.712%) in intersection # running the swap mkdir /hive/data/genomes/tetNig2/bed/blastz.hg19.swap cd /hive/data/genomes/tetNig2/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 13m21.591s # forgot the qRepeats for tetNig2 rm axtChain/tetNig2.hg19.net time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \ -continue=load -qRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > load.log 2>&1 & # real 4m7.559s cat fb.tetNig2.chainHg19Link.txt # 42910930 bases of 302314788 (14.194%) in intersection ############################################################################## # BLASTZ/CHAIN/NET oryLat2 swap (DONE - 2009-09-15 - Hiram) # original cd /hive/data/genomes/oryLat2/bed/blastzTetNig2.2009-09-14 cat fb.oryLat2.chainTetNig2Link.txt # 162783854 bases of 700386597 (23.242%) in intersection # And, for the swap: mkdir /hive/data/genomes/tetNig2/bed/blastz.oryLat2.swap cd /hive/data/genomes/tetNig2/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ /hive/data/genomes/oryLat2/bed/blastzTetNig2.2009-09-14/DEF \ -swap -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -noLoadChainSplit -verbose=2 -smallClusterHub=pk -bigClusterHub=pk \ > swap.log 2>&1 & # real 49m49.335s cat fb.tetNig2.chainOryLat2Link.txt # 136115939 bases of 302314788 (45.025%) in intersection ######################################################################### # BLASTZ/CHAIN/NET gasAcu1 swap (DONE - 2009-09-15 - Hiram) # original cd /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10 featureBits gasAcu1 chainTetNig2Link >&fb.gasAcu1.chainTetNig2Link.txt cat fb.gasAcu1.chainTetNig2Link.txt # 134497679 bases of 446627861 (30.114%) in intersection mkdir /hive/data/genomes/tetNig2/bed/blastz.gasAcu1.swap cd /hive/data/genomes/tetNig2/bed/blastz.gasAcu1.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10/DEF \ -swap -qRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > swap.log 2>&1 & # real 33m6.310s cat fb.tetNig2.chainGasAcu1Link.txt # 123285586 bases of 302314788 (40.781%) in intersection ######################################################################### # LASTZ/CHAIN/NET danRer6 (DONE - 2009-09-15,18 - Hiram) mkdir /hive/data/genomes/tetNig2/bed/lastzDanRer6.2009-09-15 cd /hive/data/genomes/tetNig2/bed/lastzDanRer6.2009-09-15 cat << '_EOF_' > DEF # Tetraodon vs. Zebrafish # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Tetraodon TetNig2 - single chunk big enough to run single largest item SEQ1_DIR=/scratch/data/tetNig2/tetNig2.2bit SEQ1_LEN=/scratch/data/tetNig2/chrom.sizes SEQ1_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit SEQ1_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes SEQ1_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: Zebrafish danRer6, chunk large enough to run largest piece SEQ2_DIR=/scratch/data/danRer6/danRer6.2bit SEQ2_LEN=/scratch/data/danRer6/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/danRer6/contigs/danRer6.contigs.2bit SEQ2_CTGLEN=/hive/data/genomes/danRer6/contigs/danRer6.contigs.sizes SEQ2_LIFT=/hive/data/genomes/danRer6/contigs/danRer6.contigs.lift SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/tetNig2/bed/lastzDanRer6.2009-09-15 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -tRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 754m56.563s cat fb.tetNig2.chainDanRer6Link.txt # 70626082 bases of 302314788 (23.362%) in intersection # and the swap to danRer6: mkdir /hive/data/genomes/danRer6/bed/blastz.tetNig2.swap cd /hive/data/genomes/danRer6/bed/blastz.tetNig2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/tetNig2/bed/lastzDanRer6.2009-09-15/DEF \ -swap -tRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > swap.log 2>&1 & # real 26m40.310s cat fb.danRer6.chainTetNig2Link.txt # 93349443 bases of 1506896106 (6.195%) in intersection ######################################################################### # LASTZ/CHAIN/NET fr2 (DONE - 2009-09-15,18 - Hiram) mkdir /hive/data/genomes/tetNig2/bed/lastzFr2.2009-09-15 cd /hive/data/genomes/tetNig2/bed/lastzFr2.2009-09-15 cat << '_EOF_' > DEF # Tetraodon vs. Zebrafish # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=9400 BLASTZ_L=3000 BLASTZ_K=3000 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Tetraodon TetNig2 - single chunk big enough to run single largest item SEQ1_DIR=/scratch/data/tetNig2/tetNig2.2bit SEQ1_LEN=/scratch/data/tetNig2/chrom.sizes SEQ1_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit SEQ1_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes SEQ1_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: Zebrafish fr2, chunk large enough to run largest piece SEQ2_DIR=/scratch/data/fr2/fr2.2bit SEQ2_LEN=/scratch/data/fr2/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/fr2/noUn/fr2.scaffolds.2bit SEQ2_CTGLEN=/hive/data/genomes/fr2/noUn/fr2.scaffolds.sizes SEQ2_LIFT=/hive/data/genomes/fr2/jkStuff/liftAll.lft SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/tetNig2/bed/lastzFr2.2009-09-15 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -tRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ > do.log 2>&1 & # real 1226m51.875s # broken down during lastz due to pk problems, finished the lastz # cluster run manually, then continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -tRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ > cat.log 2>&1 & # real 63m23.911s cat fb.tetNig2.chainFr2Link.txt # 243965150 bases of 302314788 (80.699%) in intersection # and the swap to fr2: mkdir /hive/data/genomes/fr2/bed/blastz.tetNig2.swap cd /hive/data/genomes/fr2/bed/blastz.tetNig2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/tetNig2/bed/lastzFr2.2009-09-15/DEF \ -swap -tRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ > swap.log 2>&1 & # real 21m56.576s cat fb.fr2.chainTetNig2Link.txt # 248984008 bases of 393312790 (63.304%) in intersection ######################################################################### # BLASTZ/CHAIN/NET mm9/Mouse swap (DONE - 2009-09-15 - Hiram) # the original alignment to the mouse sequence cd /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15 cat fb.mm9.chainTetNig2Link.txt # 45642112 bases of 2620346127 (1.742%) in intersection # running the swap to here, tetNig2: mkdir /hive/data/genomes/tetNig2/bed/blastz.mm9.swap cd /hive/data/genomes/tetNig2/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15/DEF \ -qRepeats=windowmaskerSdust \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -swap > swap.log 2>&1 & # real 10m34.797s cat fb.tetNig2.chainMm9Link.txt # 41176381 bases of 302314788 (13.620%) in intersection ######################################################################### ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # GAZE - predictions from Genoscope - (DONE - 2010-01-06 - Hiram) mkdir /hive/data/genomes/tetNig2/bed/gaze sed -e "s/ ; /\t/g; s/CDS G/G/;" \ ../../genoscope/annotation_tetraodon_v8.2.gff > gaze.gff ldHgGene -nobin -genePredExt tetNig2 genscan gaze.gff -out=gaze.gp Read 90337 transcripts in 284179 lines in 1 files 90337 groups 26 seqs 1 sources 4 feature types 26681 gene predictions hgLoadGenePred tetNig2 gaze gaze.gp zcat ../../genoscope/Tetraodon_peptide_v8.2.fa.gz \ | sed -e "s/ assembled CDS//;" \ | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \ | sed -e "s/*$//; s/^GSTENP/GSTENT/;" > gaze.pep.tab hgPepPred tetNig2 tab gazePep gaze.pep.tab ############################################################################ # all.joiner update - (DONE - 2010-01-06,08 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=tetNig2 -all all.joiner mkdir /hive/data/genomes/tetNig2/goldenPath cd /hive/data/genomes/tetNig2/goldenPath makeDownloads.pl tetNig2 > do.log 2>&1 # now ready for pushQ entry mkdir /hive/data/genomes/tetNig2/pushQ cd /hive/data/genomes/tetNig2/pushQ makePushQSql.pl tetNig2 > tetNig2.pushQ.sql 2> stderr.out # copy it to hgwbeta scp -p tetNig2.pushQ.sql hgwbeta:/tmp ssh hgwbeta hgsql qapushq < tetNig2.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly ############################################################################ # Create cpgIslandExt (DONE - 2010-02-11 - Hiram) mkdir /hive/data/genomes/tetNig2/bed/cpgIslands cd /hive/data/genomes/tetNig2/bed/cpgIslands doCpgIslands.pl -dbHost=hgwdev -workhorse=hgwdev -buildDir=`pwd` tetNig2 featureBits tetNig2 cpgIslandExt # 22165067 bases of 302314788 (7.332%) in intersection ############################################################################