# for emacs: -*- mode: sh; -*- # Lizard - Anolis carolinensis - Broad Institute 1.0 ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2007-02-16 - Hiram) ssh kkstore05 mkdir /cluster/store12/anoCar1 ln -s /cluster/store12/anoCar1 /cluster/data/anoCar1 mkdir /cluster/data/anoCar1/downloads cd /cluster/data/anoCar1/downloads foreach f (assembly.agp \ BasicAssemblyOneLiner.out \ ForDistribution.command \ assembly.bases.gz \ assembly.links \ assembly.quals.gz \ source) wget --timestamping \ ftp://ftp.broad.mit.edu/pub/assemblies/reptiles/lizard/AnoCar1.0/$f end faSize assembly.bases.gz # 1741478929 bases (0 N's 1741478929 real 1741478929 upper 0 lower) # in 50470 sequences in 1 files # Discovered later that this quals file needs to be lifted qaToQac assembly.quals.gz stdout \ | qacAgpLift assembly.agp stdin scaffold.lifted.qac ## Calculate N50 # Find total length in sequence sort -k2nr chrom.sizes | awk '{sum+=$2;print NR,sum,$2,$1}' | tail -1 # 7233 1781602899 340 scaffold_7232 # half of 1781602899 is 890801449 # run this again, scanning the list until the sum reaches 890801449 sort -k2nr chrom.sizes | awk '{sum+=$2;print NR,sum,$2,$1}' | less # 204 889215106 2440512 scaffold_201 # 205 891654231 2439125 scaffold_205 ######################################################################### ## Create .ra file and run makeGenomeDb.pl ssh hgwdev cd /cluster/data/anoCar1 cat << '_EOF_' >anoCar1.config.ra # Config parameters for makeGenomeDb.pl: db anoCar1 clade vertebrate genomeCladePriority 66 scientificName Anolis carolinensis commonName Lizard assemblyDate Jan. 2007 assemblyLabel Broad Institute AnoCar (1.0) orderKey 440 mitoAcc none fastaFiles /cluster/data/anoCar1/downloads/assembly.bases.gz agpFiles /cluster/data/anoCar1/downloads/assembly.agp qualFiles /cluster/data/anoCar1/downloads/scaffold.lifted.qac dbDbSpeciesDir lizard '_EOF_' time makeGenomeDb.pl -verbose=2 anoCar1.config.ra > makeGenomeDb.out 2>&1 # broken down during the quals step since assembly.quals.gz needed # to be lifted. do the qaToQac | qacAgpLift sequence, fixup the # specification above for qualFiles, and finish off the quals loading: ssh kkstore05 cd /cluster/data/anoCar1/bed/qual qacToWig -fixed ../../downloads/scaffold.lifted.qac stdout \ | wigEncode stdin qual.wig qual.wib ssh hgwdev cd /cluster/data/anoCar1/bed/qual time nice -n +19 hgLoadWiggle \ -pathPrefix=/gbdb/anoCar1/wib anoCar1 quality qual.wig ## continue ssh hgwdev cd /cluster/data/anoCar1 time makeGenomeDb.pl -verbose=2 -continue=dbDb anoCar1.config.ra \ > makeDbDb.out 2>&1 ## better orderKey to get Lizard between frog and fish hgsql -e 'update dbDb set orderKey="440" where name="anoCar1";' \ hgcentraltest ## fixup that number in the .ra file as mentioned above, was 375 ########################################################################## ## Photograph - permission to use obtained from R. Steven Rainwater ## (DONE - 2007-02-16 - Hiram) ## Fetch photo from: wget --timestamping \ "http://rainwaterreptileranch.org/steve/photos/herps/anole2.jpeg" \ -O R.Steven.Rainwater.Anole.Lizard.jpg convert -quality 80 -sharpen 0 -crop "168x272+264+37" \ R.Steven.Rainwater.Anole.Lizard.jpg Anolis_carolinensis.jpg ################################################ ## WINDOWMASKER (Working - 2007-02-16 - Hiram) ssh kkstore05 cd /cluster/data/anoCar1/bed/ time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl anoCar1 \ -workhorse=kolossus > wmRun.log 2>&1 & # real 172m58.813s # Save the log mv wmRun.log WindowMasker.2007-02-16 # Masking statistics cd WindowMasker.2007-02-18 twoBitToFa anoCar1.wmsk.2bit stdout | faSize stdin # 1781602899 bases (40123970 N's 1741478929 real # 1009685313 upper 731793616 lower) in 7233 sequences in 1 files twoBitToFa anoCar1.wmsk.sdust.2bit stdout | faSize stdin # 1781602899 bases (40123970 N's 1741478929 real # 1000477327 upper 741001602 lower) in 7233 sequences in 1 files ssh hgwdev hgLoadBed -strict anoCar1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 8354004 elements of size 3 # why does featureBits show more bases masked than what faSize # measured as lower case ? Because this counts the masked sequence # in the gaps, and the faSize doesn't have the gaps. time nice -n +19 featureBits anoCar1 windowmaskerSdust # 781125572 bases of 1741478929 (44.854%) in intersection time nice -n +19 featureBits -countGaps anoCar1 windowmaskerSdust # 781125572 bases of 1781602899 (43.844%) in intersection # Curiously, WM overlaps gaps ? time nice -n +19 featureBits -countGaps anoCar1 windowmaskerSdust gap # 40123970 bases of 1781602899 (2.252%) in intersection # 741001602 + 40123970 == 781125572 ######################################################################### # SIMPLE REPEATS (TRF) (DONE 2007-02-16 - Hiram) # (dropped chromEnd index 2007-05-10 - kuhn) ssh kolossus mkdir /cluster/data/anoCar1/bed/simpleRepeat cd /cluster/data/anoCar1/bed/simpleRepeat time nice -n 19 twoBitToFa ../../anoCar1.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 & # real 164m34.988s # user 159m32.859s # sys 4m31.649s ssh kkstore05 cd /cluster/data/anoCar1/bed/simpleRepeat # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed splitFileByColumn trfMask.bed trfMaskChrom # Load unfiltered repeats into the database: ssh hgwdev nice -n +19 hgLoadBed anoCar1 simpleRepeat \ /cluster/data/anoCar1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 569322 elements of size 16 nice -n +19 featureBits anoCar1 simpleRepeat # 50290171 bases of 1741478929 (2.888%) in intersection ######################################################################### ## Add TRF mask to WindowMasker masked sequence, and fixup the bogus ## window masked N's (DONE - 2007-02-17 - Hiram) ssh kkstore05 cd /cluster/data/anoCar1/bed/WindowMasker.2007-02-16 ## Curious, twoBitMask would not accept stdout for its output 2bit twoBitMask anoCar1.wmsk.sdust.2bit \ -add ../simpleRepeat/trfMask.bed tmp.2bit twoBitToFa tmp.2bit stdout \ | sed -e "s/n/N/g" | faToTwoBit stdin anoCar1.2bit ## check it: twoBitToFa anoCar1.2bit stdout | faSize stdin # 1781602899 bases (40123970 N's 1741478929 real # 1000242640 upper 741236289 lower) in 7233 sequences in 1 files ## trfMask contributes: awk '{sum+=$3-$2} END{print sum}' ../simpleRepeat/trfMask.bed # 16534332 ## and we measured wmsk.sdust before: 741001602 16534332 + 741001602 ######################################################################### ## BLASTZ Hg18 swap (DONE - 2007-02-18 - Hiram) ## the original blastz to hg18 measured time nice -n +19 featureBits hg18 chainAnoCar1Link \ > fb.hg18.chainAnoCar1Link.txt 2>&1 # real 2m28.318s # 137554843 bases of 2881515245 (4.774%) in intersection ssh kkstore05 mkdir /cluster/data/anoCar1/bed/blastz.hg18.swap cd /cluster/data/anoCar1/bed/blastz.hg18.swap time doBlastzChainNet.pl \ /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=pk -swap > swap.log 2>&1 & time nice -n +19 featureBits anoCar1 chainHg18Link \ > fb.anoCar1.chainHg18Link.txt 2>&1 # real 3m16.810s # 112434396 bases of 1741478929 (6.456%) in intersection ######################################################################### # GENSCAN PREDICTIONS (DONE - 2006-05-03 - 2006-05-05 - Hiram) ssh kkstore05 # Create a 2bit file all hard masked cd /cluster/data/anoCar1 time nice -n +10 twoBitToFa anoCar1.2bit stdout \ | maskOutFa stdin hard stdout \ | faToTwoBit stdin anoCar1.hard.2bit # make sure it still has all the unmasked sequence in it: nice -n +19 twoBitToFa anoCar1.hard.2bit stdout | faSize stdin # 1781602899 bases (781360259 N's 1000242640 real # 1000242640 upper 0 lower) in 7233 sequences in 1 files nice -n +19 twoBitToFa anoCar1.2bit stdout | faSize stdin # 1781602899 bases (40123970 N's 1741478929 real # 1000242640 upper 741236289 lower) in 7233 sequences in 1 files # same number of total bases, the lowers have become Ns: # 781360259 == 741236289 + 40123970 # the lower "reals" disappear from the "real" count: # 1000242640 == 1741478929 - 741236289 # And, make sure there aren't any sequences in this lot that have # become all N's with no sequence left in them: twoBitToFa anoCar1.hard.2bit stdout \ | faCount stdin > anoCar1.hard.faCount # 181 scaffolds end up with less than 100 bases of sequence left egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \ | sort -k2nr | awk '{if ($2 < 100) { print }}' | wc -l # 181 # leaving 7.052 scaffolds with more than 100 bases of seqence left: egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \ | sort -k2nr | awk '{if ($2 >= 100) { print }}' | wc -l # 7052 # make a list of those to extract their sequence: egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \ | sort -k2nr | awk '{if ($2 >= 100) { print $1 }}' \ | sort > hard.genscan.list twoBitToFa -seqList=hard.genscan.list anoCar1.hard.2bit genscan.hard.fa # What do we have left to work with: faSize genscan.hard.fa # 1780575355 bases (780338843 N's 1000236512 real # 1000236512 upper 0 lower) in 7052 sequences in 1 files # creating 4,000,000 sized chunks, the largest scaffolds remain # in single pieces, the scaffolds smaller than 4,000,000 are grouped # into 4,000,000 sized fasta files. You don't want to break these # things up because genscan will be doing its own internal 2.4 million # window on these pieces, and the gene names are going to be # constructed from the sequence name in these fasta files. The # gene names are much better when they are this simple scaffoldN.M # numbering scheme. mkdir genscanSplit faSplit about genscan.hard.fa 4000000 genscanSplit/c_ ssh hgwdev mkdir /cluster/data/anoCar1/bed/genscan cd /cluster/data/anoCar1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/anoCar1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) # Since we split on gaps, we have no chunks like that. You can # verify with faCount on the chunks. ls -1S /cluster/data/anoCar1/genscanSplit/c_*.fa > chunk.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000 #ENDLOOP '_EOF_' # << happy emacs gensub2 chunk.list single template jobList para create jobList para try, check, push, check, ... # Completed: 319 of 319 jobs # CPU time in finished jobs: 39037s 650.62m 10.84h 0.45d 0.001 y # IO & Wait Time: 1015s 16.91m 0.28h 0.01d 0.000 y # Average job time: 126s 2.09m 0.03h 0.00d # Longest finished job: 459s 7.65m 0.13h 0.01d # Submission to last job: 3495s 58.25m 0.97h 0.04d # cat results into single files ssh kkstore05 cd /cluster/data/anoCar1/bed/genscan cat gtf/c_*.gtf > genscan.gtf cat subopt/c_*.bed > genscanSubopt.bed cat pep/c_*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/anoCar1/bed/genscan ldHgGene anoCar1 -gtf genscan genscan.gtf # Read 28102 transcripts in 184045 lines in 1 files # 28102 groups 4190 seqs 1 sources 1 feature types # 28102 gene predictions hgPepPred anoCar1 generic genscanPep genscan.pep hgLoadBed -strict anoCar1 genscanSubopt genscanSubopt.bed # Loaded 286327 elements of size 6 # check the numbers time nice -n +19 featureBits anoCar1 genscan # 31394034 bases of 1741478929 (1.803%) in intersection ######################################################################### ## BLASTZ GasAcu1/Stickleback swap (DONE - 2007-02-19 - Hiram) ## the original blastz to gasAcu1 measured time nice -n +19 featureBits gasAcu1 chainAnoCar1Link \ > fb.gasAcu1.chainAnoCar1Link.txt 2>&1 # real 0m51.499s # 56386298 bases of 446627861 (12.625%) in intersection ssh kkstore05 mkdir /cluster/data/anoCar1/bed/blastz.gasAcu1.swap cd /cluster/data/anoCar1/bed/blastz.gasAcu1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-19/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap > swap.log 2>&1 & time nice -n +19 featureBits anoCar1 chainGasAcu1Link \ > fb.anoCar1.chainGasAcu1Link.txt 2>&1 # real 1m14.245s # 54464074 bases of 1741478929 (3.127%) in intersection ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-02-19) ssh kkstore05 bash # if not using bash shell already mkdir /cluster/data/anoCar1/blastDb cd /cluster/data/anoCar1 twoBitToFa anoCar1.unmasked.2bit temp.fa faSplit sequence temp.fa 500 blastDb/ rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /san/sanvol1/scratch/anoCar1/blastDb cd /cluster/data/anoCar1/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/anoCar1/blastDb done mkdir -p /cluster/data/anoCar1/bed/tblastn.hg18KG cd /cluster/data/anoCar1/bed/tblastn.hg18KG echo /san/sanvol1/scratch/anoCar1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 496 query.lst # we want around 200000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(200000/496) = 91.082960 mkdir -p /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa split -l 90 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/anoCar1/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec # back to bash exit ssh pk cd /cluster/data/anoCar1/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 202864 of 202864 jobs # CPU time in finished jobs: 14747966s 245799.43m 4096.66h 170.69d 0.468 y # IO & Wait Time: 1561940s 26032.34m 433.87h 18.08d 0.050 y # Average job time: 80s 1.34m 0.02h 0.00d # Longest finished job: 722s 12.03m 0.20h 0.01d # Submission to last job: 93277s 1554.62m 25.91h 1.08d ssh kkstore05 cd /cluster/data/anoCar1/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' exit chmod +x chainOne ls -1dS /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/anoCar1/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 409 of 409 jobs # CPU time in finished jobs: 2911s 48.52m 0.81h 0.03d 0.000 y # IO & Wait Time: 65231s 1087.18m 18.12h 0.75d 0.002 y # Average job time: 167s 2.78m 0.05h 0.00d # Longest finished job: 287s 4.78m 0.08h 0.00d # Submission to last job: 2318s 38.63m 0.64h 0.03d ssh kkstore05 cd /cluster/data/anoCar1/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/anoCar1/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/anoCar1/bed/tblastn.hg18KG hgLoadPsl anoCar1 blastHg18KG.psl # check coverage featureBits anoCar1 blastHg18KG # 21571582 bases of 1741478929 (1.239%) in intersection ssh kkstore05 rm -rf /cluster/data/anoCar1/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut #end tblastn ######################################################################### ## BLASTZ galGal3/Chicken swap (DONE - 2007-02-19 - Hiram) ## the original blastz to galGal3 measured time nice -n +19 featureBits galGal3 chainAnoCar1Link \ > fb.galGal3.chainAnoCar1Link.txt 2>&1 # real 0m43.752s # 106743952 bases of 1042591351 (10.238%) in intersection ssh kkstore05 mkdir /cluster/data/anoCar1/bed/blastz.galGal3.swap cd /cluster/data/anoCar1/bed/blastz.galGal3.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap > swap.log 2>&1 & ssh hgwdev cd /cluster/data/anoCar1/bed/blastz.galGal3.swap time nice -n +19 featureBits anoCar1 chainGalGal3Link \ > fb.anoCar1.chainGalGal3Link.txt 2>&1 # real 1m8.359s # 109074507 bases of 1741478929 (6.263%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2007-02-19 - Hiram) # This will find repeats within the genome that should not be matched # against. Uses 11-mers. # Use -repMatch=620 (based on size -- for human we use 1024, and # lizard size is ~60.4% of human judging by gapless anoCar1 vs. hg18 # genome sizes from featureBits. # hg18 / anoCar1 non-gap bases # 2881515245 / 1741478929 = 1.654636 # anoCar1 / hg18 non-gap bases # 1741478929 / 2881515245 = 0.604362 # thus 1024 * 0.604362 ~= 620 ssh kkstore05 blat /cluster/data/anoCar1/anoCar1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/data/anoCar1/11.ooc -repMatch=620 # Wrote 32070 overused 11-mers to /cluster/data/anoCar1/11.ooc cp -p /cluster/data/anoCar1/11.ooc /san/sanvol1/scratch/anoCar1 cp -p /cluster/data/anoCar1/jkStuff/liftAll.lft \ /san/sanvol1/scratch/anoCar1 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2007-02-20 - Hiram) # Make a liftAll.lft that specifies 5M chunks for genbank: # only a few of the largest scaffolds will be broken up, most of them not ssh kkstore05 cd /cluster/data/anoCar1 simplePartition.pl anoCar1.2bit 5000000 /tmp/anoCar1 find /tmp/anoCar1 -type f | grep lft | xargs cat > jkStuff/liftAll.lft rm -r /tmp/anoCar1 cp -p jkStuff/liftAll.lft /san/sanvol1/scratch/anoCar1 # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add anoCar1 just after xenTro2 # anoCar1 anoCar1.serverGenome = /cluster/data/anoCar1/anoCar1.2bit anoCar1.clusterGenome = /san/sanvol1/scratch/anoCar1/anoCar1.2bit anoCar1.ooc = /san/sanvol1/scratch/anoCar1/11.ooc anoCar1.lift = /san/sanvol1/scratch/anoCar1/liftAll.lft anoCar1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} anoCar1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} anoCar1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} anoCar1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} anoCar1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} anoCar1.refseq.mrna.native.load = yes anoCar1.genbank.est.native.load = no anoCar1.refseq.mrna.xeno.load = yes anoCar1.genbank.mrna.xeno.load = yes anoCar1.downloadDir = anoCar1 anoCar1.perChromTables = no cvs ci -m "Added anoCar1." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. # cvs ci -m "Added Anolis carolinensis (lizard)." src/lib/gbGenome.c make install-server cd /cluster/data/genbank screen # This is a call to a script that will push our jobs out to the cluster # since it's a big job. nice -n +19 bin/gbAlignStep -initial anoCar1 & # logFile: var/build/logs/2007.02.19-22:10:25.anoCar1.initalign.log # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad anoCar1 # real 8m9.108s # enable daily alignment and update of hgwdev (DONE - 2007-02-20 - Hiram) cd ~/kent/src/hg/makeDb/genbank cvsup # add anoCar1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added anoCar1." etc/align.dbs etc/hgwdev.dbs make etc-update ### (2007-05-16 markd) # modify genbank to not load native RefSeq, since there are none. # remove empty files and rerun gbDbLoadStep anoCar1.refseq.mrna.native.load = no nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad anoCar1 ########################################################################## ## BLASTZ FROG xenTro2 (WORKING - 2007-02-20 - Hiram) ssh kkstore04 mkdir /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20 cd /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20 cat << '_EOF_' > DEF # Lizard vs frog BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Lizard AnoCar1 - largest chunk big enough for largest scaffold SEQ1_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit SEQ1_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # TARGET: Frog xenTro2 - single chunk big enough for the largest scaffold SEQ2_DIR=/san/sanvol1/scratch/xenTro2/xenTro2.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes SEQ2_CHUNK=8000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/anoCar1XenTro2 > do.log 2>&1 & ## real 1522m46.550s ## this broke down during chaining because this file: # -rw-rw-r-- 1 5982855 Feb 21 14:39 ../../pslParts/part039.lst.psl.g ## has a bogus character control-V in the middle on a number ## manually running through that one chain step with that line removed ## got the chaining completed, then continuing: time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=chainMerge \ -blastzOutRoot /cluster/bluearc/anoCar1XenTro2 > chainMerge.log 2>&1 & # real 119m12.634s ssh hgwdev cd /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20] time nice -n +19 featureBits anoCar1 chainXenTro2Link \ > fb.anoCar1.chainXenTro2Link.txt 2>&1 # real 11m33.086s # 83873500 bases of 1741478929 (4.816%) in intersection ssh kkstore04 mkdir /cluster/data/xenTro2/bed/blastz.anoCar1.swap cd /cluster/data/xenTro2/bed/blastz.anoCar1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap > swap.log 2>&1 & ############################################################################ # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("anoCar1", "blat11", "17780", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("anoCar1", "blat11", "17781", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### ## BLASTZ mm8/Mouse swap (DONE - 2007-02-20 - Hiram) ## the original blastz to mm8 measured time nice -n +19 featureBits mm8 chainAnoCar1Link \ > fb.mm8.chainAnoCar1Link.txt 2>&1 # real 1m37.380s # 106743952 bases of 1042591351 (10.238%) in intersection ssh kkstore04 mkdir /cluster/data/anoCar1/bed/blastz.mm8.swap cd /cluster/data/anoCar1/bed/blastz.mm8.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk \ -swap > swap.log 2>&1 & ssh hgwdev cd /cluster/data/anoCar1/bed/blastz.mm8.swap time nice -n +19 featureBits anoCar1 chainMm8Link \ > fb.anoCar1.chainMm8Link.txt 2>&1 # real 2m1.527s # 82784787 bases of 1741478929 (4.754%) in intersection # % Coverage of Lizard by: (chainMinScore,chainLinearGap,type masking) # 6.456 - Human hg18 (5000,loose,windowMasker) # 6.263 - Chicken galGal3 (5000,loose,windowMasker) # 4.816 - Frog xenTro2 (5000,loose,windowMasker) # 4.754 - Mouse mm8 (5000,loose,windowMasker) # 3.127 - Stickleback gasAcu1 (5000,loose,windowMasker) # % coverage of Chicken by: # 10.238 - Lizard anoCar1 (5000,loose,windowMasker) # 8.795 - Human hg18 (5000,loose,rmsk) # 6.745 - Mouse mm8 (5000,loose,rmsk) # 5.330 - Frog xenTro2 (5000,loose,rmsk) # 3.144 - Stickleback gasAcu1 (2000,loose,windowMasker) # % coverage of Frog by: # 6.217 - Lizard anoCar1 (5000,loose,windowMasker) # 5.634 - Human hg18 (5000,loose,rmsk) # 5.358 - Mouse mm8 (5000,loose,rmsk) # 4.776 - Chicken galGal3 (5000,loose,rmsk) # x.xxx - Stickleback gasAcu1 (not yet done) # % coverage of Human by: # 34.514 - Mouse mm8 (3000,medium,rmsk) # 4.774 - Lizard anoCar1 (5000,loose,windowMasker) # 3.589 - Chicken galGal3 (5000,loose,rmsk) # 2.623 - Frog xenTro2 (5000,loose,rmsk) # 1.923 - Stickleback gasAcu1 (2000,loose,rmsk) ########################################################################## ## RepeatMasker run to cover all bases (DONE - 2007-03-07 - Hiram) ssh kkstore02 mkdir /cluster/data/anoCar1/bed/RepeatMasker cd /cluster/data/anoCar1/bed/RepeatMasker time nice -n +19 doRepeatMasker.pl -verbose=2 -bigClusterHub=kk \ -buildDir=/cluster/data/anoCar1/bed/RepeatMasker anoCar1 > do.log 2>&1 & ############################################################################ ## DOWNLOADS - (DONE - 2007-02-12 - 2007-02-16 - Hiram) ssh hgwdev cd /cluster/data/anoCar1 ln -s bed/RepeatMasker/anoCar1.fa.out . ~/kent/src/hg/utils/automation/makeDownloads.pl anoCar1 \ > makeDownloads.out 2>&1 # Doesn't work due to missing Repeat masker outputs # Create WindowMasker separate files by chrom, for downloads ssh kkstore05 cd /cluster/data/anoCar1/goldenPath/bigZips ln -s ../../bed/WindowMasker.2007-02-16/windowmasker.sdust.bed.gz \ ./anoCar1.WMSdust.bed.gz # get GenBank native mRNAs ssh hgwdev cd /cluster/data/genbank ./bin/x86_64/gbGetSeqs -db=anoCar1 -native \ GenBank mrna /cluster/data/anoCar1/goldenPath/bigZips/mrna.fa # get GenBank xeno mRNAs ./bin/x86_64/gbGetSeqs -db=anoCar1 -xeno \ GenBank mrna /cluster/data/anoCar1/goldenPath/bigZips/xenoMrna.fa ssh kkstore05 cd /cluster/data/anoCar1/goldenPath/bigZips gzip *.fa md5sum *.gz > md5sum.txt # Edit the README.txt file to be correct ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/anoCar1/bigZips ln -s /cluster/data/anoCar1/goldenPath/bigZips/* . ############################################################################ ## Default position set at IFG-1 (DONE - 2007-04-09 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="scaffold_72:3056494-3141055" where name="anoCar1";' hgcentraltest ############################################################################ # SWAP ORNANA1 CHAIN/NET (DONE 5/2/07 angie) ssh kkstore05 mkdir /cluster/data/anoCar1/bed/blastz.ornAna1.swap cd /cluster/data/anoCar1/bed/blastz.ornAna1.swap doBlastzChainNet.pl -swap \ /cluster/data/ornAna1/bed/blastz.anoCar1/DEF >& do.log & tail -f do.log ln -s blastz.ornAna1.swap /cluster/data/anoCar1/bed/blastz.ornAna1 ############################################################################ # SWAP Mouse Mm9 chain/net (DONE - 2007-09-21 - hiram) ssh kkstore04 screen # control this sequence with screen # the original cd /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19 cat fb.mm9.chainAnoCar1Link.txt # 89239796 bases of 2620346127 (3.406%) in intersection # and for the swap mkdir /cluster/data/anoCar1/bed/blastz.mm9.swap cd /cluster/data/anoCar1/bed/blastz.mm9.swap time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19/DEF -chainMinScore=5000 \ -swap -qRepeats=windowmaskerSdust \ -chainLinearGap=loose -bigClusterHub=kk -verbose=2 > swap.log 2>&1 & # real 29m12.291s cat fb.anoCar1.chainMm9Link.txt # 85923556 bases of 1741478929 (4.934%) in intersection ######################################################################### ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # construct liftOver to anoCar2 (DONE - 2011-02-22 - Hiram) mkdir /hive/data/genomes/anoCar1/bed/blat.anoCar2.2011-04-20 cd /hive/data/genomes/anoCar1/bed/blat.anoCar2.2011-04-20 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -debug -dbHost=hgwdev -workhorse=hgwdev anoCar1 anoCar2 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev anoCar1 anoCar2 > do.log 2>&1 # hgwdev broken down, reboot # continuing manually to complete the net step cd /hive/data/genomes/anoCar1/bed/blat.anoCar2.2011-04-20/run.chain time ./doNet.csh > net.log 2>&1 cd /hive/data/genomes/anoCar1/bed/blat.anoCar2.2011-04-20 time doSameSpeciesLiftOver.pl -continue=load -buildDir=`pwd` \ -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev anoCar1 anoCar2 > load.log 2>&1 # real 2m40.093s # verify this file exists: # /gbdb/anoCar1/liftOver/anoCar1ToAnoCar2.over.chain.gz # and try out the conversion on genome-test from anoCar1 to anoCar2 ############################################################################