# for emacs: -*- mode: sh; -*- # Tenrec July 2005 Broad Assembly # # http://www.ncbi.nlm.nih.gov/genome/234 # http://www.ncbi.nlm.nih.gov/genome/assembly/229058/ # http://www.ncbi.nlm.nih.gov/bioproject/60557 # http://www.ncbi.nlm.nih.gov/bioproject/12590 - 2X coverage # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAIY00 ############################################################################# # /hive/data/genomes/echTel1/echTel1.config.ra created 2012-06-07: # Config parameters for makeGenomeDb.pl: db echTel1 clade mammal scientificName Echinops telfairi commonName Tenrec assemblyDate Jul. 2005 assemblyLabel Broad/echTel1 assemblyShortLabel Broad/echTel1 orderKey 3370 mitoAcc none fastaFiles /hive/data/genomes/echTel1/echTel1.fa.gz agpFiles /hive/data/genomes/echTel1/broad/assembly.agp qualFiles /hive/data/genomes/echTel1/broad/scaffolds.qac dbDbSpeciesDir tenrec photoCreditURL http://www.hobbygarten.com/ photoCreditName Kathrin Holzer, all rights reserved ncbiGenomeId 234 ncbiAssemblyId 229058 ncbiAssemblyName echTel1 ncbiBioProject 12590 genBankAccessionID GCA_000181355.1 taxId 9371 ############################################################################# # DOWNLOAD SEQUENCE ssh kkstore02 mkdir /cluster/store11/echTel1 ln -s /cluster/store11/echTel1 /cluster/data cd /cluster/data/echTel1 mkdir jkStuff bed mkdir broad cd broad screen wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/tenrec/assembly.agp" wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/tenrec/assembly.bases.gz" wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/tenrec/assembly.quals.gz" wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/tenrec/unplaced.fasta.gz" wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/tenrec/unplaced.qual.gz" gunzip assembly.bases.gz faSize assembly.bases # 2111581369 bases (0 N's 2111581369 real 2111581369 upper 0 lower) in 820972 sequences in 1 files # Total size: mean 2572.1 sd 2132.3 min 74 (contig_732603) max 39350 (contig_780069) median 1840 # N count: mean 0.0 sd 0.0 # U count: mean 2572.1 sd 2132.3 # L count: mean 0.0 sd 0.0 cd /cluster/data/echTel1/broad /cluster/bin/x86_64/agpAllToFaFile assembly.agp assembly.bases ../echTel1.fa cd .. faSize echTel1.fa # 3823724728 bases (1712143359 N's 2111581369 real 2111581369 upper 0 lower) in 325491 sequences in 1 files # Total size: mean 11747.6 sd 25476.7 min 80 (scaffold_153914) max 739734 (scaffold_324606) median 3912 # N count: mean 5260.2 sd 15360.0 # U count: mean 6487.4 sd 12037.8 # L count: mean 0.0 sd 0.0 /cluster/bin/scripts/agpToLift < broad/assembly.agp > jkStuff/assembly.lft # PARTITION SCAFFOLDS FOR REPEATMASKER RUN # glom the tiny scaffolds up into ~50k collections ssh kkstore02 cd /cluster/data/echTel1 mkdir chunks50k faSplit about broad/assembly.bases 50000 chunks50k/chunk_ cd chunks50k for i in 0 1 2 3 4 5 6 7 8 9; do mkdir $i; mv *$i.fa $i; done # RUN REPEAT MASKER # make the run directory, output directory, and job list ssh kkstore02 cd /cluster/data/echTel1 tcsh cat << '_EOF_' > jkStuff/RMRabbit #!/bin/csh -fe cd $1 /bin/mkdir -p /tmp/echTel1/$2 /bin/cp $3 /tmp/echTel1/$2/ pushd /tmp/echTel1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -species tenrec $2 popd /bin/cp /tmp/echTel1/$2/$2.out ./ /bin/rm -fr /tmp/echTel1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/echTel1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/echTel1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMRabbit # mkdir RMRun RMOut # for i in chunks800k/*.fa # do # d="/cluster/data/echTel1" # c=`basename $i` # echo "../jkStuff/RMRabbit $d/RMOut $c { check in line+ $d/$i} {check out line+ $d/RMOut/$c.out}" # done > RMRun/RMJobs exit mkdir RMRun RMOut cd RMOut mkdir 0 1 2 3 4 5 6 7 8 9 cd ../chunks50k for i in */*.fa do e=`dirname $i` d="/cluster/data/echTel1" c=`basename $i` echo "../jkStuff/RMRabbit $d/RMOut/$e $c {check in line+ $d/chunks50k/$i} {check out line+ $d/RMOut/$e/$c.out}" done > ../RMRun/RMJobs # do the run ssh pk cd /cluster/data/echTel1/RMRun para create RMJobs para try, check, push, check,... # Completed: 40479 of 40479 jobs # CPU time in finished jobs: 11523595s 192059.92m 3201.00h 133.37d 0.365 y # IO & Wait Time: 125475s 2091.24m 34.85h 1.45d 0.004 y # Average job time: 288s 4.80m 0.08h 0.00d # Longest finished job: 509s 8.48m 0.14h 0.01d # Submission to last job: 82667s 1377.78m 22.96h 0.96d # Lift up the split-scaffold .out's to scaffold .out's ssh kkstore02 cd /cluster/data/echTel1/RMOut for i in 0 1 2 3 4 5 6 7 8 9; do echo $i; liftUp -nohead $i.out ../jkStuff/assembly.lft warn $i/*.fa.out>/dev/null; done head -3 0.out > echTel1.out for i in 0 1 2 3 4 5 6 7 8 9; do tail +4 $i.out; done >> echTel1.out # Load the .out files into the database with: ssh hgwdev hgLoadOut echTel1 /cluster/data/echTel1/RMOut/echTel1.out # ... bunches of ... # Strange perc. field -3.7 line 2110359 of /cluster/data/echTel1/RMOut/echTel1.out # Loading up table echTel1_rmsk # note: 4 records dropped due to repStart > repEnd hgsql echTel1 -e 'rename table echTel1_rmsk to rmsk' # Fix up the indices too: hgsql echTel1 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(16), bin); \ create index genoStart on rmsk (genoName(16), genoStart); \ create index genoEnd on rmsk (genoName(16), genoEnd);' # EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 11/5/04 angie) ssh kkstore02 mkdir /cluster/data/echTel1/bed/fakeAgp cd /cluster/data/echTel1/bed/fakeAgp faGapSizes ../../downloads/scaffolds.fasta \ -niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000 # Wow, all block of N's seem to be exactly 100bp long. # hgFakeAgp's default -minContigGap of 25 will be fine. hgFakeAgp ../../downloads/scaffolds.fasta fake.agp ssh hgwdev hgLoadGap -unsplit echTel1 /cluster/data/echTel1/bed/fakeAgp/fake.agp # SIMPLE REPEATS (TRF) ssh kkstore02 tcsh mkdir /cluster/data/echTel1/bed/simpleRepeat cd /cluster/data/echTel1/bed/simpleRepeat nice trfBig -trf=/cluster/bin/i386/trf ../../echTel1.fa \ /dev/null -bedAt=simpleRepeat.bed -tempDir=/tmp \ |& egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \ > trf.log & # check on this with tail -f trf.log # Load this into the database as so ssh hgwdev hgLoadBed echTel1 simpleRepeat /cluster/data/echTel1/bed/simpleRepeat/simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # FILTER SIMPLE REPEATS (TRF) INTO MASK # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kkstore02 cd /cluster/data/echTel1/bed/simpleRepeat awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # MASK FA USING REPEATMASKER AND FILTERED TRF FILES ssh kkstore02 cd /cluster/data/echTel1 maskOutFa -soft echTel1.fa bed/simpleRepeat/trfMask.bed echTel1.simple.fa maskOutFa -softAdd echTel1.simple.fa RMOut/echTel1.out echTel1.masked.fa # Now clean up the unmasked split scaffolds to avoid confusion later. # rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft # CREATING DATABASE # Create the database. ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 915G 746G 56% /var/lib/mysql hgsql '' -e 'create database echTel1' # STORE SEQUENCE AND ASSEMBLY INFORMATION # Translate to 2bit ssh kolossus cd /cluster/data/echTel1 faToTwoBit echTel1.masked.fa echTel1.2bit # Make chromInfo.tab. mkdir bed/chromInfo twoBitInfo echTel1.2bit stdout | awk '{printf "%s\t%s\t/gbdb/echTel1/echTel1.2bit\n",$1,$2;}' \ | sort > bed/chromInfo/chromInfo.tab # Make symbolic a link from /gbdb/echTel1 to the 2bit. ssh hgwdev mkdir -p /gbdb/echTel1 ln -s /cluster/data/echTel1/echTel1.2bit /gbdb/echTel1/ # Load chromInfo table. hgsql echTel1 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql echTel1 -e 'load data local infile "/cluster/data/echTel1/bed/chromInfo/chromInfo.tab" into table chromInfo' # Make chrom.sizes from chromInfo contents and check scaffold count. hgsql echTel1 -N -e 'select chrom,size from chromInfo' > /cluster/data/echTel1/chrom.sizes hgsql echTel1 -N -e 'select chrom from chromInfo' > /cluster/data/echTel1/chrom.lst wc -l /cluster/data/echTel1/chrom.sizes # 325491 /cluster/data/echTel1/chrom.sizes # CREATING GRP TABLE FOR TRACK GROUPING # Copy all the data from the table "grp" # in an existing database to the new database ssh hgwdev hgsql echTel1 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp' cd /cluster/data/echTel1 ln -s /cluster/data/echTel1/broad/assembly.agp echTel1.agp hgGoldGapGl -noGl echTel1 echTel1.agp # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 11/4/04 angie) # Warning: genome and organism fields must correspond # with defaultDb values echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("echTel1", "July. 2005", "/gbdb/echTel1", "Tenrec", \ "", 1, 57, "Tenrec", "Echinops telfairi", "/gbdb/echTel1/html/description.html", \ 0, 0, "Broad July 2005");' \ | hgsql -h genome-testdb hgcentraltest echo 'INSERT INTO defaultDb (genome, name) values ("Tenrec", "echTel1");' \ | hgsql -h genome-testdb hgcentraltest echo 'INSERT INTO genomeClade (genome,clade,priority) values ("Tenrec", "vertebrate",58);' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit trackDb/makefile to add echTel1 to the DBS variable. mkdir -p tenrec/echTel1 # Create a simple rabbit/echTel1/description.html file. cvs add tenrec/echTel1 cvs add tenrec/echTel1/description.html make update DBS=echTel1 ZOO_DBS= # go public on genome-test cvs ci makefile cvs ci rabbit/echTel1 mkdir /gbdb/echTel1/html # in a clean, updated tree's kent/src/hg/makeDb/trackDb: make alpha # PUT SEQUENCE ON /ISCRATCH FOR BLASTZ # First, agglomerate small scaffolds into chunks of ~100k median # (many scaffolds are larger than that) so we don't have too many # files for one dir, but keep a reasonably low job run time: ssh kkr1u00 mkdir /iscratch/i/echTel1 cp -p /cluster/data/echTel1/echTel1.2bit /iscratch/i/echTel1/ iSync # PRODUCING GENSCAN PREDICTIONS (DONE 11/4/04 angie) ssh kkstore02 # Make hard-masked scaffolds and split up for processing: cd /cluster/data/echTel1 maskOutFa scaffolds.fa hard scaffolds.fa.masked mkdir chunksUnsplitMasked faSplit about scaffolds.fa.masked 100000 chunksUnsplitMasked/chunk_ mkdir /cluster/data/echTel1/bed/genscan cd /cluster/data/echTel1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt ls -1S ../../chunksUnsplitMasked/chunk*.fa > chunks.list cat << '_EOF_' > gsub #LOOP gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 chunks.list single gsub jobList ssh kki cd /cluster/data/echTel1/bed/genscan para create jobList para try, check, push, check, ... #Completed: 463 of 463 jobs #Average job time: 12s 0.21m 0.00h 0.00d #Longest job: 317s 5.28m 0.09h 0.00d #Submission to last job: 445s 7.42m 0.12h 0.01d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # Concatenate scaffold-level results: ssh kkstore02 cd /cluster/data/echTel1/bed/genscan cat gtf/*.gtf > genscan.gtf cat subopt/*.bed > genscanSubopt.bed cat pep/*.pep > genscan.pep # Clean up rm -r /cluster/data/echTel1/chunksUnsplitMasked # Load into the database as so: ssh hgwdev cd /cluster/data/echTel1/bed/genscan # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf echTel1 genscan genscan.gtf hgPepPred echTel1 generic genscanPep genscan.pep hgLoadBed echTel1 genscanSubopt genscanSubopt.bed # MAKE DOWNLOADABLE FILES (DONE 11/4/04 angie) # RECREATE BIGZIPS DOWNLOADS CREATE README FILE (DONE, 2006-05-04, hartera) # ADDED md5sum.txt FILE FOR LIFTOVER DOWNLOADS AND CREATED CORRECT md5sum.txt # FOR vsMm7 AND vsHg18 DOWNLOADS (DONE, 2006-05-23, hartera) ssh kkstore02 mkdir /cluster/data/echTel1/zips cd /cluster/data/echTel1 zip -j zips/scaffoldOut.zip RMOut/scaffolds.fa.out zip -j zips/scaffoldFa.zip scaffolds.fa zip -j zips/scaffoldFaMasked.zip scaffolds.fa.masked zip -j zips/scaffoldTrf.zip bed/simpleRepeat/trfMask.bed foreach f (zips/*.zip) echo $f unzip -t $f | tail -1 end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/echTel1 cd /usr/local/apache/htdocs/goldenPath/echTel1 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cd bigZips cp -p /cluster/data/echTel1/zips/*.zip . md5sum *.zip > md5sum.txt # Add more bigZips downloads. Some of the above downloads don't exist # anymore in bigZips in /usr/local/apache/htdocs/goldenPath/... on hgwdev # (2006-05-04, hartera) ssh kkstore02 mkdir /cluster/data/echTel1/bigZips cd /cluster/data/echTel1 # soft-masked scaffolds sequence cp -p echTel1.masked.fa.gz ./bigZips/scaffoldSoftMask.fa.gz # assembly agp file cp -p ./broad/assembly.agp ./bigZips/ # Simple Repeats cp -p ./bed/simpleRepeat/trfMask.bed ./bigZips/trf.bed # RepeatMasker output cp -p ./RMOut/echTel1.out ./bigZips/rmsk.out # unmasked scaffolds sequences cp -p echTel1.fa.gz ./bigZips/scaffold.fa.gz cd bigZips gzip assembly.agp gzip trf.bed gzip rmsk.out # check integrity of files foreach f (*.gz) echo $f gunzip -t $f | tail -1 end md5sum *.gz > md5sum.txt # link the *.gz and *.txt files to hgwdev:/usr/local/apache/.... ssh hgwdev set gp=/usr/local/apache/htdocs/goldenPath/echTel1 mkdir -p $gp/bigZips ln -s /cluster/data/echTel1/bigZips/{*.gz,*.txt} $gp/bigZips # copy over README.txt from another bigZips directory and edit. # echTel1ToMm7.over.chain.gz is in the md5sum.txt in the # vsMm7 directory. It should be in the liftOver directory so move it there # and re-make md5sum.txt for vsMm7 dir. # Add liftOver md5sum.txt (hartera, 2006-05-23) ssh hgwdev set gp=/usr/local/apache/htdocs/goldenPath/echTel1 # copy over from hgwdevold cd /cluster/data/echTel1/bed/liftOver scp hgwdevold:$gp/liftOver/echTel1ToMm7.over.chain.gz . ln -s /cluster/data/echTel1/bed/liftOver/*.gz $gp/liftOver cd $gp/liftOver # create md5sum.txt md5sum *.gz > md5sum.txt # recreate md5sum.txt for vsMm7 files cd $gp/vsMm7 rm md5sum.txt md5sum *.gz > md5sum.txt # recreate md5sum.txt for vsHg18 files as this includes files in an axtNet # directory that is not in this directory. cd $gp/vsHg18 rm md5sum.txt md5sum *.gz > md5sum.txt # Check that README.txt is correct for vsMm7 and vsHg18. # SWAP DM1-DROANA1 BLASTZ (DONE 11/4/04 angie) ssh kkstore02 mkdir /cluster/data/echTel1/bed/blastz.dm1.swap.2004-11-03 ln -s blastz.dm1.swap.2004-11-03 /cluster/data/echTel1/bed/blastz.dm1 cd /cluster/data/echTel1/bed/blastz.dm1 set aliDir = /cluster/data/dm1/bed/blastz.echTel1 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len # With 11k scaffolds, we don't want a directory with one file per # scaffold. So just make one .axt with everything -- not too huge # anyway, given these little insect genomes. cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSort stdin dm1.axt du -sh $aliDir/axtChrom dm1.axt #389M /cluster/data/dm1/bed/blastz.echTel1/axtChrom #389M dm1.axt # CHAIN MELANOGASTER BLASTZ (DONE 11/4/04 angie) # Run axtChain on kolossus (one big dm1.axt input) ssh kolossus mkdir /cluster/data/echTel1/bed/blastz.dm1/axtChain cd /cluster/data/echTel1/bed/blastz.dm1/axtChain axtChain -verbose=0 ../dm1.axt /cluster/data/echTel1/echTel1.2bit \ /cluster/data/dm1/nib stdout \ | chainAntiRepeat /cluster/data/echTel1/echTel1.2bit \ /cluster/data/dm1/nib stdin stdout \ | chainMergeSort stdin > all.chain # Load chains into database ssh hgwdev cd /cluster/data/echTel1/bed/blastz.dm1/axtChain hgLoadChain -tIndex echTel1 chainDm1 all.chain # NET MELANOGASTER BLASTZ (DONE 11/4/04 angie) ssh kkstore02 cd /cluster/data/echTel1/bed/blastz.dm1/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/echTel1/bed/blastz.dm1/axtChain netClass -noAr noClass.net echTel1 dm1 melanogaster.net \ |& g -v "table gap doesn't exist" # Make a 'syntenic' subset: ssh kkstore02 cd /cluster/data/echTel1/bed/blastz.dm1/axtChain rm noClass.net netFilter -syn melanogaster.net > melanogasterSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/echTel1/bed/blastz.dm1/axtChain netFilter -minGap=10 melanogaster.net | hgLoadNet echTel1 netDm1 stdin netFilter -minGap=10 melanogasterSyn.net \ | hgLoadNet echTel1 netSyntenyDm1 stdin # MAKE AXTNET (DONE 11/4/04 angie) ssh kkstore02 cd /cluster/data/echTel1/bed/blastz.dm1/axtChain netToAxt melanogaster.net all.chain /cluster/data/echTel1/echTel1.2bit \ /cluster/data/dm1/nib stdout \ | axtSort stdin melanogasterNet.axt # MAKE VSDM1 DOWNLOADABLES (DONE 11/4/04 angie) ssh kkstore02 cd /cluster/data/echTel1/bed/blastz.dm1/axtChain nice gzip *.{chain,net,axt} ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/echTel1/vsDm1 cd /usr/local/apache/htdocs/goldenPath/echTel1/vsDm1 cp -p /cluster/data/echTel1/bed/blastz.dm1/axtChain/all.chain.gz \ melanogaster.chain.gz cp -p /cluster/data/echTel1/bed/blastz.dm1/axtChain/melanogaster.net.gz . cp -p /cluster/data/echTel1/bed/blastz.dm1/axtChain/melanogasterNet.axt.gz . # Make a README.txt which explains the files & formats. md5sum *.gz */*.gz > md5sum.txt # MAKE 11.OOC FILE FOR BLAT (DONE 11/4/04 angie) ssh kkr1u00 mkdir /cluster/bluearc/echTel1 blat /cluster/data/echTel1/echTel1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/echTel1/11.ooc -repMatch=1024 #Wrote 9721 overused 11-mers to /cluster/bluearc/echTel1/11.ooc cp -p /cluster/bluearc/echTel1/*.ooc /iscratch/i/echTel1/ iSync # GET GENBANK mRNA AND EST COUNTS # Go to the latest GenBank full release dir and get an idea of how # many mRNAs and ESTs there are to align. ssh eieio cd /cluster/data/genbank/data/processed/genbank.144.0/full awk '$4 == "Rabbit" {print $4 " " $5;}' mrna.gbidx | sort | uniq -c # 9 Drosophila ananassae # 1 Drosophila mojavensis # 33 Drosophila virilis # Wow, questionable whether we should have a native mRNA track here. awk '$4 == "Drosophila" {print $4 " " $5;}' est*.gbidx | sort | uniq -c # 382439 Drosophila melanogaster # 4105 Drosophila simulans # 779 Drosophila yakuba # And a native EST track isn't even a possibility for the new flies # at this point! # AUTO UPDATE GENBANK MRNA RUN (DONE 11/16/04 angie) ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # Edit etc/genbank.conf and add these lines (note scaffold-browser settings): # echTel1 (D. ananassae) echTel1.genome = /iscratch/i/echTel1/echTel1.2bit echTel1.mondoTwoBitParts = 1000 echTel1.lift = no echTel1.refseq.mrna.native.load = no echTel1.refseq.mrna.xeno.load = yes echTel1.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005 echTel1.genbank.mrna.xeno.load = yes # GenBank has no D. ananassae ESTs at this point... that may change. echTel1.genbank.est.native.load = no echTel1.genbank.est.xeno.load = no echTel1.downloadDir = echTel1 echTel1.perChromTables = no cvs ci etc/genbank.conf # Since D. ananassae is a new species for us, edit src/lib/gbGenome.c. # Pick some other browser species, & monkey-see monkey-do. cvs diff src/lib/gbGenome.c make cvs ci src/lib/gbGenome.c # Edit src/align/gbBlat to add /iscratch/i/echTel1/11.ooc cvs diff src/align/gbBlat make cvs ci src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh `fileServer /cluster/data/genbank/` cd /cluster/data/genbank # This is an -initial run, (xeno) RefSeq only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial echTel1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad echTel1 featureBits echTel1 xenoRefGene #16520385 bases of 165766797 (9.966%) in intersection # Clean up: rm -rf work/initial.echTel1 # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial echTel1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad echTel1 featureBits echTel1 all_mrna #19602 bases of 165766797 (0.012%) in intersection featureBits echTel1 xenoMrna #17295487 bases of 165766797 (10.434%) in intersection # Clean up: rm -rf work/initial.echTel1 # MAKE GCPERCENT ssh hgwdev mkdir /cluster/data/echTel1/bed/gcPercent cd /cluster/data/echTel1/bed/gcPercent # create and load gcPercent table hgGcPercent echTel1 /cluster/data/echTel1 # MAKE HGCENTRALTEST BLATSERVERS ENTRY ssh hgwdev echo 'insert into blatServers values("echTel1", "blat15", "17782", 1, 0); \ insert into blatServers values("echTel1", "blat15", "17783", 0, 1);' \ | hgsql -h genome-testdb hgcentraltest # MAKE Drosophila Proteins track (DONE braney 11/17/04) ssh kkstore02 mkdir -p /cluster/data/echTel1/blastDb cd /cluster/data/echTel1/blastDb faSplit sequence ../scaffolds.fa 400 x for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/echTel1/blastDb cp /cluster/data/echTel1/blastDb/* /iscratch/i/echTel1/blastDb (iSync) 2>&1 > sync.out mkdir -p /cluster/data/echTel1/bed/tblastn.dm1FB cd /cluster/data/echTel1/bed/tblastn.dm1FB ls -1S /iscratch/i/echTel1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst exit # back to kkstore02 cd /cluster/data/echTel1/bed/tblastn.dm1FB mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\) # 18735/(150000/396) = 49.460400 split -l 49 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp exit 1 '_EOF_' chmod +x blastSome gensub2 bug.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/echTel1/bed/tblastn.dm1FB para create blastSpec para try, push # Completed: 151668 of 151668 jobs # CPU time in finished jobs: 2932565s 48876.08m 814.60h 33.94d 0.093 y # IO & Wait Time: 694006s 11566.77m 192.78h 8.03d 0.022 y # Average job time: 24s 0.40m 0.01h 0.00d # Longest job: 2721s 45.35m 0.76h 0.03d # Submission to last job: 73860s 1231.00m 20.52h 0.85d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec # should run this on the mini-cluster or with my shove script # so you can limit the number of jobs starting to 3 or 4 para try, push... # Completed: 383 of 383 jobs # CPU time in finished jobs: 327s 5.44m 0.09h 0.00d 0.000 y # IO & Wait Time: 8218s 136.97m 2.28h 0.10d 0.000 y # Average job time: 22s 0.37m 0.01h 0.00d # Longest job: 54s 0.90m 0.01h 0.00d # Submission to last job: 674s 11.23m 0.19h 0.01d exit # back to kkstore02 cd /cluster/data/echTel1/bed/tblastn.dm1FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/echTel1/bed/tblastn.dm1FB/blastDm1FB.psl ssh hgwdev cd /cluster/data/echTel1/bed/tblastn.dm1FB hgLoadPsl echTel1 blastDm1FB.psl # End tblastn # SWAP CHAINS FROM DM2, BUILD NETS ETC. (DONE 3/2/05 angie) ssh kkstore02 mkdir /cluster/data/echTel1/bed/blastz.dm2.swap cd /cluster/data/echTel1/bed/blastz.dm2.swap doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.echTel1/DEF \ >& do.log & tail -f do.log # Add {chain,net}Dm2 to trackDb.ra if necessary. # Add /usr/local/apache/htdocs/goldenPath/echTel1/vsDm2/README.txt ######################################################################### # BLASTZ NOTE: with the advent of Angie's script to run the # blastz process through to chains and nets loaded into the # database and download files prepared, it is now a juggling act # to see which klusters are available. The particular options to # the script to make it go to one kluster or another are to be # determined at run-time. The typical run-times listed here will # be a factor in your choice of kluster to operate on. ######################################################################### # BLASTZ HUMAN Hg17 ssh kk mkdir /cluster/data/echTel1/bed/blastz.hg17 cd /cluster/data/echTel1/bed/blastz.hg17 cat << '_EOF_' > DEF # mouse vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse SEQ1_DIR=/panasas/store/echTel1/nib # not used SEQ1_RMSK=/panasas/store/echTel1/rmsk # not used SEQ1_FLAG=-rodent SEQ1_SMSK=/panasas/store/echTel1/linSpecRep.notInHuman SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Human SEQ2_DIR=/scratch/hg/hg17/bothMaskedNibs # RMSK not currently used SEQ2_RMSK= # FLAG not currently used SEQ2_FLAG= SEQ2_SMSK=/scratch/hg/hg17/linSpecRep.notInMouse SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/echTel1/bed/blastzHg17.2005_03_14 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/echTel1/chrom.sizes ./S1.len sort -rn +1 /cluster/data/hg17/chrom.sizes > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \ blast.run.out 2>&1 & # real 993m28.547s # user 0m0.198s # sys 0m0.171s # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kksilo screen -d -r # STARTED - 2005-03-17 21:25 # FINISHED - 2005-03-18 14:00 # Completed: 45347 of 45347 jobs # CPU time in finished jobs: 16921981s 282033.02m 4700.55h 195.86d 0.537 y # IO & Wait Time: 2381711s 39695.18m 661.59h 27.57d 0.076 y # Average job time: 426s 7.09m 0.12h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 9568s 159.47m 2.66h 0.11d # Submission to last job: 58695s 978.25m 16.30h 0.68d # Completed: 331 of 331 jobs # CPU time in finished jobs: 272s 4.54m 0.08h 0.00d 0.000 y # IO & Wait Time: 1145s 19.08m 0.32h 0.01d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest job: 24s 0.40m 0.01h 0.00d # Submission to last job: 265s 4.42m 0.07h 0.00d # The kki batch doChainRun.csh appears to have failed # due to underlying changes in the location of hg17 items # fixup the symlinks which are in a state of flux today, then, # to recover: ssh kki cd /cluster/data/echTel1/bed/blastzHg17.2005_03_14/axtChain/run rm -fr chain time ./doChainRun.csh # real 22m47.917s # user 0m0.380s # sys 0m0.630s # Completed: 40 of 40 jobs # CPU time in finished jobs: 6373s 106.22m 1.77h 0.07d 0.000 y # IO & Wait Time: 552s 9.20m 0.15h 0.01d 0.000 y # Average job time: 173s 2.89m 0.05h 0.00d # Longest job: 662s 11.03m 0.18h 0.01d # Submission to last job: 1200s 20.00m 0.33h 0.01d # That was the last part of the chainRun step, can now continue: ssh kksilo cd /cluster/data/echTel1/bed/blastzHg17.2005_03_14 time /cluster/bin/scripts/doBlastzChainNet.pl -continue chainMerge `pwd`/DEF > chainMerge.run.out 2>&1 & # STARTED - 2005-03-18 15:00 # FINISHED 2005-03-18 16:33 # checking the numbers for sanity: ssh hgwdev # expect ~ 2m30 seconds for chain measurement time featureBits echTel1 chainHg17 # 2596946329 bases of 2597150411 (99.992%) in intersection time featureBits mm5 chainHg17 # 2507720521 bases of 2615483787 (95.880%) in intersection # expect ~ 2m30s seconds for net measurement time featureBits echTel1 netHg17 # 2579747741 bases of 2597150411 (99.330%) in intersection time featureBits mm5 netHg17 # 2504056038 bases of 2615483787 (95.740%) in intersection ssh kolossus # expect ~ 20-22 minutes for the chainLink measurement HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \ featureBits echTel1 chainHg17Link # 966916309 bases of 2597150411 (37.230%) in intersection HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \ featureBits mm5 chainHg17Link # 1025750185 bases of 2615483787 (39.218%) in intersection # swap results to place echTel1 alignments onto Hg17 time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \ swap.run.out 2>&1 & # STARTED - 2005-03-29 - 15:58 # FINI - 2005-03-29 - 18:48 # real 171m26.172s # user 0m2.270s # sys 0m0.870s ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainMm6Link # 969459954 bases of 2866216770 (33.824%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainMm5Link # 1020106336 bases of 2866216770 (35.591%) in intersection # A measurement script to do all featureBits combinations: cd /cluster/data/echTel1/jkStuff cat << '_EOF_' > netChainCheck.sh #!/bin/sh usage() { echo "usage: netChainCheck.sh " echo " does: featureBits net" echo " featureBits net" echo " as well as the chain and chainLink tables," echo " and on the targetDb:" echo " featureBits net" echo " featureBits net" echo " and the chain and chainLink tables." echo -e "\texample: netChainCheck.sh echTel1 mm5 fr1" } doOne() { db=$1 tbl=$2 echo " featureBits $db $tbl" echo -en " #\t" time featureBits $db $tbl } ucFirstLetter() { ucString="$1" fc=`echo "${ucString}" | sed -e "s/\(.\).*/\1/"` rest=`echo "${ucString}" | sed -e "s/.\(.*\)/\1/"` FC=`echo "${fc}" | tr '[a-z]' '[A-Z]'` echo "${FC}${rest}" } if [ "$#" -ne 3 ]; then usage exit 255 fi db0=$1 db1=$2 targetDb=$3 targetDB=`ucFirstLetter "${targetDb}"` DB0=`ucFirstLetter "${db0}"` DB1=`ucFirstLetter "${db1}"` export db0 db1 targetDb targetDB DB0 DB1 # echo "${db0} ${db1} ${targetDb} ${targetDB} ${DB0} ${DB1}" doOne "${db0}" net${targetDB} doOne "${db1}" net${targetDB} doOne "${db0}" chain${targetDB} doOne "${db1}" chain${targetDB} doOne "${db0}" chain${targetDB}Link doOne "${db1}" chain${targetDB}Link doOne ${targetDb} net${DB0} doOne ${targetDb} net${DB1} doOne ${targetDb} chain${DB0} doOne ${targetDb} chain${DB1} doOne ${targetDb} chain${DB0}Link doOne ${targetDb} chain${DB1}Link '_EOF_' # << keep emacs coloring happy # CHAIN AND NET SWAPPED HUMAN BLASTZ (WORKING 12/22/05 kate) # Working in Brian's blastz dir (not doced). # This procedure follows conventions in doBlastzNetChain.pl, # so it can be used to complete the processing ssh kki cd /cluster/data/echTel1/bed/zb.hg17 ln -s `pwd` /cluster/data/hg17/bed/blastz.echTel1 mkdir -p axtChain/run/chain ls -1S /cluster/data/echTel1/bed/zb.hg17/axtChrom/*.axt.gz | \ sed 's/.gz//' > axtChain/run/input.lst cd axtChain/run cat > chain.csh << 'EOF' #!/bin/csh -ef zcat $1 | \ axtChain -verbose=0 -minScore=3000 -linearGap=medium stdin \ /cluster/data/hg17/nib /cluster/data/echTel1/echTel1.2bit stdout | \ chainAntiRepeat /cluster/data/hg17/nib /cluster/data/echTel1/echTel1.2bit \ stdin $2 # delay so parasol check out will succeed on output file (transient problem ?) sleep 5 'EOF' # << happy emacs cat > gsub << 'EOF' #LOOP chain.csh {check in exists $(path1).gz} {check out line+ chain/$(root1).chain} #ENDLOOP 'EOF' # << happy emacs chmod a+x chain.csh gensub2 input.lst single gsub jobList para create jobList para try para check para push # wait for completion para time > run.time # finish pipeline cd /cluster/data/hg17/bed/blastz.echTel1 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=chainMerge \ -bigClusterHub=pk \ `pwd`/DEF >& blastz.out & ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # echTel1 - Tenrec - Ensembl Genes version 51 (DONE - 2008-12-03 - hiram) ssh swarm cd /hive/data/genomes/echTel1 cat << '_EOF_' > echTel1.ensGene.ra # required db variable db echTel1 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the two genes that have invalid structures from Ensembl: # 29277: ENSETET00000011172 no exonFrame on CDS exon 14 # 44942: ENSETET00000018714 no exonFrame on CDS exon 1 '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 echTel1.ensGene.ra ssh hgwdev cd /hive/data/genomes/echTel1/bed/ensGene.51 featureBits echTel1 ensGene # 25441754 bases of 2111581369 (1.205%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/echTel1/bed/ensGene.51 ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/echTel1/bed/cpgIslands cd /hive/data/genomes/echTel1/bed/cpgIslands time doCpgIslands.pl echTel1 > do.log 2>&1 # real 958m28.796s # fixing the broken command: time ./doLoadCpg.csh # real 2m19.103s # continuing: time doCpgIslands.pl -continue=cleanup echTel1 > cleanup.log 2>&1 # real 93m19.493s cat fb.echTel1.cpgIslandExt.txt # 15870914 bases of 2111581369 (0.752%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/echTel1/bed/genscan cd /hive/data/genomes/echTel1/bed/genscan time doGenscan.pl echTel1 > do.log 2>&1 # recovering from power failure, kluster run had just finished # and it had just started on makeBed: time ./doMakeBed.csh # real 514m42.545s # continuing: time doGenscan.pl -continue=load echTel1 > load.log 2>&1 # broken small clusters time doGenscan.pl -workhorse=hgwdev -continue=cleanup echTel1 \ > cleanup.log 2>&1 # real 20m24.981s cat fb.echTel1.genscan.txt # 76152256 bases of 2111581369 (3.606%) in intersection cat fb.echTel1.genscanSubopt.txt # 90237546 bases of 2111581369 (4.273%) in intersection ######################################################################### # windowMasker - (DONE - 2012-05-02 - Hiram) screen -S echTel1 mkdir /hive/data/genomes/echTel1/bed/windowMasker cd /hive/data/genomes/echTel1/bed/windowMasker # trying out new version of the script that does all the usual steps # that used to be performed manually after it was done time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev echTel1 > do.log 2>&1 # real 0m4.498s # it was looking for echTel1.unmasked.2bit file, changed the doCount.csh # file to use echTel1.2bit and running that: time ./doCount.csh # real 53m39.844s # continuing: (with symlink in place for the unmasked.2bit file) time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -continue=mask -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev \ echTel1 > mask.log 2>&1 # real 1341m14.485s # fixing a broken command in the doLoad step: time ./lastLoad.csh # real 11m31.215s time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \ -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev \ echTel1 > cleanup.log 2>&1 # real 1m43.171s sed -e 's/^/ #\t/' fb.echTel1.windowmaskerSdust.beforeClean.txt # 2467260039 bases of 3823724728 (64.525%) in intersection sed -e 's/^/ #\t/' fb.echTel1.windowmaskerSdust.clean.txt # 755116680 bases of 3823724728 (19.748%) in intersection sed -e 's/^/ #\t/' fb.echTel1.rmsk.windowmaskerSdust.txt # 139935514 bases of 3823724728 (3.660%) in intersection ######################################################################### # update repeat masker (DONE - 2012-05-09 - Hiram) mkdir /hive/data/genomes/echTel1/bed/repeatMasker cd /hive/data/genomes/echTel1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -stop=mask -smallClusterHub=encodek echTel1 > do.log 2>&1 & # real 3512m26.535s # run debug to see how nestedRepeats are loaded: time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev -debug \ -stop=install -continue=install -smallClusterHub=encodek echTel1 \ > install.debug.log 2>&1 & # using this command from doLoad.csh hgLoadBed echTel1 nestedRepeats echTel1.nestedRepeats.bed \ -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 # 16:19:44 angie Exp $ grep RELEASE /hive/data/staging/data/RepeatMasker110426/Libraries/RepeatMaskerLib.embl # Library version: CC RELEASE 20110920; cat faSize.rmsk.txt # 3823724728 bases (1712143359 N's 2111581369 real 1541218499 upper 570362870 lower) in 325491 sequences in 1 files # Total size: mean 11747.6 sd 25476.7 min 80 (scaffold_153914) max 739734 (scaffold_324606) median 3912 # %14.92 masked total, %27.01 masked real # what is currently loaded: time featureBits echTel1 rmsk # 291821962 bases of 2111581369 (13.820%) in intersection # real 10m21.908s # Is this run different: hgLoadOut -table=rmskTest -nosplit echTel1 echTel1.sorted.fa.out time featureBits echTel1 rmskTest # 573346920 bases of 2111581369 (27.152%) in intersection # real 6m30.717s # the original was run with -species tenrec # this new one is with -species 'Echinops telfairi' # which does have a few more repeat definitions, but not that many grep "^>" /hive/data/staging/data/RepeatMasker110426/Libraries/20110920/tenrec/* | wc # 1295 1295 127870 grep "^>" /hive/data/staging/data/RepeatMasker110426/Libraries/20110920/echinops_telfairi/* | wc # 1410 1410 154902 # since the new one has significant more masking, going to replace # the old one: time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -continue=install -smallClusterHub=encodek echTel1 > install.log 2>&1 & # real 21m23.234s # and then, masking the genome with this new rmsk business, # add trfMask to rmsk: cd /hive/data/genomes/echTel1 twoBitMask echTel1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed echTel1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa echTel1.2bit stdout | faSize stdin > faSize.echTel1.2bit.txt cat faSize.echTel1.2bit.txt # 3823724728 bases (1712143359 N's 2111581369 real 1540894161 upper # 570687208 lower) in 325491 sequences in 1 files # Total size: mean 11747.6 sd 25476.7 min 80 (scaffold_153914) max 739734 (scaffold_324606) median 3912 # %14.92 masked total, %27.03 masked real # the original masked 2bit file was: # 3823724728 bases (1712143359 N's 2111581369 real 1819400226 upper # 292181143 lower) in 325491 sequences in 1 files # Total size: mean 11747.6 sd 25476.7 min 80 (scaffold_153914) # max 739734 (scaffold_324606) median 3912 # %7.64 masked total, %13.84 masked real # verify gbdb symlink is correct: ls -ogL /gbdb/echTel1/echTel1.2bit # -rw-rw-r-- 1 993224202 May 24 14:04 /gbdb/echTel1/echTel1.2bit # if it is not correct, reset it: rm /gbdb/echTel1/echTel1.2bit ln -s `pwd`/echTel1.2bit /gbdb/echTel1/echTel1.2bit ######################################################################### # construct gc5Base track (DONE - 2012-05-24 - Hiram) # from the automation script template: export topDir=../.. export db=echTel1 hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 $db \ $topDir/$db.2bit | gzip -c > $db.gc5Base.wigVarStep.gz wigToBigWig $db.gc5Base.wigVarStep.gz ../../chrom.sizes $db.gc5Base.bw # real 19m21.718s mkdir /gbdb/echTel1/bbi ln -s /cluster/data/echTel1/bed/gc5Base/echTel1.gc5Base.bw \ /gbdb/echTel1/bbi/gc5Base.bw hgsql echTel1 -e 'drop table if exists gc5BaseBw; create table gc5BaseBw (fileName varchar(255) not null); insert into gc5BaseBw values ("/gbdb/echTel1/bbi/gc5Base.bw");' ######################################################################### # create downloads and pushQ entry (DONE - 2012-05-24 - Hiram) # first make sure all.joiner is up to date and has this new organism # a keys check should be clean: cd ~/kent/src/hg/makeDb/schema joinerCheck -database=echTel1 -keys all.joiner cd /hive/data/genomes/echTel1 time makeDownloads.pl -workhorse hgwdev echTel1 > download.log 2>&1 # real 26m13.940s mkdir /hive/data/genomes/echTel1/pushQ cd /hive/data/genomes/echTel1/pushQ time makePushQSql.pl echTel1 > echTel1.sql 2> stderr.out # real 4m1.136s # check stderr.out for no significant problems, it is common to see: WARNING: hgwdev does not have /gbdb/echTel1/wib/gc5Base.wib WARNING: hgwdev does not have /gbdb/echTel1/wib/quality.wib WARNING: hgwdev does not have /gbdb/echTel1/bbi/quality.bw WARNING: echTel1 does not have seq WARNING: echTel1 does not have extFile WARNING: echTel1 does not have estOrientInfo WARNING: echTel1 does not have gbMiscDiff WARNING: echTel1 does not have gbWarn WARNING: echTel1 does not have mrnaOrientInfo WARNING: hgwdev does not have chain/net download /gbdb/echTel1/liftOver/echTel1ToMm7.over.chain.gz ! WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of supporting and genbank tables) which tracks to assign these tables to: tableList # Remove the Mm7 references from echTel1.sql and tableList is not # important, in fact, drop that table scp -p echTel1.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < echTel1.sql ######################################################################### # quality table loaded (DONE - 2008-11-24 - Hiram) mkdir /hive/data/genomes/echTel1/bed/qual cd /hive/data/genomes/echTel1/bed/qual qacToWig -fixed ../../broad/scaffolds.qac stdout \ | wigEncode stdin qual.wig qual.wib ln -s `pwd`/qual.wib /gbdb/echTel1/wib hgLoadWiggle -pathPrefix=/gbdb/echTel1/wib echTel1 quality qual.wig wigTableStats.sh echTel1 quality # db.table min max mean count sumData stdDev viewLimits echTel1.quality 0 68 24.872 3823724728 9.51037e+10 23.6258 viewLimits=0:68 ######################################################################### # fixup search rules for the gold track (DONE - 2012-06-04 - Hiram) # scan the names to see if there is a pattern: hgsql -N -e "select frag from gold;" echTel1 | sort | head contig_0 hgsql -N -e "select frag from gold;" echTel1 | sort | tail contig_99990 # the pattern appears to be: contig_[0-9]+ # this search rule in trackDb/tenrec/trackDb.ra seems to work: searchTable gold searchMethod exact searchType gold shortCircuit 1 termRegex contig_[0-9]+ searchPriority 8 ######################################################################### # fixup search rules for the ensemblGeneScaffold (DONE - 2012-06-04 - Hiram) # scan the names to see if there is a pattern: hgsql -N -e "select name from ensemblGeneScaffold;" echTel1 | sort | head GS_1.1 hgsql -N -e "select name from ensemblGeneScaffold;" echTel1 | sort | tail GS_999.6 # the pattern appears to be: GS_[0-9]+(\.[0-9]+)? # to get the option of searching for GS_999 or GS_999.6 # this search rule in trackDb/tenrec/trackDb.ra seems to work: searchTable ensemblGeneScaffold searchMethod exact searchType bed shortCircuit 1 termRegex GS_[0-9]+(\.[0-9]+)? searchPriority 18 ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-06-07 - Hiram) cd /hive/data/genomes/echTel1 twoBitToFa echTel1.2bit stdout | faSize stdin # 3823724728 bases (1712143359 N's 2111581369 real 1540894161 upper # 570687208 lower) in 325491 sequences in 1 files # Total size: mean 11747.6 sd 25476.7 min 80 (scaffold_153914) # max 739734 (scaffold_324606) median 3912 # %14.92 masked total, %27.03 masked real # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2111581369 / 2897316137 \) \* 1024 # ( 2111581369 / 2897316137 ) * 1024 = 746.297339 # round up to 800 cd /hive/data/genomes/echTel1 time blat echTel1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/echTel1.11.ooc -repMatch=800 # Wrote 19852 overused 11-mers to jkStuff/echTel1.11.ooc # real 3m3.527s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" echTel1 | sort | uniq -c # 495481 yes # cd /hive/data/genomes/echTel1/jkStuff # gapToLift echTel1 echTel1.nonBridged.lift -bedFile=echTel1.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' echTel1.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-06-07 - 2012-06-22 - Hiram) # since the original genbank run was done in 2005 and daily updates # were never run, start over with an initial run. # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Echinops telfairi 4 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # fixup existing entry: # echTel1 (tenrec) echTel1.serverGenome = /hive/data/genomes/echTel1/echTel1.2bit echTel1.clusterGenome = /scratch/data/echTel1/echTel1.2bit echTel1.ooc = /hive/data/genomes/echTel1/jkStuff/echTel1.11.ooc echTel1.lift = no echTel1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} echTel1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} echTel1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} echTel1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} echTel1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} echTel1.refseq.mrna.native.load = no echTel1.refseq.mrna.xeno.load = yes echTel1.genbank.mrna.xeno.load = no echTel1.genbank.est.native.load = no echTel1.genbank.mrna.native.load = no echTel1.genbank.mrna.native.loadDesc = no echTel1.downloadDir = echTel1 echTel1.perChromTables = no git commit -m "add ooc file for tenrec/echTel1" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S echTel1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial echTel1 & # var/build/logs/2012.06.22-12:25:58.echTel1.initalign.log # real 277m20.096s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad echTel1 & # logFile: var/dbload/hgwdev/logs/2012.06.26-11:05:28.dbload.log # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add echTel1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added echTel1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # SWAP LASTZ Human hg19 (DONE - 2012-07-02 - Hiram) # original alignment to hg19: cd /hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29 cat fb.hg19.chainEchTel1Link.txt # 670299345 bases of 2897316137 (23.135%) in intersection # and for the swap mkdir /hive/data/genomes/echTel1/bed/blastz.hg19.swap cd /hive/data/genomes/echTel1/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 405m49.935s cat fb.echTel1.chainMm10Link.txt # 659524096 bases of 2111581369 (31.234%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/echTel1/bed ln -s blastz.hg19.swap lastz.hg19 ##############################################################################