# for emacs: -*- mode: sh; -*- # Heterocephalus glaber (naked mole-rat) # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Heterocephalus_glaber/HetGla_1.0/ # Project ID: 68323 # http://www.ncbi.nlm.nih.gov/bioproject/68323 # http://www.ncbi.nlm.nih.gov/nuccore?term=AFSB00000000[pacc] # http://www.ncbi.nlm.nih.gov/nuccore/AFSB00000000.1 # Beijing Genomics Institute, Shenzhen # Whole Genome Shotgun sequencing project: # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AFSB00 # WGS: AFSB01000001:AFSB01273990 # Scaffold: JH165514:JH204779 ########################################################################## # Download sequence (DONE - 2011-12-01 Chin) mkdir /hive/data/genomes/hetGla1 cd /hive/data/genomes/hetGla1 mkdir genbank cd genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Heterocephalus_glaber/HetGla_1.0/*" # FINISHED --2011-12-01 11:18:35-- # Downloaded: 20 files, 1003M in 35m 12s (486 KB/s) # Read ASSEMBLY_INFO faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2643961837 bases (213913418 N's 2430048419 real 2430048419 upper 0 lower) # in 39266 sequences in 1 files zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | grep "^#" > ucsc.agp # remove the .1 accession versions from the scaffold/contig names # our database tables can not function with periods in the scaffold names zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | grep -v "^#" | sed -e "s/\.1//" >> ucsc.agp gzip ucsc.agp # extract the scaffold/contig name and remove the .1 accession # version zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ | sed -e "s/^>.*JH/>JH/; s/^>.*AFSB01/>AFSB01/; s/\.1.*//" \ > ucsc.fa gzip ucsc.fa # verify nothing lost in the translation, should be same numbers as above faSize ucsc.fa.gz # 2643961837 bases (213913418 N's 2430048419 real 2430048419 upper 0 lower) # in 39266 sequences in 1 files # verify the names are OK zcat ucsc.fa.gz | grep "^>" | sort | head zcat ucsc.fa.gz | grep "^>" | sort | tail # head:tail = JH165514:JH204779 # N50 mkdir N50 gunzip ucsc.fa.gz faCount ucsc.fa | awk ' {print $1, $2}' | grep "^JH" > N50/chrom.sizes n50.pl N50/chrom.sizes # reading: N50/chrom.sizes # contig count: 39266, total size: 2643961837, one half size: 1321980918 # cumulative N50 count contig contig size 1321132235 501 JH170059 1603897 1321980918 one half size 1322735412 502 JH170231 1603177 ######################################################################### # Initial database build (DONE - 2011-12-02 - Chin) cd /hive/data/genomes/hetGla1 cat << '_EOF_' > hetGla1.config.ra # Config parameters for makeGenomeDb.pl: db hetGla1 clade mammal genomeCladePriority 35 scientificName Heterocephalus glaber commonName naked mole-rat assemblyDate Jul. 2011 assemblyLabel HetGla_1.0 (NCBI project 68323, accession GCA_000230455.1) assemblyShortLabel HetGla_1.0 orderKey 165 mitoAcc NC_015112 fastaFiles /hive/data/genomes/hetGla1/genbank/ucsc.fa.gz agpFiles /hive/data/genomes/hetGla1/genbank/ucsc.agp.gz # qualFiles none dbDbSpeciesDir moleRat taxId 10181 '_EOF_' # << happy emacs time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev hetGla1.config.ra \ > makeGenomeDb.log 2>&1 & # real 19m42.489s # add the trackDb entries to the source tree, and the 2bit link: # we will re-ln with masked one after we done the mask business later! *** ln -s `pwd`/hetGla1.unmasked.2bit /gbdb/hetGla1/hetGla1.2bit XXXX TODO - replace *** use actual url for project etc.. in all html files # Per instructions in makeGenomeDb.log: # Search for '***' notes in each file in and make corrections (sometimes the # files used for a previous assembly might make a better template): # description.html /cluster/data/hetGla1/html/{trackDb.ra,gap.html,gold.html} # # Then copy these files to your ~/kent/src/hg/makeDb/trackDb/moleRat/hetGla1 # - cd ~/kent/src/hg/makeDb/trackDb # - edit makefile to add hetGla1 to DBS. # - git add moleRat/hetGla1/*.{ra,html} # - git commit -m "Added hetGla1 to DBS." makefile # - git commit -m "Initial descriptions for hetGla1." moleRat/hetGla1/*.{ra,html} # - git pull; git push # - Run make update DBS=hetGla1 and make alpha when done. # - (optional) Clean up /cluster/data/hetGla1/TemporaryTrackDbCheckout ######################################################################### # RepeatMasker (DONE 2011-12-02 - Chin) mkdir /hive/data/genomes/hetGla1/bed/repeatMasker cd /hive/data/genomes/hetGla1/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -bigClusterHub=swarm -noSplit hetGla1 > do.log 2>&1 & # real 581m17.442s cat faSize.rmsk.txt # 2643978223 bases (213913418 N's 2430064805 real 1624736547 upper # 805328258 lower) in 39267 sequences in 1 files # %30.46 masked total, %33.14 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps hetGla1 rmsk # 812021314 bases of 2643978223 (30.712%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ######################################################################### # simpleRepeats (DONE - 2011-12-06 - Chin) mkdir /hive/data/genomes/hetGla1/bed/simpleRepeat cd /hive/data/genomes/hetGla1/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev -bigClusterHub=swarm -smallClusterHub=memk \ hetGla1 > do.log 2>&1 & # real 68m40.246s cat fb.simpleRepeat # 50055772 bases of 2430064805 (2.060%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/hetGla1 twoBitMask hetGla1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed hetGla1.2bit # safe to ignore warnings about >=13 fields twoBitToFa hetGla1.2bit stdout | faSize stdin > hetGla1.2bit.faSize.txt cat hetGla1.2bit.faSize.txt # 2643978223 bases (213913418 N's 2430064805 real 1624077548 upper # 805987257 lower) in 39267 sequences in 1 files # %30.48 masked total, %33.17 masked real # double check with featureBits featureBits -countGaps hetGla1 gap # 213913418 bases of 2643978223 (8.091%) in intersection rm /gbdb/hetGla1/hetGla1.2bit ln -s `pwd`/hetGla1.2bit /gbdb/hetGla1/hetGla1.2bit ######################################################################### # Marking *all* gaps - they are not all in the AGP file # (DONE - 2011-12-06 - Chin) mkdir /hive/data/genomes/hetGla1/bed/allGaps cd /hive/data/genomes/hetGla1/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../hetGla1.unmasked.2bit > findMotif.txt 2>&1 # real 0m30.369s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits hetGla1 -not gap -bed=notGap.bed # 2430064805 bases of 2430064805 (100.000%) in intersection featureBits hetGla1 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 2430064805 (0.000%) in intersection # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" hetGla1 | sort -n | tail -1 # 1320 # use tcsh and ctrl-c to create the here doc cat << '_EOF_' > mkGap.pl #!/usr/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" hetGla1 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps hetGla1 other.bed # 0 bases of 2643978223 (0.000%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad hetGla1 otherGap other.bed # Loaded 0 elements of size 3 # adding this many: hgsql -e "select count(*) from gap;" hetGla1 # 234724 # Since there is no line in other.bed, skip the add new gap # hgsql hetGla1 -e 'load data local infile "bed.tab" into table gap;' # result count: # hgsql -e "select count(*) from gap;" hetGla1 ########################################################################## ## WINDOWMASKER (DONE - 2011-12-06 - Chin) mkdir /hive/data/genomes/hetGla1/bed/windowMasker cd /hive/data/genomes/hetGla1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev hetGla1 > do.log 2>&1 & # real 187m50.729s # Masking statistics twoBitToFa hetGla1.wmsk.2bit stdout | faSize stdin # 2643978223 bases (213913418 N's 2430064805 real 1648910110 upper # 781154695 lower) in 39267 sequences in 1 files # %29.54 masked total, %32.15 masked real twoBitToFa hetGla1.wmsk.sdust.2bit stdout | faSize stdin # 2643978223 bases (213913418 N's 2430064805 real 1637135710 upper # 792929095 lower) in 39267 sequences in 1 files # %29.99 masked total, %32.63 masked real hgLoadBed hetGla1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 14228036 elements of size 3 featureBits -countGaps hetGla1 windowmaskerSdust # 1006842513 bases of 2643978223 (38.081%) in intersection # eliminate the gaps from the masking featureBits hetGla1 -not gap -bed=notGap.bed # 2430064805 bases of 2430064805 (100.000%) in intersection time nice -n +19 featureBits hetGla1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 792929095 bases of 2430064805 (32.630%) in intersection # real 26m11.675s # reload track to get it clean hgLoadBed hetGla1 windowmaskerSdust cleanWMask.bed.gz # Loaded 14308684 elements of size 4 featureBits -countGaps hetGla1 windowmaskerSdust # 792929095 bases of 2643978223 (29.990%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../hetGla1.unmasked.2bit stdin \ -type=.bed hetGla1.cleanWMSdust.2bit twoBitToFa hetGla1.cleanWMSdust.2bit stdout | faSize stdin \ > hetGla1.cleanWMSdust.faSize.txt cat hetGla1.cleanWMSdust.faSize.txt # 2643978223 bases (213913418 N's 2430064805 real 1637135710 upper # 792929095 lower) in 39267 sequences in 1 files # %29.99 masked total, %32.63 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps hetGla1 rmsk windowmaskerSdust # 408993748 bases of 2643978223 (15.469%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2011-12-07 - Chin) cd /hive/data/genomes/hetGla1 twoBitMask -add bed/windowMasker/hetGla1.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed hetGla1.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa hetGla1.2bit stdout | faSize stdin > faSize.hetGla1.txt cat faSize.hetGla1.txt # 2643978223 bases (213913418 N's 2430064805 real 1636844037 upper # 793220768 lower) in 39267 sequences in 1 files # %30.00 masked total, %32.64 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/hetGla1/hetGla1.2bit ln -s `pwd`/hetGla1.2bit /gbdb/hetGla1/hetGla1.2bit ######################################################################## # Create kluster run files (DONE - 2011-12-07 - Chin) # numerator is hetGla1 gapless bases "real" as reported by: featureBits -noRandom -noHap hetGla1 gap # 213913418 bases of 2430064805 (8.803%) in intersection # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2430064805 / 2861349177 \) \* 1024 ( 2430064805 / 2861349177 ) * 1024 = 869.654910 # ==> use -repMatch=850 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/hetGla1 blat hetGla1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/hetGla1.11.ooc \ -repMatch=850 & # Wrote 33104 overused 11-mers to jkStuff/hetGla1.11.ooc mkdir /hive/data/staging/data/hetGla1 cp -p hetGla1.2bit jkStuff/hetGla1.11.ooc /hive/data/staging/data/hetGla1 cp -p chrom.sizes /hive/data/staging/data/hetGla1 # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' hetGla1 \ | sort -k7,7nr # all gaps are bridged hgsql -e "select bridge from gap;" hetGla1 | sort | uniq # bridge # yes # most non-bridges gaps have size = 100, follow pig's example (5000) # decide on a minimum gap for this break # skip the following steps # gapToLift -verbose=2 -minGap=100 hetGla1 jkStuff/nonBridged.lft \ # -bedFile=jkStuff/nonBridged.bed # cp -p jkStuff/nonBridged.lft \ # /hive/data/staging/data/hetGla1/hetGla1.nonBridged.lft # ask cluster-admin to copy (evry time if any file changed) # /hive/data/staging/data/hetGla1 directory to # /scratch/data/hetGla1 on cluster nodes # before porceed to genbank step ######################################################################## # GENBANK AUTO UPDATE (DONE - 2011-12-12 - Chin) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # check /cluster/data/genbank/data/organism.lst # to find out number of RefSeqs, genbank mRNAs, and ESTs available # for hetGla1 first # organism mrnaCnt estCnt refSeqCnt # Heterocephalus glaber 39 3 0 # Use turkey (redmine 1577) as template # Meleagris gallopavo 229 17443 0 # edit etc/genbank.conf to add hetGla1 after rn4 # hetGla1 (moleRat) hetGla1.serverGenome = /hive/data/genomes/hetGla1/hetGla1.2bit hetGla1.clusterGenome = /scratch/data/hetGla1/hetGla1.2bit hetGla1.ooc = /scratch/data/hetGla1/hetGla1.11.ooc hetGla1.lift = no hetGla1.perChromTables = no hetGla1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} hetGla1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} hetGla1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} hetGla1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} hetGla1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} hetGla1.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} hetGla1.downloadDir = hetGla1 hetGla1.downloadDir = hetGla1 hetGla1.phalus glaber.mrna.native.load = no hetGla1.refseq.mrna.xeno.load = yes hetGla1.refseq.mrna.xeno.loadDesc = yes hetGla1.genbank.mrna.native.load = yes hetGla1.genbank.mrna.native.loadDesc = yes hetGla1.genbank.mrna.xeno.load = yes hetGla1.genbank.mrna.xeno.loadDesc = yes hetGla1.genbank.est.native.load = yes hetGla1.genbank.est.native.loadDesc = yes git add etc/genbank.conf git commit -m "Added hetGla1" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. With these two lines: # static char *hetGlaNames[] = {"Heterocephalus glaber", NULL}; # ... later ... # {"hetGla", hetGlaNames}, # gbGenome.c is in # /cluster/home/chinhli/kent/src/hg/makeDb/genbank/src/lib # make and checkin make install-server git add src/lib/gbGenome.c git commit -m "adding hetGla1 moleRat" src/lib/gbGenome.c git pull git push ssh hgwdev # used to do this on "genbank" machine screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial hetGla1 & # logFile: var/build/logs/2011.12.13-10:43:43.hetGla1.initalign.log # real 1029m23.335s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hetGla1 & # logFile: var/dbload/hgwdev/logs/2011.12.16-08:48:35.dbload.log # real 193m46.127s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add hetGla1 to: etc/align.dbs etc/hgwdev.dbs git add etc/align.dbs git add etc/hgwdev.dbs git commit -m "Added hetGla1 - moleRat" etc/align.dbs etc/hgwdev.dbs git pull git push make etc-update ######################################################################### # reset default position as JH169644:1002890-1003079 (DONE 2012-01-04 - Chin) # for hox1- (NM_214560) homeobox protein HB1 # hgsql -e \ 'update dbDb set defaultPos="JH169644:1002890-1003079" where name="hetGla1";' \ hgcentraltest ############################################################################ # running cpgIsland business (DONE -2011-12-17 - Chin) mkdir /hive/data/genomes/hetGla1/bed/cpgIsland cd /hive/data/genomes/hetGla1/bed/cpgIsland cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands # needed to fixup this source, adding include to readseq.c: #include "string.h" # and to cpg_lh.c: #include "unistd.h" #include "stdlib.h" # and fixing a declaration in cpg_lh.c sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c mv tmp.c cpg_lh.c make cd ../../ ln -s hg3rdParty/cpgIslands/cpglh.exe mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../hetGla1.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/hetGla1/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | grep "^JH\|^chr" | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod 775 runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time para problems para status # then, kick it with para push # check it with plb # when all are done, para time shows: # para time # 39267 jobs in batch # 18 jobs (including everybody's) in Parasol queue or running. # Checking finished jobs # Completed: 39267 of 39267 jobs # CPU time in finished jobs: 190s 3.17m 0.05h 0.00d 0.000 y # IO & Wait Time: 100994s 1683.23m 28.05h 1.17d 0.003 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 6s 0.10m 0.00h 0.00d # Submission to last job: 123s 2.05m 0.03h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed ssh hgwdev cd /hive/data/genomes/hetGla1/bed/cpgIsland hgLoadBed hetGla1 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Loaded 31524 elements of size 10 # Sorted # Creating table definition for cpgIslandExt # Saving bed.tab # Loading hetGla1 # cleanup rm -fr hardMaskedFa ########################################################################## # BLATSERVERS ENTRY (DONE 2011-01-05 - Chin) # After getting a blat server assigned by the Blat Server Gods, # # blat5 # ports 17784 (trans) and 17785 (untrans). hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("hetGla1", "blat5", "17784", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("hetGla1", "blat5", "17785", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # GENSCAN GENE PREDICTIONS (DONE 2012-02-02 - Chin) mkdir /hive/data/genomes/hetGla1/bed/genscan cd /hive/data/genomes/hetGla1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/genscanlinux # create hard masked .fa files mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../hetGla1.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done # Generate a list file, genome.list, of all the hard-masked contig # chunks: find ./hardMaskedFa/ -type f | sed -e 's#^./##' > genome.list wc -l genome.list # 39267 genome.list # Run on small cluster (more mem than big cluster). ssh swarm cd /hive/data/genomes/hetGla1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Create template file, template, for gensub2. For example (3-line # file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in exists+ $(path1)} {check out exists gtf/$(root1).gtf} -trans={check out exists pep/$(root1).pep} -subopt={check out exists subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs gensub2 genome.list single template jobList para create jobList para try para check ... etc... para time # Completed: 39267 of 39267 jobs # CPU time in finished jobs: 38098s 634.96m 10.58h 0.44d 0.001 y # IO & Wait Time: 102541s 1709.02m 28.48h 1.19d 0.003 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest finished job: 225s 3.75m 0.06h 0.00d # Submission to last job: 600s 10.00m 0.17h 0.01d # Make sure all files ended with linefeed # this did not work, runs out of file handles ? # run endsInLf in batch of 256 files find ./gtf -type f | xargs -n 256 endsInLf -zeroOk # Concatenate results: cd /hive/data/genomes/hetGla1/bed/genscan find ./gtf -type f | xargs cat > genscan.gtf find ./pep -type f | xargs cat > genscan.pep find ./subopt -type f | xargs cat > genscanSubopt.bed # Load into the database (without -genePredExt because no frame # info): ssh hgwdev cd /hive/data/genomes/hetGla1/bed/genscan # to construct a local file with the genePred business: gtfToGenePred genscan.gtf genscan.gp # this produces exactly the same thing and loads the table: ldHgGene -gtf hetGla1 genscan genscan.gtf # Read 41311 transcripts in 271899 lines in 1 files # 41311 groups 5593 seqs 1 sources 1 feature types # 41311 gene predictions # ???? Don't load the Pep anymore -- redundant since it's from genomic. hgPepPred hetGla1 generic genscanPep genscan.pep # Processing genscan.pep hgLoadBed hetGla1 genscanSubopt genscanSubopt.bed # Reading genscanSubopt.bed # Read 504519 elements of size 6 from genscanSubopt.bed featureBits hetGla1 genscan # 62125158 bases of 1358334882 (4.574%) in intersection # previously: ######################################################################### # all.joiner update, downloads and in pushQ - ( DONE 2012-04-18 - Chin) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output # add hetGla after myoLuc joinerCheck -database=hetGla1 -all all.joiner mkdir /hive/data/genomes/hetGla1/goldenPath cd /hive/data/genomes/hetGla1/goldenPath makeDownloads.pl hetGla1 > do.log 2>&1 # now ready for pushQ entry mkdir /hive/data/genomes/hetGla1/pushQ cd /hive/data/genomes/hetGla1/pushQ makePushQSql.pl hetGla1 > hetGla1.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/hetGla1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/hetGla1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/hetGla1/bbi/quality.bw # WARNING: hetGla1 does not have seq # WARNING: hetGla1 does not have extFile # # copy it to hgwbeta scp -p hetGla1.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < hetGla1.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly ####################################################################### # LIFTOVER TO hetGla2 (DONE - 2012-07-23 - Hiram ) mkdir /hive/data/genomes/hetGla1/bed/blat.hetGla2.2012-07-23 cd /hive/data/genomes/hetGla1/bed/blat.hetGla2.2012-07-23 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug \ -ooc /hive/data/genomes/hetGla1/jkStuff/hetGla1.11.ooc hetGla1 hetGla2 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /hive/data/genomes/hetGla1/jkStuff/hetGla1.11.ooc \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ hetGla1 hetGla2 > do.log 2>&1 # real 195m33.266s ############################################################################# # lastz swap Mouse Mm10 (DONE - 2012-04-15 - Hiram) # original alignment cd /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14 cat fb.mm10.chainHetGla2Link.txt # 853221843 bases of 2652783500 (32.163%) in intersection # and this swap mkdir /hive/data/genomes/hetGla2/bed/blastz.mm10.swap cd /hive/data/genomes/hetGla2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 92m24.775s cat fb.hetGla2.chainMm10Link.txt # 879356778 bases of 2314771103 (37.989%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/hetGla2/bed ln -s blastz.mm10.swap lastz.mm10 ##############################################################################