# for emacs: -*- mode: sh; -*- # Felis Catus (domestic cat) -- Broad/Agencourt release 3.0 (March 2006) # file template copied from dm3.txt ######################################################################### # DOWNLOAD SEQUENCE (DONE Sep. 14, 2006 Heather) ssh kkstore05 mkdir /cluster/store12/felCat3 ln -s /cluster/store12/felCat3 /cluster/data/felCat3 mkdir /cluster/data/felCat3/downloads cd /cluster/data/felCat3/downloads wget "ftp://ftp.broad.mit.edu/pub/assemblies/mammals/cat/felCat3*" # trimHeader in fasta and qual files mkdir /cluster/data/felCat3/fixup zcat ../downloads/Draft_v3.agp.chromosome.fasta.gz | /cluster/home/heather/kent/src/hg/snp/snpLoad/trimHeader stdin mv trimHeader.out fasta.fix gzip fasta.fix zcat ../downloads/Draft_v3.agp.chromosome.qual.gz | /cluster/home/heather/kent/src/hg/snp/snpLoad/trimHeader stdin mv trimHeader.out qual.fix gzip qual.fix ######################################################################### # MAKE GENOME DB (DONE Oct. 5, 2006 heather) ssh kkstore05 cd /cluster/data/dm3 cat > felCat3.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log ######################################################################### # REPEATMASKER (DONE Oct 2006 Heather) ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: ~/kent/src/utils/doRepeatMasker.pl felCat3 -verbose 3 -debug # Run it for real and tail the log: ~/kent/src/utils/doRepeatMasker.pl felCat3 -verbose 3 \ >& /cluster/data/felCat3/bed/RepeatMasker.2006-07-31/do.log & tail -f /cluster/data/felCat3/bed/RepeatMasker.2006-07-31/do.log # RepeatMasker and lib version from do.log: # March 20 2006 (open-3-1-5) version of RepeatMasker #CC RELEASE 20060315; * # coverage featureBits felCat3 rmsk 618949830 bases of 1642698377 (37.679%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (DONE Oct 2006 Heather) ssh kkr1u00 mkdir /cluster/data/felCat3/bed/simpleRepeat cd /cluster/data/felCat3/bed/simpleRepeat twoBitToFa ../../felCat3.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp \ >& trf.log & tail -f trf.log # 8 hours to run # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed splitFileByColumn trfMask.bed trfMaskChrom # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed felCat3 simpleRepeat \ /cluster/data/felCat3/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Coverage featureBits felCat3 simpleRepeat 47819770 bases of 1642698377 (2.911%) in intersection ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE Oct 17, 2006 heather) ssh kolossus cd /cluster/data/felCat3 time twoBitMask felCat3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed felCat3.2bit # This warning is OK -- the extra fields are not block coordinates. #Warning: BED file bed/simpleRepeat/trfMask.bed has 10 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks). #4.857u 5.050s 0:32.63 30.3% 0+0k 0+0io 5pf+0w # Link to it from /gbdb: ssh hgwdev ln -s /cluster/data/felCat3/felCat3.2bit /gbdb/felCat3/felCat3.2bit ######################################################################### # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE Heather Oct. 20, 2006) cd /cluster/data/felCat3 ln -s /cluster/data/felCat3/bed/RepeatMasker.2006-10-16/felCat3.fa.out /cluster/data/felCat3 ~/kent/src/utils/automation/makeDownloads.pl felCat3 -verbose=2 >& jkStuff/downloads.log & tail -f jkStuff/downloads.log # Edit README.txt files when done: # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/felCat3/goldenPath/database/README.txt # /cluster/data/felCat3/goldenPath/bigZips/README.txt ######################################################################### # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE Oct 20, 2006 heather) # pitakluster: ssh pk mkdir /san/sanvol1/scratch/felCat3 cp /cluster/data/felCat3/felCat3.2bit /san/sanvol1/scratch/felCat3/ cp /cluster/data/felCat3/chrom.sizes /san/sanvol1/scratch/felCat3/ mkdir /san/sanvol1/scratch/felCat3/rmsk cp -p /cluster/data/felCat3/felCat3.fa.out /san/sanvol1/scratch/felCat3/rmsk # small cluster: ssh kkr1u00 mkdir -p /iscratch/i/felCat3 cp -p /cluster/data/felCat3/felCat3.2bit . cp -p /cluster/data/felCat3/chrom.sizes . iSync ######################################################################### # BLASTZ/CHAIN/NET HG18 (Done Nov 09 2006 heather) # Do target = hg18 first because felCat3 is 200K scaffolds (documented in hg18.txt) # Then do swap mkdir /cluster/data/felCat3/bed/blastz.hg18.2006-11-10.swap cd /cluster/data/felCat3/bed/blastz.hg18.2006-11-10.swap doBlastzChainNet.pl /cluster/data/felCat3/bed/blastz.hg18.2006-11-09/DEF -swap >& do.log & # netToAxt took over 4 hours nice featureBits felCat3 chainHg18Link # 1014983843 bases of 1642698377 (61.788%) in intersection ######################################################################### # BLASTZ/CHAIN/NET MM8 (Done Nov 09 2006 heather) # Do target = mm8 first (documented in mm8.txt) mkdir /cluster/data/felCat3/bed/blastz.mm8.2006-11-15.swap cd /cluster/data/felCat3/bed/blastz.mm8.2006-11-15.swap doBlastzChainNet.pl /cluster/data/felCat3/bed/blastz.mm8.2006-11-14/DEF -swap >& do.log & nice featureBits felCat3 chainMm8Link # 486204459 bases of 1642698377 (29.598%) in intersection ######################################################################### # BLASTZ/CHAIN/NET CANFAM2 (Done Nov 18 2006 heather) # Do target = canFam2 first (documented in canFam2.txt) mkdir /cluster/data/felCat3/bed/blastz.canFam2.2006-11-18.swap cd /cluster/data/felCat3/bed/blastz.canFam2.2006-11-18.swap doBlastzChainNet.pl /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16/DEF -swap >& do.log & nice featureBits felCat3 chainCanFam2Link # 1209933048 bases of 1642698377 (73.655%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE Oct 24 2006 heather) # Use -repMatch=600 (based on size -- for human we use 1024, and # cat size is 57% of human judging by twoBitInfo -noNs) ssh kolossus blat /cluster/data/felCat3/felCat3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/felCat3/11.ooc -repMatch=600 # Wrote 18561 overused 11-mers to /cluster/bluearc/felCat3/11.ooc ssh kkr1u00 cd /iscratch/i/felCat3 cp -p /cluster/bluearc/felCat3/11.ooc . iSync ######################################################################### # GENBANK AUTO UPDATE (DONE Nov 2006 heather) # Don't need a liftAll.lft file # Guidance from Mark: # Genbank does its own partitioning. The only thing the lift file is used # for is locating gaps. It optimizes by splitting on largish gaps. # The current threshold is 3000000. If you don't have any gaps over # this threshold, you don't need a lift file. With 200K scaffolds, # you can probably just skip this. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add felCat3 # check data/organism.lst for counts of native mrna, est, refseq # organism mrnaCnt estCnt refSeqCnt # Felis catus 593 921 219 # since small yield, include xeno mrna # always do xeno refseq # never do xeno est # felCat3 (cat) 217790 scaffolds felCat3.serverGenome = /cluster/data/felCat3/felCat3.2bit felCat3.clusterGenome = /iscratch/i/felCat3/felCat3.2bit felCat3.ooc = /iscratch/i/felCat3/11.ooc felCat3.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} felCat3.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} felCat3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} felCat3.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} felCat3.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} felCat3.lift = no felCat3.refseq.mrna.xeno.load = yes felCat3.downloadDir = felCat3 felCat3.perChromTables = no # edit src/lib/gbGenome.c # static char *felCatNames[] = {"Felis catus", NULL}; # static struct dbToSpecies dbToSpeciesMap[] = ... {"felCat", felCatNames, NULL}, ... cvs commit -m "Added cat" etc/genbank.conf src/lib/gbGenome.c make install-server ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial felCat3 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad felCat3 & # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add felCat3 to: etc/align.dbs etc/hgwdev.dbs cvs commit make etc-update ######################################################################### # CREATE MICROSAT TRACK (DONE Nov. 2006 Heather) ssh hgwdev cd /cluster/data/felCat3/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed felCat3 microsat microsat.bed ######################################################################### # GENSCAN (DONE Nov. 2006 Heather) # Assume none of the 200K scaffolds are all Ns # (These cause Genscan to run forever) ssh hgwdev mkdir /cluster/data/felCat3/bed/genscan cd /cluster/data/felCat3/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt cvs co hg3rdParty/genscanlinux # generate hard-masked sequence ssh kkstore05 cd /cluster/data/felCat3/bed/genscan zcat /cluster/data/felCat3/goldenPath/bigZips/felCat3.fa.gz | maskOutFa stdin hard felCat3.hardmask.fa mkdir split cd split faSplit about ../felCat3.hardmask.fa 2000000 split # This yields 1939 files # Generate file list and check that no files are completed masked cd .. foreach f ( /cluster/data/felCat3/bed/genscan/split/*.fa ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # 1939 genome.list # Log into kki (not kk!). kki is the driver node for the small # cluster (kkr1u00-kkr8u00). Genscan has a problem running on the # big cluster, due to limited memory and swap space on each # processing node. ssh kki cd /cluster/data/felCat3/bed/genscan # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy /parasol/bin/gensub2 genome.list single gsub jobList para create jobList para try para check para push # Completed: 1939 of 1939 jobs # CPU time in finished jobs: 119320s 1988.66m 33.14h 1.38d 0.004 y # IO & Wait Time: 7219s 120.32m 2.01h 0.08d 0.000 y # Average job time: 65s 1.09m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 242s 4.03m 0.07h 0.00d # Submission to last job: 10683s 178.05m 2.97h 0.12d # Concatenate ssh kkstore05 cd /cluster/data/felCat3/bed/genscan cat gtf/*.gtf > genscan.gtf cat pep/*.pep > genscan.pep cat subopt/*.bed > genscanSubopt.bed # Load into the database ssh hgwdev cd /cluster/data/felCat3/bed/genscan ldHgGene -gtf felCat3 genscan genscan.gtf # Reading genscan.gtf # Read 75440 transcripts in 283819 lines in 1 files # 75440 groups 54047 seqs 1 sources 1 feature types # 75440 gene predictions hgPepPred felCat3 generic genscanPep genscan.pep hgLoadBed felCat3 genscanSubopt genscanSubopt.bed featureBits felCat3 genscan # 46616293 bases of 1642698377 (2.838%) in intersection featureBits felCat3 genscanSubopt # 53124220 bases of 1642698377 (3.234%) in intersection # Should be zero intersection with rmsk featureBits felCat3 genscan rmsk # 3334 bases of 1642698377 (0.000%) in intersection ######################################################################### # CpG Islands (DONE Nov 22, 2006 Heather) ssh kkstore05 cd /cluster/data/felCat3/bed mkdir cgpIsland cd cgpIsland zcat ../../goldenPath/bigZips/felCat3.fa.masked.gz | faSplit about stdin 2000000 split ssh hgwdev cd /cluster/data/felCat3/bed/cgpIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/felCat3/bed/cpgIsland/ ssh kkstore05 # could also have used kolossus cd /cluster/data/felCat3/bed/cpgIsland cat << '_EOF_' > run.csh #!/bin/csh -ef foreach f (split/*) set fout=$f:t:r:r.cgp echo $fout ./cpglh.exe $f > $fout end '_EOF_' # << this line makes emacs coloring happy run.csh # Transform cpglh output to bed + cat << '_EOF_' > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk x*.cpg > cpgIsland.bed # load into database ssh hgwdev cd /cluster/data/felCat3/bed/cpgIsland hgLoadBed felCat3 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Loaded 45262 elements of size 10 wc -l cpgIsland.bed # 45262 cpgIsland.bed featureBits felCat3 cpgIslandExt # 30893611 bases of 1642698377 (1.881%) in intersection ######################################################################### # Build contigAcc table (DONE, Heather, Jan 2007) # I found the contig accessions at # http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=Nucleotide&cmd=Search&term=AANG01000001:AANG01817956[PACC] # They have a regular pattern: # contig_0 --> AANG01000001 # contig_1 --> AANG01000002 # contig_817955 --> AANG01817956 ssh kkstore05 cd /cluster/data/felCat3 cat << '_EOF_' > contigAcc.csh #!/bin/tcsh set contig = 0 set acc = 1000001 while ($contig < 817956) echo "contig_$contig\tAANG0$acc" set contig=`expr $contig + 1` set acc=`expr $acc + 1` end '_EOF_' # << this line makes emacs coloring happy ./contigAcc.csh > contigAcc.out ssh hgwdev cd /cluster/data/felCat3 hgsql felCat3 < contigAcc.sql hgsql -e 'load data local infile "contigAcc.out" into table contigAcc' ########################################################################### # HUMAN (hg18) PROTEINS TRACK (in progress braney 2007-01-29) ssh kkstore05 bash # if not using bash shell already mkdir /cluster/data/felCat3/blastDb cd /cluster/data/felCat3 zcat fixup/fasta.fix.gz > temp.fa faSplit sequence temp.fa 500 blastDb/ rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /san/sanvol1/scratch/felCat3/blastDb cd /cluster/data/felCat3/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/felCat3/blastDb done mkdir -p /cluster/data/felCat3/bed/tblastn.hg18KG cd /cluster/data/felCat3/bed/tblastn.hg18KG echo /san/sanvol1/scratch/felCat3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 499 query.lst # we want around 200000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(200000/499) = 91.633865 mkdir -p /cluster/bluearc/felCat3/bed/tblastn.hg18KG/kgfa split -l 90 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/felCat3/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/felCat3/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/felCat3/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/felCat3/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/felCat3/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec # back to bash exit ssh pk cd /cluster/data/felCat3/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 134439 of 204091 jobs # Crashed: 69652 jobs # CPU time in finished jobs: 22099258s 368320.97m 6138.68h 255.78d 0.701 y # IO & Wait Time: 2034194s 33903.23m 565.05h 23.54d 0.065 y # Average job time: 180s 2.99m 0.05h 0.00d # Longest finished job: 1226s 20.43m 0.34h 0.01d # Submission to last job: 288017s 4800.28m 80.00h 3.33d # Completed: 69652 of 69652 jobs # CPU time in finished jobs: 9130572s 152176.19m 2536.27h 105.68d 0.290 y # IO & Wait Time: 396459s 6607.66m 110.13h 4.59d 0.013 y # Average job time: 137s 2.28m 0.04h 0.00d # Longest finished job: 420s 7.00m 0.12h 0.00d # Submission to last job: 52132s 868.87m 14.48h 0.60d ssh kkstore05 cd /cluster/data/felCat3/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/felCat3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' exit chmod +x chainOne ls -1dS /cluster/bluearc/felCat3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/felCat3/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 409 of 409 jobs # CPU time in finished jobs: 3862s 64.37m 1.07h 0.04d 0.000 y # IO & Wait Time: 164467s 2741.12m 45.69h 1.90d 0.005 y # Average job time: 412s 6.86m 0.11h 0.00d # Longest finished job: 877s 14.62m 0.24h 0.01d # Submission to last job: 5693s 94.88m 1.58h 0.07d ssh kkstore05 cd /cluster/data/felCat3/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/felCat3/bed/tblastn.hg18KG/blastHg18KG.psl pslCheck blastHg18KG.psl # this is ok. # load table ssh hgwdev cd /cluster/data/felCat3/bed/tblastn.hg18KG hgLoadPsl felCat3 blastHg18KG.psl # check coverage featureBits felCat3 refGene:cds blastHg18KG -enrichment # refGene:cds 0.013%, blastHg18KG 0.926%, both 0.008%, cover 60.78%, enrich 65.60x ssh kkstore05 rm -rf /cluster/data/felCat3/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/felCat3/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## # AUGUSTUS ab initio track (DONE, mario 2007-01-30) ssh hgwdev mkdir /cluster/data/felCat3/bed/augustus cd /cluster/data/felCat3/bed/augustus # get the program AUGUSTUS from the augustus web server wget http://augustus.gobics.de/binaries/augustus.2.0.1.src.tar.gz tar xzf augustus.2.0.1.src.tar.gz # compile the binary if necessary cd augustus/src make augustus # create output and error directory cd /cluster/data/felCat3/bed/augustus mkdir out err ssh kkstore cd /cluster/data/felCat3/bed/augustus # use the split fasta files that were already generated for genscan, see above # (completely masked files are no problem) ls ../genscan/split/*.fa >> genome.lst # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP augustus/src/augustus --AUGUSTUS_CONFIG_PATH=/cluster/data/felCat3/bed/augustus/augustus/config --species=human --sample=100 --/augustus/verbosity=0 {check in line+ $(path1)} --outfile={check out line out/$(root1).gtf} --errfile=err/$(root1).err #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # create the job list ssh pk cd /cluster/data/felCat3/bed/augustus /parasol/bin/gensub2 genome.list single gsub jobList wc -l jobList # 1939 para try para check para push # CPU time in finished jobs: 4923552s 82059.20m 1367.65h 56.99d 0.156 y # IO & Wait Time: 9930s 165.50m 2.76h 0.11d 0.000 y # Average job time: 2544s 42.41m 0.71h 0.03d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 4050s 67.50m 1.12h 0.05d # Submission to last job: 45408s 756.80m 12.61h 0.53d # check the error files, should be no errors cat err/*.err # no errors, delete the empty files rm err/*.err cat out/*.gtf | augustus/scripts/join_aug_pred.pl > augustus.pep.gff augustus/scripts/getAnnoFasta.pl augustus.pep.gff cat augustus.pep.gff | egrep "CDS|codon"> augustus.gff # load into database ssh hgwdev cd /cluster/data/felCat3/bed/augustus/ ldHgGene -bin felCat3 augustus augustus.gff # 28510 gene predictions hgPepPred felCat3 generic augustusPep augustus.pep.aa featureBits felCat3 augustus # 21748832 bases of 1642698377 (1.324%) in intersection ############################################################################ # GENEID GENES (DONE - 2007-02-01 - Hiram) ## re-done with new data 2007-02-05 - Hiram ssh kkstore05 mkdir /cluster/data/felCat3/bed/geneid cd /cluster/data/felCat3/bed/geneid wget --timestamping \ "http://genome.imim.es/genepredictions/F.catus/golden_path_200603/geneidv1.2/felCat3.gtf" \ -O "felCat3.gtf" wget --timestamping \ "http://genome.imim.es/genepredictions/F.catus/golden_path_200603/geneidv1.2/felCat3.prot" \ -O "felCat3.prot" # Add missing .1 to protein id's perl -wpe 's/^(>scaffold\w+)$/$1.1/' felCat3.prot > felCat3-fixed.prot ssh hgwdev cd /cluster/data/felCat3/bed/geneid # load predictions nice -n +19 ldHgGene -genePredExt -gtf felCat3 geneid felCat3.gtf # Read 204448 transcripts in 509279 lines in 1 files # 204448 groups 188121 seqs 1 sources 3 feature types # load protein sequences nice -n +19 hgPepPred felCat3 generic geneidPep felCat3-fixed.prot # after loading with new data set: nice -n +19 featureBits felCat3 -enrichment refGene:CDS geneid # refGene:CDS 0.013%, geneid 2.321%, both 0.009%, cover 71.49%, # enrich 30.80x nice -n +19 featureBits felCat3 -enrichment genscan:CDS geneid # genscan:CDS 2.838%, geneid 2.321%, both 1.395%, cover 49.15%, # enrich 21.17x ## With the previous data set nice -n +19 featureBits felCat3 -enrichment refGene:CDS geneid # refGene:CDS 0.013%, geneid 1.440%, both 0.004%, cover 31.81%, # enrich 22.09x nice -n +19 featureBits felCat3 -enrichment genscan:CDS geneid # genscan:CDS 2.838%, geneid 1.440%, both 0.816%, cover 28.76%, # enrich 19.97x ########################################################################## # NSCAN track - (2007-03-08 markd) # uses hg18 as informat, with pseudogene masking # cd /cluster/data/felCat3/bed/nscan/ # obtainedf NSCAN predictions from michael brent's group # at WUSTL wget -nv http://mblab.wustl.edu/predictions/cat/felCat3.gtf wget -nv http://mblab.wustl.edu/predictions/cat/felCat3/felCat3.ptx.fa gzip -9 felCat3.* chmod a-w *.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt felCat3 nscanGene felCat3.gtf.gz hgPepPred felCat3 generic nscanPep felCat3.ptx.fa.gz rm -f *.tab # update trackDb; need a felCat3-specific page to describe informants cat/felCat3/nscanGene.html cat/felCat3/trackDb.ra ########################################################################## # SYNTENIC NETS (Heather, Mar 2007) ssh hgwdev cd /cluster/data/felCat3/bed cd blastz.hg18.2006-11-10.swap/axtChain netFilter -syn felCat3.hg18.net.gz | hgLoadNet felCat3 netSyntenyHg18 stdin featureBits -countGaps felCat3 refGene:cds netHg18 -enrichment # refGene:cds 0.005%, netHg18 66.952%, both 0.005%, cover 99.78%, enrich 1.49x featureBits -countGaps felCat3 refGene:cds netSyntenyHg18 -enrichment # refGene:cds 0.005%, netSyntenyHg18 53.767%, both 0.004%, cover 79.25%, enrich 1.47x cd blastz.mm8.2006-11-15.swap/axtChain netFilter -syn felCat3.mm8.net.gz | hgLoadNet felCat3 netSyntenyMm8 stdin featureBits -countGaps felCat3 refGene:cds netMm8 -enrichment # refGene:cds 0.005%, netMm8 36.836%, both 0.005%, cover 99.33%, enrich 2.70x featureBits -countGaps felCat3 refGene:cds netSyntenyMm8 -enrichment # refGene:cds 0.005%, netSyntenyMm8 23.421%, both 0.003%, cover 56.22%, enrich 2.40x cd blastz.canFam2.2006-11-18.swap netFilter -syn felCat3.canFam2.net.gz | hgLoadNet felCat3 netSyntenyCanFam2 stdin featureBits -countGaps felCat3 refGene:cds netCanFam2 -enrichment # refGene:cds 0.005%, netCanFam2 78.072%, both 0.005%, cover 99.82%, enrich 1.28x featureBits -countGaps felCat3 refGene:cds netSyntenyCanFam2 -enrichment # refGene:cds 0.005%, netSyntenyCanFam2 67.261%, both 0.005%, cover 88.77%, enrich 1.32x ########################################################################## # RECIPROCAL BEST NETS (2007-03-19 kate) ssh kkstore05 cd /cluster/data/felCat3 cd bed/blastz.hg18 ~/kent/src/hg/utils/automation/doRecipBest.pl felCat3 hg18 >&! rbest.log & ssh kkstore05 cd /cluster/data/felCat3/bed cd blastz.mm8.2006-11-15.swap ~/kent/src/hg/utils/automation/doRecipBest.pl felCat3 mm8 \ -buildDir=`pwd` >&! rbest.log & ssh kkstore05 cd /cluster/data/felCat3/bed cd blastz.canFam2.2006-11-18.swap ~/kent/src/hg/utils/automation/doRecipBest.pl felCat3 canFam2 \ -buildDir=`pwd` >&! rbest.log & # downloadables ssh hgwdev ln -s /cluster/data/felCat3/bed/blastz.canFam2.2006-11-18.swap/axtRBestNet/felCat3.canFam2.rbest.axt.gz \ /usr/local/apache/htdocs/goldenPath/felCat3/vsCanFam2/felCat3.canFam2.rbest.axt.gz ln -s /cluster/data/felCat3/bed/blastz.hg18/axtRBestNet/felCat3.hg18.rbest.axt.gz \ /usr/local/apache/htdocs/goldenPath/felCat3/vsHg18/felCat3.hg18.rbest.axt.gz ln -s /cluster/data/felCat3/bed/blastz.mm8.2006-11-15.swap/axtRBestNet/felCat3.mm8.rbest.axt.gz \ /usr/local/apache/htdocs/goldenPath/felCat3/vsMm8/felCat3.mm8.rbest.axt.gz ########################################################################## # MULTIZ4WAY (Heather, DONE, May 2007) # split; write to /san (will be using pk) # create 1024 files 000.maf through 1023.maf # Files vary from almost 2000 - over 13000 lines each # A little over 6,000,000 lines total ssh kkstore05 cd /cluster/data/felCat3/bed/blastz.canFam2.2006-11-18.swap/mafRBestNet zcat *gz | mafSplit dummyArg /san/sanvol1/scratch/felCat3/mafNet/canFam2/split stdin -byTarget -useHashedName=10 cd /cluster/data/felCat3/bed/blastz.hg18.2006-11-10.swap/mafRBestNet zcat *gz | mafSplit dummyArg /san/sanvol1/scratch/felCat3/mafNet/hg18/split stdin -byTarget -useHashedName=10 cd /cluster/data/felCat3/bed/blastz.mm8.2006-11-15.swap/mafRBestNet zcat *gz | mafSplit dummyArg /san/sanvol1/scratch/felCat3/mafNet/mm8/split stdin -byTarget -useHashedName=10 # setup multiz4way # output will be to /cluster/data/felCat3/bed/multiz4way/maf cd /cluster/data/felCat3/bed/multiz4way mkdir maf mkdir run # tree_doctor /cluster/bin/phast.new/tree_doctor --prune-all-but=felCat3,canFam2,mm8,hg18 \ /cluster/data/hg18/bed/multiz28way/cons/28way.mod | awk '$1 == "TREE:" {print $2;}' > 4way.nh # use the following for species.lst: # felCat3 canFam2 mm8 hg18 # use the following for tree.nh: # ((felCat3 canFam2) (mm8 hg18)) # use the following for tree-species.nh: # ((cat, dog), (mouse, human)) # create tree image for details page using phyloGif # The treeImage setting in trackDb.ra is # phylo/felCat3_4way.gif (relative to htdocs/images). /data/apache/cgi-bin/phyloGif -phyloGif_tree=tree-species.nh -phyloGif_width=260 \ -phyloGif_height=260 > felCat3_4way.gif # If you haven't already, check out the browser CVS tree in your ~/: # (cd; cvs co -d hgwdev:/projects/hg/cvsroot browser) cp felCat3_4way.gif ~browser/images/phylo/ cd ~/browser/images/phylo cvs add felCat3_4way.gif cvs commit -m "new image" felCat3_4way.gif cd ../.. cvs update -d make alpha cd run # create split.lst perl -we 'for ($i=0; $i<1024; $i++) { printf "split%03d\n", $i; }' > split.lst # spec file cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/felCat3/bed/multiz4way/maf/$(dir1)/$(root1).maf} #ENDLOOP 'EOF' # << emacs # autoMultiz.csh cat << 'EOF' > autoMultiz.csh #!/bin/csh -ef set db = felCat3 set c = $1 set mafOut = $2 set run = `pwd` set pairs = /san/sanvol1/scratch/$db/mafNet # set up temp dir set tmp = /scratch/tmp/$db/multiz.$c rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) if ($s == $db) then continue endif set clusterMaf = $pairs/$s/$c.maf set localMaf = $db.$s.sing.maf if (-e $clusterMaf) then cp $clusterMaf $localMaf else echo "##maf version=1 scoring=autoMZ" > $localMaf endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $mafOut rm -fr $tmp 'EOF' # << emacs # stash binaries mkdir penn cd penn cp /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz . cp /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project . cp /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ . # cluster run ssh pk cd /cluster/data/felCat3/bed/multiz4way/run gensub2 split.lst single spec jobList para create jobList para try para check para push para time Completed: 1024 of 1024 jobs # CPU time in finished jobs: 740s 12.33m 0.21h 0.01d 0.000 y # IO & Wait Time: 8688s 144.80m 2.41h 0.10d 0.000 y # Average job time: 9s 0.15m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 51s 0.85m 0.01h 0.00d # Submission to last job: 1616s 26.93m 0.45h 0.02d ########################################################################## # ANNOTATE MULTIZ4WAY MAF AND LOAD TABLES (Done, May 2007, Heather) # create felCat3.Nbed (already have it for hg18, mm8, canFam2) ssh kolossus cd /cluster/data/felCat3 twoBitInfo -nBed felCat3.2bit felCat3.N.bed # directory structure and nBeds file (input to mafAddIRows) ssh kkstore05 mkdir /cluster/data/felCat3/bed/multiz4way/anno cd /cluster/data/felCat3/bed/multiz4way/anno mkdir maf run cd run # setup.sh cat << 'EOF' > setup.sh #!/bin/sh for DB in canFam2 hg18 mm8 do ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds done 'EOF' # << emacs # setup for mafAddIRows # makeIrows.sh cat << 'EOF' > makeIrows.sh #!/bin/sh rm -f getIrows.sh twobit=/cluster/data/felCat3/felCat3.2bit outdir=../maf # do smaller jobs first for file in `ls -1rS ../../maf/*.maf` do echo "echo $file" >> getIrows.sh base=`basename $file` echo nice mafAddIRows -nBeds=nBeds $file $twobit $outdir/$base >> getIrows.sh done chmod +x getIrows.sh 'EOF' # << emacs # run mafAddIRows # takes 30 minutes or so depending on load on kolossus # could have used kki (see ornAna1.txt) ssh kolossus cd /cluster/data/felCat3/bed/multiz4way/anno/run ./getIrows.sh # concat and filter ssh kkstore05 cd /cluster/data/felCat3/bed/multiz4way/anno cat maf/*.maf >> felCat3.maf # rollup file is 29,932,626 lines mafFilter -overlap -reject=rf felCat3.maf > felCat3.mf.maf # rejected minRow: 2279 # total rejected: 2279 blocks # mafFilter rejected 2279 single-row (felCat3 only) blocks due to # its default minRow of 2. That's reasonable, so replace the original # with the filtered version. mv felCat3.maf felCat3.preFilter.maf mv felCat3.mf.maf felCat3.maf # TO DO when kkstore05 isn't so busy: # gzip -c felCat3.preFilter.maf > felCat3.preFilter.maf.gz # load ssh hgwdev mkdir -p /gbdb/felCat3/multiz4way/anno ln -s /cluster/data/felCat3/bed/multiz4way/anno/felCat3.maf /gbdb/felCat3/multiz4way/anno/ time nice hgLoadMaf -pathPrefix=/gbdb/felCat3/multiz4way/anno felCat3 multiz4way # Loaded 2344109 mafs in 1 files from /gbdb/felCat3/multiz4way/anno # 60.419u 16.819s 1:48.05 71.4% 0+0k 0+0io 2pf+0w # summary ssh kolossus cd /cluster/data/felCat3/bed/multiz4way/anno time nice hgLoadMafSummary felCat3 -minSize=30000 -mergeGap=1500 -maxSize=200000 -test \ multiz4waySummary felCat3.maf # Created 323552 summary blocks from 1510110 components and 2344109 mafs from felCat3.maf # 107.717u 21.889s 3:08.00 68.9% 0+0k 0+0io 3pf+0w # Creates multiz4waySummary.tab # load ssh hgwdev sed -e 's/mafSummary/multiz4waySummary/' ~/kent/src/hg/lib/mafSummary.sql > /tmp/multiz4waySummary.sql cd /cluster/data/felCat3/bed/multiz4way/anno time nice hgLoadSqlTab felCat3 multiz4waySummary /tmp/multiz4waySummary.sql multiz4waySummary.tab # 0.000u 0.003s 0:04.18 0.0% 0+0k 0+0io 3pf+0w # summary is only useful for scaffolds larger than 1 million, which is just 7 in felCat3 # Angie subsequently changed hgLoadMafSummary so by default it will skip small sequences mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_217569"; mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_213970"; mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_138821"; mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_149667"; mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_217324"; mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_132063"; mysql> insert into multiz4waySummaryNew select * from multiz4waySummary where chrom = "scaffold_194753"; mysql> rename table multiz4waySummary to multiz4waySummaryAll; mysql> rename table multiz4waySummaryNew to multiz4waySummary; ######################################################################### # MULTIZ4WAY DOWNLOADABLES (Done May 2007 Heather) ssh kolossus mkdir /cluster/data/felCat3/bed/multiz4way/mafDownloads # Tried for upstream data with nscanGene, didn't get anything # ssh hgwdev # cd /cluster/data/felCat3/bed/multiz4way/mafDownloads # nice featureBits felCat3 nscanGene:upstream:1000 -fa=/dev/null -bed=up.bad # nice featureBits felCat3 nscanGene:upstream:2000 -fa=/dev/null -bed=up.bad # nice featureBits felCat3 nscanGene:upstream:5000 -fa=/dev/null -bed=up.bad # Make a gzipped version of the monolithic annotated maf file: cd /cluster/data/felCat3/bed/multiz4way gzip -c anno/felCat3.maf > mafDownloads/felCat3.maf.gz cd mafDownloads md5sum felCat3.maf.gz > md5sum.txt ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/felCat3/multiz4way mkdir $dir ln -s /cluster/data/felCat3/bed/multiz4way/mafDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/ornAna1/multiz6way/README.txt $dir # edit README.txt # create /usr/local/apache/htdocs/goldenPath/hg18/multiz4way/4way.nh ######################################################################### # MULTIZ4WAY MAF FRAMES (Done May 2007 Heather) # using nscanGene for felCat3 # using knownGene for mm8 hg18 # using ensGene for canFam2 # genePreds (name specific fields to get around bin, knownGene extras) ssh hgwdev mkdir /cluster/data/felCat3/bed/multiz4way/frames cd /cluster/data/felCat3/bed/multiz4way/frames mkdir genes cd genes cat << '_EOF_' > getGenes.csh #!/bin/csh -ef foreach queryDb (felCat3 mm8 hg18 canFam2) if ($queryDb == "felCat3") then set geneTbl = nscanGene else if ($queryDb == "canFam2") then set geneTbl = ensGene else set geneTbl = knownGene endif hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from $geneTbl" ${queryDb} \ | genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$queryDb.tmp.gz mv /scratch/tmp/$queryDb.tmp.gz $queryDb.gp.gz end '_EOF_' # << this line makes emacs coloring happy ./getGenes.csh # create frames ssh kolossus cd /cluster/data/felCat3/bed/multiz4way/frames cat << '_EOF_' > getFrames.csh #!/bin/csh -ef set multizDir = /cluster/data/felCat3/bed/multiz4way set mafDir = $multizDir/mafDownloads set geneDir = $multizDir/frames/genes cd $multizDir/frames genePredToMafFrames felCat3 $mafDir/felCat3.maf.gz felCat3.mafFrames \ felCat3 genes/felCat3.gp.gz \ hg18 genes/hg18.gp.gz \ mm8 genes/mm8.gp.gz \ canFam2 genes/canFam2.gp.gz gzip felCat3.mafFrames '_EOF_' # << this line makes emacs coloring happy ./getFrames.csh # load the database ssh hgwdev cd /cluster/data/felCat3/bed/multiz4way/frames hgLoadMafFrames felCat3 multiz4wayFrames felCat3.mafFrames.gz ########################################################################## # PHASTCONS (DONE, June 2007, Heather) # Using Angie's process from ornAna1.txt ssh hgwdev cd /cluster/data/felCat3/bed/multiz4way mkdir phastCons cd phastCons # tree_doctor # Does the order matter in the prune-all-but argument? /cluster/bin/phast.new/tree_doctor --prune-all-but=felCat3,canFam2,mm8,hg18 \ /cluster/data/hg18/bed/multiz28way/cons/28way.mod > 4way.mod # msa_split # msa_split requires its MAF and FA inputs to contain one sequence each # input is from /cluster/data/felCat3/bed/multiz4way/maf/split*.maf # output is to /san/sanvol1/scratch/felCat3/multiz4way/phastCons/ss ssh kkstore05 cd /cluster/data/felCat3/bed/multiz4way/phastCons nice ./doSplitOnFileserver.csh > & split.log & # this took 5.5 hours to create directories split000 - split999 # not bothering to restructure as a cluster run (Angie suggested this in ornAna1.txt) # 50k WARNINGs in split.log # make a list of all output (sort each split dir by size) ./makeInput.sh # check a single chunk ssh kolossus cd /cluster/data/felCat3/bed/multiz4way/phastCons /cluster/bin/phast.new/phyloFit -i SS -E -p MED -s HKY85 \ --tree "`cat ../tree-commas.nh`" \ /san/sanvol1/scratch/felCat3/multiz4way/phastCons/ss/split495/scaffold_216010.1-806250.ss \ -o phyloFit.tree # get value of rho ssh kolossus mkdir run.cons cd run.cons /cluster/bin/phast.new/phastCons --estimate-rho /tmp/estimatedRho.mod \ --target-coverage 0.005 --expected-length 12 --no-post-probs \ /san/sanvol1/scratch/felCat3/multiz4way/phastCons/ss/split495/scaffold_216010.1-806250.ss ../4way.mod # rho = 0.226082 # template goes here (uses rho) # run all ssh pk cd /cluster/data/felCat3/bed/multiz4way/phastCons/run.cons mv ../in.list . gensub2 in.list single template jobList para create jobList para try para check para push para time # Completed: 103985 of 103985 jobs # CPU time in finished jobs: 11039s 183.98m 3.07h 0.13d 0.000 y # IO & Wait Time: 1170173s 19502.89m 325.05h 13.54d 0.037 y # Average job time: 11s 0.19m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 177s 2.95m 0.05h 0.00d # Submission to last job: 7596s 126.60m 2.11h 0.09d # create Most Conserved track # Takes about 10 minutes ssh kolossus cd /san/sanvol1/scratch/felCat3/multiz4way/phastCons find ./bed -name "*.bed" \ | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \ | sort -k7,7 -k9,9n \ | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \ | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements4way.bed cp phastConsElements4way.bed /cluster/data/felCat3/bed/multiz4way/phastCons # estimate coverage # ornAna1 targets: 5% overall coverage, 70% CDS coverage # xenTro1 target: 4% overall coverage ssh hgwdev cd /cluster/data/felCat3/bed/multiz4way/phastCons # attempted genome-wide, ran over an hour without completing # pick scaffolds with NSCAN annotations: featureBits -chrom=scaffold_0 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 10.166%, phastConsElements4way.bed 0.358%, both 0.358%, cover 3.53%, enrich 9.84x featureBits -chrom=scaffold_1000 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 5.882%, phastConsElements4way.bed 2.677%, both 2.286%, cover 38.87%, enrich 14.52x featureBits -chrom=scaffold_100011 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 3.830%, phastConsElements4way.bed 2.286%, both 1.993%, cover 52.04%, enrich 22.76x featureBits -chrom=scaffold_100063 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 2.668%, phastConsElements4way.bed 1.487%, both 0.701%, cover 26.29%, enrich 17.69x featureBits -chrom=scaffold_100077 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 2.671%, phastConsElements4way.bed 1.784%, both 0.968%, cover 36.25%, enrich 20.32x featureBits -chrom=scaffold_100083 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 3.633%, phastConsElements4way.bed 6.358%, both 3.633%, cover 100.00%, enrich 15.73x featureBits -chrom=scaffold_100088 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 1.384%, phastConsElements4way.bed 2.254%, both 0.625%, cover 45.15%, enrich 20.03x # the largest scaffolds: featureBits -chrom=scaffold_217569 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 0.877%, phastConsElements4way.bed 3.784%, both 0.607%, cover 69.21%, enrich 18.29x featureBits -chrom=scaffold_213970 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 0.000%, phastConsElements4way.bed 1.531%, both 0.000%, cover nan%, enrich nanx featureBits -chrom=scaffold_138821 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 6.129%, phastConsElements4way.bed 5.286%, both 3.572%, cover 58.28%, enrich 11.02x featureBits -chrom=scaffold_149667 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 0.658%, phastConsElements4way.bed 0.900%, both 0.344%, cover 52.30%, enrich 58.14x featureBits -chrom=scaffold_217324 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 0.746%, phastConsElements4way.bed 3.041%, both 0.531%, cover 71.21%, enrich 23.42x featureBits -chrom=scaffold_132063 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 1.903%, phastConsElements4way.bed 2.062%, both 1.142%, cover 60.03%, enrich 29.11x featureBits -chrom=scaffold_194753 felCat3 -enrichment nscanGene:cds phastConsElements4way.bed # nscanGene:cds 0.843%, phastConsElements4way.bed 2.334%, both 0.230%, cover 27.28%, enrich 11.69x # not as good as ornAna1, but hopefully good enough # load hgLoadBed felCat3 phastConsElements4way phastConsElements4way.bed # create merged posterior probability file and wiggle track data files ssh kolossus cd /san/sanvol1/scratch/felCat3/multiz4way/phastCons/ # sort by chromName, chromStart so that items are in numerical order # for wigEncode time find ./pp -name "*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode -noOverlap stdin phastCons4way.wig phastCons4way.wib cp -p phastCons4way.wi? /cluster/data/felCat3/bed/multiz4way/phastCons # 0.166u 1.856s 2:10.13 1.5% 0+0k 0+0io 0pf+0w cp -p phastCons4way.wi? /cluster/data/felCat3/bed/multiz4way/phastCons # load gbdb and database with wiggle ssh hgwdev cd /cluster/data/felCat3/bed/multiz4way/phastCons ln -s /cluster/data/felCat3/bed/multiz4way/phastCons/phastCons4way.wib /gbdb/felCat3/multiz4way hgLoadWiggle -pathPrefix=/gbdb/felCat3/multiz4way felCat3 phastCons4way phastCons4way.wig # tally up tree distances ssh hgwdev cd /cluster/data/felCat3/bed/multiz4way tail -1 phastCons/phyloFit.tree.mod | sed -e 's/^TREE: //' > 4way.phyloFit.nh /cluster/bin/phast.new/all_dists 4way.nh > 4way.distances.txt grep felCat3 4way.distances.txt | sort -k3,3n | awk '{printf ("%.4f\t%s\n", $3, $1)}' > distances.txt cat distances.txt # 0.1993 canFam2 # 0.3320 hg18 # 0.5306 mm8 # downloadables ssh kkstore05 cd /cluster/data/felCat3/bed/multiz4way mkdir phastConsDownloads cd phastConsDownloads ./make.csh nice gzip felCat3.pp md5sum felCat3.pp.gz > md5sum.txt ssh hgwdev cd /cluster/data/felCat3/bed/multiz4way/phastConsDownloads set dir = /usr/local/apache/htdocs/goldenPath/felCat3/phastCons4way mkdir $dir ln -s /cluster/data/felCat3/bed/multiz4way/phastConsDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/ornAna1/phastCons6way/README.txt $dir # edit README.txt cp /cluster/data/felCat3/bed/multiz4way/phastCons/4way.mod /usr/local/apache/htdocs/goldenPath/felCat3/phastCons4way # clean up ssh kkstore05 rm /cluster/data/felCat3/bed/multiz4way/phastCons/*.tab rm -r /san/sanvol1/scratch/felCat3/multiz4way/phastCons ############################################################################ ## BLASTZ swap from mm9 alignments (2007-11-10 - markd) ssh hgwdev mkdir /cluster/data/felCat3/bed/blastz.mm9.swap cd /cluster/data/felCat3/bed/blastz.mm9.swap ln -s blastz.mm9.swap ../blastz.mm9 /cluster/bin/scripts/doBlastzChainNet.pl \ -swap /cluster/data/mm9/bed/blastz.felCat3/DEF >& swap.out& nice +19 featureBits felCat3 chainMm9Link # 486766552 bases of 1642698377 (29.632%) in intersection ############################################################################ # SGP GENES (DONE - 2007-12-20 - Hiram) ssh kkstore05 mkdir /cluster/data/felCat3/bed/sgp cd /cluster/data/felCat3/bed/sgp wget --timestamping \ "http://genome.imim.es/genepredictions/F.catus/golden_path_200603/SGP/200603mm8/scaffolds.gtf" \ -O felCat3.gtf ssh hgwdev cd /cluster/data/felCat3/bed/sgp ldHgGene -gtf -genePredExt felCat3 sgpGene felCat3.gtf # Read 51619 transcripts in 153058 lines in 1 files # 51619 groups 39049 seqs 1 sources 3 feature types # 51619 gene predictions featureBits felCat3 sgpGene # 18130668 bases of 1642698377 (1.104%) in intersection featureBits felCat3 -enrichment refGene:CDS sgpGene # refGene:CDS 0.014%, sgpGene 1.104%, both 0.006%, cover 41.92%, # enrich 37.98x ##################################################################### ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # felCat3 - Cat - Ensembl Genes version 51 (DONE - 2008-12-03 - hiram) ssh kolossus cd /hive/data/genomes/felCat3 cat << '_EOF_' > felCat3.ensGene.ra # required db variable db felCat3 # do we need to translate geneScaffold coordinates geneScaffolds yes # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly skipInvalid yes # ignore the three genes that have invalid structures from Ensembl: # 2100: ENSFCAT00000006929 no exonFrame on CDS exon 16 # 14578: ENSFCAT00000010965 no exonFrame on CDS exon 1 # 26634: ENSFCAT00000009384 no exonFrame on CDS exon 0 '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 felCat3.ensGene.ra ssh hgwdev cd /hive/data/genomes/felCat3/bed/ensGene.51 featureBits felCat3 ensGene # 22158651 bases of 1642698377 (1.349%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/felCat3/bed/ensGene.51 ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # LIFTOVER TO FelCatV17e (DONE - 2010-03-29 - Chin ) mkdir /hive/data/genomes/felCat3/bed/blat.felCatV17e.2010-03-29 cd /hive/data/genomes/felCat3/bed/blat.felCatV17e.2010-03-29 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug -ooc /scratch/data/felCat3/11.ooc \ felCat3 felCatV17e # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /scratch/data/felCat3/11.ooc \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ felCat3 felCatV17e > do.log 2>&1 & # real 36m16.693s ############################################################################ # LIFTOVER TO FelCat4 (DONE - 2010-08-20 - Chin ) mkdir /hive/data/genomes/felCat3/bed/blat.felCat4.2010-08-20 cd /hive/data/genomes/felCat3/bed/blat.felCat4.2010-08-20 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug -ooc /scratch/data/felCat3/11.ooc \ felCat3 felCat4 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /scratch/data/felCat3/11.ooc \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ felCat3 felCat4 > do.log 2>&1 & # real 2093m9.263s # pushQ stuff cd /usr/local/apache/htdocs-hgdownload/goldenPath/felCat3/liftOver/ md5sum felCat3ToFelCat4.over.chain.gz >> md5sum.txt ############################################################################