# for emacs: -*- mode: sh; -*- # Ornithorhynchus anatinus -- WUSTL version 5.0 (Dec. 2006) # http://www.ncbi.nlm.nih.gov/genome/110 # http://www.ncbi.nlm.nih.gov/bioproject/12885 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAPN00 ######################################################################### # DOWNLOAD SEQUENCE (DONE 12/15/06 angie - REDONE 2/5/07 angie) ssh kkstore05 mkdir /cluster/store12/ornAna1 ln -s /cluster/store12/ornAna1 /cluster/data/ornAna1 mkdir /cluster/data/ornAna1/downloads cd /cluster/data/ornAna1/downloads set baseUrl = http://genome.wustl.edu/pub/user/lhillier/private/platypus/ wget --timestamping $baseUrl/platypus.tar.gz wget --timestamping $baseUrl/sctgs_hit_by_ESTs.not_yet_submitted.gz gunzip sctgs_hit_by_ESTs.not_yet_submitted.gz tar xvzf platypus.tar.gz faSize *.fa #2016937414 bases (174725237 N's 1842212177 real 1842212177 upper 0 lower) in 20 sequences in 20 files #Total size: mean 100846870.7 sd 348736694.0 min 1399469 (chr17) max 1579857390 (chrUn) median 16302927 #N count: mean 8736261.8 sd 32604763.8 #U count: mean 92110608.8 sd 316138018.7 #L count: mean 0.0 sd 0.0 # Wow, chrUn is 1.5G even with 100bp contig gaps... that won't cause # the browser to die (though it did provoke an integer-overflow -> # infinite loop in faSize!) but it seems like it would be nicer to # have sequence for just the super- and ultra-contigs for this assembly. # OK, LaDeana made new chrUn files that contain ultracontigs... wget --timestamping $baseUrl/chrUn.agp.gz wget --timestamping $baseUrl/chrUn.fa.gz mv chrUn.fa chrUn.fa.orig mv chrUn.agp chrUn.agp.orig gunzip chrUn.fa.gz chrUn.agp.gz # checkAgpAndFa found a slight hiccup: chrUn.fa had an empty Ultra673 # which was not in any .agp file. Removed Ultra673 from chrUn.fa -- # LaDeana confirmed that's OK. # RELOADING 2/5/07 to pick up slight changes to chr5 and chrUn so we # will be completely in sync with GenBank cd /cluster/data/ornAna1 mv downloads downloads.bak mkdir downloads cd downloads wget --timestamping $baseUrl/platypus.070113.tar.gz tar xvzf platypus.070113.tar.gz faSize *.fa #Invalid fasta format: sequence size == 0 for element Ultra673 #1996794193 bases (154575690 N's 1842218503 real 1842218503 upper 0 lower) in 201523 sequences in 20 files #Total size: mean 9908.5 sd 331992.5 min 0 (Ultra673) max 59581953 (chr3) median 1069 #N count: mean 767.0 sd 20573.1 #U count: mean 9141.5 sd 311758.8 #L count: mean 0.0 sd 0.0 # I edited out the empty Ultra673 (LaDeana OK'd). ######################################################################### # MAKE GENOME DB (DONE 12/19/06 angie - REDONE 2/5/07 angie) ssh kkstore05 cd /cluster/data/ornAna1 cat > ornAna1.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log # Followed instructions printed out at the end of the script for # editing the template description files and checked them in. ######################################################################### # RELOAD GOLD/GAP, REBUILD DOWNLOAD AFTER AGP TWEAK (DONE 1/8/07 angie - OBSOLETED BY TOTAL REBUILD) # LaDeana sent a manual tweak to chr5.agp: change "p" to "-" for # this one entry: # chr5 647409 657392 203 W Contig2532.3 1 9984 p # So reload the agp, re-compress for download. ssh kkstore05 cd /cluster/data/ornAna1 # Make the necessary edits: $EDITOR downloads/chr5.agp $EDITOR ornAna1.agp # Reload the table ssh -x hgwdev hgGoldGapGl -noGl ornAna1 /cluster/data/ornAna1/ornAna1.agp # Regenerate the download file and update md5sum.txt: cd /cluster/data/ornAna1/goldenPath/bigZips gzip -c /cluster/data/ornAna1/ornAna1.agp > ornAna1.agp.gz md5sum ornAna1.agp.gz #e8539f496b2343c5e802b2362e61c9a3 ornAna1.agp.gz $EDITOR md5sum.txt ######################################################################### # REPEATMASKER (DONE 12/20/06 angie - REDONE 2/5/07) # REPLACED 7/20/07 by beta lib re-run -- see REPEATMASKER BETA TEST below. ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: doRepeatMasker.pl ornAna1 -verbose 3 -debug # Run it for real and tail the log: doRepeatMasker.pl ornAna1 -verbose 3 \ >& /cluster/data/ornAna1/bed/RepeatMasker.2007-02-05/do.log & tail -f /cluster/data/ornAna1/bed/RepeatMasker.2007-02-05/do.log # RepeatMasker and lib version from do.log: #grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker/RepeatMasker # October 6 2006 (open-3-1-6) version of RepeatMasker #grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl #CC RELEASE 20061006; * # Coverage: featureBits ornAna1 rmsk #633511004 bases of 1842236818 (34.388%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (DONE 12/20/06 angie - REDONE 2/6/07) # (dropped chromEnd index 07-10-07 kuhn) # The first time around I just piped twoBitToFa|trfBig but due to the # large number of scaffolds, that took 12.5 hours. This time, make it # a small cluster job. ssh kki mkdir /cluster/data/ornAna1/bed/simpleRepeat cd /cluster/data/ornAna1/bed/simpleRepeat # no splitting -- choose a chunk size equal to the largest sequence: set chunkSize = `awk '{print $2;}' ../../chrom.sizes | sort -nr | head -1` ~/kent/src/hg/utils/automation/simplePartition.pl ../../ornAna1.unmasked.2bit $chunkSize chunks cat > gsub <<'_EOF_' #LOOP ./doTrfBig.csh {check in exists $(path1)} {check out exists $(path1).bed} #ENDLOOP '_EOF_' # << emacs cat > doTrfBig.csh <<'_EOF_' #!/bin/csh -ef sed -e 's/^.*://' $1 \ | twoBitToFa ../../ornAna1.unmasked.2bit -seqList=stdin stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=$2 -tempDir=/tmp '_EOF_' # << emacs chmod a+x doTrfBig.csh ls -1S chunks/*/*.lst > chunks.lst gensub2 chunks.lst single gsub jobList para make jobList para time #Completed: 36 of 36 jobs #CPU time in finished jobs: 13498s 224.96m 3.75h 0.16d 0.000 y #IO & Wait Time: 749s 12.49m 0.21h 0.01d 0.000 y #Average job time: 396s 6.60m 0.11h 0.00d #Longest finished job: 1017s 16.95m 0.28h 0.01d #Submission to last job: 1017s 16.95m 0.28h 0.01d cat chunks/*/*.bed > simpleRepeat.bed # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed ornAna1 simpleRepeat \ /cluster/data/ornAna1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits ornAna1 simpleRepeat #38774128 bases of 1842236818 (2.105%) in intersection ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 12/20/06 angie - REDONE 2/6/07 - REDONE 7/24/07) # new assembly in Feb, new RM lib in July ssh kolossus cd /cluster/data/ornAna1 time twoBitMask ornAna1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ornAna1.2bit # This warning is OK -- the extra fields are not block coordinates. #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks). #5.490u 3.275s 0:18.91 46.3% 0+0k 0+0io 4pf+0w # Link to it from /gbdb: ssh hgwdev ln -s /cluster/data/ornAna1/ornAna1.2bit /gbdb/ornAna1/ornAna1.2bit ######################################################################### # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 12/21/06 angie - REDONE 2/6/07 - REDONE 7/24/07) # new assembly in Feb, new RM lib in July cd /cluster/data/ornAna1 # If rerunning, save the edited README.txt files aside... mkdir tmpReadmes foreach d (bigZips database) mkdir -p tmpReadmes/$d cp -p goldenPath/$d/README.txt tmpReadmes/$d/README.txt end rm -f ornAna1.fa.out ln -s bed/RepeatMasker.2007-07-17/ornAna1.fa.out . makeDownloads.pl ornAna1 -verbose=2 \ >& jkStuff/downloads.log & tail -f jkStuff/downloads.log # Edit README.txt files when done: # If rerunning, use info in tmpReadmes/ files but update where appropriate. # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/ornAna1/goldenPath/database/README.txt # /cluster/data/ornAna1/goldenPath/bigZips/README.txt rm -rf tmpReadmes ######################################################################### # WINDOWMASKER FOR COMPARISON (IN PROGRESS 1/11/07 angie) cd /cluster/data/ornAna1 ~/kent/src/hg/utils/automation/doWindowMasker.pl -debug ornAna1 ~/kent/src/hg/utils/automation/doWindowMasker.pl ornAna1 \ >& bed/WindowMasker.2006-12-22/do.log & tail -f bed/WindowMasker.2006-12-22/do.log # Needs -workhorse kolossus so it can connect to the NCBI server... # ~/kent/src/hg/utils/automation/doWindowMasker.pl ornAna1 \ ~/kent/src/hg/utils/automation/doWindowMasker.pl ornAna1 \ -buildDir /cluster/data/ornAna1/bed/WindowMasker.2006-12-22/do.log \ -workhorse kolossus -continue mask \ >& bed/WindowMasker.2006-12-22/do.log & tail -f bed/WindowMasker.2006-12-22/do.log cd bed/WindowMasker.2006-12-22 # Compare coverage to rmsk which was 633510998 / 1842236818 = 34.388% zcat windowmasker.bed.gz | awk '{print $3 - $2}' | total #844961729 # 844961729 / 1842236818 = 45.866% zcat windowmasker.sdust.bed.gz | awk '{print $3 - $2}' | total #1011460283 # 1011460283 / 1842236818 = 54.904% # The coverage is higher, but do we really want to mask 55% of the # assembly before alignment?? #TODO - evaluate!! ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE 12/22/06 angie - REDONE 2/6/07) # (probably frivolous to redo because the assembly changes were so small, # but redo anyway so it's reproduceable later.) # Use -repMatch=608 # To determine -repMatch for new species, first scale 1024 by size # relative to human (e.g. divide ornAna1 / hg18 gapless bases from # featureBits). That gives ~607, rounding up to 608. ssh kolossus blat /cluster/data/ornAna1/ornAna1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/ornAna1/11.ooc -repMatch=608 #Wrote 35954 overused 11-mers to /cluster/bluearc/ornAna1/11.ooc ######################################################################### # GENBANK AUTO UPDATE (DONE 1/5/07 angie - REDONE 2/6/07) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add ornAna1 just after dm2... # ornAna1 (Platypus -- O. anatinus) ornAna1.serverGenome = /cluster/data/ornAna1/ornAna1.2bit ornAna1.clusterGenome = /iscratch/i/ornAna1/ornAna1.2bit ornAna1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} ornAna1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} ornAna1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} ornAna1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} ornAna1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} ornAna1.ooc = /cluster/bluearc/ornAna1/11.ooc ornAna1.lift = no ornAna1.genbank.mrna.xeno.load = yes ornAna1.downloadDir = ornAna1 ornAna1.perChromTables = no cvs ci -m "Added ornAna1." etc/genbank.conf # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. cvs ci -m "Added O. anatinus (platypus)." src/lib/gbGenome.c make install-server ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial ornAna1 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad ornAna1 & # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add ornAna1 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added ornAna1." etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### # SWAP HG18 CHAIN/NET (DONE 4/9/07 angie) ssh kkstore05 mkdir /cluster/data/ornAna1/bed/blastz.hg18.swap cd /cluster/data/ornAna1/bed/blastz.hg18.swap doBlastzChainNet.pl -swap \ /cluster/data/hg18/bed/blastz.ornAna1.2007-02-21/DEF >& do.log & tail -f do.log ln -s blastz.hg18.swap /cluster/data/ornAna1/bed/blastz.hg18 ######################################################################### # SWAP MM8 CHAIN/NET (DONE 3/2/07 angie) ssh kkstore05 mkdir /cluster/data/ornAna1/bed/blastz.mm8.swap cd /cluster/data/ornAna1/bed/blastz.mm8.swap doBlastzChainNet.pl -swap \ /cluster/data/mm8/bed/blastz.ornAna1.2007-02-27/DEF >& do.log & tail -f do.log ln -s blastz.mm8.swap /cluster/data/ornAna1/bed/blastz.mm8 ######################################################################### # SWAP GALGAL3 CHAIN/NET (DONE 4/9/07 angie) ssh kkstore05 mkdir /cluster/data/ornAna1/bed/blastz.galGal3.swap cd /cluster/data/ornAna1/bed/blastz.galGal3.swap doBlastzChainNet.pl -swap \ /cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06/DEF >& do.log & tail -f do.log ln -s blastz.galGal3.swap /cluster/data/ornAna1/bed/blastz.galGal3 ######################################################################### # SWAP MONDOM4 CHAIN/NET (DONE 4/3/07 angie) ssh kkstore05 mkdir /cluster/data/ornAna1/bed/blastz.monDom4.swap cd /cluster/data/ornAna1/bed/blastz.monDom4.swap doBlastzChainNet.pl -swap -workhorse=kkr3u00 \ /cluster/data/monDom4/bed/blastz.ornAna1/DEF >& do.log & tail -f do.log ln -s blastz.monDom4.swap /cluster/data/ornAna1/bed/blastz.monDom4 ####################################################################### # BLASTZ LIZARD ANOCAR1 (DONE 4/18/07 angie) ssh kkstore05 mkdir /cluster/data/ornAna1/bed/blastz.anoCar1.2007-04-03 cd /cluster/data/ornAna1/bed/blastz.anoCar1.2007-04-03 cat << '_EOF_' > DEF # chicken vs. platypus # Use "human-fish" params. # If that causes us to get swamped, we can increase L. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Platypus ornAna1 SEQ1_DIR=/san/sanvol1/scratch/ornAna1/ornAna1.2bit SEQ1_LEN=/san/sanvol1/scratch/ornAna1/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LIMIT=100 SEQ1_LAP=10000 # QUERY: Lizard anoCar1 SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes SEQ2_CHUNK=50000000 SEQ2_LIMIT=500 SEQ2_LAP=0 BASE=/cluster/data/ornAna1/bed/blastz.anoCar1.2007-04-03 TMPDIR=/scratch/tmp '_EOF_' # << emacs doBlastzChainNet.pl DEF \ -bigClusterHub pk -smallClusterHub pk \ >& do.log & tail -f do.log ln -s blastz.anoCar1.2007-04-03 /cluster/data/ornAna1/bed/blastz.anoCar1 ######################################################################### # UPDATE DEFAULT POSITION (DONE 4/30/07 angie) # This is a fairly well-conserved region of chrX5 with homology to # chicken chrZ and an intronEst (platypus GenBank seqs are rare!)... # also has an aligned human protein. ssh hgwdev hgsql hgcentraltest -e \ 'update dbDb set defaultPos = "chrX5:26250001-26320000" where \ name = "ornAna1";' ######################################################################### # REPEATMASKER BETA TEST (DONE 4/19/07 angie -- REDONE 7/20/07) # 7/17/07: Trying out development lib version. ssh kkstore05 # Run -debug to create the dir structure and preview the scripts: ~/kent/src/hg/utils/automation/doRepeatMasker.pl ornAna1 -verbose 3 -debug # Run it for real and tail the log: cd /cluster/data/ornAna1/bed/RepeatMasker.2007-07-17 ~/kent/src/hg/utils/automation/doRepeatMasker.pl ornAna1 -verbose 3 \ -stop mask \ >& do.log & tail -f do.log # RepeatMasker and lib version from do.log: #grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker.20070717-dev/RepeatMasker # May 17 2007 (open-3-1-8) version of RepeatMasker #grep RELEASE /cluster/bluearc/RepeatMasker.20070717-dev/Libraries/RepeatMaskerLib.embl #CC RELEASE 20070717; * # Now load the new files as table rmskBeta for comparison: ssh hgwdev cd /cluster/data/ornAna1/bed/RepeatMasker.2007-07-17 hgLoadOut -table=rmskBeta -nosplit ornAna1 ornAna1.fa.out # Coverage: featureBits ornAna1 rmskBeta #882976774 bases of 1842236818 (47.930%) in intersection # wow! big increase from the initial rmsk: #633511004 bases of 1842236818 (34.388%) in intersection hgLoadBed ornAna1 nestedRepeats ornAna1.nestedRepeats.bed \ -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql # Robert Hubley and Arian looked at it & approved for release. # Continue the doRepeatMasker run so that the rmsk table is reloaded. ~/kent/src/hg/utils/automation/doRepeatMasker.pl ornAna1 -verbose 3 \ -continue load \ -buildDir /cluster/data/ornAna1/bed/RepeatMasker.2007-07-17 \ >& do.log & tail -f do.log hgsql ornAna1 -e 'drop table rmskBeta' ######################################################################### # ENSEMBL GENES (DONE 4/26/07 angie) ssh hgwdev mkdir /cluster/data/ornAna1/bed/ensembl cd /cluster/data/ornAna1/bed/ensembl # Go to www.ensembl.org and click around their evolving interface # to get the following types of data: # 1. a tab-separated file that relates Ensembl gene, transcript and # protein IDs. Save as ensGtp.txt.gz # 2. peptide fasta with only transcript IDs in header -> ensPep.fa.gz # They have started dumping GTF files so we can download that directly: wget 'ftp://ftp.ensembl.org/pub/current_ornithorhynchus_anatinus/data/gtf/*' # Add "chr" to chrom names in the gene data gtf file to make # it compatible with our software. gunzip -c Ornithorhynchus_anatinus.OANA5.44.gtf.gz \ | perl -wpe 's/^([\dX]+)/chr$1/; s/^MT/chrM/;' \ > ensGene.gtf ldHgGene -gtf -genePredExt ornAna1 ensGene \ /cluster/data/ornAna1/bed/ensembl/ensGene.gtf nice featureBits ornAna1 ensGene #22058521 bases of 1842236818 (1.197%) in intersection # strip header line: gunzip -c ensGtp.txt.gz | tail +2 > ensGtp.txt hgLoadSqlTab ornAna1 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt gunzip ensPep.fa.gz hgPepPred ornAna1 generic ensPep ensPep.fa ######################################################################### # MULTIZ7WAY (DONE 4/25/07 angie) # ((ornAna1 (monDom4 (hg18 mm8))) (galGal3 anoCar1)) ssh kkstore05 mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24 cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24 # Prune the hg18 28way tree to just these 6: /cluster/bin/phast/tree_doctor \ --prune-all-but=ornAna1,monDom4,hg18,mm8,galGal3,anoCar1 \ /cluster/data/hg18/bed/multiz28way.2007-03-20/cons/28way.mod \ | awk '$1 == "TREE:" {print $2;}' \ > 6way.nh # *carefully* edit 6way.nh to put ornAna1 first. # create species list and stripped down tree for autoMZ sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 6way.nh \ > tree-commas.nh sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst # Largest ornAna1 seq size is ~60M, so no need to split sequences (whew): awk '{print $2;}' ../../chrom.sizes | sort -nr | head -1 #59581953 wc -l ../../chrom.sizes #201523 ../../chrom.sizes # Split MAFs by sequence onto cluster-friendly server. # Since there are 200k scaffolds, and each other species may have # alignments to a different subset, use the new -useHashedName both to # reduce the number of split files and to ensure consistent split # file names across all other species. mkdir /san/sanvol1/scratch/ornAna1/mafNet foreach s (monDom4 hg18 mm8 galGal3 anoCar1) echo $s time nice mafSplit -byTarget -useHashedName=10 \ dummyArg /san/sanvol1/scratch/ornAna1/mafNet/$s/split \ /cluster/data/ornAna1/bed/blastz.$s/mafNet/* end ssh pk cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24 mkdir maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = ornAna1 set c = $1 set mafOut = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/mafNet rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) if ($s == $db) then continue endif set clusterMaf = $pairs/$s/$c.maf set localMaf = $db.$s.sing.maf if (-e $clusterMaf) then cp $clusterMaf $localMaf else echo "##maf version=1 scoring=autoMZ" > $localMaf endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $mafOut rm -fr $tmp 'EOF' # << emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/ornAna1/bed/multiz6way.2007-04-24/maf/$(root1).maf} #ENDLOOP 'EOF' # << emacs # List scaffolds in the dir structure created by mafSplit above: perl -we 'for ($i=0; $i<1024; $i++) { printf "split%03d\n", $i; }' \ > split.lst gensub2 split.lst single spec jobList para make jobList para time #Completed: 1024 of 1024 jobs #CPU time in finished jobs: 10728s 178.80m 2.98h 0.12d 0.000 y #IO & Wait Time: 3278s 54.63m 0.91h 0.04d 0.000 y #Average job time: 14s 0.23m 0.00h 0.00d #Longest finished job: 330s 5.50m 0.09h 0.00d #Submission to last job: 479s 7.98m 0.13h 0.01d ######################################################################### # ANNOTATE MULTIZ6WAY MAF AND LOAD TABLES (DONE 4/25/2007 angie) ssh kolossus mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno mkdir maf run cd run rm -f sizes nBeds foreach db (`cat /cluster/data/ornAna1/bed/multiz6way.2007-04-24/species.lst`) if (! -e /cluster/data/$db/$db.N.bed) then twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed} endif ln -s /cluster/data/$db/$db.N.bed $db.bed echo $db.bed >> nBeds end ssh kki cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno/run cp /dev/null jobList foreach f (/cluster/data/ornAna1/bed/multiz6way.2007-04-24/maf/*) set maf = $f:t echo mafAddIRows -nBeds=nBeds $f \ /iscratch/i/ornAna1/ornAna1.2bit ../maf/$maf >> jobList end para make jobList para time #Completed: 1024 of 1024 jobs #CPU time in finished jobs: 2571s 42.85m 0.71h 0.03d 0.000 y #IO & Wait Time: 2745s 45.75m 0.76h 0.03d 0.000 y #Average job time: 5s 0.09m 0.00h 0.00d #Longest finished job: 42s 0.70m 0.01h 0.00d #Submission to last job: 447s 7.45m 0.12h 0.01d # Consolidate multi-level maf to monolithic file # No need to sort chunks by position because the maf chunk size is greater # than the largest scaffold size. That may not be true in other # assemblies. ssh kkstore05 cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno cat maf/*.maf >> ornAna1.maf mafFilter -overlap -reject=rf ornAna1.maf > ornAna1.mf.maf #rejected minRow: 66 #total rejected: 66 blocks # mafFilter rejected 66 single-row (ornAna1 only) blocks due to # its default minRow of 2. That's reasonable, so replace the original # with the filtered version (and load the db tables w/filtered). gzip -c ornAna1.maf > ornAna1.preFilter.maf.gz mv ornAna1.mf.maf ornAna1.maf # Load annotated maf ssh hgwdev cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno mkdir -p /gbdb/ornAna1/multiz6way/anno ln -s /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno/ornAna1.maf \ /gbdb/ornAna1/multiz6way/anno/ time nice hgLoadMaf -pathPrefix=/gbdb/ornAna1/multiz6way/anno ornAna1 multiz6way #Loaded 2665247 mafs in 1 files from /gbdb/ornAna1/multiz6way/anno #60.126u 5.767s 1:22.72 79.6% 0+0k 0+0io 0pf+0w # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno time nice hgLoadMafSummary ornAna1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz6waySummary ornAna1.maf #Created 1130326 summary blocks from 5153951 components and 2665247 mafs from ornAna1.maf #158.600u 10.810s 2:50.16 99.5% 0+0k 0+0io 4pf+0w ssh hgwdev cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/anno sed -e 's/mafSummary/multiz6waySummary/' ~/kent/src/hg/lib/mafSummary.sql \ > /tmp/multiz6waySummary.sql time nice hgLoadSqlTab ornAna1 multiz6waySummary \ /tmp/multiz6waySummary.sql multiz6waySummary.tab #0.002u 0.000s 0:24.69 0.0% 0+0k 0+0io 4pf+0w rm *.tab ln -s multiz6way.2007-04-24 /cluster/data/ornAna1/bed/multiz6way ######################################################################### # Adding automatic generation of upstream files (DONE - 2009-08-13 - Hiram) # edit src/hg/makeDb/genbank/genbank.conf to add: ornAna1.upstreamGeneTbl = ensGene ornAna1.upstreamMaf = multiz6way /hive/data/genomes/ornAna1/bed/multiz6way.2007-04-24/species.lst ######################################################################### # MULTIZ6WAY DOWNLOADABLES (DONE 4/26/2007 angie) ssh hgwdev mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/mafDownloads cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/mafDownloads # upstream mafs... not sure how good these will be since I expect # a lot of genes are fragmented on contigs, so upstream may not # be upstream, but worth a try... cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits ornAna1 ensGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags ornAna1 multiz6way up.bed upstream$i.maf \ -orgs=../species.lst rm up.bed end date 'EOF' # << emacs time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log #152.641u 50.687s 5:40.89 59.6% 0+0k 0+0io 0pf+0w # Make a gzipped version of the monolithic annotated maf file: ssh kolossus cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24 time nice gzip -c anno/ornAna1.maf > mafDownloads/ornAna1.maf.gz #623.839u 5.221s 10:29.86 99.8% 0+0k 0+0io 1pf+0w cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/mafDownloads time nice gzip up*.maf #0.849u 0.094s 0:01.02 91.1% 0+0k 0+0io 0pf+0w time nice md5sum *.gz > md5sum.txt #3.246u 0.574s 0:03.84 99.2% 0+0k 0+0io 0pf+0w ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/ornAna1/multiz6way mkdir $dir ln -s /cluster/data/ornAna1/bed/multiz6way.2007-04-24/mafDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/xenTro2/multiz7way/README.txt $dir # edit README.txt ######################################################################### # MULTIZ6WAY MAF FRAMES (DONE 4/26/2007 angie) # this was redone 2011-05-12 see below - Hiram ssh hgwdev mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/frames cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/frames mkdir genes #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using ensGene for ornAna1 # no genes for monDom4, anoCar1 # using knownGene for mm8 hg18 # using refGene for galGal3 # genePreds (name specific fields to get around bin, knownGene extras) foreach queryDb (ornAna1 mm8 hg18 galGal3) if ($queryDb == "ornAna1") then set geneTbl = ensGene else if ($queryDb == "galGal3") then set geneTbl = refGene else set geneTbl = knownGene endif hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from $geneTbl" ${queryDb} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz end #------------------------------------------------------------------------ # create frames ssh kolossus set multizDir = /cluster/data/ornAna1/bed/multiz6way.2007-04-24 set mafDir = $multizDir/mafDownloads set geneDir = $multizDir/frames/genes cd $multizDir/frames genePredToMafFrames ornAna1 $mafDir/ornAna1.maf.gz ornAna1.mafFrames \ ornAna1 genes/ornAna1.gp.gz \ hg18 genes/hg18.gp.gz \ mm8 genes/mm8.gp.gz \ galGal3 genes/galGal3.gp.gz gzip ornAna1.mafFrames #------------------------------------------------------------------------ # load the database ssh hgwdev cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/frames hgLoadMafFrames ornAna1 multiz6wayFrames ornAna1.mafFrames.gz ######################################################################### # PHASTCONS (DONE 4/27/2007 angie) # Using Kate's process from makeHg17.doc. # This process is distilled from Hiram and Adam's experiments # on mouse (mm7) 16way track. Many parameters are now fixed, without # being experimentally derived, either because the experiments # were lengthy and produced similar results, or because they # weren't runnable given the alignment size. # These parameters are: # --rho # --expected-length # --target-coverage # Also, instead of generating cons and noncons tree models, # we use a single, pre-existing tree model -- Elliot Margulies' model # from the (37-way) ENCODE alignments. ssh kkstore05 mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons # Prune 28-way model to just our 6 species: /cluster/bin/phast/tree_doctor \ --prune-all-but=ornAna1,monDom4,hg18,mm8,galGal3,anoCar1 \ /cluster/data/hg18/bed/multiz28way.2007-03-20/cons/28way.mod \ > 6way.mod # Split MAF into windows and use to generate "sufficient statistics" # (ss) files for phastCons input. # msa_split (and the .ss format) can't deal with more than one # target sequence in a .maf file. So unfortunately we will need # to split the 1k files further into tens of thousands of # per-sequence files. # This could be a small cluster job, especially with the mafSplit in the # loop, but there would still be a lot of I/O so just do it all on the # fileserver... cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons cat > doSplitOnFileserver.csh << '_EOF_' #!/bin/csh -fex set WINDOWS = /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/ss # this would be /scratch/tmp but kkstore05 won't let me create it: set tmpDir = /tmp rm -fr $WINDOWS mkdir -p $WINDOWS date foreach f (/cluster/data/ornAna1/bed/multiz6way.2007-04-24/maf/*.maf) # skip the maf files that have only comments -- those crash msa_split: if (`grep -v ^\# $f | head | wc -l`) then set c = $f:t:r mkdir $WINDOWS/$c rm -rf $tmpDir/$c mkdir $tmpDir/$c pushd $tmpDir/$c mafSplit dummyArg -byTarget tmp $f foreach t (tmp*.maf) set seq = `grep '^s ornAna1\.' $t | head -1 | perl -wpe 's/^s ornAna1.(\w+)\s.*$/$1/ || die;'` twoBitToFa /cluster/data/ornAna1/ornAna1.2bit -seq=$seq $seq.fa /cluster/bin/phast/$MACHTYPE/msa_split $t -i MAF \ -M $seq.fa \ -o SS -r $WINDOWS/$c/$seq -w 10000000,0 -I 1000 -B 5000 end popd rm -r $tmpDir/$c endif end date '_EOF_' # << emacs chmod a+x doSplitOnFileserver.csh nice ./doSplitOnFileserver.csh >& split.log & tail -f split.log # That took 2:45... should be a small cluster job next time. # check tree model on a single chunk, using params recommended by Adam, # (to verify branch lengths on 2X species -- though we aren't using any # of those here) ssh kolossus cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \ --tree "`cat ../tree-commas.nh`" \ /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/ss/split873/chr3.30000001-40000000.ss \ -o phyloFit.tree # The numbers in general are a lot lower... I'll chalk it up to # bias of what can be aligned and go ahead with the human 28-way # which was generated w/more data. # Run phastCons mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/run.cons cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/run.cons cat > doPhast.csh << 'EOF' #!/bin/csh -fe set d = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set tmp = /scratch/tmp/$f mkdir -p $tmp set san = /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons cp -p $san/ss/$d/$f.ss ../6way.mod $tmp pushd $tmp > /dev/null set c = $f:r:r /cluster/bin/phast/$MACHTYPE/phastCons $f.ss 6way.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/pp/$d $san/bed/$d sleep 1 mv $tmp/$f.pp $san/pp/$d mv $tmp/$f.bed $san/bed/$d rm -fr $tmp 'EOF' # << emacs chmod a+x doPhast.csh /cluster/bin/phast/$MACHTYPE/phastCons --estimate-rho /tmp/estimatedRho.mod \ --target-coverage 0.005 --expected-length 12 --no-post-probs \ /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/ss/split873/chr3.30000001-40000000.ss \ ../6way.mod #(rho = 0.248574) # for target-coverage 0.1: (rho = 0.229248) # Create gsub file cat > template << 'EOF' #LOOP doPhast.csh $(dir1) $(file1) 12 .005 .26 #ENDLOOP 'EOF' # << emacs # Create parasol batch and run it ssh pk cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/run.cons pushd /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/ss cp /dev/null /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/run.cons/in.list foreach d (*) ls -1S $d/*.ss | sed 's/.ss$//' \ >> /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/run.cons/in.list end popd gensub2 in.list single template jobList para make jobList para time #Completed: 35180 of 35180 jobs #CPU time in finished jobs: 4947s 82.45m 1.37h 0.06d 0.000 y #IO & Wait Time: 128549s 2142.48m 35.71h 1.49d 0.004 y #Average job time: 4s 0.06m 0.00h 0.00d #Longest finished job: 148s 2.47m 0.04h 0.00d #Submission to last job: 527s 8.78m 0.15h 0.01d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -name "*.bed" \ | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \ | sort -k7,7 -k9,9n \ | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \ | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements6way.bed cp -p phastConsElements6way.bed /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons # Measure coverage. If good, load elements into database and proceed with wiggle. # Try for somewhere in the neighborhood of 5% overall cov, and 70% CDS cov. # (Jim tried for 4% overall in xenTro1) ssh hgwdev cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons featureBits -chrom=chr1 ornAna1 -enrichment ensGene:cds phastConsElements6way.bed featureBits -chrom=chr3 ornAna1 -enrichment ensGene:cds phastConsElements6way.bed featureBits -chrom=chr17 ornAna1 -enrichment ensGene:cds phastConsElements6way.bed featureBits -chrom=Ultra1 ornAna1 -enrichment ensGene:cds phastConsElements6way.bed # FIRST ITERATION: 6way.mod, (len cov rho) = (12 .005 .26) #ensGene:cds 0.762%, phastConsElements6way.bed 5.003%, both 0.604%, cover 79.35%, enrich 15.86x #ensGene:cds 0.822%, phastConsElements6way.bed 4.681%, both 0.658%, cover 80.11%, enrich 17.11x #ensGene:cds 2.775%, phastConsElements6way.bed 9.169%, both 1.663%, cover 59.92%, enrich 6.53x #ensGene:cds 0.241%, phastConsElements6way.bed 3.012%, both 0.171%, cover 70.84%, enrich 23.52x # Close enough! Let's see how it looks in the browser. mv phastConsElements6way.bed phastConsElements6way_12_005_26.bed # When happy: hgLoadBed ornAna1 phastConsElements6way phastConsElements6way_12_005_26.bed # Create merged posterior probability file and wiggle track data files ssh kolossus cd /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/ # sort by chromName, chromStart so that items are in numerical order # for wigEncode time find ./pp -name "*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode -noOverlap stdin phastCons6way.wig phastCons6way.wib cp -p phastCons6way.wi? \ /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons ln -s `pwd`/phastCons6way.wib /gbdb/ornAna1/multiz6way/ hgLoadWiggle -pathPrefix=/gbdb/ornAna1/multiz6way ornAna1 \ phastCons6way phastCons6way.wig # Tally up tree distances. ssh hgwdev cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24 tail -1 phastCons/phyloFit.tree.mod | sed -e 's/^TREE: //' \ > 6way.phyloFit.nh /cluster/bin/phast/all_dists 6way.nh > 6way.distances.txt grep ornAna1 6way.distances.txt | sort -k3,3n | \ awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt cat distances.txt #0.8975 monDom4 #0.9673 hg18 #1.0957 galGal3 #1.1659 mm8 #1.2146 anoCar1 # the order in the browser display will be by tree topology, # not by distance. # Just for reference, these were the distances from phyloFit.tree.mod... # Quite different! #0.5175 monDom4 #0.5186 galGal3 #0.5363 hg18 #0.5544 anoCar1 #0.6066 mm8 # Make .gif for tree, check in to browser/images and install in # htdocs/images/phylo/... don't forget to request a push of that # file. The treeImage setting in trackDb.ra is # phylo/ornAna1_6way.gif (relative to htdocs/images). /cluster/bin/phast/tree_doctor \ --rename="ornAna1 -> platypus ; hg18 -> human ; mm8 -> mouse ; monDom4 -> opossum; galGal3 -> chicken; anoCar1 -> lizard;" \ 6way.nh > 6way_species.nh /data/apache/cgi-bin/phyloGif -phyloGif_tree=6way_species.nh \ -phyloGif_width=260 -phyloGif_height=260 \ > ornAna1_6way.gif # If you haven't already, check out the browser CVS tree in your ~/: # (cd; cvs co -d hgwdev:/projects/hg/cvsroot browser) cp ornAna1_6way.gif ~/browser/images/phylo/ cd ~/browser/images/phylo cvs add ornAna1_6way.gif # note: markd says it is best to use "cvs add -kb" for binary files - b0b cvs ci ornAna1_6way.gif cd ../.. cvsup make alpha ######################################################################### # PHASTCONS SCORES DOWNLOADABLES FOR 6WAY (DONE 5/1/07 angie) ssh kkstore05 mkdir /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastConsDownloads cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastConsDownloads set ppDir = /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons/pp set dDir = `pwd` # Some maf's were split into .ss chunks, so make sure to cat them in order. cp /dev/null ornAna1.pp foreach d ($ppDir/split*) pushd $d >& /dev/null ls -1 *.pp \ | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \ | sort -k1,1 -k3,3n \ | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \ | xargs cat >> $dDir/ornAna1.pp popd >& /dev/null end nice gzip ornAna1.pp md5sum ornAna1.pp.gz > md5sum.txt ssh hgwdev cd /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastConsDownloads set dir = /usr/local/apache/htdocs/goldenPath/ornAna1/phastCons6way mkdir $dir ln -s /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastConsDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/xenTro2/phastCons7way/README.txt $dir # edit README.txt # Clean up after phastCons run. ssh kkstore05 rm /cluster/data/ornAna1/bed/multiz6way.2007-04-24/phastCons/*.tab rm -r /san/sanvol1/scratch/ornAna1/multiz6way.2007-04-24/phastCons ######################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-04-28) ssh kkstore05 bash mkdir /cluster/data/ornAna1/blastDb cd /cluster/data/ornAna1 ls downloads/*.fa M/*.fa | grep -v chrUn > temp.lst cat `cat temp.lst` > temp.fa faSplit gap temp.fa 10000000 blastDb/x -lift=blastDb.lft faSplit sequence downloads/chrUn.fa 100 blastDb/un rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa *.log mkdir -p /san/sanvol1/scratch/ornAna1/blastDb cd /cluster/data/ornAna1/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/ornAna1/blastDb done mkdir -p /cluster/data/ornAna1/bed/tblastn.hg18KG cd /cluster/data/ornAna1/bed/tblastn.hg18KG echo /san/sanvol1/scratch/ornAna1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 154 query.lst # we want around 250000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(250000/154) = 22.623832 mkdir -p /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/kgfa split -l 60 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/ornAna1/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/ornAna1/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit # back to bash ssh pk cd /cluster/data/ornAna1/bed/tblastn.hg18KG para create blastSpec para try, push, etc. # Completed: 94402 of 94402 jobs # CPU time in finished jobs: 16325847s 272097.44m 4534.96h 188.96d 0.518 y # IO & Wait Time: 603783s 10063.06m 167.72h 6.99d 0.019 y # Average job time: 179s 2.99m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1411s 23.52m 0.39h 0.02d # Submission to last job: 75065s 1251.08m 20.85h 0.87d ssh kkstore05 cd /cluster/data/ornAna1/bed/tblastn.hg18KG tcsh mkdir chainRun cd chainRun cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh pk cd /cluster/data/ornAna1/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 613 of 613 jobs # CPU time in finished jobs: 5247s 87.45m 1.46h 0.06d 0.000 y # IO & Wait Time: 61655s 1027.58m 17.13h 0.71d 0.002 y # Average job time: 109s 1.82m 0.03h 0.00d # Longest finished job: 305s 5.08m 0.08h 0.00d # Submission to last job: 2287s 38.12m 0.64h 0.03d ssh kkstore05 cd /cluster/data/ornAna1/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/ornAna1/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/ornAna1/bed/tblastn.hg18KG hgLoadPsl ornAna1 blastHg18KG.psl # check coverage nice featureBits -noRandom ornAna1 blastHg18KG # 16607325 bases of 1842236818 (0.901%) in intersection nice featureBits -noRandom -enrichment ornAna1 ensGene:cds blastHg18KG # ensGene:cds 1.193%, blastHg18KG 0.901%, both 0.763%, cover 63.91%, enrich 70.89x ssh kkstore05 rm -rf /cluster/data/ornAna1/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/ornAna1/bed/tblastn.hg18KG/blastOut #end tblastn ######################################################################### # UPDATE ASSEMBLY VERSION STRING AND VERIFY SAME DATA (DONE 5/7/07 angie) # WUSTL's platypus project website now calls the assembly rev Mar. 2007, # 5.0.1 (as opposed to 5.0 or 6.0) so follow suit: ssh hgwdev hgsql hgcentraltest -e \ 'update dbDb set description = "Mar. 2007" where name = "ornAna1";' hgsql hgcentraltest -e \ 'update dbDb set sourceName = "WUSTL version 5.0.1" where name = "ornAna1";' # We got the data in Feb. not Mar., so I redownloaded files from # http://genome.wustl.edu/pub/organism/Other_Vertebrates/Ornithorhynchus_anatinus/assembly/Ornithorhynchus_anatinus-5.0.1/output/ # and verified it's the same seq/agp. ssh kolossus mkdir /tmp/checkOrnAna1 cd /tmp/checkOrnAna1 wget http://genome.wustl.edu/pub/organism/Other_Vertebrates/Ornithorhynchus_anatinus/assembly/Ornithorhynchus_anatinus-5.0.1/output/chromosomes.agp.gz wget http://genome.wustl.edu/pub/organism/Other_Vertebrates/Ornithorhynchus_anatinus/assembly/Ornithorhynchus_anatinus-5.0.1/output/chromosomes.fa.gz gunzip chr*gz # Compare AGP: wc -l /cluster/data/ornAna1/ornAna1.agp #689178 /cluster/data/ornAna1/ornAna1.agp wc -l chr*.agp #689178 chromosomes.agp sort -k 1,1 -k 2n,2n /cluster/data/ornAna1/ornAna1.agp > tmp1 sort -k 1,1 -k 2n,2n chromosomes.agp > tmp2 cmp tmp1 tmp2 # No output -- good, no change!! # Compare FASTA: faSize chromosomes.fa #871323289 bases (58126753 N's 813196536 real 813196535 upper 1 lower) in 125840 sequences in 1 files #Total size: mean 6924.1 sd 381372.6 min 108 (Contig271463) max 59581953 (chr3) median 894 # OK, that only has chroms and Contigs, not Ultras! wget http://genome.wustl.edu/pub/organism/Other_Vertebrates/Ornithorhynchus_anatinus/assembly/Ornithorhynchus_anatinus-5.0.1/output/supercontigs.fa.gz gunzip su*.gz #1996203893 bases (153985390 N's 1842218503 real 1842218503 upper 0 lower) in 205536 sequences in 1 files # The "1842218503 real" base count matches expected. # Neither downloaded fasta file has Ultra497... but it would be # reconstructed from AGP to be the same as what we have. # We can compare chr5's affected region: echo chr5 > list faSomeRecords chromosomes.fa list stdout | faFrag stdin 647408 657392 tmp1 twoBitToFa /cluster/data/ornAna1/ornAna1.2bit \ -seq=chr5 -start=647408 -end=657392 tmp2 faCmp tmp1 tmp2 #tmp1 and tmp2 are the same # Whew! ######################################################################### # WSSD AND SEGMENTAL DUPLICATIONS (DONE 8/3/07 angie - WSSD UPDATED 9/12/07) ssh kkstore05 mkdir /cluster/data/ornAna1/bed/genomicSuperDups cd /cluster/data/ornAna1/bed/genomicSuperDups # Unpack file emailed from Ginger Cheng # in Evan Eichler's lab: tar xvzf platypusDUP_track.tar.gz ssh hgwdev cd /cluster/data/ornAna1/bed/genomicSuperDups # 9/12/07: Ginger sent new WSSD file, and said we can filter out # single-base items. awk '($3 - $2) > 1 {print;}' SmallContig_wssdGE10K_nogap.tab \ | sed -e 's/^chr_Ultra/Ultra/' \ | hgLoadBed ornAna1 wssdCoverage stdin awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' genomicSuperDups \ | hgLoadBed ornAna1 genomicSuperDups stdin \ -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql # 8/29/07 Gak! Kayla found that the strand values were "+" and "_" -- fix: hgsql ornAna1 -e 'update genomicSuperDups set strand = "-" where strand = "_";' ########################################################################### # Blastz Platypus ornAna1 (WORKING - 2007-11-20 - Hiram) # This is a second attempt after a blastz of Platypus query to # Marmoset target produced no results for some unknown reason. ssh kkstore05 screen # use screen to control this job mkdir /cluster/data/ornAna1/bed/blastzCalJac1.2007-11-20 cd /cluster/data/ornAna1/bed/blastzCalJac1.2007-11-20 # this chunking will produce 542,178 jobs # This takes way too long, even on pk, 23 days cat << '_EOF_' > DEF # Platypus vs Marmoset BLASTZ_M=50 BLASTZ_K=2200 BLASTZ_L=6000 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_Y=3400 # Target: Platypus ornAna1, largest chrom 59,581,953, 201,523 sequences SEQ1_DIR=/cluster/bluearc/scratch/data/ornAna1/ornAna1.2bit SEQ1_LEN=/cluster/data/ornAna1/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LIMIT=300 SEQ1_LAP=10000 # QUERY: Marmoset calJac1, largest contig 2,551,648, 49,724 contigs SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit SEQ2_LEN=/cluster/data/calJac1/chrom.sizes SEQ2_CHUNK=5000000 SEQ2_LIMIT=500 SEQ2_LAP=0 BASE=/cluster/data/ornAna1/bed/blastzCalJac1.2007-11-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -stop=blastz \ -chainMinScore=5000 -chainLinearGap=loose \ -bigClusterHub=pk > blastz.log 2>&1 & # real 5494m4.009s Completed: 541845 of 542178 jobs Crashed: 333 jobs CPU time in finished jobs: 575423244s 9590387.40m 159839.79h 6659.99d 18.247 y IO & Wait Time: 8698684s 144978.06m 2416.30h 100.68d 0.276 y Average job time: 1078s 17.97m 0.30h 0.01d Longest finished job: 2887s 48.12m 0.80h 0.03d Submission to last job: 1991620s 33193.67m 553.23h 23.05d # the crashes are tracking errors # continuing after verifying that all result files are present # testing some changes to the script time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 `pwd`/DEF \ -continue=cat \ -chainMinScore=5000 -chainLinearGap=loose \ -bigClusterHub=pk > cat.log 2>&1 & # real 142m40.246s mkdir /cluster/data/calJac1/bed/blastz.ornAna1.swap cd /cluster/data/calJac1/bed/blastz.ornAna1.swap time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 \ /cluster/data/ornAna1/bed/blastzCalJac1.2007-11-20/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -swap -bigClusterHub=pk > swap.log 2>&1 & # real 40m3.380s cat fb.calJac1.chainOrnAna1Link.txt # 211526899 bases of 2929139385 (7.221%) in intersection time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 \ /cluster/data/ornAna1/bed/blastzCalJac1.2007-11-20/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -continue=cleanup -swap -bigClusterHub=pk > cleanup.log 2>&1 & ########################################################################### ############################################################################ # SWAP ornAna1 -> equCab2 (DONE - 2008-04-29 - larrym) mkdir /cluster/data/ornAna1/bed/blastz.equCab2.swap cd /cluster/data/ornAna1/bed/blastz.equCab2.swap time /bin/nice -n 19 doBlastzChainNet.pl \ /cluster/data/equCab2/bed/blastz.ornAna1.2008-04-24/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -syntenicNet >& do.log & 0.883u 0.754s 2:33:21.36 0.0% 0+0k 0+0io 16pf+0w ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ # XENO REFGENE - added xeno refGene (2008-06-03 markd) add to genbank.conf ornAna1.refseq.mrna.xeno.load = yes ornAna1.refseq.mrna.xeno.loadDesc = yes ssh genbank cd /cluster/data/genbank ((./bin/gbAlignStep -initial -srcDb=refseq ornAna1)|&mail markd&) ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # QUALITY TRACK (DONE - 2008-11-24 - Hiram) mkdir /hive/data/genomes/ornAna1/downloads cd /hive/data/genomes/ornAna1/downloads qaToQac contigs.fa.qual.gz assembly.qac qacAgpLift ../ornAna1.agp assembly.qac scaffolds.qac mkdir /hive/data/genomes/ornAna1/bed/qual cd /hive/data/genomes/ornAna1/bed/qual # the qac file was created by Rico during 28-way annotations qacToWig -fixed ../../downloads/scaffolds.qac stdout \ | wigEncode stdin qual.wig qual.wib ln -s `pwd`/qual.wib /gbdb/ornAna1/wib hgLoadWiggle -pathPrefix=/gbdb/ornAna1/wib ornAna1 quality qual.wig ############################################################################ # ornAna1 - Platypus - Ensembl Genes version 51 (DONE - 2008-12-03 - hiram) ssh kolossus cd /hive/data/genomes/ornAna1 cat << '_EOF_' > ornAna1.ensGene.ra # required db variable db ornAna1 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9][0-9]*\)/chr\1/; s/^\(X[0-9]\)/chr\1/; s/^MT/chrM/" # ignore genes that do not properly convert to a gene pred, and contig # names that are not in the UCSC assembly, 365 items skipInvalid yes '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=51 ornAna1.ensGene.ra ssh hgwdev cd /hive/data/genomes/ornAna1/bed/ensGene.51 featureBits ornAna1 ensGene # 24491979 bases of 1842236818 (1.329%) in intersection *** All done! (through the 'makeDoc' step) *** Steps were performed in /hive/data/genomes/ornAna1/bed/ensGene.51 ############################################################################ # lastz swap from Horse EquCab2 (DONE - 2009-07-02 - Hiram) # original lastz run cd /hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01 cat fb.equCab2.chainOrnAna1Link.txt # 132750149 bases of 2428790173 (5.466%) in intersection mkdir /hive/data/genomes/ornAna1/bed/blastz.equCab2.swap cd /hive/data/genomes/ornAna1/bed/blastz.equCab2.swap time doBlastzChainNet.pl \ /hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -swap -workhorse=hgwdev \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 115m47.611s cat fb.ornAna1.chainEquCab2Link.txt # 132776204 bases of 1842236818 (7.207%) in intersection ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # lastz Lizard swap anoCar2 (DONE - 2011-04-27 - Hiram) # the original alignment cd /cluster/data/anoCar2/bed/lastzOrnAna1.2011-04-26 cat fb.anoCar2.chainOrnAna1Link.txt # 77594222 bases of 1701353770 (4.561%) in intersection ## running the swap mkdir /hive/data/genomes/ornAna1/bed/blastz.anoCar2.swap cd /hive/data/genomes/ornAna1/bed/blastz.anoCar2.swap time nice -n +25 doBlastzChainNet.pl -verbose=2 \ /cluster/data/anoCar2/bed/lastzOrnAna1.2011-04-26/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \ -swap -bigClusterHub=swarm -tRepeats=windowmaskerSdust > swap.log 2>&1 & # real 40m51.778s cat fb.ornAna1.chainAnoCar2Link.txt # 75402821 bases of 1842236818 (4.093%) in intersection ############################################################################ # rerun maf annotation since mafAddIRows can only work on one sequence # at a time mkdir /hive/data/genomes/ornAna1/bed/multiz6way.2007-04-24/anno/mafSplit cd /hive/data/genomes/ornAna1/bed/multiz6way.2007-04-24/anno/mafSplit # need to split up the single maf file into individual # per-scaffold maf files to run annotation on # create bed files to list approximately 1000 scaffolds in # a single list, approximately 202 lists cat << '_EOF_' > mkBedLists.pl #!/usr/bin/env perl use strict; use warnings; my $bedCount = 0; my $i = 0; my $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; open (FH,") { chomp $line; if ( (($i + 1) % 1000) == 0 ) { printf "%s\n", $line; close (BF); ++$bedCount; $bedFile = sprintf("file_%d.bed", $bedCount); open (BF,">$bedFile") or die "can not write to $bedFile $!"; } ++$i; my ($chr, $size) = split('\s+',$line); printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr; } close (FH); close (BH); '_EOF_' # << happy emacs chmod +x mkBedLists.pl ./mkBedLists.pl cat *.bed | wc -l # 201523 wc -l /hive/data/genomes/ornAna1/chrom.sizes # 201523 /hive/data/genomes/ornAna1/chrom.sizes # now, run a mafsInRegion on each one of those lists cat << '_EOF_' > runOne #!/bin/csh -fe set runDir = "/cluster/data/ornAna1/bed/multiz6way/anno/mafSplit" set resultDir = $1 set bedFile = $resultDir.bed mkdir -p $resultDir mkdir -p /scratch/tmp/ornAna1/$resultDir pushd /scratch/tmp/ornAna1/$resultDir mafsInRegion $runDir/$bedFile -outDir . \ /hive/data/genomes/ornAna1/bed/multiz6way.2007-04-24/anno/ornAna1.maf popd rsync -q -a /scratch/tmp/ornAna1/$resultDir/ ./$resultDir/ rm -fr /scratch/tmp/ornAna1/$resultDir rmdir --ignore-fail-on-non-empty /scratch/tmp/ornAna1 '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' > template #LOOP ./runOne $(root1) #ENDLOOP '_EOF_' # << happy emacs # running this on memk with 24 CPUs ls file*.bed > runList gensub2 runList single template jobList para create jobList para try ... check ... push ... etc # Completed: 202 of 202 jobs # CPU time in finished jobs: 7640s 127.34m 2.12h 0.09d 0.000 y # IO & Wait Time: 13270s 221.16m 3.69h 0.15d 0.000 y # Average job time: 104s 1.73m 0.03h 0.00d # Longest finished job: 245s 4.08m 0.07h 0.00d # Submission to last job: 1071s 17.85m 0.30h 0.01d cd /hive/data/genomes/ornAna1/bed/multiz6way/anno/run for DB in `cat ../../species.lst` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set outDir = ../maf/$2 set result = $3 set input = $1 mkdir -p $outDir cat $input | \ nice mafAddIRows -nBeds=nBeds stdin /hive/data/genomes/ornAna1/ornAna1.2bit $result '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs find ../mafSplit -type f -name "*.maf" > maf.list gensub2 maf.list single template jobList # slowing this down on swarm, 1 job per node with this memory limit: para -ram=8g create jobList para try ... check ... etc. para push # Completed: 83266 of 83266 jobs # CPU time in finished jobs: 17759s 295.98m 4.93h 0.21d 0.001 y # IO & Wait Time: 212068s 3534.47m 58.91h 2.45d 0.007 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest finished job: 19s 0.32m 0.01h 0.00d # Submission to last job: 1106s 18.43m 0.31h 0.01d # put the results back together into a single file cd /hive/data/genomes/ornAna1/bed/multiz6way/anno # combine into one file head -q -n 1 maf/file_0/chr1.maf > ornAna1.6way.maf time find ./maf -type f | while read F do grep -h -v "^#" ${F} done >> ornAna1.6way.maf real 23m17.855s # these maf files do not have the end marker, this does nothing: # tail -q -n 1 maf/file_0/chr1.maf >> ornAna1.6way.maf # How about an official end marker: echo "##eof maf" >> ornAna1.6way.maf cd /cluster/data/ornAna1/bed/multiz6way/anno mkdir -p /gbdb/ornAna1/multiz6way/anno ln -s `pwd`/ornAna1.6way.maf /gbdb/ornAna1/multiz6way/anno/multiz6way.maf # by loading this into the table multiz6way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/ornAna1/multiz6way/anno \ ornAna1 multiz6way # Loaded 2917036 mafs in 1 files from /gbdb/ornAna1/multiz6way/anno # real 1m17.387s # normally filter this for chrom size > 1,000,000 and only load # those chroms. But this is a scaffold assembly, load everything: time nice -n +19 hgLoadMafSummary ornAna1 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz6waySummary \ /gbdb/ornAna1/multiz6way/anno/multiz6way.maf # Created 572679 summary blocks from 2650531 components # and 2917036 mafs from /gbdb/ornAna1/multiz6way/anno/multiz6way.maf # real 1m13.692s # by loading this into the table multiz6waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz6way*.tab files in this /scratch/tmp directory rm multiz6way*.tab ############################################################################ # rerun frames on new annotation and more gene sources (2011-05-12 - Hiram) cd /hive/data/genomes/ornAna1/bed/multiz6way.2007-04-24/frames # ensGene is good for most for qDB in ornAna1 anoCar1 galGal3 mm8 do echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB} hgsql -N -e "select * from ensGene" ${qDB} | cut -f 2-11 \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done # monDom4 ensGene is an older version with only 10 columns for qDB in monDom4 do echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB} hgsql -N -e "select * from ensGene" ${qDB} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done # knownGenes for hg18 for qDB in hg18 do echo hgsql -N -e \"'select * from 'knownGene\;\" ${qDB} hgsql -N -e "select * from knownGene" ${qDB} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done cat ../anno/ornAna1.6way.maf \ | nice -n +19 genePredToMafFrames ornAna1 stdin stdout \ `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.lst` \ | gzip > ornAna1.mafFrames.gz # a couple of minutes # verify there are frames on everything: zcat ornAna1.mafFrames.gz | awk '{print $4}' | sort | uniq -c # 225496 anoCar1 # 318691 galGal3 # 428985 hg18 # 489490 mm8 # 341401 monDom4 # 149620 ornAna1 ssh hgwdev cd /hive/data/genomes/ornAna1/bed/multiz6way/frames time hgLoadMafFrames ornAna1 multiz6wayFrames ornAna1.mafFrames.gz # real 0m15.994s ######################################################################### # SWAP BOSTAU6 CHAIN/NET (DONE 2011-05-20 - Chin) cd /cluster/data/ornAna1/bed/blastz.bosTau6.swap cat fb.ornAna1.chainBosTau6Link.txt # 190401000 bases of 1842236818 (10.335%) in intersection cd /cluster/data/ornAna1/bed ln -s blastz.bosTau6.swap lastz.bosTau6 ############################################################################ # running cpgIsland business (DONE 2011-11-22 - Chin) mkdir /hive/data/genomes/ornAna1/bed/cpgIsland cd /hive/data/genomes/ornAna1/bed/cpgIsland cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands # needed to fixup this source, adding include to readseq.c: #include "string.h" # and to cpg_lh.c: #include "unistd.h" #include "stdlib.h" # and fixing a declaration in cpg_lh.c sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c mv tmp.c cpg_lh.c make cd ../../ ln -s hg3rdParty/cpgIslands/cpglh.exe mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../ornAna1.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/ornAna1/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too # many # N's - it gets stuck cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | grep -e ^chr -e ^Contig -e ^Ultra | awk '{print $2 - $7}'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod 775 runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time para problems para status # then, kick it with para push # check it with plb # when all are done, para time shows: # Checking finished jobs # Completed: 201523 of 201523 jobs # CPU time in finished jobs: 145s 2.42m 0.04h 0.00d 0.000 y # IO & Wait Time: 526359s 8772.65m 146.21h 6.09d 0.017 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 46s 0.77m 0.01h 0.00d # Submission to last job: 21512s 358.53m 5.98h 0.25d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed ssh hgwdev cd /hive/data/genomes/ornAna1/bed/cpgIsland hgLoadBed ornAna1 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Loaded 82257 elements of size 10 # Sorted # Creating table definition for cpgIslandExt # Saving bed.tab # Loading ornAna1 # cleanup rm -fr hardMaskedFa ######################################################################### # SWAP BOSTAU7 CHAIN/NET (DONE 2012-01-18 - Chin) cd /cluster/data/ornAna1/bed/blastz.bosTau7.swap cat fb.ornAna1.chainBosTau7Link.txt # 190094136 bases of 1842236818 (10.319%) in intersection cd /cluster/data/ornAna1/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ############################################################################