# for emacs: -*- mode: sh; -*- # Equus caballus - ?? # "$Id: equCab2.txt,v 1.31 2010/04/20 17:34:31 chinhli Exp $" ######################################################################### # DOWNLOAD SEQUENCE (... # Looked at kkstore05; equCab1 was 52Gigs, and there was 150+ Gigs available on /cluster/store12, # so hiram decided we had room to put equCab2 on /cluster/store12 too ssh kkstore05 mkdir /cluster/store12/equCab2 ln -s /cluster/store12/equCab2 /cluster/data/equCab2 mkdir /cluster/data/equCab2/broad cd /cluster/data/equCab2/broad wget --timestamping -r -nd ftp://ftp.broad.mit.edu/pub/assemblies/mammals/horse/Equus2 # sanity check of AGP file: zcat assembly.bases.gz | grep '^>' | sed -e 's/>//' | sort > contig_names cat mapped.agp.chromosome.agp | egrep -v 'fragment|clone' | cut -f 6 | sort -u > contig_names2 zcat assembly.quals.gz | grep '^>' | sed -e 's/>//' | sort > contig_names3 diff contig_names contig_names2 diff contig_names contig_names3 # check for non-ATCG zcat assembly.bases.gz | egrep -v '^>contig' | egrep '[^ATCG]' | wc no diffs, so the assembly, quality and chromosome files agree. # Unknown chromosome: cut -f 1 mapped.agp.chromosome.agp | sort -u | grep '^chrUn' | wc 9604 9604 96040 i.e. lots of unknown fragments cd /cluster/data/equCab2/broad/ egrep '^chrUn' mapped.agp.chromosome.agp | perl -e '$sum = 0; while(<>) { if(/^\S+\s+(\S+)\s+(\S+)/) { $n = $2 - $1 + 1; $sum += $n;}} print "sum: $sum\n"' sum: 107858955 about 1/3 of the previous horse. Coallesce many unknown chromosomes in Broad data to one chrUnk: cat << '_EOF_' > unk.pl #!/usr/bin/env perl # Process unknown regions of mapped.agp.chromosome.agp to create one unknown chromosome. # # run thus: # ./unk.pl < mapped.agp.chromosome.agp > test use warnings; use strict; my $seq = 1; my $end = 0; # fixed-up end position (field 3) of current chromosome my $prevEnd = 0; # field three of previous "chromosome" (not necessarily the previous line) my $count = 0; # current number of $prevEnd lines seen (used to detect single line Unk chromosomes). my $lastNum = ""; # "Unk" number of previous line my $ungappedLen = 1000; my $debug = 0; my $test = 0; sub printUnGapped { # print a non-bridged gap for single line unknown chromosomes. my ($prev) = @_; print "chrUn\t"; print $prev + 1, "\t"; print $prev + 1000, "\t"; print $seq++, "\t"; print "N\t$ungappedLen\tclone\tno\n"; $test++; return $prev + $ungappedLen; } while(<>) { my @a = split /\s+/; if($a[0] =~ /^chrUn(\d+)/) { my $num = $1; my $newChr = $num ne $lastNum; # XXXX if($newChr && $count) { $end = printUnGapped($end); } if($newChr) { print "chrUn$num\n" if($debug); $prevEnd = $end; } # fixup fields 2 & 3 $a[1] += $prevEnd; $a[2] += $prevEnd; # fixup sequence $a[3] = $seq++; print "chrUn\t"; for my $i (1..8) { if(defined($a[$i])) { print $a[$i],"\t"; } } print "\n"; $end = $a[2]; $lastNum = $num; $count = $newChr ? 1 : $count + 1; } } print STDERR "test: $test\n" if($debug); '_EOF_' unk.pl < mapped.agp.chromosome.agp > test # Sanity checks on unk.pl run: cut -f 6 test | egrep ^contig | wc 12980 12980 168740 cut -f 6 test | egrep ^contig | sort | uniq | wc 12980 12980 168740 # create our fixed-up up map file (ucsc.agp) cp test ucsc.agp egrep -v '^chrUn' mapped.agp.chromosome.agp >> ucsc.agp Run makeGenomeDb.pl: makeGenomeDb.pl failed b/c we forgot to lift the QA files, so we manually lifted them thus: # Lift contig quality scores to chrom coordinates: qaToQac assembly.quals.gz stdout | qacAgpLift ucsc.agp stdin equCab2.qual.qac cd /cluster/data/equCab2/bed/qual qacToWig -fixed /cluster/data/equCab2/broad/equCab2.qual.qac stdout | wigEncode stdin qual.{wig,wib} # run on hgwdev: hgLoadWiggle -pathPrefix=/gbdb/equCab2/wib equCab2 quality qual.wig & then we restarted makeGenomeDb.pl: makeGenomeDb.pl -continue dbDb equCab2.config.ra > & makeGenomeDb.log2 & and it finished correctly. # Sanity checks on makeGenomeDb data: twoBitToFa equCab2.unmasked.2bit stdout | faSize stdin 2510493021 bases (56068733 N's 2454424288 real 2454424288 upper 0 lower) in 34 sequences in 1 files Total size: mean 73838030.0 sd 36872490.7 min 16660 (chrM) max 185838109 (chr1) median 80757907 N count: mean 1649080.4 sd 4017533.3 U count: mean 72188949.6 sd 35782988.9 L count: mean 0.0 sd 0.0 %0.00 masked total, %0.00 masked real zcat assembly.bases.gz | faSize stdin 2428773513 bases (0 N's 2428773513 real 2428773513 upper 0 lower) in 55316 sequences in 1 files Total size: mean 43907.3 sd 67000.2 min 601 (contig_28626) max 916837 (contig_34896) median 16149 N count: mean 0.0 sd 0.0 U count: mean 43907.3 sd 67000.2 L count: mean 0.0 sd 0.0 %0.00 masked total, %0.00 masked real 2454424288 - 2428773513 == 25650775 i.e. our file has 25,650,775 more real nucleotides: zcat assembly.bases.gz | egrep '^>contig' | tail >contig_55315 # consistent with faSize data and length of contig_names zcat mapped.agp.chromosome.fasta.gz | faSize stdin & 2500873361 bases (46465733 N's 2454407628 real 2454407628 upper 0 lower) in 10096 sequences in 1 files Total size: mean 247709.3 sd 1055871.2 min 3000 (Un9604.1-3000) max 5000000 (1.1-5000000) median 4235 N count: mean 4602.4 sd 24700.2 U count: mean 243106.9 sd 1042172.0 L count: mean 0.0 sd 0.0 %0.00 masked total, %0.00 masked real This file is consistent with our gaps: (56068733 - 46465733) - (9603 * 1000) == 0 Our file has extra real nucleotides: 2454424288 - 2454407628 == 16660 Which is OK, b/c that's the size of ChrM. Problem is that their assembly uses the same contigs in different chromosomes; see broad/dupes.txt; e.g. contig_31550 is at the end of chr27 and chr6 and broad/dupes.agp egrep -v "^track" dupes.agp | awk '{print $3-$2+1}' | ave stdin total 51268230.000000 51268230 / 2 == 25634115 So there are 25,634,115 duplicated bps. ((2454424288 - 16660) - 25634115) - 2428773513 == 0 i.e. if you take dupes and chrM into account, our data agrees with their raw data. ######################################################################### # REPEATMASKER (DONE - 2008-04-03 - larrym) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/equCab2/bed/repeatMasker cd /cluster/data/equCab2/bed/repeatMasker doRepeatMasker.pl -buildDir=/cluster/data/equCab2/bed/repeatMasker -bigClusterHub=pk equCab2 >& do.log & # XXXX Hiram, I had to run this on hgwdev to get access to mysql; does that make sense? time featureBits equCab2 rmsk >& fb.equCab2.rmsk.txt & # 1004742864 bases of 2454424288 (40.936%) in intersection # RepeatMasker and lib version from do.log: # RepeatMasker version development-$Id: equCab2.txt,v 1.31 2010/04/20 17:34:31 chinhli Exp $ # Jan 11 2008 (open-3-1-9) version of RepeatMasker # CC RELEASE 20071204 # Compare coverage to previous assembly: featureBits equCab1 rmsk 994130286 bases of 2421923695 (41.047%) in intersection ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-04-03 - larrym) ssh kkstore05 screen # use a screen to manage this job mkdir /cluster/data/equCab2/bed/simpleRepeat cd /cluster/data/equCab2/bed/simpleRepeat doSimpleRepeat.pl -buildDir=/cluster/data/equCab2/bed/simpleRepeat -bigClusterHub=memk \ equCab2 >& do.log & # add simple repeats to the RepeatMasker repeats: cd /cluster/data/equCab2 twoBitMask equCab2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed equCab2.2bit twoBitToFa equCab2.2bit stdout | faSize stdin # 2510493021 bases (56068733 N's 2454424288 real 1449746465 upper 1004677823 lower) in 34 sequences in 1 files # Total size: mean 73838030.0 sd 36872490.7 min 16660 (chrM) max 185838109 (chr1) median 80757907 # N count: mean 1649080.4 sd 4017533.3 # U count: mean 42639601.9 sd 20847778.6 # L count: mean 29549347.7 sd 15784709.4 # %40.02 masked total, %40.93 masked real twoBitToFa equCab2.rmsk.2bit stdout | faSize stdin # 2510493021 bases (56068733 N's 2454424288 real 1450129420 upper 1004294868 lower) in 34 sequences in 1 files # Total size: mean 73838030.0 sd 36872490.7 min 16660 (chrM) max 185838109 (chr1) median 80757907 # N count: mean 1649080.4 sd 4017533.3 # U count: mean 42650865.3 sd 20852722.8 # L count: mean 29538084.4 sd 15779986.2 # %40.00 masked total, %40.92 masked real featureBits -countGaps equCab2 rmsk # 1004742864 bases of 2510493021 (40.022%) in intersection # Slightly different from above b/c some repeats include N's in gaps. featureBits -bed=rmsk_and_gaps.bed -countGaps equCab2 rmsk gap & cat rmsk_and_gaps.bed | awk "{print $3-$2}" | ave stdin # total 448008 1004742864 - 448008 - 1004294868 = -12 We don't understand the -12 discrepancy # Run on hgwdev: rm /gbdb/equCab2/equCab2.2bit ln -s /cluster/data/equCab2/equCab2.2bit /gbdb/equCab2/equCab2.2bit ######################################################################## # add ctgPos2 track (DONE - 2008-04-04 - larrym) cd /cluster/data/equCab2/broad ./unk.pl < mapped.agp.chromosome.agp > /dev/null ./mkScaffolds.pl assembly.agp mapped.agp.chromosome.agp >> Contig.bed awk '{size = $3-$2; printf "%s\t%d\t%s\t%d\t%d\tW\n", $4, size, $1, $2, $3}' Contig.bed > ctgPos2.tab ssh hgwdev cd /cluster/data/equCab2/broad hgLoadSqlTab equCab2 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-04-04 - larrym) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("equCab2", "blat10", "17784", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("equCab2", "blat10", "17785", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## DEFAULT POSITION (DONE - 2008-04-04 - larrym) # Set default position to be same as equCab1, as found by a blat search ssh hgwdev hgsql -e 'update dbDb set defaultPos="chr11:52,970,942-52,973,091" \ where name="equCab2";' hgcentraltest ############################################################################ ## gold.html, ctgPos2.html etc. (working - 2008-04-09 - larrym) grep contig_ mapped.agp.chromosome.agp | awk '{print $3-$2+1}' | ave stdin average 43973.978823 count 55815 total 2454407628.000000 # But there is only 55316 uniq contigs: grep contig_ mapped.agp.chromosome.agp | awk '{printf "%s\t%s\t%s\n", $6, $7, $8}' | sort | uniq | wc -l 55316 # working with only unique contigs, we get numbers which agree with the Broad data: grep contig_ mapped.agp.chromosome.agp | awk '{printf "%s\t%s\t%s\n", $6, $7, $8}' | sort | uniq | awk '{print $3-$2+1}' | ave stdin average 43907.251302 total 2428773513.000000 ######################################################################### # Create OOC file for genbank runs (WORKING - 2008-04-09 - larrym) 1024 * (2.5 / 3.1) == 825.805824 # Use -repMatch=825 (based on size -- for human we use 1024, and # horse size is 84% of human judging by twoBitInfo -noNs) ssh kkstore05 cd /cluster/data/equCab2 blat equCab2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=equCab2.11.ooc -repMatch=825 ######################################################################### # make equCab2.chrUn.lift, chrUn.fa and equCab2.UnScaffolds.2bit ssh hgwdev cd /cluster/data/equCab2/jkStuff grep ^chrUn ../broad/Contig.bed | awk '{printf "%d\t%s\t%d\tchrUn\t156378573\n", $2, $4, $3-$2}' > equCab2.chrUn.lift # oops, I copied over hiram's length in the above command, so redo this grep ^chrUn ../broad/Contig.bed | awk '{printf "%d\t%s\t%d\tchrUn\t117461955\n", $2, $4, $3-$2}' > equCab2.chrUn.lift # create FASTA file with chrUn, using Broad scaffold names cp -p /cluster/data/gasAcu1/jkStuff/lft2BitToFa.pl . ./lft2BitToFa.pl ../equCab2.2bit equCab2.chrUn.lift > chrUn.fa & faSize chrUn.fa 107858955 bases (14539925 N's 93319030 real 36162792 upper 57156238 lower) in 9604 sequences in 1 files # create FASTA file w/o unknown chroms cd .. twoBitToFa equCab2.2bit stdout | \ perl -e 'my $print=0; while(<>) { if(/^>chr(.+)/) { $print = $1 ne "Un";} print if($print);}' > jkStuff/chr.fa & faSize jkStuff/chr.fa 2393031066 bases (31925808 N's 2361105258 real 1413583673 upper 947521585 lower) in 33 sequences in 1 files # corrected chr27: 2367070107 bases (31598964 N's 2335471143 real 1397621407 upper 937849736 lower) in 33 sequences in 1 files 2393031066 + 107858955 + (9603 * 1000) = 2510493021 # corrected chr27: 2367070107 + 107858955 + (9603 * 1000) = 2484532062 # i.e. correct size cd jkStuff faToTwoBit chr.fa chrUn.fa ../equCab2.UnScaffolds.2bit cd .. twoBitInfo equCab2.UnScaffolds.2bit equCab2.UnScaffolds.sizes cp -p equCab2.UnScaffolds.2bit /san/sanvol1/scratch/equCab2 cp -p equCab2.UnScaffolds.sizes /san/sanvol1/scratch/equCab2 # run via hg18.txt ######################################################################### # GENBANK AUTO UPDATE (DONE - 2008-04-10 - larrym) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add equCab2 just before equCab1 # equCab2 (Equus caballus) equCab2.serverGenome = /cluster/data/equCab2/equCab2.2bit equCab2.clusterGenome = /scratch/data/equCab2/equCab2.2bit equCab2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} equCab2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} equCab2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} equCab2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} equCab2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} equCab2.ooc = /cluster/data/equCab2/equCab2.11.ooc equCab2.lift = /cluster/data/equCab2/jkStuff/equCab2.chrUn.lift equCab2.align.unplacedChroms = chrUn equCab2.genbank.mrna.xeno.load = yes equCab2.downloadDir = equCab2 cvs ci -m "Added equCab2" etc/genbank.conf # update /cluster/data/genbank: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time /bin/nice -n 19 bin/gbAlignStep -initial equCab2 & # logFile: var/build/logs/2008.04.09-11:33:43.equCab2.initalign.log # failed (problem with a machine that didn't have proper rsync, so we rsync'ed that # machine and re-ran the batch): para -retries=6 push time /bin/nice -n 19 bin/gbAlignStep -initial -continue=finish equCab2 & # logFile: var/build/logs/2008.04.10-14:21:09.equCab2.initalign.log # 2599.431u 1067.489s 1:03:48.15 95.7% 0+0k 0+0io 10pf+0w # load database when finished ssh hgwdev cd /cluster/data/genbank time /bin/nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad equCab2 & # logFile: var/dbload/hgwdev/logs/2008.04.10-15:27:13.dbload.log # 567.199u 198.673s 16:48.99 75.9% 0+0k 0+0io 4pf+0w # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add equCab2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added equCab2 - Equus caballus" etc/align.dbs etc/hgwdev.dbs make etc-update ## ### ############################################################################ # equCab2 - Horse - Ensembl Genes (DONE - 2008-04-02 - hiram) ssh kkstore04 cd /cluster/data/equCab2 cat << '_EOF_' > equCab2.ensGene.ra # required db variable db equCab2 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" # translate Ensembl chrUnNNNN names to chrUn coordinates liftUp /cluster/data/equCab2/jkStuff/chrUn.lift '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=49 equCab2.ensGene.ra ssh hgwdev cd /cluster/data/canFam2/bed/ensGene.49 featureBits equCab2 ensGene # 39746402 bases of 2454424288 (1.619%) in intersection ############################################################################ # SWAP equCab2 -> hg18 mkdir /cluster/data/equCab2/bed/blastz.hg18.swap cd /cluster/data/bosTau4/bed/blastz.hg18.swap time /bin/nice -n 19 doBlastzChainNet.pl \ /cluster/data/hg18/bed/blastz.equCab2.2008-04-10/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet >& do.log & 0.191u 0.101s 2:20:34.06 0.0% 0+0k 0+0io 6pf+0w featureBits equCab2 -chrom=chr1 chainHg18Link 126450610 bases of 183561855 (68.887%) in intersection ############################################################################ # SWAP equCab2 -> mm9 (DONE - 2008-04-17 - larrym) mkdir /cluster/data/equCab2/bed/blastz.mm9.swap cd /cluster/data/equCab2/bed/blastz.mm9.swap time /bin/nice -n 19 doBlastzChainNet.pl \ /cluster/data/mm9/bed/blastz.equCab2.2008-04-15/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet >& do.log & 0.164u 0.128s 1:32:36.50 0.0% 0+0k 0+0io 6pf+0w featureBits equCab2 -chrom=chr1 chainMm9Link 72255123 bases of 183561855 (39.363%) in intersection ############################################################################ # SWAP equCab2 -> canFam2 (DONE - 2008-04-22 - larrym) mkdir /cluster/data/equCab2/bed/blastz.canFam2.swap cd /cluster/data/equCab2/bed/blastz.canFam2.swap time /bin/nice -n 19 doBlastzChainNet.pl \ /cluster/data/canFam2/bed/blastz.equCab2.2008-04-18/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet >& do.log & 0.191u 0.108s 2:44:26.23 0.0% 0+0k 0+0io 1pf+0w ############################################################################ # SWAP equCab2 -> galGal3 (DONE - 2008-04-22 - larrym) mkdir /cluster/data/equCab2/bed/blastz.galGal3.swap cd /cluster/data/equCab2/bed/blastz.galGal3.swap time /bin/nice -n 19 doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastz.equCab2.2008-04-18/DEF \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet >& do.log & 0.170u 0.107s 12:27.33 0.0% 0+0k 0+0io 0pf+0w ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-22) # (auto) establish native host of /cluster/data/ df /cluster/data/equCab2 ssh kkstore05 # bash if not using bash shell already sizes=equCab2.UnScaffolds.sizes twobit=equCab2.UnScaffolds.2bit mkdir /cluster/data/equCab2/blastDb cd /cluster/data/equCab2 awk '{if ($2 > 1000000) print $1}' $sizes > 1meg.lst if test -s 1meg.lst; then twoBitToFa -seqList=1meg.lst $twobit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa fi awk '{if ($2 <= 1000000) print $1}' $sizes > less1meg.lst if test -s less1meg.lst; then twoBitToFa -seqList=less1meg.lst $twobit temp.fa faSplit about temp.fa 1000000 blastDb/y rm temp.fa fi wc 1meg.lst less1meg.lst # 39 39 252 1meg.lst # 9598 9598 95975 less1meg.lst # 9637 9637 96227 total wc $sizes # 9637 19274 145695 equCab2.UnScaffolds.sizes rm 1meg.lst less1meg.lst cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa ls *.nsq | wc -l # 3086 mkdir -p /san/sanvol1/scratch/equCab2/blastDb cd /cluster/data/equCab2/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/equCab2/blastDb done mkdir -p /cluster/data/equCab2/bed/tblastn.hg18KG cd /cluster/data/equCab2/bed/tblastn.hg18KG echo /san/sanvol1/scratch/equCab2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 3086 query.lst # we want around 100,000 jobs per gig (0.0001 jobs per base) numJobs=`awk '{sum += $2} END {print sum * 0.0001 }' /cluster/data/equCab2/chrom.sizes` # we want around numJobs jobs numGenes=`wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'` numQueries=`wc query.lst | awk '{print $1}'` numKGFiles=`awk -v ng=$numGenes -v nq=$numQueries -v nj=$numJobs 'BEGIN {printf "%d", ng/(nj/nq);exit}' ` echo $numJobs # 251049 mkdir -p /cluster/bluearc/equCab2/bed/tblastn.hg18KG/kgfa split -l $numKGFiles /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/equCab2/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/equCab2/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/equCab2/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/equCab2/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/equCab2/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then if test -s /cluster/data/equCab2/blastDb.lft then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/equCab2/blastDb.lft carry $f.2 else mv $f.2 $f.3 fi liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/equCab2/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 253052 of 253052 jobs # CPU time in finished jobs: 23683489s 394724.82m 6578.75h 274.11d 0.751 y # IO & Wait Time: 2116386s 35273.09m 587.88h 24.50d 0.067 y # Average job time: 102s 1.70m 0.03h 0.00d # Longest finished job: 405s 6.75m 0.11h 0.00d # Submission to last job: 77580s 1293.00m 21.55h 0.90d ssh kkstore05 cd /cluster/data/equCab2/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/equCab2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/equCab2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh memk cd /cluster/data/equCab2/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 82 of 82 jobs # CPU time in finished jobs: 86617s 1443.62m 24.06h 1.00d 0.003 y # IO & Wait Time: 10159s 169.31m 2.82h 0.12d 0.000 y # Average job time: 1180s 19.67m 0.33h 0.01d # Longest finished job: 3153s 52.55m 0.88h 0.04d # Submission to last job: 8853s 147.55m 2.46h 0.10d ssh kkstore05 cd /cluster/data/equCab2/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/equCab2/bed/tblastn.hg18KG/preblastHg18KG.psl cd .. pslCheck preblastHg18KG.psl liftUp -nosort -type=".psl" -nohead blastHg18KG.psl /cluster/data/equCab2/jkStuff/chrUn.lift carry preblastHg18KG.psl # load table ssh hgwdev cd /cluster/data/equCab2/bed/tblastn.hg18KG hgLoadPsl equCab2 blastHg18KG.psl # check coverage featureBits equCab2 blastHg18KG # 41055754 bases of 2454424288 (1.673%) in intersection featureBits equCab2 refGene:cds blastHg18KG -enrichment # refGene:cds 0.017%, blastHg18KG 1.673%, both 0.016%, cover 90.76%, enrich 54.26x featureBits equCab2 ensGene:cds blastHg18KG -enrichment # ensGene:cds 1.294%, blastHg18KG 1.673%, both 1.023%, cover 79.08%, enrich 47.27x ssh kkstore05 rm -rf /cluster/data/equCab2/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/equCab2/bed/tblastn.hg18KG/blastOut #end tblastn ################################################## ########################################################################### # Blastz Platypus ornAna1 (DONE - 2008-04-28 - larrym) cd /cluster/data/equCab2/jkStuff cp -p equCab2.chrUn.lift /san/sanvol1/scratch/equCab2 ssh kkstore04 screen # use screen to control this multi-day job mkdir /cluster/data/equCab2/bed/blastz.ornAna1.2008-04-24 cd /cluster/data/equCab2/bed/blastz.ornAna1.2008-04-24 cat << '_EOF_' > DEF # TARGET: Horse SEQ1_DIR=/scratch/data/equCab2/equCab2.2bit SEQ1_LEN=/scratch/data/equCab2/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes SEQ1_LIFT=/san/sanvol1/scratch/equCab2/equCab2.chrUn.lift SEQ1_CHUNK=20000000 SEQ1_LIMIT=200 SEQ1_LAP=10000 # QUERY: Platypus ornAna1 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/equCab2/bed/blastz.ornAna1.2008-04-28 TMPDIR=/scratch/tmp _EOF_ time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -stop=partition \ -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot /cluster/bluearc/ornAna1/blastz.equCab2 >& do.log & 0.095u 0.050s 0:12.21 1.1% 0+0k 0+0io 16pf+0w Ran twice to determine SEQ2_LIMIT=400 (to get number of jobs down to 107525) time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -continue=blastz \ -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot /cluster/bluearc/ornAna1/blastz.equCab2 >>& do.log & failed on part262.lst b/c of a corrupted line in batch file; I fixed the line and reran successfully (via para push). time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -continue=cat \ -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot /cluster/bluearc/ornAna1/blastz.equCab2 >>& do.log & 0.209u 0.115s 42:36.73 0.0% 0+0k 0+0io 0pf+0w ######################################################################### ## 6-Way Multiz (DONE - 2008-05-01 - larrym) ssh hgwdev mkdir /cluster/data/equCab2/bed/multiz6way cd /cluster/data/equCab2/bed/multiz6way # select the 6 organisms from the 30-way recently done on mouse mm9 /cluster/bin/phast/tree_doctor \ --prune-all-but Human_hg18,Mouse_mm9,Dog_canFam2,Platypus_ornAna1,Chicken_galGal3,Horse_equCab1 \ /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \ > 6-way.fullNames.nh ((((Mouse_mm9:0.325818,Human_hg18:0.126901):0.019763,(Dog_canFam2:0.149350,Horse_equCab1:0.099323):0.038613):0.332197,Platypus_ornAna1:0.488110):0.118797,Chicken_galGal3:0.488824); rearranged to get horse on top: cat << '_EOF_' > equCab2.6-way.nh ((((Horse_equCab2:0.099323,Dog_canFam2:0.149350):0.038613, (Mouse_mm9:0.325818,Human_hg18:0.126901):0.019763):0.332197, Platypus_ornAna1:0.488110):0.118797,Chicken_galGal3:0.488824); '_EOF_' sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' equCab2.6-way.nh | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \ | sed -e "s/.*_//; s/:.*//" | sort > species.list # verify that has 6 db names in it # create a stripped down nh file for use in autoMZ run echo \ `sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' equCab2.6-way.nh \ | sed -e "s/ / /g"` > tree.6.nh # results: # ((((equCab2 canFam2) (mm9 hg18)) ornAna1) galGal3) # => for making the gif: ((((Horse,Dog), (Mouse, Human)), Platypus), Chicken) wget -O equCab2_6way.gif 'http://genome.ucsc.edu/cgi-bin/phyloGif?hgsid=106951235&phyloGif_width=219&phyloGif_height=260&boolshad.phyloGif_branchLengths=1&boolshad.phyloGif_lengthLegend=1&boolshad.phyloGif_branchLabels=1&phyloGif_branchDecimals=2&phyloGif_branchMultiplier=1&boolshad.phyloGif_underscores=1&boolshad.phyloGif_htmlPage=1&phyloGif_tree=%28%28%28%28Horse%2CDog%29%2C+%28Mouse%2C+Human%29%29%2C+Platypus%29%2C+Chicken%29&phyloGif_submit=submit' # verify all blastz's exists cat << '_EOF_' > listMafs.csh #!/bin/csh -fe cd /cluster/data/equCab2/bed/multiz6way foreach db (`grep -v equCab2 species.list`) set bdir = /cluster/data/equCab2/bed/blastz.$db if (-e $bdir/mafRBestNet/chr1.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/chr1.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/chr1.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end '_EOF_' # << happy emacs chmod +x ./listMafs.csh ./listMafs.csh canFam2 mafSynNet galGal3 mafSynNet hg18 mafSynNet mm9 mafSynNet ornAna1 mafNet /cluster/bin/phast/all_dists equCab2.6-way.nh > 6way.distances.txt grep -i equcab 6way.distances.txt | sort -k3,3n Horse_equCab2 Dog_canFam2 0.248673 Horse_equCab2 Human_hg18 0.284600 Horse_equCab2 Mouse_mm9 0.483517 Horse_equCab2 Platypus_ornAna1 0.958243 Horse_equCab2 Chicken_galGal3 1.077754 # use the calculated # distances in the table below to order the organisms and check # the button order on the browser. Zebrafish ends up before # tetraodon and fugu on the browser despite its distance. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainPonAbe2Link chain linearGap # distance on EquCab2 on other minScore # 1 0.248673 Dog_canFam2 (% 70.910) (% 70.300) 3000 medium # 2 0.284600 Human_hg18 (% 66.819) (% 57.162) 3000 medium # 3 0.483517 Mouse_mm9 (% 37.218) (% 34.821) 3000 medium # 4 0.958243 Platypus_ornAna1 (% 5.477) (% 7.217) 5000 loose # 5 1.077754 Chicken_galGal3 (% 3.031) (% 6.568) 3000 medium # copy net mafs to cluster-friendly storage, splitting chroms # into 50MB chunks to improve run-time # NOTE: splitting will be different for scaffold-based reference asemblies ssh hgwdev mkdir /cluster/data/equCab2/bed/multiz6way/run.split cd /cluster/data/equCab2/bed/multiz6way/run.split # this works by examining the rmsk table for likely repeat areas # that won't be used in blastz mafSplitPos equCab2 50 mafSplit.bed ssh kki cd /cluster/data/equCab2/bed/multiz6way/run.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set targDb = "equCab2" set db = $1 set sdir = /san/sanvol1/scratch/$targDb/splitStrictMafNet mkdir -p $sdir if (-e $sdir/$db) then echo "directory $sdir/$db already exists -- remove and retry" exit 1 endif set bdir = /cluster/data/$targDb/bed/blastz.$db if (! -e $bdir) then echo "directory $bdir not found" exit 1 endif mkdir -p $sdir/$db if (-e $bdir/mafRBestNet) then set mdir = $bdir/mafRBestNet else if (-e $bdir/mafSynNet) then set mdir = $bdir/mafSynNet else if (-e $bdir/mafNet) then set mdir = $bdir/mafNet else echo "$bdir maf dir not found" exit 1 endif echo $mdir foreach f ($mdir/*) set c = $f:t:r:r echo " $c" nice mafSplit mafSplit.bed $sdir/$db/ $f end echo "gzipping $sdir/$db mafs" nice gzip $sdir/$db/* endif echo $mdir > $db.done '_EOF_' # << happy emacs chmod +x doSplit.csh grep -v equCab2 ../species.list > split.list cat << '_EOF_' > template #LOOP doSplit.csh $(path1) {check out line+ $(path1).done} #ENDLOOP '_EOF_' gensub2 split.list single template jobList para create jobList para -maxPush=3 push para push para time 5 jobs in batch Completed: 5 of 5 jobs CPU time in finished jobs: 3230s 53.84m 0.90h 0.04d 0.000 y IO & Wait Time: 186s 3.10m 0.05h 0.00d 0.000 y Average job time: 683s 11.39m 0.19h 0.01d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 1301s 21.68m 0.36h 0.02d Submission to last job: 1336s 22.27m 0.37h 0.02d # ready for the multiz run ssh pk cd /cluster/data/equCab2/bed/multiz6way # actually, the result directory here should be maf.split instead of maf mkdir -p maf run cd run mkdir penn # use latest penn utilities setenv P /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # list chrom chunks, any db dir will do; better would be for the # splitter to generate this file # We temporarily use __ instead of . to delimit chunk in filename # so we can use $(root) to get basename bash find /san/sanvol1/scratch/equCab2/splitStrictMafNet -type f \ | while read F; do basename $F; done \ | sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.list exit wc -l chromChunks.list # 64 chromChunks.list cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = equCab2 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet rm -fr $tmp mkdir -p $tmp cp ../tree.6.nh ../species.list $tmp pushd $tmp foreach s (`cat species.list`) set c2 = `echo $c | sed 's/__/./'` set in = $pairs/$s/$c2.maf set out = $db.$s.sing.maf if ($s == equCab2) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.6.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/equCab2/bed/multiz6way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << emacs gensub2 chromChunks.list single template jobList para create jobList para try para push # 64 jobs in batch # Completed: 64 of 64 jobs # CPU time in finished jobs: 45922s 765.37m 12.76h 0.53d 0.001 y # IO & Wait Time: 626s 10.43m 0.17h 0.01d 0.000 y # Average job time: 727s 12.12m 0.20h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1141s 19.02m 0.32h 0.01d # Submission to last job: 2752s 45.87m 0.76h 0.03d # put the split maf results back together into single chroms ssh kkstore04 cd /cluster/data/equCab2/bed/multiz6way # here is where the result directory maf should have already been maf.split mv maf maf.split mkdir maf # going to sort out the redundant header garbage to leave a cleaner maf for C in `ls maf.split | sed -e "s#__.*##" | sort -u` do echo ${C} head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \ sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf done # load tables for a look ssh hgwdev mkdir -p /gbdb/equCab2/multiz6way/maf ln -s /cluster/data/equCab2/bed/multiz6way/maf/*.maf /gbdb/equCab2/multiz6way/maf # this generates a large 1 Gb multiz6way.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time /bin/nice -n 19 hgLoadMaf -pathPrefix=/gbdb/equCab2/multiz6way/maf equCab2 multiz6way # 101.386u 33.294s 2:57.36 75.9% 0+0k 0+0io 2pf+0w # Loaded 5609508 mafs in 34 files from /gbdb/equCab2/multiz6way/maf # load summary table time /bin/nice -n 19 cat /gbdb/equCab2/multiz6way/maf/*.maf | hgLoadMafSummary equCab2 -minSize=30000 -mergeGap=1500 -maxSize=200000 multiz6waySummary stdin & # 0.112u 11.365s 4:13.79 4.5% 0+0k 0+0io 0pf+0w # Created 705120 summary blocks from 12830585 components and 5609498 mafs from stdin # Gap Annotation # prepare bed files with gap info ssh kkstore04 mkdir /cluster/data/equCab2/bed/multiz6way/anno cd /cluster/data/equCab2/bed/multiz6way/anno mkdir maf run # these actually already all exist from previous multiple alignments for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `grep -v equCab2 ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done ssh kki cd /cluster/data/equCab2/bed/multiz6way/anno/run cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set dir = /cluster/data/equCab2/bed/multiz6way set c = $1 cat $dir/maf/${c}.maf | \ nice mafAddIRows -nBeds=nBeds stdin /cluster/data/equCab2/equCab2.2bit $2 '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(root1) {check out line+ /cluster/data/equCab2/bed/multiz6way/anno/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 /cluster/data/equCab2/chrom.sizes > chrom.list gensub2 chrom.list single template jobList para create jobList para try... para push... # 34 jobs in batch # Completed: 34 of 34 jobs # CPU time in finished jobs: 2287s 38.12m 0.64h 0.03d 0.000 y # IO & Wait Time: 899s 14.98m 0.25h 0.01d 0.000 y # Average job time: 94s 1.56m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 363s 6.05m 0.10h 0.00d # Submission to last job: 363s 6.05m 0.10h 0.00d ssh hgwdev cd /cluster/data/equCab2/bed/multiz6way/anno mkdir -p /gbdb/equCab2/multiz6way/anno/maf ln -s /cluster/data/equCab2/bed/multiz6way/anno/maf/*.maf /gbdb/equCab2/multiz6way/anno/maf # by loading this into the table multiz6way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time /bin/nice -n 19 hgLoadMaf -pathPrefix=/gbdb/equCab2/multiz6way/anno/maf equCab2 multiz6way 122.861u 43.946s 3:23.21 82.0% 0+0k 0+0io 0pf+0w Loaded 6251465 mafs in 34 files from /gbdb/equCab2/multiz6way/anno/maf cat /cluster/data/equCab2/chrom.sizes | \ awk '{if ($2 > 1000000) { print $1 }}' | while read C do echo /gbdb/equCab2/multiz6way/anno/maf/$C.maf done | xargs cat | \ hgLoadMafSummary equCab2 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz6waySummary stdin # Created 705120 summary blocks from 12830585 components and 6251455 mafs from stdin # # by loading this into the table multiz6waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz6way*.tab files in this /scratch/tmp directory rm multiz6way* ############################################################################# ## Annotate 6-way multiple alignment with gene annotations ## (DONE - 2008-05-02 - larrym) ssh hgwdev mkdir /cluster/data/equCab2/bed/multiz6way/frames cd /cluster/data/equCab2/bed/multiz6way/frames # dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " echo -n "Tables: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \ $table == "knownGene") then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql equCab2 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql equCab2 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' chmod +x ./showGenes.csh ./showGenes.csh # canFam2: Tables: ensGene: 29749, refGene: 889, Mrnas: 1667 # equCab2: Tables: ensGene: 27192, refGene: 354, Mrnas: 38402 # galGal3: Tables: ensGene: 22946, refGene: 4300, Mrnas: 27182 # hg18: Tables: ensGene: 55906, knownGene: 56722, mgcGenes: 28715, refGene: 26306, Mrnas: 231949 # mm9: Tables: ensGene: 43973, knownGene: 49409, mgcGenes: 22666, refGene: 21569, Mrnas: 222212 # ornAna1: Tables: ensGene: 30089, refGene: 10, Mrnas: 147 # from this information, conclusions: # (hiram sez use ensGene rather than knownGene for hg18 (even though knownGene has more genes): # use knownGene for mm9 # use ensGene for hg18, galGal3, canFam2 and ornAna1 mkdir genes # knownGene for DB in mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene # for DB in galGal3 canFam2 ornAna1 # for DB in hg18 for DB in equCab2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done ls -og genes # -rw-r--r-- 1 1865308 May 2 13:10 canFam2.gp.gz # -rw-r--r-- 1 1916011 May 16 15:11 equCab2.gp.gz # -rw-r--r-- 1 1557911 May 2 13:10 galGal3.gp.gz # -rw-r--r-- 1 2008806 May 2 13:09 hg18.gp.gz # -rw-r--r-- 1 1965274 May 2 13:09 mm9.gp.gz # -rw-r--r-- 1 1347330 May 2 13:10 ornAna1.gp.gz after switching hg18 => ensGene: # -rw-r--r-- 1 2151412 May 6 14:46 genes/hg18.gp.gz ssh kkstore04 cd /cluster/data/equCab2/bed/multiz6way/frames time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames equCab2 stdin stdout canFam2 genes/canFam2.gp.gz hg18 genes/hg18.gp.gz \ mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz6way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 35858 galGal3 # 236512 mm9 # 236761 hg18 # 249158 canFam2 # 252980 ornAna1 redo with changed hg18: ssh kkstore04 cd /cluster/data/equCab2/bed/multiz6way/frames time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames equCab2 stdin stdout canFam2 genes/canFam2.gp.gz hg18 genes/hg18.gp.gz \ mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz6way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 35858 galGal3 # 236512 mm9 # 249158 canFam2 # 252980 ornAna1 # 254414 hg18 # redo above with updated equCab2 (2008-05-16): ssh kkstore04 cd /cluster/data/equCab2/bed/multiz6way/frames time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames equCab2 stdin stdout equCab2 genes/equCab2.gp.gz canFam2 genes/canFam2.gp.gz hg18 genes/hg18.gp.gz \ mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multiz6way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 35858 galGal3 # 186570 equCab2 # 236512 mm9 # 249158 canFam2 # 252980 ornAna1 # 254414 hg18 # load the resulting file ssh hgwdev cd /cluster/data/equCab2/bed/multiz6way/frames time /bin/nice -n 19 hgLoadMafFrames equCab2 multiz6wayFrames multiz6way.mafFrames.gz 7.176u 0.735s 0:17.24 45.8% 0+0k 0+0io 2pf+0w # re-run with equCab2: 8.523u 1.119s 4:46.07 3.3% 0+0k 0+0io 2pf+0w # enable the trackDb entries: # frames multiz6wayFrames # irows on ############################################################################# # phastCons 6-way (DONE - 2008-05-09 - larrym) # split 6way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh kki mkdir /cluster/data/equCab2/bed/multiz6way/msa.split cd /cluster/data/equCab2/bed/multiz6way/msa.split mkdir -p /san/sanvol1/scratch/equCab2/multiz6way/cons/ss cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/equCab2/bed/multiz6way/maf set WINDOWS = /san/sanvol1/scratch/equCab2/multiz6way/cons/ss pushd $WINDOWS set c = $1 rm -fr $c mkdir $c twoBitToFa -seq=$c /scratch/data/equCab2/equCab2.2bit /scratch/tmp/equCab2.$c.fa # need to truncate odd-ball scaffold/chrom names that include dots # as phastCons utils can't handle them set CLEAN_MAF = /scratch/tmp/$c.clean.maf.$$ perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $CLEAN_MAF /cluster/bin/phast/$MACHTYPE/msa_split $CLEAN_MAF -i MAF \ -M /scratch/tmp/equCab2.$c.fa \ -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 rm -f $CLEAN_MAF /scratch/tmp/equCab2.$c.fa popd date >> $c.done '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../maf | sed -e "s/.maf//" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 34 of 34 jobs # CPU time in finished jobs: 1600s 26.66m 0.44h 0.02d 0.000 y # IO & Wait Time: 1183s 19.72m 0.33h 0.01d 0.000 y # Average job time: 82s 1.36m 0.02h 0.00d # Longest finished job: 241s 4.02m 0.07h 0.00d # Submission to last job: 418s 6.97m 0.12h 0.00d # take the cons and noncons trees from the mouse 30-way # Estimates are not easy to make, probably more correctly, # take the 30-way .mod file, and re-use it here. ssh hgwdev cd /cluster/data/equCab2/bed/multiz6way cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod . # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh memk mkdir -p /cluster/data/equCab2/bed/multiz6way/cons/run.cons cd /cluster/data/equCab2/bed/multiz6way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all gliers placentals # Well, that's what it was when used in the Mm9 30-way, # in this instance, there is only the directory "all" cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin set subDir = $1 set f = $2 set c = $2:r set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/equCab2/bed/multiz6way/cons mkdir -p $tmp set san = /san/sanvol1/scratch/equCab2/multiz6way/cons if (-s $cons/$grp/$grp.non-inf) then cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp else cp -p $cons/$grp/$grp.mod $tmp cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir sleep 4 touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir rm -f $san/$grp/pp/$subDir/$f.pp rm -f $san/$grp/bed/$subDir/$f.bed mv $tmp/$f.pp $san/$grp/pp/$subDir mv $tmp/$f.bed $san/$grp/bed/$subDir rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/equCab2/multiz6way/cons/all/pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/equCab2/multiz6way/cons find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" > /cluster/data/equCab2/bed/multiz6way/cons/ss.list popd # run for all species cd .. mkdir -p all run.cons/all cd all /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin/tree_doctor \ ../../mm9.30way.mod \ --prune-all-but=equCab1,hg18,galGal3,mm9,canFam2,ornAna1 \ > all.mod # edit all.mod to s/equCab2/equCab1/; # TREE: ((((mm9:0.325818,hg18:0.136019):0.019763,(canFam2:0.149350,equCab2:0.099323):0.038613):0.332197,ornAna1:0.488110):0.118797,galGal3:0.488824); cd ../run.cons/all # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/equCab2/multiz6way/cons/all/pp/$(lastDir1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../../ss.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 268 of 268 jobs # CPU time in finished jobs: 8054s 134.23m 2.24h 0.09d 0.000 y # IO & Wait Time: 2459s 40.98m 0.68h 0.03d 0.000 y # Average job time: 39s 0.65m 0.01h 0.00d # Longest finished job: 52s 0.87m 0.01h 0.00d # Submission to last job: 543s 9.05m 0.15h 0.01d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/equCab2/multiz6way/cons/all time nice -n +19 cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/equCab2/bed/multiz6way/cons/all # load into database ssh hgwdev cd /cluster/data/equCab2/bed/multiz6way/cons/all time /bin/nice -n 19 hgLoadBed equCab2 phastConsElements6way mostConserved.bed # 3.212u 0.439s 0:22.68 16.0% 0+0k 0+0io 0pf+0w # Loaded 1036081 elements of size 5 # Try for 5% overall cov, and 70% CDS cov # We don't have any gene tracks to compare CDS coverage # --rho .31 --expected-length 45 --target-coverage .3 featureBits equCab2 phastConsElements6way # 134628278 bases of 2454424288 (5.485%) in intersection # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/equCab2/multiz6way/cons/all mkdir -p phastCons6wayScores cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP for D in pp/chr* do C=${D/pp\/} out=phastCons6wayScores/${C}.data.gz echo "${D} > ${C}.data.gz" ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \ gzip > ${out} done '_EOF_' chmod +x gzipAscii.sh time /bin/nice -n 19 ./gzipAscii.sh 1244.107u 83.541s 22:46.45 97.1% 0+0k 0+0io 0pf+0w # rereun to fix corrupted file 1310.778u 34.480s 22:56.12 97.7% 0+0k 0+0io 0pf+0w # XXXX NOT DONE YET # copy the phastCons8wayScores to: # /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way/phastConsScores # for hgdownload downloads # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/equCab2/multiz6way/cons/all time /bin/nice -n 19 ls phastCons6wayScores/*.data.gz | xargs zcat | wigEncode -noOverlap stdin phastCons6way.wig phastCons6way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 23m55.813s time /bin/nice -n 19 cp -p *.wi? /cluster/data/equCab2/bed/multiz6way/cons/all # 0.010u 10.862s 0:49.38 22.0% 0+0k 0+0io 0pf+0w # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/equCab2/bed/multiz6way/cons/all ln -s `pwd`/phastCons6way.wib /gbdb/equCab2/multiz6way/phastCons6way.wib time /bin/nice -n 19 hgLoadWiggle -pathPrefix=/gbdb/equCab2/multiz6way equCab2 phastCons6way phastCons6way.wig # 12.758u 3.986s 0:42.43 39.4% 0+0k 0+0io 2pf+0w # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/equCab2/bed/multiz6way/cons/all time /bin/nice -n 19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=equCab2 phastCons6way > histogram.data 2>&1 # real 3m29.727s cat << '_EOF_' > gnuplot.txt set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Horse EquCab2 Histogram phastCons6way track" set xlabel " phastCons6way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' gnuplot < gnuplot.txt > histo.png ############################################################################# ## Create downloads for 6-way business (DONE - 2008-05-21 - larrym) ssh kkstore04 mkdir -p /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way cd /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way cp -p /san/sanvol1/scratch/equCab2/multiz6way/cons/all/phastCons6wayScores/* . ln -s ../../cons/all/all.mod ./6way.mod cp /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/README.txt . # edit that README.txt to be correct for this 6-way alignment cd .. mkdir multiz6way cd multiz6way cp -p /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/README.txt . # edit that README.txt to be correct for this 6-way alignment ssh kkstore04 cd /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way ln -s ../../equCab2.6-way.nh ./6way.nh mkdir -p /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf cd /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf cp -p /cluster/data/equCab2/bed/multiz6way/anno/maf/*.maf . time /bin/nice -n 19 gzip -v *.maf & 1796.135u 69.195s 31:47.39 97.7% 0+0k 0+0io 0pf+0w # creating upstream files ssh hgwdev cd /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf # creating upstream files from ensGene, bash script: cat << '_EOF_' > mkUpstream.sh #!/bin/bash DB=equCab2 GENE=ensGene NWAY=multiz6way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | mafFrags ${DB} ${NWAY} \ stdin stdout \ -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done '_EOF_' # << happy emacs chmod +x ./mkUpstream.sh time /bin/nice -n 19 ./mkUpstream.sh 87.355u 87.965s 4:47.94 60.8% 0+0k 0+0io 6pf+0w # -rw-rw-r-- 1 larrym protein 38856776 May 19 13:59 upstream5000.maf.gz # -rw-rw-r-- 1 larrym protein 15604291 May 19 13:58 upstream2000.maf.gz # -rw-r--r-- 1 larrym protein 8206298 May 19 13:57 upstream1000.maf.gz # check the names in these upstream files to ensure sanity: # should be a list of the other 4 species with a high count, # then ensGene names. zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' | sort | uniq -c | sort -rn | head 7030 ornAna1 7030 mm9 7030 hg18 7030 galGal3 7030 canFam2 1 ENSECAT00000027191 1 ENSECAT00000027190 ssh kkstore04 cd /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf md5sum *.maf.gz > md5sum.txt cd .. ln -s ../../equCab2.6-way.nh ./6way.nh ln -s $HOME/kent/src/hg/makeDb/trackDb/horse/equCab2/multiz6way.html . # copy README.txt from mm9 30-way downloads and edit to represent # this directory cd .. mkdir -p phastCons6way/phastConsScores cd phastCons6way/phastConsScores cp -p /san/sanvol1/scratch/equCab2/multiz6way/cons/all/phastCons6wayScores/*.data.gz . md5sum *.data.gz > md5sum.txt cd .. ln -s ../../cons/all/all.mod 6way.mod # copy README.txt from mm9 30way downloads and edit to represent # this directory # enable them for hgdownload rsync transfer ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/equCab2 mkdir multiz6way phastCons6way cd multiz6way ln -s /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/6way.nh . ln -s /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/*.html . ln -s /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/R*.txt . mkdir maf cd maf ln -s /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf/*.maf.gz . ln -s /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf/md5sum.txt cd ../../phastCons6way mkdir phastConsScores cd phastConsScores ln -s /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way/phastConsScores/*.gz . ln -s /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way/phastConsScores/md5sum.txt . cd .. ln -s /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way/6way.mod . ln -s /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way/README.txt . ############################################################################# # N-SCAN gene predictions (nscanGene) - (2008-04-29 markd) # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/equCab2/bed/nscan/ wget http://mblab.wustl.edu/predictions/horse/equCab2/equCab2.gtf wget http://mblab.wustl.edu/predictions/horse/equCab2/equCab2.prot.fa # updated readme.html wget http://mblab.wustl.edu/predictions/horse/equCab2/repl.html bzip2 equCab2.* chmod a-w * # load track gtfToGenePred -genePredExt equCab2.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt equCab2 nscanGene stdin hgPepPred equCab2 generic nscanPep equCab2.prot.fa.bz2 rm *.tab # update trackDb; need a equCab2-specific page to describe informants horse/equCab2/nscanGene.html (copy from repl.html) horse/equCab2/trackDb.ra # set search regex to termRegex chr[0-9a-zA-Z_]+\.[0-9]+\.[0-9]+ ############################################################################# ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ########################################################################## # verify all.joiner entries (DONE - 2008-07-08 - larrym) # try to get joinerCheck tests clean output on these commands ssh hgwdev cd ~/kent/src/hg/makeDb/schema joinerCheck -database=equCab2 -tableCoverage all.joiner joinerCheck -database=equCab2 -keys all.joiner joinerCheck -database=equCab2 -times all.joiner ########################################################################## # make downloads (DONE - 2008-07-08 - larrym) # verify all tracks have html pages for their trackDb entries cd /cluster/data/equCab2 /cluster/bin/scripts/makeDownloads.pl equCab2 # fix the README files # re-build the upstream files: cd /cluster/data/equCab2/goldenPath/bigZips for size in 1000 2000 5000 do echo $size featureBits equCab2 ensGene:upstream:$size -fa=stdout \ | gzip -c > ensGene.upstream$size.fa.gz done ########################################################################## # create pushQ entry (DONE - 2008-07-08 - larrym) # verify all tracks have html pages for their trackDb entries ssh hgwdev cd /cluster/data/equCab2/jkStuff /cluster/bin/scripts/makePushQSql.pl equCab2 > equCab2.pushQ.sql # equCab2 does not have seq scp -p equCab2.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < equCab2.pushQ.sql # verify file sizes in the pushQ entries # *** All done! # *** Please edit the output to ensure correctness before using. # *** 1. Resolve any warnings output by this script. # *** 2. Remove any entries which should not be pushed. # *** 3. Add tables associated with the main track table (e.g. *Pep tables # for gene prediction tracks). # *** 4. Add files associated with tracks. First, look at the results # of this query: # hgsql equCab2 -e 'select distinct(path) from extFile' # Then, look at file(s) named in each of the following wiggle tables: # hgsql equCab2 -e 'select distinct(file) from phastCons6way' # hgsql equCab2 -e 'select distinct(file) from gc5Base' # hgsql equCab2 -e 'select distinct(file) from quality' # Files go in the second field after tables (it's tables, cgis, files). # *** 5. This script currently does not recognize composite tracks. If equCab2 # has any composite tracks, you should manually merge the separate # per-table entries into one entry. # *** 6. Make sure that qapushq does not already have a table named equCab2: # ssh hgwbeta hgsql qapushq -NBe "'desc equCab2;'"ssh hgwbeta hgsql qapushq -NBe "'desc equCab2;'" # You *should* see this error: # ERROR 1146 at line 1: Table 'qapushq.equCab2' doesn't exist # If it already has that table, talk to QA and figure out whether # it can be dropped or fixed up (by sql or the Push Queue web app). # *** When everything is complete and correct, use hgsql on hgwbeta to # execute the sql file. Then use the Push Queue web app to check the # contents of all entries. # *** If you haven't already, please add equCab2 to makeDb/schema/all.joiner ! # It should be in both $gbd and $chainDest. XXXX TODO # *** When equCab2 is on the RR (congrats!), please doBlastz -swap if you haven't # already, and add Push Queue entries for those other databases' chains # and nets to equCab2. Added following files to pushQ entry: /gbdb/equCab2/multiz6way/maf/chr*.maf /gbdb/equCab2/multiz6way/anno/maf/chr*.maf /gbdb/equCab2/multiz6way/phastCons6way.wib /gbdb/equCab2/wib/gc5Base.wib /gbdb/equCab2/wib/qual.wib ########################################################################## ########################################################################## # Fix duplicate scaffold_34 (DONE - 2008-09-09 - larrym) Start 2008-08-19: insert into dbDb (name, description, nibPath, organism, defaultPos, active, genome, scientificName) values ('equCab2Copy', 'Sep. 2007 (test)', '/gbdb/equCab2Copy', 'Horse', 'chr11:52,970,942-52,973,091', 1, 'Horse', 'Equus caballus'); chr6 (correct location): scaffold_34 - chr6:58759118-84719076 chr27: scaffold_24 - chr27:1-39960074 - 39,960,074 scaffold_34 - chr27:39961075-65921033 (25959959 bases long) Example of errors: human protein FLJ13236 - aligns to chr6:67090333-67092571 and chr27:48292290-48294528 mRNA AJ514427 - aligns to chr6:67600011-67601519 and chr27:48801968-48803476 est CX605339 - aligns to chr6:67238940-67239361 and chr27:48440897-48441318 intronEst CD469838 - aligns to chr6:71553983-71575277 and chr27:52755940-52777234 xenoMrna BC033236 - aligns to chr6:67088771-67092632 and chr27:48290728-48294589 The region to delete starts at the gap precediing scaffold_34: chr27: 39960074 (zero based) 65921033 - 39960075 + 1 = 25960959 bp's being deleted. Tables that cross gap: quality nscanGene ... Tracks to ignore while testing: oligoMatch (Short Match) a lot of tables are split, so I will have to write some C to do this: run delete program (/cluster/data/equCab2/fixEquCab2.c): #table:chromName:number rows deleted blastHg18KG:tName:1138 chr27_chainCanFam2:tName:94250 chr27_chainGalGal3:tName:30247 chr27_chainHg18:tName:82111 chr27_chainMm9:tName:108140 chr27_chainOrnAna1:tName:49989 chr27_est:tName:823 chr27_gap:chrom:499 chr27_gold:chrom:499 chr27_intronEst:tName:429 chr27_mrna:tName:27 chr27_rmsk:genoName:37468 ctgPos2:chrom:1 ensGene:chrom:0 fixedStep:chrom:0 gc5Base:chrom:5052 multiz6way:chrom:71172 nestedRepeats:chrom:3894 netCanFam2:tName:37661 netGalGal3:tName:4908 netHg18:tName:31495 netMm9:tName:49546 netOrnAna1:tName:8094 nscanGene:chrom:351 phastConsElements6way:chrom:13664 quality:chrom:25353 refGene:chrom:7 simpleRepeat:chrom:2933 testBed:chrom:0 transMapAlnMRna:tName:8972 transMapAlnRefSeq:tName:933 transMapAlnSplicedEst:tName:177200 transMapAlnUcscGenes:tName:1713 xenoMrna:tName:21370 Things that apparently didn't get covered by the script: multiz6wayFrames, multiz6waySummary, phastCons6way do manually: delete from multiz6wayFrames where chrom = 'chr27' and chromStart >= 39960074; delete from multiz6waySummary where chrom = 'chr27' and chromStart >= 39960074; delete from phastCons6way where chrom = 'chr27' and (chromStart >= 39960074 || (chromStart < 39960074 && chromEnd >= 39960074)); delete from all_mrna where tName = 'chr27' and tStart >= 39960074; 27 rows delete from all_est where tName = 'chr27' and tStart >= 39960074; 823 rows delete from estOrientInfo where chrom = 'chr27' and chromStart >= 39960074; 823 rows delete from refSeqAli where tName = 'chr27' and tStart >= 39960074; 7 rows select * from ensGene where chrom = 'chr27' and txStart >= 39960074; 0 rows delete from refFlat where chrom = 'chr27' and txStart >= 39960074; 7 rows delete from mrnaOrientInfo where chrom = 'chr27' and chromStart >= 39960074; 34 rows sanity check: dbSnoop equCab2Copy stdout | grep ^2008 | egrep -v 'chr[0-9MX]+_' | grep -v trackDb | grep -v hgFindSpec | grep -v chrUn | sort Truncate chromInfo: update chromInfo set size = 39960074 where chrom = 'chr27'; # re-build 2bit files: # create /cluster/data/equCab2/seqList.txt to use as parameter to twoBitToFa: chr1 ... chr27:0-39960074 chr28 ... chrX chrUn chrM mv equCab2.2bit equCab2.2bit.old mv equCab2.rmsk.2bit equCab2.rmsk.2bit.old mv equCab2.unmasked.2bit equCab2.unmasked.2bit.old mv equCab2.UnScaffolds.2bit equCab2.UnScaffolds.2bit.old twoBitToFa -seqList=seqList.txt equCab2.2bit.bak equCab2.fa faSize equCab2.fa 2484532062 bases (55741889 N's 2428790173 real 1433784199 upper 995005974 lower) in 34 sequences in 1 files 2510493021 - 25960959 == 2484532062 agrees! # twoBitToFa is putting in "chr27:0-39960074, so we have to fix that up with the perl -ne # all commands run on kkstore05: twoBitToFa -seqList=seqList.txt equCab2.2bit.old stdout | perl -ne 's/^>chr27:0-39960074/>chr27/;print;' | faToTwoBit stdin equCab2.2bit twoBitToFa -seqList=seqList.txt equCab2.rmsk.2bit.old stdout | perl -ne 's/^>chr27:0-39960074/>chr27/;print;' | faToTwoBit stdin equCab2.rmsk.2bit twoBitToFa -seqList=seqList.txt equCab2.unmasked.2bit.old stdout | perl -ne 's/^>chr27:0-39960074/>chr27/;print;' | faToTwoBit stdin equCab2.unmasked.2bit twoBitToFa -seqList=seqList.txt equCab2.UnScaffolds.2bit.old stdout | perl -ne 's/^>chr27:0-39960074/>chr27/;print;' | faToTwoBit stdin equCab2.UnScaffolds.2bit 2nd run of fixEquCab2: overlaps: gc5Base chrom chromStart 1 overlaps: quality chrom chromStart 1 overlaps: nscanGene chrom txStart 1 overlaps: xenoMrna tName tStart 2 overlaps: blastHg18KG tName tStart 1 overlaps: nscanGene chrom txStart 1 select * from gc5Base where chrom = 'chr27' and chromStart < 39960074 && chromEnd > 39960074; select * from quality where chrom = 'chr27' and chromStart < 39960074 && chromEnd > 39960074; # Fixed gc5Base and quality below delete from xenoMrna where tName = 'chr27' and tStart < 39960074 && tEnd > 39960074; 2 rows delete from blastHg18KG where tName = 'chr27' and tStart < 39960074 && tEnd > 39960074; 1 row 1 overlapping row in nscanGene: delete from nscanGene where chrom = 'chr27' and txStart < 39960074 and txEnd > 39960074; 1 row Sent email to cluster-admin to reload blat from 2bit file. After reload, a portion of previously duplicated DNA: TATTGCAGGCAGGATGACCACTATTGCAACGCTCTCCGGCCTCGAAATGG now shows up with only one hit. # download fixed agp from broad cd /cluster/data/equCab2/broad mkdir 2008-04-15-updates cd 2008-04-15-updates wget ftp://ftp.broad.mit.edu/pub/assemblies/mammals/horse/Equus2/mapped.agp.chromosome.agp cp -p ucsc.agp ucsc.agp.old cp -p Contig.bed Contig.bed.old cp -p ucsc.agp ucsc.agp.old cp -p mapped.agp.chromosome.agp mapped.agp.chromosome.agp.old # manually remove relevant section (line 51417-52415) diff mapped.agp.chromosome.agp 2008-04-15-updates/mapped.agp.chromosome.agp # no differences (which is good, means they changed what I expected them to change) # on kkstore05 qaToQac assembly.quals.gz stdout | qacAgpLift ucsc.agp stdin equCab2.qual.qac Read 55316 qacs from stdin Got 33 chroms in ucsc.agp ... chr27 size=39960074 ... cd /cluster/data/equCab2/bed/qual qacToWig -fixed /cluster/data/equCab2/broad/equCab2.qual.qac stdout | wigEncode stdin qual.wig qual.wib # run on hgwdev: hgLoadWiggle -pathPrefix=/gbdb/equCab2/wib equCab2 quality qual.wig & cd /cluster/data/equCab2/bed/gc5Base hgGcPercent -wigOut -doGaps -win=5 -file=stdout -verbose=0 equCab2 /cluster/data/equCab2/equCab2.2bit | wigEncode stdin gc5Base.wig gc5Base.wib hgLoadWiggle -pathPrefix=/gbdb/equCab2/wib equCab2 gc5Base gc5Base.wig & hg18.chainEquCab2 etc. grep contig_ mapped.agp.chromosome.agp | awk '{print $3-$2+1}' | ave stdin count 55316 total 2428773513 grep contig_ mapped.agp.chromosome.agp | awk '{print $3-$2+1}' | sort -r -n | ~/bin/n50.pl 2428773513 count: 6246; len: 112381 cp -p Contig.bed Contig.bed.old # manually remove extra contig in chr27 grep scaffold_ Contig.bed | awk '{print $3-$2}' | ave stdin count 73 average 30724621.506849 total 2242897370 grep scaffold_ Contig.bed | awk '{print $3-$2}' | ~/bin/n50.pl 2242897370 count: 33; len: 53663962 checkTableCoords equCab2 -daysOld=1000 equCab2.chr27_chainCanFam2Link has 1604165 records with end > chromSize. equCab2.chr27_chainGalGal3Link has 374961 records with end > chromSize. equCab2.chr27_chainHg18Link has 1351976 records with end > chromSize. equCab2.chr27_chainMm9Link has 1671652 records with end > chromSize. equCab2.chr27_chainOrnAna1Link has 696995 records with end > chromSize. equCab2.xenoMrna has 3 records with end > chromSize. delete from chr27_chainCanFam2Link where tName = 'chr27' and tStart >= 39960074; 1604165 rows delete from chr27_chainGalGal3Link where tName = 'chr27' and tStart >= 39960074; 374961 rows delete from chr27_chainHg18Link where tName = 'chr27' and tStart >= 39960074; 1351976 rows delete from chr27_chainMm9Link where tName = 'chr27' and tStart >= 39960074; 1671652 rows delete from chr27_chainOrnAna1Link where tName = 'chr27' and tStart >= ; 696995 rows I don't know how the xenoMrna records got missed: delete from xenoMrna where tName = 'chr27' and tEnd > 39960074; 3 rows hg18, mm9, canFam2, ornAna1, galGal3 chr27_chainCanFam2Link hg18, mm9, canFam2, ornAna1, galGal3 In a meeting we (kuhn, hiram et al.) decided to do the nets and chains later, and focus on releasing core stuff now; i.e. I will re-run all net/chain stuff from scratch later. # misc fixes: # fixup genoLeft column in rmsk table: update chr27_rmsk set genoLeft = genoLeft + 25959959; ########################################################################## # re-make downloads (DONE - 2008-09-09 - larrym) Edit to remove end of chr27: chrom.sizes, rmsk_and_gaps.bed equCab2.agp, 27/chr27.agp, 27/chr27.fa.out bed/simpleRepeat/simpleRepeat.bed, bed/simpleRepeat/bed.tab bed/simpleRepeat/trfMask.bed, trfMaskChrom/chr27.bed bed/repeatMasker/bed.tab, bed/repeatMasker/equCab2.fa.out bed/chromInfo/chromInfo.tab broad/ctgPos2.tab, broad/scaffolds.bed # verify all tracks have html pages for their trackDb entries cd /cluster/data/equCab2 cp -rp goldenPath goldenPath.old /cluster/bin/scripts/makeDownloads.pl equCab2 # restore the older README files cd /cluster/data/equCab2 for dir in bigZips database liftOver chromosomes do cp -p goldenPath.old/$dir/README.txt goldenPath/$dir/README.txt done # re-build the upstream files: for size in 1000 2000 5000 do echo $size featureBits equCab2 ensGene:upstream:$size -fa=stdout \ | gzip -c > ensGene.upstream$size.fa.gz done ######################################################################### # Re-run creation of OOC file for genbank runs (DONE - 2008-08-27 - larrym) 1024 * (2.5 / 3.1) == 825.805824 # Use -repMatch=825 (based on size -- for human we use 1024, and # horse size is 84% of human judging by twoBitInfo -noNs) cd /cluster/data/equCab2 blat equCab2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=equCab2.11.ooc -repMatch=825 & ######################################################################### # REDO-GENBANK AUTO UPDATE (DONE - 2008-09-11 - larrym) ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time /bin/nice -n 19 bin/gbAlignStep -initial equCab2 & # logFile: var/build/logs/2008.09.10-13:33:41.equCab2.initalign.log # kill'ed b/c /scratch/data hadn't been rsync'ed # email to cluster-admin: # please rsync /hive/data/staging/data/equCab2 to /scratch/data/equCab2 on all cluster nodes # gbAlignStep hadn't got the parasol step, so markd said I could just re-run it: rm -rf /cluster/genbank/genbank/build/work/initial.equCab2 time /bin/nice -n 19 bin/gbAlignStep -initial equCab2 & # logFile: var/build/logs/2008.09.10-15:27:53.equCab2.initalign.log # 8970.392u 3173.177s 8:20:43.48 40.4% 0+0k 0+0io 1233pf+0w # load database when finished ssh hgwdev cd /cluster/data/genbank time /bin/nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad equCab2 & # logFile: var/dbload/hgwdev/logs/2008.09.11-11:50:22.dbload.log # 632.112u 222.505s 18:36.80 76.5% # test some genbank stuff - look at chr6:67,086,866-71,661,987 mRNA AJ514427 - aligns just to chr6:67600011-67601519 est CX605339 - aligns just to chr6:67238940-67239361 intronEst CD469838 - aligns just to chr6:71553983-71575277 xenoMrna BC033236 - aligns just to chr6:67088771-67092632 ######################################################################### # QA by larrym RefSeq Gene COL2A1 - why is it 1 base shorter now? - chr6:65629018-65660099 vs. chr6:65629017-65660099 http://hgwdev-larrym.cse.ucsc.edu/cgi-bin/hgc?hgsid=1559182&o=65629017&t=65660099&g=refGene&i=NM_001081764&c=chr6&l=62511743&r=76237109&db=equCab2 http://hgwbeta.cse.ucsc.edu/cgi-bin/hgc?hgsid=1501039&o=65629016&t=65660099&g=refGene&i=NM_001081764&c=chr6&l=62511743&r=76237109&db=equCab2 Alignment is on reverse strand; older version is one base longer; why? Similar with RefSeq Gene IL23 (which is not on reverse strand). select * from refGene where name = 'NM_001082522'; echo "select * from refGene where name = 'NM_001082522';" | hgsql equCab2 vs. echo "select * from refGene where name = 'NM_001082522';" | hgsql equCab2Old True on other chromosomes; e.g.: RefSeq Gene TPH2 - chr28:559352-671879 Update_time: 2008-09-11 12:08:02 # markd says don't worry about it - probably a blat change. ########################################################################## # re-verify all.joiner entries (DONE - 2008-09-15 - larrym) ssh hgwdev cd ~/kent/src/hg/makeDb/schema joinerCheck -database=equCab2 -tableCoverage all.joiner joinerCheck -database=equCab2 -keys all.joiner Error: 352 of 19964 elements of equCab2.nscanPep.name are not in key nscanGene.name line 589 of all.joiner Example miss: CHR27.199.1 equCab2.transMapInfoRefSeq.mappedId - hits 50099 of 51032 Error: 933 of 51032 elements of equCab2.transMapInfoRefSeq.mappedId are not in key transMapAlnRefSeq.qName line 2408 of all.joiner Example miss: NM_000020.2-1.2 equCab2.transMapInfoUcscGenes.mappedId - hits 94700 of 96413 Error: 1713 of 96413 elements of equCab2.transMapInfoUcscGenes.mappedId are not in key transMapAlnUcscGenes.qName line 2413 of all.joiner Example miss: UC001RME.1-1.2 equCab2.transMapInfoMRna.mappedId - hits 474538 of 483510 Error: 8972 of 483510 elements of equCab2.transMapInfoMRna.mappedId are not in key transMapAlnMRna.qName line 2418 of all.joinerw Example miss: A10156.1-1.2 equCab2.transMapInfoSplicedEst.mappedId - hits 6221957 of 6399157 Error: 177200 of 6399157 elements of Example miss: AA000021.1-1.2 select count(*) from nscanPep where name >= 'CHR27.199.1' and name <= 'CHR28'; 352 delete from nscanPep where name >= 'CHR27.199.1' and name <= 'CHR28'; Query OK, 352 rows affected (0.01 sec) ~/src/delOrphans.pl transMapInfoRefSeq mappedId transMapAlnRefSeq qName deleted: 933 ~/src/delOrphans.pl transMapInfoUcscGenes mappedId transMapAlnUcscGenes qName deleted: 1713 ~/src/delOrphans.pl transMapInfoMRna mappedId transMapAlnMRna qName deleted: 8972 ~/src/delOrphans.pl transMapInfoSplicedEst mappedId transMapAlnSplicedEst qName deleted: 177200 re-run: joinerCheck -database=equCab2 -keys all.joiner no errors joinerCheck -database=equCab2 -times all.joiner ############################################################################ ## RE-COMPUTE DEFAULT POSITION (DONE - 2008-09-16 - larrym) # determine new default position using liftOver (via convert menu) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chr11:52,970,942-53,028,811" where name="equCab2";' hgcentraltest ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: equCab2.upstreamGeneTbl = refGene equCab2.upstreamMaf = multiz6way /hive/data/genomes/equCab2/bed/multiz6way/species.list ################################################ # Fixed tSizes in transMap tracks (2008-10-15 markd) update equCab2.transMapAlnUcscGenes set tSize=39960074 where tName="chr27" and tSize=65921033; update equCab2.transMapAlnRefSeq set tSize=39960074 where tName="chr27" and tSize=65921033; update equCab2.transMapAlnMRna set tSize=39960074 where tName="chr27" and tSize=65921033; update equCab2.transMapAlnSplicedEst set tSize=39960074 where tName="chr27" and tSize=65921033; ############################################################################ # Re-Run GalGal3 lastz (DONE - 2009-07-01 - Hiram) # primary run cd /hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29 cat fb.galGal3.chainEquCab2Link.txt # 68476046 bases of 1042591351 (6.568%) in intersection # running the swap mkdir /hive/data/genomes/equCab2/bed/blastz.galGal3.swap cd /hive/data/genomes/equCab2/bed/blastz.galGal3.swap time doBlastzChainNet.pl \ /hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=pk \ -swap -workhorse=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 10m1.381s cat fb.equCab2.chainGalGal3Link.txt # 73336799 bases of 2428790173 (3.019%) in intersection ############################################################################ # Re-Run ornAna1 alignment (DONE - 2009-07-01,02 - Hiram) mkdir /hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01 cd /hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01 cat << '_EOF_' > DEF # Horse vs. Platypus BLASTZ_M=50 # TARGET: Horse SEQ1_DIR=/scratch/data/equCab2/equCab2.2bit SEQ1_LEN=/scratch/data/equCab2/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit SEQ1_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes SEQ1_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift SEQ1_CHUNK=20000000 SEQ1_LIMIT=100 SEQ1_LAP=0 # QUERY: Platypus ornAna1 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -workhorse=hgwdev \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1413m12.090s cat fb.equCab2.chainOrnAna1Link.txt # 132750149 bases of 2428790173 (5.466%) in intersection mkdir /hive/data/genomes/ornAna1/bed/blastz.equCab2.swap cd /hive/data/genomes/ornAna1/bed/blastz.equCab2.swap time doBlastzChainNet.pl \ /hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -swap -workhorse=hgwdev \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 115m47.611s cat fb.ornAna1.chainEquCab2Link.txt # 132776204 bases of 1842236818 (7.207%) in intersection ############################################################################ # lastz swap to Dog CanFam2 (DONE - 2009-07-01 - Hiram) # the original lastz cd /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29 cat fb.canFam2.chainEquCab2Link.txt # 1676663178 bases of 2384996543 (70.300%) in intersection mkdir /hive/data/genomes/equCab2/bed/blastz.canFam2.swap cd /hive/data/genomes/equCab2/bed/blastz.canFam2.swap time doBlastzChainNet.pl \ /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29/DEF \ -swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 286m51.658s fb.equCab2.chainCanFam2Link.txt # 1721407500 bases of 2428790173 (70.875%) in intersection ############################################################################ # lastz swap to Mouse Mm9 (DONE - 2009-07-02 - Hiram) # the original lastz cd /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29 cat fb.mm9.chainEquCab2Link.txt # 912421053 bases of 2620346127 (34.821%) in intersection mkdir /hive/data/genomes/equCab2/bed/blastz.mm9.swap cd /hive/data/genomes/equCab2/bed/blastz.mm9.swap time doBlastzChainNet.pl \ /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29/DEF \ -swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 122m25.314s cat fb.equCab2.chainMm9Link.txt # 902295813 bases of 2428790173 (37.150%) in intersection ############################################################################ # lastz swap to Opossum MonDom5 (DONE - 2009-07-02 - Hiram) # the original lastz cd /hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29 cat fb.monDom5.chainEquCab2Link.txt # 355004426 bases of 3501660299 (10.138%) in intersection mkdir /hive/data/genomes/equCab2/bed/blastz.monDom5.swap cd /hive/data/genomes/equCab2/bed/blastz.monDom5.swap time doBlastzChainNet.pl \ /hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -swap -workhorse=hgwdev \ -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 320m3.573s cat fb.equCab2.chainMonDom5Link.txt # 351787662 bases of 2428790173 (14.484%) in intersection ######################################################################### # lastz swap to Human Hg18 (DONE - 2009-07-02 - Hiram) # the original lastz cd /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29 cat fb.hg18.chainEquCab2Link.txt # 1647122438 bases of 2881515245 (57.162%) in intersection mkdir /hive/data/genomes/equCab2/bed/blastz.hg18.swap cd /hive/data/genomes/equCab2/bed/blastz.hg18.swap time doBlastzChainNet.pl \ /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -swap -workhorse=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 238m42.004s cat fb.equCab2.chainHg18Link.txt # 1622340736 bases of 2428790173 (66.796%) in intersection ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # xenoRefGene - (2010-02-04 markd) # enable xeno RefSeq track per user request: add to etc/genbank.conf: equCab2.refseq.mrna.xeno.load = yes ##################################################################### # oviAri1 Sheep BLASTZ/CHAIN/NET (DONE - 2010-04-16 - Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/equCab2/bed/lastzOviAri1.2010-04-16 cd /hive/data/genomes/equCab2/bed/lastzOviAri1.2010-04-16 cat << '_EOF_' > DEF # Sheep vs. Mouse BLASTZ_M=50 # TARGET: Horse EquCab2 SEQ1_DIR=/scratch/data/equCab2/equCab2.2bit SEQ1_LEN=/scratch/data/equCab2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Sheep OviAri1 SEQ2_DIR=/scratch/data/oviAri1/oviAri1.2bit SEQ2_LEN=/scratch/data/oviAri1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/equCab2/bed/lastzOviAri1.2010-04-16 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 397m26.503s cat fb.equCab2.chainOviAri1Link.txt # 1012763540 bases of 2428790173 (41.698%) in intersection mkdir /hive/data/genomes/oviAri1/bed/blastz.equCab2.swap cd /hive/data/genomes/oviAri1/bed/blastz.equCab2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/equCab2/bed/lastzOviAri1.2010-04-16/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 103m0.724s cat fb.oviAri1.chainEquCab2Link.txt # 940763026 bases of 1201271277 (78.314%) in intersection ############################################################################ # DBSNP BUILD 130 -> SNP130 (DONE 12/6/10 angie) mkdir -p /hive/data/outside/dbSNP/130/horse cd /hive/data/outside/dbSNP/130/horse # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (horse_9796 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_*_([0-9]+_[0-9]) # -- use the first num for build and the second num_num for buildAssembly: cat > config.ra <& do.log & tail -f do.log #*** b130_ContigInfo_2_1 has 9688 contig_acc values #*** that are not in /hive/data/genomes/equCab2/chrom.sizes . #*** They are listed in /hive/data/outside/dbSNP/130/horse/dbSnpContigsNotInUcsc.txt # #*** b130_ContigInfo_2_1 has coords for 84 sequences; these have been written to #*** /hive/data/outside/dbSNP/130/horse/suggested.lft . #*** 9604 lines of b130_ContigInfo_2_1.bcp.gz either had no lift-coords #*** or had unrecognized chrom names; see #*** /hive/data/outside/dbSNP/130/horse/cantLiftUpSeqNames.txt . # #*** You must account for those in config.ra, in the liftUp file #*** and/or the ignoreDbSnpContigs regex. #*** Then run again with -continue=translate . # suggested.lft looks reasonable. We don't already have an equivalent # in equCab2/jkStuff so rename suggested.lft, and add that to config.ra: mv suggested.lft /hive/data/genomes/equCab2/jkStuff/liftNcbiContigs.lft echo "liftUp /hive/data/genomes/equCab2/jkStuff/liftNcbiContigs.lft" \ >> config.ra # cantLiftUpSeqNames.txt seems to be all-Un, and Un seems like a specific # enough string to safely use with egrep -vw (see doDbSnp.pl -help). # Add it to config.ra: echo "ignoreDbSnpContigs Un" >> config.ra # Try again with updated config: ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=translate >>& do.log & tail -f do.log # 110668 MultipleAlignments # 662 ObservedMismatch # 144 SingleClassLongerSpan # 101 SingleClassZeroSpan # 35 FlankMismatchGenomeShorter # 30 FlankMismatchGenomeLonger # 3 FlankMismatchGenomeEqual # # *** All done! # doDbSnp.pl needs to show much more info and guidance for the developer # at the end. But at least the processing works now! :) ############################################################################ # SWAP lastz mm10 (DONE - 2012-03-19 - Hiram) # original alignment to mm10 cat /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16/fb.mm10.chainEquCab2Link.txt # 912967841 bases of 2652783500 (34.415%) in intersection # and this swap mkdir /hive/data/genomes/equCab2/bed/blastz.mm10.swap cd /hive/data/genomes/equCab2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 87m2.261s cat fb.equCab2.chainMm10Link.txt # 901995882 bases of 2428790173 (37.138%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/equCab2/bed ln -s blastz.mm10.swap lastz.mm10 ########################################################################### # lastz White Rhino cerSim1 (DONE - 2012-10-23 - Hiram) screen -S equCab2CerSim1 mkdir /hive/data/genomes/equCab2/bed/lastzCerSim1.2012-10-23 cd /hive/data/genomes/equCab2/bed/lastzCerSim1.2012-10-23 cat << '_EOF_' > DEF # Horse vs White Rhino BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_M=50 # TARGET: Horse EquCab2 SEQ1_DIR=/scratch/data/equCab2/equCab2.2bit SEQ1_LEN=/scratch/data/equCab2/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit SEQ1_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes SEQ1_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift SEQ1_CHUNK=20000000 SEQ1_LIMIT=50 SEQ1_LAP=10000 # QUERY: White Rhino CerSim1 SEQ2_DIR=/hive/data/genomes/cerSim1/cerSim1.2bit SEQ2_LEN=/hive/data/genomes/cerSim1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/equCab2/bed/lastzCerSim1.2012-10-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl \ `pwd`/DEF \ -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \ -qRepeats=windowmaskerSdust \ -bigClusterHub=swarm -verbose=2 > do.log 2>&1 & # real 7847m25.844s # broken chain run on encodek, attempted finish up on swarm last 34 jobs # something is odd for these chaining operations, running manually, # the last 20 jobs on hgwdev, total time parallel running them all: # real 5539m3.710s -> over 92 hours ! # and they are gigantic files: # -rw-rw-r-- 1 6979748026 Nov 4 23:57 equCab2.UnScaffolds.2bit:chrX:.chain # -rw-rw-r-- 1 6415261877 Nov 5 04:59 equCab2.UnScaffolds.2bit:chr1:.chain time nice -n +19 doBlastzChainNet.pl \ `pwd`/DEF \ -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainMerge -qRepeats=windowmaskerSdust \ -bigClusterHub=swarm -verbose=2 > chainMerge.log 2>&1 & XXX - running - Mon Nov 5 10:39:45 PST 2012 ############################################################################ # NUMTS TRACK (working 2013-02-05 - Chin) # copy and unzip # http://193.204.182.50/files/tracks.zip and # http://193.204.182.50/files/other%20NumtS%20tracks.zip # to /hive/data/outside/NumtS/2012/tracks mkdir /hive/data/genomes/equCab2/bed/NumtS cd /hive/data/genomes/equCab2/bed/NumtS cp /hive/data/outside/NumtS/2012/tracks/equCab2/* . cat tracks_nuclear_NumtS_ECA | grep -v "^track" > equCab2NumtS.bed cat tracks_numtS_assembled_ECA | grep -v "^track" > equCab2NumtSAssembled.bed cat tracks_mit_NumtS_ECA | grep -v "^track" > equCab2NumtSMitochondrion.bed # load the 3 bed files to equCab2 hgLoadBed equCab2 numtSAssembled equCab2NumtSAssembled.bed hgLoadBed equCab2 numtS equCab2NumtS.bed hgLoadBed equCab2 numtSMitochondrion equCab2NumtSMitochondrion.bed # Make /gbdb/ links and load bam mkdir /gbdb/equCab2/NumtS ln -s `pwd`/EQ-2_NumtS_seqs.sorted.bam{,.bai} /gbdb/equCab2/NumtS/ hgBbiDbLink equCab2 bamAllNumtSSorted /gbdb/equCab2/NumtS/EQ-2_NumtS_seqs.sorted.bam # setup trackDb for equCab2 ##############################################################################