# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on the # Chicken (Gallus gallus) May 2006 release. # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | genscan | genePred genscanPep | # +-------------+---------------------------------+ ######################################################################### # CREATE BUILD DIRECTORY (DONE 5/14/06 angie) ssh kkstore03 mkdir /cluster/store6/galGal3 ln -s /cluster/store6/galGal3 /cluster/data/ ######################################################################### # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 5/14/06 angie) mkdir /cluster/data/galGal3/M cd /cluster/data/galGal3/M # go to http://www.ncbi.nih.gov/ and search Nucleotide for # "gallus mitochondrion genome". That shows the gi number: # 5834843 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834843&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Gallus gallus mitochondrion complete genome, and then replace the # header line with just ">chrM". ######################################################################### # DOWNLOAD, UNPACK AND CHECK CHROM FASTA & AGP (DONE 5/14/06 angie) mkdir /cluster/data/galGal3/downloads cd /cluster/data/galGal3/downloads wget ftp://genome.wustl.edu/pub/user/lhillier/private/chicken_060412.tar.gz tar xvzf chicken_060412.tar.gz cp /dev/null ../chrom.lst foreach agp (*.agp) set chr = $agp:r echo $chr set fa = $chr.fa if (! -e $fa) then echo "*** No fasta for $agp" break endif set c = `echo $chr | sed -e 's/^chr//; s/_random$//;'` if (! -d ../$c) mkdir ../$c mv $agp $fa ../$c end cd .. # checkAgpAndFa prints out way too much info -- keep the end/stderr only: ls -1d ?{,?} E* | sed -e 's@/$@@' > chrom.lst foreach c (`cat chrom.lst`) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo checking consistency of $agp and $fa checkAgpAndFa $agp $fa | tail -1 endif end end faSize */chr*.fa #Total size: mean 19306674.4 sd 39157213.7 min 1028 (chr32) max 200994015 (chr1) median 2031799 ######################################################################### # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 5/14/06 angie) ssh kkstore03 cd /cluster/data/galGal3 foreach c (`cat chrom.lst`) foreach agp ($c/chr$c{,_random}.agp) if (-e $agp) then set fa = $agp:r.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end # splitFaIntoContigs makes new dirs for _randoms. Move their contents # back into the main chrom dirs and get rid of the _random dirs. foreach d (*_random) set base = `echo $d | sed -e 's/_random$//'` mv $d/lift/oOut.lst $base/lift/rOut.lst mv $d/lift/ordered.lft $base/lift/random.lft mv $d/lift/ordered.lst $base/lift/random.lst rmdir $d/lift mv $d/* $base rmdir $d end # Un/ has Un_random but for some reason o* files were created. Rename: foreach f (Un/lift/*) set g = `echo $f | sed -e 's/ordered/random/; s@/o@/r@;'` mv $f $g end # Make a "pseudo-contig" for processing chrM too: mkdir M/chrM_1 sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa mkdir M/lift echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst echo "chrM_1" > M/lift/ordered.lst set mSize = `faSize M/chrM.fa | awk '{print $1;}'` echo "0 M/chrM_1 $mSize chrM $mSize" > M/lift/ordered.lft ######################################################################### # REPEAT MASKING (DONE 5/15/06 angie) # RepeatMasker version and library version: # March 20 2006 (open-3-1-5) version of RepeatMasker grep RELEASE /cluster/bluearc/RepeatMasker060320/Libraries/RepeatMaskerLib.embl #CC RELEASE 20060315; * #- Split contigs into 500kb chunks, at gaps if possible: ssh kkstore03 cd /cluster/data/galGal3 foreach c (`cat chrom.lst`) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/galGal3 mkdir jkStuff cat << '_EOF_' > jkStuff/RMChicken #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/galGal3/$2 /bin/cp $2 /tmp/galGal3/$2/ cd /tmp/galGal3/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec chicken $2 popd /bin/cp /tmp/galGal3/$2/$2.out ./ if (-e /tmp/galGal3/$2/$2.align) /bin/cp /tmp/galGal3/$2/$2.align ./ if (-e /tmp/galGal3/$2/$2.tbl) /bin/cp /tmp/galGal3/$2/$2.tbl ./ if (-e /tmp/galGal3/$2/$2.cat) /bin/cp /tmp/galGal3/$2/$2.cat ./ /bin/rm -fr /tmp/galGal3/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/galGal3/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/galGal3 '_EOF_' # << emacs chmod +x jkStuff/RMChicken mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}{,_random}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/galGal3/jkStuff/RMChicken \ /cluster/data/galGal3/$d $f \ '{'check out line+ /cluster/data/galGal3/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh pk cd /cluster/data/galGal3/RMRun para make RMJobs para time #Completed: 2492 of 2492 jobs #CPU time in finished jobs: 2613579s 43559.64m 725.99h 30.25d 0.083 y #IO & Wait Time: 17626s 293.77m 4.90h 0.20d 0.001 y #Average job time: 1056s 17.60m 0.29h 0.01d #Longest finished job: 1335s 22.25m 0.37h 0.02d #Submission to last job: 8741s 145.68m 2.43h 0.10d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kkstore03 cd /cluster/data/galGal3 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/galGal3 hgLoadOut galGal3 */chr*.fa.out # Compare coverage to previous release: featureBits galGal3 rmsk #102214417 bases of 1042591351 (9.804%) in intersection # galGal2 coverage: #104249260 bases of 1054197620 (9.889%) in intersection # Uh-oh -- in the past, a drop in coverage has meant an RM problem. # However, spot-checking the per-chrom coverage of galGal2 vs. galGal3, # it seems like many small or random chroms have simply had a lot # of repetitive sequence cut from them (significant drop in size as well # as drop in coverage), while for many of the large chroms, the coverage # has gone up. Some chroms grew but the rmsk coverage did not keep up. # chrUn is quite a bit smaller (121M --> 53M). I saved the per-chrom # coverage figures in /cluster/data/galGal3/RMCompare/ . #galGal2:chr1 24219568 183744490 13.181 #galGal3:chr1 25804441 195192300 13.220 #galGal2:chr1_random 186317 1261352 14.771 #galGal3:chr1_random 12930 213918 6.044 #galGal2:chr10 807202 18954178 4.259 #galGal3:chr10 858376 20484937 4.190 #galGal2:chr10_random 525320 3515812 14.942 #galGal3:chr10_random 644 13679 4.708 #galGal2:chr2 16579726 143798269 11.530 [exception. #galGal3:chr2 17182793 150358687 11.428 [ #galGal2:chr2_random 4670 53846 8.673 [ #galGal3:chr2_random 16589 127706 12.990 [ #galGal2:chr18 461248 8797585 5.243 #galGal3:chr18 507614 10513347 4.828 #galGal2:chrE22C19W28 2628 47202 5.568 #galGal2:chrE50C23 690 10171 6.784 #galGal3:chrE22C19W28_E50C23 42486 822662 5.164 #galGal2:chrUn 18697113 121198700 15.427 #galGal3:chrUn_random 11075578 53400422 20.741 #galGal2:chrW 530067 4135691 12.817 #galGal3:chrW 133707 233885 57.168 #galGal2:chrW_random 109531 229903 47.642 #galGal3:chrW_random 311315 625108 49.802 #galGal2:chrZ 4629721 30832492 15.016 #galGal3:chrZ 10599358 67536383 15.694 #galGal2:chrZ_random 2172389 14348615 15.140 #galGal3:chrZ_random 115568 309836 37.300 # So I'm inclined to think that it's most likely not a RepeatMasker bug # but a change in the underlying assembly. Something to run by LaDeana # after reading the release notes... ######################################################################### # MAKE LIFTALL.LFT (DONE 5/15/06 angie) ssh kkstore03 cd /cluster/data/galGal3 cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft ######################################################################### # CREATING DATABASE (DONE 5/15/06 angie) ssh hgwdev echo 'create database galGal3' | hgsql '' # old hgwdev 5/15: # We are awfully tight here but at least not out of space: df -h /var/lib/mysql #/dev/sdc1 1.8T 1.6T 65G 97% /var/lib/mysql # New hgwdev 5/18: df -h /data/mysql #/dev/sdc1 1.8T 685G 1.1T 40% /data/mysql # Copy over the grp table from previous release: echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \ | hgsql galGal3 ######################################################################### # GOLD AND GAP TRACKS (DONE 5/15/06 angie) ssh hgwdev cd /cluster/data/galGal3 hgGoldGapGl -noGl -chromLst=chrom.lst galGal3 /cluster/data/galGal3 . # featureBits complains if there's no chrM_gap, so make one: # echo "create table chrM_gap like chr1_gap" | hgsql galGal3 # oops, that won't work until v4.1, so do this for the time being: echo "create table chrM_gap select * from chr1_gap where 0=1" \ | hgsql galGal3 ######################################################################### # SIMPLE REPEATS (TRF) (DONE 5/15/06 angie) # TRF runs pretty quickly now... it takes a few hours total runtime, # so instead of binrsyncing and para-running, just do this on the # local fileserver ssh kkstore03 mkdir /cluster/data/galGal3/bed mkdir /cluster/data/galGal3/bed/simpleRepeat cd /cluster/data/galGal3/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/galGal3/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end csh -efx jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh #259 jobs.csh ls -1 trf | wc -l #259 endsInLf trf/* #trf/chr12_random_1.bed zero length #trf/chr13_random_1.bed zero length # That's OK. # When job is done do: liftUp simpleRepeat.bed /cluster/data/galGal3/jkStuff/liftAll.lft warn \ trf/*.bed # Load into the database: ssh hgwdev hgLoadBed galGal3 simpleRepeat \ /cluster/data/galGal3/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits galGal3 simpleRepeat #9650062 bases of 1042591351 (0.926%) in intersection # galGal2 coverage: #8434365 bases of 1054197620 (0.800%) in intersection ########################################################################### # CREATE MICROSAT TRACK (done 2006-7-5 JK) ssh hgwdev cd /cluster/data/galGal3/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed galGal3 microsat microsat.bed ######################################################################### # PROCESS SIMPLE REPEATS INTO MASK (DONE 5/15/06 angie) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore03 cd /cluster/data/galGal3/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/galGal3 mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # Here's the coverage for the filtered TRF: ssh hgwdev cat /cluster/data/galGal3/bed/simpleRepeat/trfMaskChrom/*.bed \ > /tmp/filtTrf.bed featureBits galGal3 /tmp/filtTrf.bed #4110365 bases of 1042591351 (0.394%) in intersection # galGal2 coverage: #4510381 bases of 1054197620 (0.428%) in intersection ######################################################################### # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 5/15/06 angie) # Note: just to keep things consistent, redid chr1 and chr2 2/26 with # the ProcessRepeats-only rerun results (only masking changes were # 5bp in chr1 and 3bp in chr2) ssh kkstore03 cd /cluster/data/galGal3 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end foreach c (`cat chrom.lst`) echo "repeat- and trf-masking contigs of chr$c, chr${c}_random" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end # Make 2bit for blat/browser usage: faToTwoBit */chr*.fa galGal3.2bit # Make soft-masked nib for blastz: mkdir nib foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end ######################################################################### # MAKE CHROMINFO TABLE WITH 2BIT (DONE 5/15/06 angie) ssh kkstore03 cd /cluster/data/galGal3 mkdir bed/chromInfo twoBitInfo galGal3.2bit stdout \ | awk '{print $1 "\t" $2 "\t/gbdb/galGal3/galGal3.2bit";}' \ > bed/chromInfo/chromInfo.tab # Link to 2bit from /gbdb/galGal3/: ssh hgwdev mkdir /gbdb/galGal3 ln -s /cluster/data/galGal3/galGal3.2bit /gbdb/galGal3/ # Load /gbdb/galGal3/galGal3.2bit paths into database and save size info. # Make a special chromInfo.sql with large index size so that # chrE22C19W28_E50C23 and chrE22C19W28_E50C23_random don't get collapsed. cd /cluster/data/galGal3/bed/chromInfo perl -wpe 's/chrom\([0-9]+/chrom\(21/' \ $HOME/kent/src/hg/lib/chromInfo.sql > chromInfo.sql hgLoadSqlTab galGal3 chromInfo chromInfo.sql chromInfo.tab hgsql -N galGal3 -e "select chrom,size from chromInfo" \ > /cluster/data/galGal3/chrom.sizes # take a look at chrom.sizes size wc chrom.sizes # 57 114 947 ../../chrom.sizes ######################################################################### # MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 5/15/06 angie) # Use -repMatch=380 (based on size -- for human we use 1024, and # chicken size is ~37% of human) ssh kkr1u00 cd /cluster/data/galGal3 mkdir /cluster/bluearc/galGal3 blat galGal3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/galGal3/11.ooc -repMatch=380 #Wrote 13061 overused 11-mers to /cluster/bluearc/galGal3/11.ooc blat galGal3.2bit /dev/null /dev/null -tileSize=10 \ -makeOoc=/cluster/bluearc/galGal3/10.ooc -repMatch=380 #Wrote 166633 overused 10-mers to /cluster/bluearc/galGal3/10.ooc mkdir /iscratch/i/galGal3 cp -p /cluster/bluearc/galGal3/*.ooc /iscratch/i/galGal3/ iSync ######################################################################### # PUT NIBS ON /SCRATCH (DONE 5/15/06 angie) ssh kkstore03 mkdir /cluster/bluearc/scratch/hg/galGal3 rsync -av /cluster/data/galGal3/nib/* /cluster/bluearc/scratch/hg/galGal3/nib/ cp -p /cluster/data/galGal3/galGal3.2bit /cluster/bluearc/scratch/hg/galGal3/ # Ask cluster-admin to distribute to /scratch on big & small cluster ######################################################################### # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 5/15/06 angie) # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvsup # Add trackDb directories and a description.html mkdir chicken/galGal3 cvs add chicken/galGal3 cvs add chicken/galGal3/description.html cvs ci -m "Initial description for galGal3." chicken/galGal3 # Edit that makefile to add galGal3 in all the right places and do make update DBS=galGal3 mkdir /gbdb/galGal3/html mkdir /cluster/data/galGal3/html ln -s /cluster/data/galGal3/html/description.html /gbdb/galGal3/html/ cvs ci -m "Added galGal3." makefile # Go public on genome-test. In a clean tree (no mods, up-to-date): cvs up makefile make alpha # Note: hgcentral*.genome values must correspond # with defaultDb.genome values hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("galGal3", "May 2006", "/gbdb/galGal3", "Chicken", \ "chr5:57710001-57780000", 1, 30, "Chicken", \ "Gallus Gallus", "/gbdb/galGal3/html/description.html", \ 0, 0, "Chicken Genome Sequencing Consortium May 2006 release");' ######################################################################### # PRODUCING GENSCAN PREDICTIONS (DONE 5/16/06 angie) ssh hgwdev mkdir /cluster/data/galGal3/bed/genscan cd /cluster/data/galGal3/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/galGal3/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) cp /dev/null genome.list foreach f ( `ls -1S /cluster/data/galGal3/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # 259 genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para make jobList para time #Completed: 255 of 259 jobs #Crashed: 4 jobs #CPU time in finished jobs: 82258s 1370.97m 22.85h 0.95d 0.003 y #IO & Wait Time: 1053s 17.55m 0.29h 0.01d 0.000 y #Average job time: 327s 5.45m 0.09h 0.00d #Longest finished job: 20131s 335.52m 5.59h 0.23d #Submission to last job: 23171s 386.18m 6.44h 0.27d # If there are crashes, diagnose with "para problems" / "para crashed". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". ssh kkr5u00 cd /cluster/data/galGal3/bed/genscan /cluster/bin/x86_64/gsBig /cluster/data/galGal3/4/chr4_13/chr4_13.fa.masked gtf/chr4_13.fa.gtf -trans=pep/chr4_13.fa.pep -subopt=subopt/chr4_13.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 /cluster/bin/x86_64/gsBig /cluster/data/galGal3/3/chr3_14/chr3_14.fa.masked gtf/chr3_14.fa.gtf -trans=pep/chr3_14.fa.pep -subopt=subopt/chr3_14.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 /cluster/bin/x86_64/gsBig /cluster/data/galGal3/2/chr2_22/chr2_22.fa.masked gtf/chr2_22.fa.gtf -trans=pep/chr2_22.fa.pep -subopt=subopt/chr2_22.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 /cluster/bin/x86_64/gsBig /cluster/data/galGal3/Un/chrUn_random_3/chrUn_random_3.fa.masked gtf/chrUn_random_3.fa.gtf -trans=pep/chrUn_random_3.fa.pep -subopt=subopt/chrUn_random_3.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 # Convert these to chromosome level files as so: ssh kkstore03 cd /cluster/data/galGal3/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/galGal3/bed/genscan ldHgGene galGal3 genscan genscan.gtf hgPepPred galGal3 generic genscanPep genscan.pep hgLoadBed galGal3 genscanSubopt genscanSubopt.bed ######################################################################### # MAKE GCPERCENT (DONE 5/15/06 angie) ssh kolossus mkdir /cluster/data/galGal3/bed/gc5Base cd /cluster/data/galGal3/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 galGal3 \ /cluster/data/galGal3 \ | wigEncode stdin gc5Base.wig gc5Base.wib ssh hgwdev mkdir /gbdb/galGal3/wib cd /cluster/data/galGal3/bed/gc5Base ln -s `pwd`/gc5Base.wib /gbdb/galGal3/wib hgLoadWiggle -pathPrefix=/gbdb/galGal3/wib galGal3 gc5Base gc5Base.wig ######################################################################### # CPGISSLANDS (WUSTL) (DONE 5/15/06 angie) ssh hgwdev mkdir /cluster/data/galGal3/bed/cpgIsland cd /cluster/data/galGal3/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/galGal3/bed/cpgIsland/ ssh kolossus cd /cluster/data/galGal3/bed/cpgIsland foreach f (../../*/chr*.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout nice ./cpglh.exe $f > $fout end # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd /cluster/data/galGal3/bed/cpgIsland hgLoadBed galGal3 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed ######################################################################### # CPGISLANDS (ANDY LAW) (DONE 5/15/06 angie) # See notes in makeGalGal2.doc ssh kolossus mkdir /cluster/data/galGal3/bed/cpgIslandGgfAndy cd /cluster/data/galGal3/bed/cpgIslandGgfAndy # Build the preProcGgfAndy program in # kent/src/oneShot/preProcGgfAndy into your ~/bin/ # Use unmasked sequence since this is not a mammal... cp /dev/null cpgIslandGgfAndy.bed foreach f (../../*/chr*.fa) set chr = $f:t:r:r echo preproc and run on unmasked $chr ~/bin/x86_64/preProcGgfAndy $f \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndy.bed end wc -l ../cpgIsland/cpgIsland.bed *bed # 22806 ../cpgIsland/cpgIsland.bed # 76488 cpgIslandGgfAndy.bed # load into database: ssh hgwdev cd /cluster/data/galGal3/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql hgLoadBed galGal3 cpgIslandGgfAndy -tab \ -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed featureBits galGal3 cpgIslandExt #15533065 bases of 1042591351 (1.490%) in intersection featureBits galGal3 cpgIslandGgfAndy #62026174 bases of 1042591351 (5.949%) in intersection ######################################################################### # MAKE CHICKEN LINEAGE-SPECIFIC REPEATS (DONE 5/22/06 angie) # In an email 2/13/04, Arian said we could treat all human repeats as # lineage-specific, but could exclude these from chicken as ancestral: # L3, L3a, L3b, MIR, MIR3, MIRb, MIRm ssh kkstore03 cd /cluster/data/galGal3 mkdir -p /san/sanvol1/galGal3/linSpecRep foreach f (*/chr*.fa.out) awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \ > /san/sanvol1/galGal3/linSpecRep/$f:t:r:r.out.spec end # rebuild these as they appear to have become lost mkdir /hive/data/staging/data/galGal3/linSpecRep foreach f (*/chr*.fa.out) awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \ > /hive/data/staging/data/galGal3/linSpecRep/$f:t:r:r.out.spec end ######################################################################### # SWAP CHAINS/NET HG18 (DONE 5/23/06 angie) ssh kkstore03 mkdir /cluster/data/galGal3/bed/blastz.hg18.swap cd /cluster/data/galGal3/bed/blastz.hg18.swap doBlastzChainNet.pl -swap /cluster/data/hg18/bed/blastz.galGal3/DEF \ >& do.log & tail -f do.log ln -s blastz.hg18.swap /cluster/data/galGal3/bed/blastz.hg18 ######################################################################### # SWAP CHAINS/NET MM8 (DONE 5/24/06 angie) ssh kkstore03 mkdir /cluster/data/galGal3/bed/blastz.mm8.swap cd /cluster/data/galGal3/bed/blastz.mm8.swap doBlastzChainNet.pl -swap /cluster/data/mm8/bed/blastz.galGal3/DEF \ >& do.log & tail -f do.log ln -s blastz.mm8.swap /cluster/data/galGal3/bed/blastz.mm8 ######################################################################### # SWAP CHAINS/NET RN4 (DONE 5/25/06 angie) ssh kkstore03 mkdir /cluster/data/galGal3/bed/blastz.rn4.swap cd /cluster/data/galGal3/bed/blastz.rn4.swap doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.galGal3/DEF \ >& do.log & tail -f do.log ln -s blastz.rn4.swap /cluster/data/galGal3/bed/blastz.rn4 ######################################################################### # SWAP CHAINS/NET MONDOM4 (DONE 5/26/06 angie) ssh kkstore03 mkdir /cluster/data/galGal3/bed/blastz.monDom4.swap cd /cluster/data/galGal3/bed/blastz.monDom4.swap doBlastzChainNet.pl -swap /cluster/data/monDom4/bed/blastz.galGal3/DEF \ >& do.log & tail -f do.log ln -s blastz.monDom4.swap /cluster/data/galGal3/bed/blastz.monDom4 ######################################################################### # BLASTZ/CHAIN/NET XENTRO2 (DONE 5/27/06 angie) ssh pk mkdir /cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26 cd /cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26 cat << '_EOF_' > DEF # chicken vs. frog BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 # Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/san/sanvol1/galGal3/nib SEQ1_LEN=/cluster/data/galGal3/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Frog xenTro2 - single chunk big enough to run two of the # largest scaffolds in one job SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26 '_EOF_' # << emacs doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/scratch/galGal3XenTro2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \ >& do.log & tail -f do.log ln -s blastz.xenTro2.2006-05-26 /cluster/data/galGal3/bed/blastz.xenTro2 ######################################################################### # SWAP CHAINS/NET DANRER4 (DONE 5/31/06 angie) ssh kkstore03 mkdir /cluster/data/galGal3/bed/blastz.danRer4.swap cd /cluster/data/galGal3/bed/blastz.danRer4.swap doBlastzChainNet.pl -swap /cluster/data/danRer4/bed/blastz.galGal3/DEF \ >& do.log & tail -f do.log ln -s blastz.danRer4.swap /cluster/data/galGal3/bed/blastz.danRer4 ######################################################################### # GENBANK AUTO UPDATE (DONE 6/9/06 angie) # align with revised genbank process. drop xeno ESTs. cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add galGal3 # galGal3 galGal3.serverGenome = /cluster/data/galGal3/galGal3.2bit galGal3.clusterGenome = /scratch/hg/galGal3/galGal3.2bit galGal3.ooc = /cluster/bluearc/galGal3/11.ooc galGal3.align.unplacedChroms = chr*_random galGal3.lift = /cluster/data/galGal3/jkStuff/liftAll.lft galGal3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} galGal3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} galGal3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} galGal3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} galGal3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} galGal3.refseq.mrna.native.load = yes galGal3.refseq.mrna.xeno.load = yes galGal3.genbank.mrna.xeno.load = yes galGal3.downloadDir = galGal3 cvs ci etc/genbank.conf # update /cluster/data/genbank/ make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial galGal3 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad galGal3 & # enable daily alignment and update of hgwdev cd ~/kent/src/makeDb/genbank cvsup # add galGal3 to: etc/align.dbs etc/hgwdev.dbs cvs commit make etc-update ########################################################################### # HUMAN (hg18) PROTEINS TRACK FOR hg18 (DONE, 2006-06-16, braney) ssh kkstore03 bash # if not using bash shell already # make Blast database for non-random chrom sequences mkdir -p /cluster/data/galGal3/blastDb cd /cluster/data/galGal3/blastDb ls ../*/*/*.lft | grep -v lift | sed 's/lft/fa/' > list ln -s `cat list` . for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.log *.fa list cd /cluster/data/galGal3 for i in */*/*.lft do cat $i ; done > jkStuff/subChr.lft mkdir -p /san/sanvol1/scratch/galGal3/blastDb; cd /cluster/data/galGal3/blastDb for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/galGal3/blastDb; done mkdir /cluster/data/galGal3/bed/tblastnHg18KG cd /cluster/data/galGal3/bed/tblastnHg18KG echo /san/sanvol1/scratch/galGal3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 259 query.lst # we want around 250000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(250000/259) = 38.049172 mkdir -p /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa split -l 60 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/galGal3/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/galGal3/jkStuff/liftAll.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec # then run the Blast cluster jobs ssh kk cd /cluster/data/galGal3/bed/tblastn.hg18KG para create blastSpec para try, check, push, check etc. # pushed 100,000 jobs at a time so need to do para push again later para time # Completed: 158767 of 158767 jobs # CPU time in finished jobs: 9352133s 155868.88m 2597.81h 108.24d 0.297 y # IO & Wait Time: 526347s 8772.45m 146.21h 6.09d 0.017 y # Average job time: 62s 1.04m 0.02h 0.00d # Longest finished job: 221s 3.68m 0.06h 0.00d # Submission to last job: 104854s 1747.57m 29.13h 1.21d ssh kkstore03 cd /cluster/data/galGal3/bed/tblastn.hg18KG tcsh mkdir chainRun cd chainRun cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS \ /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh pk cd /cluster/data/galGal3/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 613 of 613 jobs # CPU time in finished jobs: 26409s 440.15m 7.34h 0.31d 0.001 y # IO & Wait Time: 7787s 129.79m 2.16h 0.09d 0.000 y # Average job time: 56s 0.93m 0.02h 0.00d # Longest finished job: 553s 9.22m 0.15h 0.01d # Submission to last job: 1834s 30.57m 0.51h 0.02d ssh kkstore03 cd /cluster/data/galGal3/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/galGal3/bed/tblastn.hg18KG/blastHg18KG.psl pslCheck /cluster/data/galGal3/bed/tblastn.hg18KG/blastHg18KG.psl # this is ok. # load table ssh hgwdev cd /cluster/data/galGal3/bed/tblastn.hg18KG hgLoadPsl galGal3 blastHg18KG.psl # check coverage featureBits galGal3 blastHg18KG # 19638830 bases of 1042591351 (1.884%) in intersection featureBits galGal2 blastHg16KG # 17057637 bases of 1054197620 (1.618%) in intersection featureBits galGal3 refGene:cds blastHg18KG -enrichment # refGene:cds 0.504%, blastHg18KG 1.884%, both 0.417%, cover 82.71%, enrich 43.91x featureBits galGal2 refGene:cds blastHg16KG -enrichment # refGene:cds 0.497%, blastHg16KG 1.618%, both 0.335%, cover 67.42%, enrich 41.66x ssh kkstore04 rm -rf /cluster/data/galGal3/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut #end human proteins ######################################################################### # BACEND PAIRS TRACK # DOWNLOAD CLONEENDS (BACENDS) FROM NCBI (DONE 6/20/06 angie) ssh kkstore03 cd /cluster/data/galGal3 # Plenty of room at the moment: df -h . # 2.0T 1.6T 280G 86% /cluster/store6 mkdir -p bed/cloneend/ncbi cd bed/cloneend/ncbi wget --timestamping \ ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/gallus_gallus/\* cd .. # Extract unversioned accessions from headers and combine into one # uncompressed file which we will link to from /gbdb/: zcat ncbi/*ends*.mfa.gz \ | perl -wpe 'if (/^>/) { \ s/^>.*\|(gb|dbj)\|(\w+)\.\d+\|.*/>$2/ || die "parse line $.:$_\n"; }' \ > cloneEnds.fa # Make sure the sequences are intact after the header-munging: faSize ncbi/*.mfa.gz #146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 8 files #Total size: mean 1084.6 sd 183.8 min 64 (gi|30716935|gb|CC322877.1|) max 2431 (gi|30609044|gb|CC263290.1|) median 1095 faSize cloneEnds.fa #146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 1 files #Total size: mean 1084.6 sd 183.8 min 64 (CC322877) max 2431 (CC263290) median 1095 # Extract pairings from info files: zcat ncbi/*info*.txt.gz \ | /cluster/bin/scripts/convertCloneEndInfo stdin #55099 pairs and 22655 singles # split for cluster run mkdir /cluster/bluearc/galGal3/cloneEnds faSplit sequence cloneEnds.fa 100 /cluster/bluearc/galGal3/cloneEnds/cloneEnds # Check to ensure no breakage: faSize /cluster/bluearc/galGal3/cloneEnds/*.fa #146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 99 files #Total size: mean 1084.6 sd 183.8 min 64 (CC322877) max 2431 (CC263290) median 1095 # same numbers as before # load sequences ssh hgwdev mkdir /gbdb/galGal3/cloneend ln -s /cluster/data/galGal3/bed/cloneend/cloneEnds.fa /gbdb/galGal3/cloneend/ cd /tmp hgLoadSeq galGal3 /gbdb/galGal3/cloneend/cloneEnds.fa #135327 sequences # BACEND SEQUENCE ALIGNMENTS (DONE 7/17/06 angie) ssh kkstore03 # Make unmasked fasta. cd /cluster/data/galGal3 mkdir /cluster/bluearc/galGal3/noMask foreach f (?{,?}/chr*.fa) echo $f:t:r perl -wpe 'tr/a-z/A-Z/ if (! /^>/);' $f \ > /cluster/bluearc/galGal3/noMask/$f:t end # kluster run [san is down today so use kk this time] ssh kk mkdir /cluster/data/galGal3/bed/bacends cd /cluster/data/galGal3/bed/bacends mkdir out # allow blat to run politely in /tmp while it writes output, then # copy results to results file: cat << '_EOF_' > runBlat.csh #!/bin/csh -ef set root1 = $1 set root2 = $2 set result = $3 rm -fr /scratch/tmp/${root1}_${root2} mkdir /scratch/tmp/${root1}_${root2} pushd /scratch/tmp/${root1}_${root2} /cluster/bin/i386/blat /cluster/bluearc/galGal3/noMask/${root1}.fa \ /cluster/bluearc/galGal3/cloneEnds/${root2}.fa ${root1}.${root2}.psl \ -ooc=/cluster/bluearc/galGal3/10.ooc -tileSize=10 popd mkdir -p out/${root2} rm -f ${result} mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result} rm -fr /scratch/tmp/${root1}_${root2} '_EOF_' # << emacs chmod +x runBlat.csh cat << '_EOF_' > template #LOOP ./runBlat.csh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # << emacs ls -1S /cluster/bluearc/galGal3/cloneEnds/cloneEnds???.fa > bacEnds.lst ls -1S /cluster/bluearc/galGal3/noMask/chr*.fa > contig.lst gensub2 contig.lst bacEnds.lst template jobList para make jobList #Completed: 5247 of 5247 jobs #CPU time in finished jobs: 69864s 1164.39m 19.41h 0.81d 0.002 y #IO & Wait Time: 614130s 10235.51m 170.59h 7.11d 0.019 y #Average job time: 130s 2.17m 0.04h 0.00d #Longest finished job: 3306s 55.10m 0.92h 0.04d #Submission to last job: 3674s 61.23m 1.02h 0.04d ssh kkstore03 cd /cluster/data/galGal3/bed/bacends mkdir temp time pslSort dirs raw.psl temp out/* #9.349u 1.074s 0:10.54 98.7% 0+0k 0+0io 2pf+0w time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons raw.psl \ bacEnds.psl /dev/null #4.838u 0.158s 0:05.09 97.8% 0+0k 0+0io 2pf+0w # BACEND PAIRS TRACK (DONE 7/17/06 angie) ssh kolossus cd /cluster/data/galGal3/bed/bacends time /cluster/bin/x86_64/pslPairs -tInsert=10000 \ -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.psl \ ../cloneend/cloneEndPairs.txt all_bacends bacEnds #1.076u 0.292s 0:01.68 80.9% 0+0k 0+0io 3pf+0w # Filter by score and sort by {chrom,chromStart}: awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed cat bacEnds.{slop,short,long,mismatch,orphan} \ | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed # CHECK bacEndPairs.bed ID's to make sure they have no blanks in them awk '{print $5}' bacEndPairs.bed | sort -u /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \ bacEndPairsBad.bed \ | sorttbl tname tstart | headchg -del \ > bacEnds.load.psl wc -l bacEnds.* # 136054 bacEnds.psl # 106769 bacEnds.load.psl # 29879 bacEnds.pairs # 5 bacEnds.long # 99 bacEnds.lst # 56 bacEnds.mismatch # 20125 bacEnds.orphan # 106 bacEnds.short # 78 bacEnds.slop # load into database ssh hgwdev cd /cluster/data/galGal3/bed/bacends hgLoadBed -strict -notItemRgb galGal3 bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql #Loaded 29879 elements of size 11 # note - the "Bad" track isn't pushed to RR, just used for assembly QA. hgLoadBed -strict -notItemRgb galGal3 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql #Loaded 20360 elements of size 11 time hgLoadPsl galGal3 -table=all_bacends bacEnds.load.psl #1.123u 0.104s 0:04.23 28.8% 0+0k 0+0io 3pf+0w # Compares favorably to previous release (good cov up, bad cov down): featureBits galGal3 all_bacends #58629720 bases of 1042591351 (5.623%) in intersection featureBits galGal2 all_bacends #52184234 bases of 1054197620 (4.950%) in intersection featureBits galGal3 bacEndPairs #43710407 bases of 1042591351 (4.192%) in intersection featureBits galGal2 bacEndPairs #39206208 bases of 1054197620 (3.719%) in intersection featureBits galGal3 bacEndPairsBad #15784430 bases of 1042591351 (1.514%) in intersection featureBits galGal2 bacEndPairsBad #27538232 bases of 1054197620 (2.612%) in intersection # Clean up. rm psl.tab psl.tab.loaded raw.psl bed.tab rm -r out rmdir temp rm -r /cluster/bluearc/galGal3/noMask/ /cluster/bluearc/galGal3/cloneEnds/ # end BACEND PAIRS TRACK ######################################################################### # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 7/13/06 angie - REDONE 8/4) # REDONE 8/4 after makeDownloads.pl was fixed to not try to include complete # paths in the tar files. ssh hgwdev cd /cluster/data/galGal3 ~/kent/src/utils/makeDownloads.pl galGal3 -verbose=2 \ >& jkStuff/downloads.log & tail -f jkStuff/downloads.log # Edit README.txt files when done: # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/galGal3/goldenPath/database/README.txt # /cluster/data/galGal3/goldenPath/bigZips/README.txt # /cluster/data/galGal3/goldenPath/chromosomes/README.txt # (The htdocs/goldenPath/galGal3/*/README.txt "files" are just links to those.) ######################################################################### # MAKE EMPTY CHRM_GOLD (DONE 7/14/06 angie) # Kayla found that hgTracks was complaining about missing chrM_gold. # We can use hgFakeAgp to make a fake chrM.agp, and load that. # However, then the chrM Assembly track shows faked contents that do not # match the track/assembly description. So remove the faked contents, # leaving an empty table which is fine -- chrM is not part of the assembly. ssh hgwdev cd /cluster/data/galGal3 hgFakeAgp M/chrM.fa M/chrM.agp hgGoldGapGl -noGl -chromLst=chrom.lst galGal3 /cluster/data/galGal3 . hgsql galGal3 -e 'delete from chrM_gold' ######################################################################### # MULTIZ7WAY (DONE 7/18/06 angie) # (((galGal3 ((hg18 (mm8 rn4)) monDom4)) xenTro2) danRer4) ssh kkstore03 mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18 cd /cluster/data/galGal3/bed/multiz7way.2006-07-18 # copy MAF's to cluster-friendly server mkdir /san/sanvol1/scratch/galGal3/mafNet foreach s (rn4 mm8 hg18 monDom4 xenTro2 danRer4) echo $s rsync -av /cluster/data/galGal3/bed/blastz.$s/mafNet/* \ /san/sanvol1/scratch/galGal3/mafNet/$s/ end # Prune the hg17 17way tree to just these 9 and update db names: /cluster/bin/phast/tree_doctor \ --prune-all-but=rat_rn3,mouse_mm7,human_hg17,monodelphis_monDom2,chicken_galGal2,xenopus_xenTro1,zebrafish_danRer3 \ --rename="rat_rn3 -> rat_rn4 ; mouse_mm7 -> mouse_mm8 ; human_hg17 -> human_hg18 ; monodelphis_monDom2 -> monodelphis_monDom4 ; chicken_galGal2 -> chicken_galGal3 ; zebrafish_danRer3 -> zebrafish_danRer4 ; xenopus_xenTro1 -> xenopus_xenTro2" \ /cluster/data/hg17/bed/multiz17way/17way.nh > 7way.nh # *carefully* edit 7way.nh to put galGal3 first. /cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt grep galGal3 7way.distances.txt | sort -k3,3n | \ awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt cat distances.txt #0.9847 human_hg18 #1.0149 monodelphis_monDom4 #1.3425 mouse_mm8 #1.3472 rat_rn4 #1.3604 xenopus_xenTro2 #1.6727 zebrafish_danRer4 # the order in the browser display will be by tree topology, # not by distance, but in this case (unlike human) they happen to match. # create species list and stripped down tree for autoMZ sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 7way.nh \ > tree-commas.nh sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst ssh pk cd /cluster/data/galGal3/bed/multiz7way.2006-07-18 mkdir maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = galGal3 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/mafNet rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp 'EOF' # << emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/galGal3/bed/multiz7way.2006-07-18/maf/$(root1).maf} #ENDLOOP 'EOF' # << emacs awk '{print $1}' /cluster/data/galGal3/chrom.sizes > chrom.lst gensub2 chrom.lst single spec jobList para make jobList para time #Completed: 57 of 57 jobs #CPU time in finished jobs: 6545s 109.08m 1.82h 0.08d 0.000 y #IO & Wait Time: 206s 3.43m 0.06h 0.00d 0.000 y #Average job time: 118s 1.97m 0.03h 0.00d #Longest finished job: 890s 14.83m 0.25h 0.01d #Submission to last job: 891s 14.85m 0.25h 0.01d # Make .gif for tree by pasting the contents of 7way.nh into Galt's # cgi-bin/phyloGif tool. Save the .gif in your checkout of the browser/ # CVS tree as browser/images/phylo/galGal3_7way.gif . ssh hgwdev cd ~/browser/images/phylo cvs add galGal3_7way.gif # note: markd says it is best to use "cvs add -kb" for binary files - b0b cvs ci -m "phyloGif-generated galGal3 7way tree image." galGal3_7way.gif # Do a "make alpha" to install the file in htdocs/images/phylo/ : cd ~/browser cvs up -d -P make alpha # Include the file in the push request, and include # "treeImage phylo/galGal3_7way.gif" in the multiz7way trackDb.ra entry. ######################################################################### # ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE 7/18/2006 angie - REDONE 7/28) # REDONE 7/28/06 after Kayla found overlaps and Brian fixed mafAddIRows. ssh kolossus mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno mkdir maf run cd run rm -f sizes nBeds foreach db (`cat /cluster/data/galGal3/bed/multiz7way.2006-07-18/species.lst`) ln -s /cluster/data/$db/chrom.sizes $db.len if (! -e /cluster/data/$db/$db.N.bed) then twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed} endif ln -s /cluster/data/$db/$db.N.bed $db.bed echo $db.bed >> nBeds echo $db.len >> sizes end echo date > jobs.csh # do smaller jobs first: foreach f (`ls -1rS ../../maf/*.maf`) echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \ /cluster/data/galGal3/galGal3.2bit ../maf/$f:t >> jobs.csh end echo date >> jobs.csh csh -efx jobs.csh >&! jobs.log & tail -f jobs.log # 23min # Run mafFilter -overlap to make sure everything is cool before loading: mafFilter -overlap -reject=/tmp/chr1.rej.maf ../maf/chr1.maf > /dev/null #rejected 0 blocks # Load anno/maf ssh hgwdev cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf mkdir -p /gbdb/galGal3/multiz7way/anno/maf ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf/*.maf \ /gbdb/galGal3/multiz7way/anno/maf nice hgLoadMaf -pathPrefix=/gbdb/galGal3/multiz7way/anno/maf \ galGal3 multiz7way #Loaded 1488826 mafs in 57 files from /gbdb/galGal3/multiz7way/anno/maf # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf cat *.maf \ | nice hgLoadMafSummary galGal3 -minSize=30000 -mergeGap=1500 -maxSize=200000 \ -test multiz7waySummary stdin #Created 518561 summary blocks from 3179814 components and 1488826 mafs from stdin ssh hgwdev cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf sed -e 's/mafSummary/multiz7waySummary/' ~/kent/src/hg/lib/mafSummary.sql \ > /tmp/multiz7waySummary.sql time nice hgLoadSqlTab galGal3 multiz7waySummary /tmp/multiz7waySummary.sql \ multiz7waySummary.tab #0.000u 0.002s 0:08.00 0.0% 0+0k 0+0io 3pf+0w rm *.tab ln -s multiz7way.2006-07-18 /cluster/data/galGal3/bed/multiz7way ######################################################################### # MULTIZ7WAY DOWNLOADABLES (DONE 7/18/2006 angie - REDONE 7/28) # Annotated MAF is now documented, so use anno/maf for downloads. ssh hgwdev mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads # upstream mafs cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits galGal3 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags galGal3 multiz7way up.bed upstream$i.maf \ -orgs=../species.lst rm up.bed end date 'EOF' # << emacs time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log #10.144u 15.154s 0:52.22 48.4% 0+0k 0+0io 5pf+0w ssh kkstore03 cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads cat > downloads.csh << 'EOF' date foreach f (../anno/maf/chr*.maf) set c = $f:t:r echo $c nice gzip -c $f > $c.maf.gz end md5sum *.gz > md5sum.txt date 'EOF' # << emacs time csh -efx downloads.csh >&! downloads.log #355.340u 3.718s 5:59.55 99.8% 0+0k 0+0io 1pf+0w nice gzip up*.maf nice md5sum up*.maf.gz >> md5sum.txt # Make a README.txt ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/galGal3/multiz7way mkdir $dir ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads/{*.gz,*.txt} $dir ######################################################################### # MULTIZ7WAY MAF FRAMES (DONE 7/18/2006 angie - REDONE 7/31) ssh hgwdev mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames # The following is adapted from MarkD's Makefile used for mm7... #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using mrna table as genes: danRer4 mkdir genes foreach queryDb (danRer4) set tmpExt = `mktemp temp.XXXXXX` set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt} set tmpMrna = ${queryDb}.mrna.${tmpExt} set tmpCds = ${queryDb}.cds.${tmpExt} echo $queryDb hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ ${queryDb} > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \ stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end # using knownGene for rn4 mm8 hg18 # using refGene for galGal3 # using mgcGenes for xenTro2 # no genes for monDom4 # genePreds; (must keep only the first 10 columns for knownGene) foreach queryDb (galGal3 rn4 mm8 hg18 xenTro2) unset geneTbl range if ($queryDb == "xenTro2") then set geneTbl = mgcGenes set range = 1-10 else if ($queryDb == "galGal3") then set geneTbl = refGene set range = 2-11 else set geneTbl = knownGene set range = 1-10 endif echo $queryDb hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f $range \ | genePredSingleCover stdin stdout \ | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz end #------------------------------------------------------------------------ # create frames ssh kkstore03 nice tcsh # easy way to get process niced cd /cluster/data/galGal3/bed/multiz7way/frames (cat ../anno/maf/*.maf \ | time genePredToMafFrames galGal3 stdin stdout \ danRer4 genes/danRer4.gp.gz \ rn4 genes/rn4.gp.gz \ hg18 genes/hg18.gp.gz \ mm8 genes/mm8.gp.gz \ xenTro2 genes/xenTro2.gp.gz \ galGal3 genes/galGal3.gp.gz \ | gzip > multiz7way.mafFrames.gz) >& mafFrames.log & tail -f mafFrames.log #------------------------------------------------------------------------ # load the database ssh hgwdev cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames hgLoadMafFrames galGal3 multiz7wayFrames multiz7way.mafFrames.gz ######################################################################### # PHASTCONS (DONE 7/20/2006 angie) # Using Kate's process from makeHg17.doc. # This process is distilled from Hiram and Adam's experiments # on mouse (mm7) 17way track. Many parameters are now fixed, without # being experimentally derived, either because the experiments # were lengthy and produced similar results, or because they # weren't runnable given the alignment size. # These parameters are: # --rho # --expected-length # --target-coverage # Also, instead of generating cons and noncons tree models, # we use a single, pre-existing tree model -- Elliot Margulies' model # from the (37-way) ENCODE alignments. ssh kkstore03 cd /cluster/data/galGal3 foreach f (`cat chrom.lst`) echo $f cp -p $f/*.fa /san/sanvol1/scratch/galGal3/chrom/ end # Split chromosome MAF's into windows and use to generate # "sufficient statistics" (ss) files for phastCons input # NOTE: as the SAN fs has lotsa space, we're leaving these # big (temp) files unzipped, to save time during phastCons run. # Note also the larger chunk sizes from previous runs -- this # reduces run-time on the split, slows down the actual phastCons # enough so jobs don't crash (jobs are very quick, just a minute # or so), and according to Adam, will produce better results. # The previous small chunks were probably required by # the phyloFit step, which we are no longer using for the # human alignments. ssh pk mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons /cluster/bin/phast/tree_doctor \ --prune-all-but=rn3,mm7,hg17,monDom2,galGal2,xenTro1,danRer3 \ --rename="galGal2 -> galGal3 ; rn3 -> rn4 ; hg17 -> hg18 ; mm7 -> mm8 ; monDom2 -> monDom4 ; xenTro1 -> xenTro2 ; danRer3 -> danRer4" \ /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod \ > elliotsEncodePruned.mod mkdir run.split cd run.split set WINDOWS = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/galGal3/bed/multiz7way.2006-07-18/maf set WINDOWS = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss cd $WINDOWS set c = $1 echo $c rm -fr $c mkdir $c /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \ -M /san/sanvol1/scratch/galGal3/chrom/$c.fa \ -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 echo "Done" >> $c.done 'EOF' # << emacs chmod +x doSplit.csh rm -f jobList foreach f (../../maf/*.maf) set c = $f:t:r echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList end para make jobList para time #Completed: 54 of 57 jobs #Crashed: 3 jobs #CPU time in finished jobs: 570s 9.49m 0.16h 0.01d 0.000 y #IO & Wait Time: 229s 3.82m 0.06h 0.00d 0.000 y #Average job time: 15s 0.25m 0.00h 0.00d #Longest finished job: 112s 1.87m 0.03h 0.00d #Submission to last job: 223s 3.72m 0.06h 0.00d para crashed #chr12_random, chr17_random and chr32 crashed because their maf's contain #nothing but comments -- no alignments. That's OK, no windows for them. # check tree model on 5MB chunk, using params recommended by Adam, # (to verify branch lengths on 2X species) ssh kolossus cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \ --tree "`cat ../tree-commas.nh`" \ /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss/chr7/chr7.10000001-19997442.ss \ -o phyloFit.tree /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s REV \ --tree "`cat ../tree-commas.nh`" \ /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss/chr7/chr7.10000001-19997442.ss \ -o phyloFit.rev.tree # Comment from makeHg17.doc: # # he ok'ed the results -- not necessary for next human run # TODO: maybe run these by Adam... the numbers in the RATE_MAT and # TREE are different between the phyloFit and elliotsEncode models, # not sure if the differences are significant. I will proceed with # elliotsEncode as in makeHg17.doc. # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons cat > doPhast.csh << 'EOF' #!/bin/csh -fe set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set tmp = /scratch/tmp/$f mkdir -p $tmp set san = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons cp -p $san/ss/$c/$f.ss ../elliotsEncodePruned.mod $tmp pushd $tmp > /dev/null /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncodePruned.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/pp/$c $san/bed/$c sleep 1 mv $tmp/$f.pp $san/pp/$c mv $tmp/$f.bed $san/bed/$c rm -fr $tmp 'EOF' # << emacs chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << 'EOF' #LOOP doPhast.csh $(root1) $(file1) 12 .05 .20 #ENDLOOP 'EOF' # << emacs # Create parasol batch and run it ssh pk cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons pushd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons ls -1S ss/chr*/chr*.ss \ | sed 's/.ss$//' \ > /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons/in.list popd gensub2 in.list single template jobList para make jobList para time #Completed: 151 of 151 jobs #CPU time in finished jobs: 2832s 47.20m 0.79h 0.03d 0.000 y #IO & Wait Time: 644s 10.74m 0.18h 0.01d 0.000 y #Average job time: 23s 0.38m 0.01h 0.00d #Longest finished job: 34s 0.57m 0.01h 0.00d #Submission to last job: 181s 3.02m 0.05h 0.00d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -name "chr*.bed" \ | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \ | sort -k7,7 -k9,9n \ | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \ | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements7way.bed cp -p phastConsElements7way.bed /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons # Measure coverage. If good, load elements into database and proceed with wiggle. # Try for 5% overall cov, and 70% CDS cov. ssh hgwdev cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons featureBits galGal3 -enrichment refGene:cds phastConsElements7way.bed featureBits galGal3 -enrichment xenoRefGene:cds phastConsElements7way.bed # FIRST ITERATION: doPhast (len cov rho) = (12 .1 .27) #refGene:cds 0.504%, phastConsElements7way.bed 7.891%, both 0.414%, cover 82.18%, enrich 10.41x #xenoRefGene:cds 1.785%, phastConsElements7way_12_100_27.bed 7.891%, both 1.610%, cover 90.22%, enrich 11.43x mv phastConsElements7way.bed phastConsElements7way_12_100_27.bed # SECOND ITERATION: doPhast (len cov rho) = (12 .1 .3) #refGene:cds 0.504%, phastConsElements7way.bed 8.336%, both 0.421%, cover 83.61%, enrich 10.03x #xenoRefGene:cds 1.785%, phastConsElements7way.bed 8.336%, both 1.635%, cover 91.59%, enrich 10.99x mv phastConsElements7way.bed phastConsElements7way_12_100_30.bed # THIRD ITERATION: doPhast (len cov rho) = (12 .05 .27) #refGene:cds 0.504%, phastConsElements7way.bed 7.650%, both 0.415%, cover 82.26%, enrich 10.75x #xenoRefGene:cds 1.785%, phastConsElements7way.bed 7.650%, both 1.615%, cover 90.49%, enrich 11.83x mv phastConsElements7way.bed phastConsElements7way_12_050_27.bed # FOURTH ITERATION: doPhast (len cov rho) = (12 .05 .20) #refGene:cds 0.504%, phastConsElements7way.bed 6.480%, both 0.392%, cover 77.82%, enrich 12.01x #xenoRefGene:cds 1.785%, phastConsElements7way.bed 6.480%, both 1.535%, cover 86.02%, enrich 13.28x mv phastConsElements7way.bed phastConsElements7way_12_050_20.bed # The coverage doesn't want to drop too much and maybe it shouldn't -- # I don't like the idea of throwing extreme parameters in order to get # the desired coverage figure from a model that doesn't want to go there. # Load this up and see how it looks... # When happy: hgLoadBed -strict galGal3 phastConsElements7way \ phastConsElements7way_12_050_20.bed # Create merged posterier probability file and wiggle track data files ssh kolossus cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ # sort by chromName, chromStart so that items are in numerical order # for wigEncode #next time try Angie's simpler sort, below time find ./pp -name "chr*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode stdin phastCons7way.wig phastCons7way.wib cp -p phastCons7way.wi? \ /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons ln -s `pwd`/phastCons7way.wib /gbdb/galGal3/multiz7way/phastCons7way.wib hgLoadWiggle -pathPrefix=/gbdb/galGal3/multiz7way galGal3 \ phastCons7way phastCons7way.wig ######################################################################### # PHASTCONS SCORES DOWNLOADABLES FOR 7WAY (DONE 7/20/2006 angie) ssh kolossus cd /cluster/data/galGal3/bed/multiz7way.2006-07-18 mkdir phastConsDownloads cd phastConsDownloads cat > downloads.csh << 'EOF' date cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/pp foreach chr (`awk '{print $1}' /cluster/data/galGal3/chrom.sizes`) echo $chr cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \ | nice gzip -c \ > /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads/$chr.gz end date 'EOF' # << emacs csh -efx downloads.csh >&! downloads.log & tail -f downloads.log # ~3min md5sum *.gz > md5sum.txt # Make a README.txt ssh hgwdev cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads set dir = /usr/local/apache/htdocs/goldenPath/galGal3/phastCons7way mkdir $dir ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads/{*.gz,*.txt} $dir # Clean up after phastCons run. ssh kkstore03 rm /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/*.tab rm -r /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons ######################################################################### # SELF CHAINS (DONE 9/11/06 angie) # requested by user ssh kk mkdir /cluster/data/galGal3/bed/blastz.galGal3.2006-09-08 cd /cluster/data/galGal3/bed/blastz.galGal3.2006-09-08 cat << '_EOF_' > DEF # chicken vs chicken BLASTZ=/cluster/bin/penn/i386/blastz BLASTZ_H=2000 BLASTZ_M=200 # TARGET: chicken galGal3 SEQ1_DIR=/scratch/hg/galGal3/nib SEQ1_LEN=/cluster/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: chicken galGal3 SEQ2_DIR=/scratch/hg/galGal3/nib SEQ2_LEN=/cluster/data/galGal3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastz.galGal3.2006-09-08 '_EOF_' # << emacs doBlastzChainNet.pl -chainMinScore=10000 -chainLinearGap=medium \ -blastzOutRoot=/cluster/bluearc/galGal3Self \ -bigClusterHub=kk -workhorse=kkr7u00 DEF \ >& do.log & tail -f do.log ln -s blastz.galGal3.2006-09-08 /cluster/data/galGal3/bed/blastz.galGal3 ######################################################################### # SNP & GENES FROM BEIJING GENOME INST. (DONE 12/4/06 angie) ssh kkstore03 mkdir /cluster/data/galGal3/bed/bgi cd /cluster/data/galGal3/bed/bgi wget --timestamping \ 'ftp://61.50.158.101/BGI/snps_by_strain/2006-11-28/*' # (username & password in email from Yong Zhang 11/29) # Each .tar.gz unpacks into identical sets of files so they will # overwrite each other if unpacked into the same directory. So create # a subdir for each strain, and unpack its files in the subdir: foreach f (*.tar.gz) set strain = `echo $f:r:r:r | perl -wpe 's/^(\w)/\u$1/'` mkdir -p $strain pushd $strain tar -xvzf ../$f popd end # Make coverage bed4: cp /dev/null bgiCov.bed foreach strain (Broiler Layer Silkie) foreach f ($strain/ContigCov/ContigCov-chr*.txt) set chr = `echo $f:t:r | sed -e 's/ContigCov-//'` grep -v ^Covered $f \ | perl -wpe 'if (/^(\d+)\s+(\d+)\s*$/) { \ $s = $1-1; $_ = "'$chr'\t$s\t$2\t'$strain'.'$chr'.$1.$2\n"; \ } else {die};' \ >> bgiCov.bed end end # Make bgiSnp bed+: ~/kent/src/hg/snp/bgi/bgiSnp.pl */SNPtables/*.txt # The Layer files had the wrong strain ID code (last digit) -- the # script substituted in the correct strain ID (Yong Zhang's suggestion). # Load tables ssh hgwdev cd /cluster/data/galGal3/bed/bgi hgLoadBed galGal3 bgiCov bgiCov.bed hgLoadBed galGal3 bgiSnp bgiSnp.bed \ -sqlTable=$HOME/kent/src/hg/lib/bgiSnp.sql # Got this warning but I think it's because the final column is always # empty so bed.tab doesn't have the final column... table looks OK. #load of bgiSnp did not go as planned: 3306633 record(s), 0 row(s) skipped, 3306633 warning(s) loading bed.tab ######################################################################### # STS? ######################################################################### ######################################################################### # SWAP STICKLEBACK GASACU1 ssh pk mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.swap cd /cluster/data/galGal3/bed/blastz.gasAcu1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \ -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & # failed on the net step: # HgStepManager: executing step 'net'. # netChains: looks like previous stage was not successful # (can't find [galGal3.gasAcu1.]all.chain[.gz]). time doBlastzChainNet.pl -verbose=2 \ /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \ -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 & # real 7m11.286s featureBits galGal3 chainGasAcu1Link > fb.galGal3.chainGasAcu1Link.txt # 32781658 bases of 1042591351 (3.144%) in intersection ####################################################################### ## BLASTZ Chicken chroms vs Stickleback chroms and random contigs ## The above swap of gasAcu1 does *not* include the stickleback chrUn ## To get alignments on the stickleback chrUn, do this alignment of ## stickleback with chrUn to chicken, and swap them back to stickleback. ## Don't do any Db loading, pick out the stickleback randoms after the swap ## (WORKIN - 2007-01-11 - Hiram) ssh kkstore03 mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11 cd /cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11 cat << '_EOF_' > DEF # Chicken chroms vs. Stickleback chroms and chrUn in contigs # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 chroms only, no random sequence # The largest is 200 million bases, there are 34 of them SEQ1_DIR=/san/sanvol1/scratch/galGal3/galGal3.noRandoms.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/galGal3/galGal3.noRandoms.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Stickleback gasAcu1 chroms and chrUn in contigs # The largest chrom is 32M bases, the largest contig 418,670 bases # The smallest chrom chrM is 15,742 bases, smallest contig 60 bases # there are 5,115 chroms and contig pieces # total size 1,089,690,673 bases # 47107538 N's 1042583135 real 834274605 upper 208308530 lower SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift SEQ2_CHUNK=1000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF \ -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/galGal3ChrGasAcu1Un \ -stop=net -bigClusterHub=pk > do.log 2>&1 & # real 116m59.888s # And swapping: mkdir /cluster/data/gasAcu1/bed/blastz.galGal3.swap cd /cluster/data/gasAcu1/bed/blastz.galGal3.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11/DEF \ -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -stop=net -bigClusterHub=pk > swap.log 2>&1 & ####################################################################### ## BLASTZ LIZARD ANOCAR1 - (DONE - 2007-02-18 - Hiram) ssh kkstore05 mkdir /cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18 cd /cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18 cat << '_EOF_' > DEF # chicken vs. lizard # Use same params as used for galGal3-xenTro2 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/san/sanvol1/galGal3/nib SEQ1_LEN=/cluster/data/galGal3/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot=/san/sanvol1/scratch/galGal3AnoCar1 > do.log 2>&1 time doBlastzChainNet.pl -verbose=2 DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net \ -blastzOutRoot=/san/sanvol1/scratch/galGal3AnoCar1 > net.log 2>&1 & time nice -n +19 featureBits galGal3 chainAnoCar1Link \ > fb.galGal3.chainAnoCar1Link.txt 2>&1 # real 0m43.752s # 106743952 bases of 1042591351 (10.238%) in intersection ## swap documented in anoCar1.txt time doBlastzChainNet.pl -verbose=2 \ /cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap > swap.log 2>&1 & ####################################################################### ## BLASTZ FUGU FR2 - (DONE - 2007-03-09 - 2007-03-12 - Hiram) ssh kkstore03 mkdir /cluster/data/galGal3/bed/blastz.fr2.2007-03-09 cd /cluster/data/galGal3/bed/blastz.fr2.2007-03-09 cat << '_EOF_' > DEF # chicken vs. fugu # Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/san/sanvol1/galGal3/nib SEQ1_LEN=/cluster/data/galGal3/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Fugu fr2 SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft SEQ2_CHUNK=10000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastz.fr2.2007-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF \ -qRepeats=windowmaskerSdust \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot=/cluster/bluearc/galGal3Fr2 > do.log 2>&1 cat fb.galGal3.chainFr2Link.txt # 30935323 bases of 1042591351 (2.967%) in intersection mkdir /cluster/data/fr2/bed/blastz.galGal3.swap cd /cluster/data/fr2/bed/blastz.galGal3.swap time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \ /cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -swap > swap.log 2>&1 & time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \ /cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net -swap > swap_net.log 2>&1 & # real 3m1.239s cat fb.fr2.chainGalGal3Link.txt # 36175581 bases of 393312790 (9.198%) in intersection ####################################################################### ## BLASTZ FUGU FR1 - (DONE - 2007-03-09 - 2007-03-12 - Hiram) ssh kkstore03 mkdir /cluster/data/galGal3/bed/blastz.fr1.2007-03-09 cd /cluster/data/galGal3/bed/blastz.fr1.2007-03-09 cat << '_EOF_' > DEF # chicken vs. fugu # Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/san/sanvol1/galGal3/nib SEQ1_LEN=/cluster/data/galGal3/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Fugu fr1 SEQ2_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft SEQ2_CHUNK=10000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastz.fr1.2007-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF \ -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot=/cluster/bluearc/galGal3Fr1 > do.log 2>&1 cat fb.galGal3.chainFr1Link.txt # 29604522 bases of 1042591351 (2.840%) in intersection mkdir /cluster/data/fr1/bed/blastz.galGal3.swap cd /cluster/data/fr1/bed/blastz.galGal3.swap time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \ /cluster/data/galGal3/bed/blastz.fr1.2007-03-09/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap > swap.log 2>&1 & # real 3m14.494s cat fb.fr1.chainGalGal3Link.txt # 33536400 bases of 315518167 (10.629%) in intersection ####################################################################### # BLASTZ PLATYPUS ORNANA1 (DONE 4/9/07 angie) ssh kkstore03 mkdir /cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06 cd /cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06 cat << '_EOF_' > DEF # chicken vs. platypus # Use same params as used for galGal3-xenTro2 but back off L to 6000. # If that causes us to get swamped, we can go back up to 8000. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/iscratch/i/galGal3/galGal3.2bit SEQ1_LEN=/cluster/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Platypus ornAna1 - largest chunk big enough for largest scaffold SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06 TMPDIR=/scratch/tmp '_EOF_' # << emacs doBlastzChainNet.pl DEF \ >& do.log & tail -f do.log ln -s blastz.ornAna1.2007-04-06 /cluster/data/galGal3/bed/blastz.ornAna1 ######################################################################### # BLASTZ/CHAIN/NET HORSE AND CHICKEN (DONE 2/27/07 Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22 cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22 cat << '_EOF_' > DEF # Horse vs. Chicken BLASTZ_M=50 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Chicken galGal3 SEQ2_DIR=/scratch/hg/galGal3/galGal3.2bit SEQ2_LEN=/cluster/data/galGal3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.galGal3.2007-02-22 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot /cluster/bluearc/equCab1/blastz.galGal3 >& do.log & tail -f do.log ssh hgwdev cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22 ln -s blastz.galGal3.2007-02-22 /cluster/data/equCab1/bed/blastz.galGal3 nice featureBits equCab1 -chrom=chr1 chainGalGal3Link # 9136777 bases of 177498097 (5.148%) in intersection bash time nice -n 19 featureBits equCab1 chainGalGal3Link \ > fb.equCab1.chainGalGal3Link.txt 2>&1 # 114216918 bases of 2421923695 (4.716%) in intersection ssh kkstore05 mkdir /cluster/data/galGal3/bed/blastz.equCab1.swap cd /cluster/data/galGal3/bed/blastz.equCab1.swap bash time doBlastzChainNet.pl \ /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 & tail -f swap.log # the above script stopped with the following lines in the .log file: # netToAxt axtChain/net/chrM.net axtChain/chain/chrM.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout # axtSort stdin stdout # gzip -c # Processing chrM # end # netToAxt axtChain/net/chrUn_random.net axtChain/chain/chrUn_random.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout # axtSort stdin stdout # gzip -c # Processing chrUn_random # Hiram helped to created the following temp .csh: cat finiNet.csh #!/bin/csh -efx # This script was automatically generated by /cluster/bin/scripts/doBlastzChainNet.pl # from /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF # It is to be executed on kolossus in /cluster/data/galGal3/bed/blastz.equCab1.swap/axtChain . # It generates nets (without repeat/gap stats -- those are added later on # hgwdev) from chains, and generates axt, maf and .over.chain from the nets. # This script will fail if any of its commands fail. cd /cluster/data/galGal3/bed/blastz.equCab1.swap foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.galGal3.equCab1.net.axt.gz end # Make mafNet for multiz: one .maf per galGal3 seq. mkdir mafNet foreach f (axtNet/*.galGal3.equCab1.net.axt.gz) axtToMaf -tPrefix=galGal3. -qPrefix=equCab1. $f \ /cluster/data/galGal3/chrom.sizes /san/sanvol1/scratch/equCab1/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end ssh kolossus cd /cluster/data/galGal3/bed/blastz.equCab1.swap fininet.csh ssh hgwdev cd /cluster/data/galGal3/bed/blastz.equCab1.swap bash time nice -n 19 featureBits galGal3 chainEquCab1Link \ > fb.galGal3.chainEquCab1Link.txt 2>&1 # 104418042 bases of 1042591351 (10.015%) in intersection ########################################################################### # ENSEMBL GENES (DONE 11/2/07 angie) ssh hgwdev mkdir /cluster/data/galGal3/bed/ensembl cd /cluster/data/galGal3/bed/ensembl # Go to www.ensembl.org and click around their evolving interface # to get the following types of data: # 1. a tab-separated file that relates Ensembl gene, transcript and # protein IDs. Save as ensGtp.txt.gz # 2. peptide fasta with only transcript IDs in header -> ensPep.fa.gz # They have started dumping GTF files so we can download that directly: # wget # 'ftp://ftp.ensembl.org/pub/current_gallus_gallus/data/gtf/*' # -- but I forgot to do that and just grabbed the GTF from Ensembl # BioMart along with the GTP and Pep. # Add "chr" to chrom names in the gene data gtf file to make # it compatible with our software. Also change MT --> M. zcat ensembl.gff.gz | sed -e 's/^MT/M/; s/^/chr/;' > ensGene.gtf ldHgGene -gtf -genePredExt galGal3 ensGene ensGene.gtf nice featureBits galGal3 ensGene #30850267 bases of 1042591351 (2.959%) in intersection # strip header line: zcat ensGtp.txt.gz | tail +2 \ | hgLoadSqlTab galGal3 ensGtp ~/kent/src/hg/lib/ensGtp.sql \ /dev/stdin -notOnServer zcat ensPep.fa.gz | sed -e 's/^>.*ENSGALT/>ENSGALT/' \ | hgPepPred galGal3 generic ensPep stdin ############################################################################ # BLASTZ petMar1 Lamprey (WORKING - 2008-04-11 - Hiram) ssh kkstore03 screen # use a screen to control this multi-day job mkdir /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11 cd /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11 cat << '_EOF_' > DEF # Chicken vs Lamprey BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/scratch/data/galGal3/nib SEQ1_LEN=/cluster/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Lamprey petMar1 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/cluster/data/petMar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=memk -verbose=2 > do.log 2>&1 & # Completed: 83468 of 83468 jobs # CPU time in finished jobs: 2199234s 36653.91m 610.90h 25.45d 0.070 y # IO & Wait Time: 426454s 7107.56m 118.46h 4.94d 0.014 y # Average job time: 31s 0.52m 0.01h 0.00d # Longest finished job: 414s 6.90m 0.12h 0.00d # Submission to last job: 103466s 1724.43m 28.74h 1.20d # continuing after some kluster interruptions time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=cat -bigClusterHub=memk -verbose=2 > cat.log 2>&1 & # real 13m54.957s cat fb.galGal3.chainPetMar1Link.txt # 20134896 bases of 1042591351 (1.931%) in intersection # and the swap mkdir /cluster/data/petMar1/bed/blastz.galGal3.swap cd /cluster/data/petMar1/bed/blastz.galGal3.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=memk -verbose=2 > swap.log 2>&1 & # real 33m41.587s cat fb.petMar1.chainGalGal3Link.txt # 22308118 bases of 831696438 (2.682%) in intersection ############################################################################ # BLASTZ petMar1 Lanclet (WORKING - 2008-04-15 - Hiram) ssh kkstore03 screen # use a screen to control this multi-day job mkdir /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15 cd /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15 cat << '_EOF_' > DEF # Chicken vs Lanclet BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/scratch/data/galGal3/nib SEQ1_LEN=/scratch/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold # Largest scaffold 7,200,735 - 3032 scaffolds + chrM SEQ2_DIR=/scratch/data/braFlo1/braFlo1.2bit SEQ2_LEN=/scratch/data/braFlo1/chrom.sizes SEQ2_CTGDIR=/scratch/data/braFlo1/braFlo1UnScaffolds.2bit SEQ2_CTGLEN=/scratch/data/braFlo1/braFlo1UnScaffolds.sizes SEQ2_LIFT=/scratch/data/braFlo1/braFlo1.lift SEQ2_CHUNK=10000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -verbose=2 > do.log 2>&1 & # real 168m49.357s cat fb.galGal3.chainBraFlo1Link.txt # 19795687 bases of 1042591351 (1.899%) in intersection # and the swap mkdir /cluster/data/braFlo1/bed/blastz.galGal3.swap cd /cluster/data/braFlo1/bed/blastz.galGal3.swap time nice -n +19 doBlastzChainNet.pl \ /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk -verbose=2 > swap.log 2>&1 & # real 4m45.054s cat fb.braFlo1.chainGalGal3Link.txt # 30287175 bases of 923355587 (3.280%) in intersection ############################################################################# # BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-18 - larrym) ssh kkstore04 screen # use screen to control this multi-day job mkdir /cluster/data/galGal3/bed/blastz.equCab2.2008-04-18 cd /cluster/data/galGal3/bed/blastz.equCab2.2008-04-18 cat << '_EOF_' > DEF # Chicken vs. Horse BLASTZ_M=50 # TARGET: Chicken calGal3 SEQ1_DIR=/scratch/data/galGal3/nib SEQ1_LEN=/cluster/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/cluster/data/equCab2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/galGal3/bed/blastz.equCab2.2008-04-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >& do.log & failed b/c para.results got currupted: para recover jobList recoverJobList para create recoverJobList Checking input files 0 jobs written to batch so all jobs had successfully completed. Generate run.time file manually: time doBlastzChainNet.pl `pwd`/DEF -continue=cat \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >>& do.log & but the batch file has disappeared ... I just decided to remove the psl files and re-run from scratch: time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >& do.log & 0.248u 0.190s 6:10:43.78 0.0% 0+0k 0+0io 26pf+0w ln -s blastz.equCab2.2008-04-18 /cluster/data/galGal3/bed/blastz.equCab2 ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ##################################################### # build 2bit with contigs from random chroms and chrUn (DONE braney 2008-08-30) cd /cluster/data/galGal3 fgrep -h 'random Un' */*.agp | awk '{if ($5 == "W") print $1, $2-1, $3, $6}' \ | while read chr start stop contig; \ do \ echo ">$contig"; twoBitToFa galGal3.2bit:$chr:$start-$stop stdout \ | tail -n +2; \ done > unRandom.fa fgrep -v 'random Un' chrom.sizes | awk '{print $1}' | \ twoBitToFa galGal3.2bit -seqList=stdin noUn.fa cat noUn.fa unRandom.fa | faToTwoBit stdin galGal3.blastz.2bit twoBitInfo galGal3.blastz.2bit galGal3.blastz.sizes ######################################################################### # BLASTZ/CHAIN/NET TAEGUT1 (DONE braney 2008-09-04) ssh swarm screen mkdir /cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06 cd /cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06 cat << _EOF_ > DEF # chicken vs. zebra finch BLASTZ_T=2 BLASTZ_M=50 # TARGET: Chicken galGal3 # SEQ1_DIR=/scratch/data/galGal3/galGal3.2bit # SEQ1_LEN=/scratch/data/galGal3/chrom.sizes SEQ1_DIR=/hive/data/genomes/galGal3/galGal3.2bit SEQ1_LEN=/hive/data/genomes/galGal3/chrom.sizes # SEQ1_CTGDIR=/hive/data/genomes/galGal3/galGal3.blastz.2bit # SEQ1_CTGLEN=/hive/data/genomes/galGal3/galGal3.blastz.sizes # SEQ1_LIFT=/hive/data/genomes/galGal3/jkStuff/liftAll.lft # one chrom at a time SEQ1_CHUNK=200000000 SEQ1_LAP=0 # QUERY: Zebra finch taeGut1 # SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit # SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes SEQ2_DIR=/hive/data/genomes/taeGut1/taeGut1.2bit SEQ2_LEN=/hive/data/genomes/taeGut1/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/galGal3/bed/blastz.taeGut1.2008-09-06 _EOF_ # << emacs doBlastzChainNet.pl -syntenicNet \ -bigClusterHub=swarm -chainMinScore=3000 -chainLinearGap=medium \ -smallClusterHub=swarm DEF -workhorse=swarm \ -qRepeats=windowmaskerSdust > do.log 2>&1 & cd /cluster/data/galGal3/bed rm -f blastz.taeGut1 ln -s blastz.taeGut1.2008-09-06 /cluster/data/galGal3/bed/blastz.taeGut1 ############ ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: galGal3.upstreamGeneTbl = refGene galGal3.upstreamMaf = multiz7way /hive/data/genomes/galGal3/bed/multiz7way/species.lst ############################################################################ # QUALITY TRACK (REDONE 12/2/09 angie) # originally done 2008-11-24 - Hiram) cd /hive/data/genomes/galGal3 # create an agp file for C in `cut -f1 chrom.sizes | grep -v random | sed -e "s/chr//" | sort` do cat $C/chr${C}.agp done > galGal3.agp for C in `cut -f1 chrom.sizes | grep random | sed -e "s/chr//; s/_random//" | sort` do F=$C/chr${C}_random.agp if [ -s $F ]; then cat $F fi done >> galGal3.agp cd /hive/data/genomes/galGal3/downloads qaToQac contigs.fa.qual.gz assembly.qac qacAgpLift -mScore=99 -verbose=2 ../galGal3.agp assembly.qac scaffolds.qac mkdir /hive/data/genomes/galGal3/bed/qual cd /hive/data/genomes/galGal3/bed/qual qacToWig -fixed ../../downloads/scaffolds.qac stdout \ | wigEncode stdin qual.wig qual.wib #Converted stdin, upper limit 99.00, lower limit 0.00 ln -s `pwd`/qual.wib /gbdb/galGal3/wib hgLoadWiggle -pathPrefix=/gbdb/galGal3/wib galGal3 quality qual.wig ############################################################################# # N-SCAN gene predictions (nscanGene) - (2009-03-13 markd) # obtained NSCAN predictions from michael brent's group # at WUSTL cd /cluster/data/galGal3/bed/nscan/ http://mblab.wustl.edu/predictions/chicken/galGal3/ wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/galGal3.gtf wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/galGal3.fa wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/README bzip2 galGal3.* chmod a-w * # load track ldHgGene -bin -gtf -genePredExt galGal3 nscanGene galGal3.gtf.bz2 hgPepPred galGal3 generic nscanPep galGal3.fa.bz2 rm *.tab # chicken/galGal3/trackDb.ra, add: track nscanGene override informant Chicken N-SCAN uses Zerba Finch (taeGut1) as the informant, updated with PASA clusters of chicken cDNAs. searchTable nscanGene searchType genePred termRegex chr[0-9a-zA-Z_]+\.([0-9]+|pasa)\.[0-9]+(\.[0-9a-z]+)? searchPriority 50 ############################################################################ # Re-Run equCab2 alignment (DONE - 2009-06-29,07-01 - Hiram) mkdir /hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29 cd /hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29 cat << '_EOF_' > DEF # Chicken vs. Horse BLASTZ_M=50 # TARGET: Human galGal3 SEQ1_DIR=/scratch/data/galGal3/bothMaskedNibs SEQ1_LEN=/scratch/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/scratch/data/equCab2/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=pk \ -workhorse=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 220m54.623s # cat fb.galGal3.chainEquCab2Link.txt # 68476046 bases of 1042591351 (6.568%) in intersection mkdir /hive/data/genomes/equCab2/bed/blastz.galGal3.swap cd /hive/data/genomes/equCab2/bed/blastz.galGal3.swap time doBlastzChainNet.pl \ /hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=pk \ -swap -workhorse=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 10m1.381s cat fb.equCab2.chainGalGal3Link.txt # 73336799 bases of 2428790173 (3.019%) in intersection ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ ## caCondor 454 alignments (2009-12-08 markd) ############################################################################ mkdir /hive/data/genomes/galGal3/bed/caCondor454 # get WUGSC 454 cDNA sequences from the trace archives cd data wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/gymnogyps_californianus/anc.gymnogyps_californianus.001.gz wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/gymnogyps_californianus/fasta.gymnogyps_californianus.001.gz cd .. # get trace ids for this project, and then extract fasta CENTER_NAME = WUGSC column $6 CENTER_PROJECT = U_FC-MATT_TECHD2 column $7 TI column $57 zcat data/anc.gymnogyps_californianus.001.gz |tawk '$6=="WUGSC" && $7=="U_FC-MATT_TECHD2" {print "gnl|ti|"$57}' >data/wugsc-454-cdna.ids faSomeRecords data/fasta.gymnogyps_californianus.001.gz data/wugsc-454-cdna.ids data/wugsc-454-cdna.fa # obtained PASA cDNA clusters for galGal3 from the Brent lab at WUStL http://mblab.wustl.edu/~jeltje/for_Mark/ -> data/pasa-pre-2009-28/ bzcat data/pasa-pre-2009-28/chr*.gtf.bz2 | gtfToGenePred stdin data/pasa.gp getRnaPred galGal3 data/pasa.gp all data/pasa.fa -pslOut=data/pasa.psl # obtain N-SCAN gene predictiosn getRnaPred galGal3 nscanGene all data/nscan.fa -pslOut=data/nscan.psl # combine to make files for transMap cat data/pasa.fa data/nscan.fa >data/nscanPasa.fa cat data/pasa.psl data/nscan.psl >data/nscanPasa.psl # modify makefiles that were create during mapping experiments for alignments # these run two cluster batches. The first aligns the reads make >&log& # load tracks hgLoadPsl galGal3 -table=caCondor454 results/nscanPasaUnmappedRealign/combined.psl.gz ln -s /hive/data/genomes/galGal3/bed/caCondor454/data/wugsc-454-cdna.fa /gbdb/galGal3/caCondor454.fa hgLoadSeq galGal3 /gbdb/galGal3/caCondor454.fa ####################################################################### # BLASTZ/CHAIN/NET Turkey vs Chicken (DONE - 2011-03-24 - Chin) # use a screen to manage this multi-day job mkdir /hive/data/genomes/galGal3/bed/lastzMelGal1.2011-03-24 cd /hive/data/genomes/galGal3/bed/lastzMelGal1.2011-03-24 cat << '_EOF_' > DEF # Chicken vs. Turkey BLASTZ_M=50 # TARGET: Chicken GalGal3 SEQ1_DIR=/scratch/data/galGal3/galGal3.2bit SEQ1_LEN=/scratch/data/galGal3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: Turkey MelGal1 SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/galGal3/bed/lastzMelGal1.2011-03-24 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy # screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 120m58.775s # *** All done ! Elapsed time: 120m59s # *** Make sure that goldenPath/galGal3/vsMelGal1/README.txt is accurate. # *** Add {chain,net}MelGal1 tracks to trackDb.ra if necessary. cd /hive/data/genomes/galGal3/bed ln -s lastzMelGal1.2011-03-24 lastz.melGal3 cat fb.galGal3.chainMelGal1Link.txt # 924755061 bases of 1042591351 (88.698%) in intersection cd /hive/data/genomes/galGal3/bed ln -s lastzMelGal1.2011-03-24 lastz.melGal3 # and then swap mkdir /hive/data/genomes/melGal1/bed/blastz.galGal3.swap cd /hive/data/genomes/melGal1/bed/blastz.galGal3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/galGal3/bed/lastzMelGal1.2011-03-24/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 38m4.406s cat fb.melGal1.chainGalGal3Link.txt # 874579772 bases of 935922386 (93.446%) in intersection cd /hive/data/genomes/melGal1/bed ############################################################################ # LASTZ Lizard AnoCar2 (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/galGal3/bed/lastzAnoCar2.2011-04-25 cd /hive/data/genomes/galGal3/bed/lastzAnoCar2.2011-04-25 cat << '_EOF_' > DEF # Chicken vs lizard BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Chicken galGal3 SEQ1_DIR=/scratch/data/galGal3/nib SEQ1_LEN=/scratch/data/galGal3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Lizard anoCar2 SEQ2_DIR=/scratch/data/anoCar2/anoCar2.2bit SEQ2_LEN=/scratch/data/anoCar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=40 BASE=/hive/data/genomes/galGal3/bed/lastzAnoCar2.2011-04-25 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +25 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \ -bigClusterHub=swarm -tRepeats=windowmaskerSdust \ -qRepeats=windowmaskerSdust > do.log 2>&1 & # real 210m1.468s cat fb.galGal3.chainAnoCar2Link.txt # 113851392 bases of 1042591351 (10.920%) in intersection # running the swap - DONE - 2011-04-26 mkdir /hive/data/genomes/anoCar2/bed/blastz.galGal3.swap cd /hive/data/genomes/anoCar2/bed/blastz.galGal3.swap time nice -n +25 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/galGal3/bed/lastzAnoCar2.2011-04-25/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -syntenicNet -swap -qRepeats=windowmaskerSdust \ -tRepeats=windowmaskerSdust > swap.log 2>&1 & # real 11m8.229s cat fb.anoCar2.chainGalGal3Link.txt # 116069635 bases of 1701353770 (6.822%) in intersection ############################################################################## # LASTZ Xenopus xenTro3 (DONE - 2010-12-17 - Hiram) mkdir /hive/data/genomes/galGal3/bed/lastzXenTro3.2011-09-20 cd /hive/data/genomes/galGal3/bed/lastzXenTro3.2011-09-20 cat << '_EOF_' > DEF # chicken vs. X. tropicalis BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Zebrafish galGal3 SEQ1_DIR=/scratch/data/galGal3/galGal3.2bit SEQ1_LEN=/scratch/data/galGal3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: X. tropicalis xenTro3 SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/galGal3/bed/lastzXenTro3.2011-09-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 209m23.536s cat fb.galGal3.chainXenTro3Link.txt # 54978665 bases of 1042591351 (5.273%) in intersection mkdir /hive/data/genomes/xenTro3/bed/blastz.galGal3.swap cd /hive/data/genomes/xenTro3/bed/blastz.galGal3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/galGal3/bed/lastzXenTro3.2011-09-20/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \ -swap -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > swap.log 2>&1 & # real 13m36.708s cat fb.xenTro3.chainGalGal3Link.txt # 64250673 bases of 1358334882 (4.730%) in intersection ######################################################################### # construct liftOver to galGal4 (DONE - 2012-05-25 - Hiram) screen -S galGal3 # manage this longish running job in a screen mkdir /hive/data/genomes/galGal3/bed/blat.galGal4.2012-05-25 cd /hive/data/genomes/galGal3/bed/blat.galGal4.2012-05-25 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/galGal3/11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev galGal3 galGal4 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/galGal3/11.ooc \ -dbHost=hgwdev -workhorse=hgwdev galGal3 galGal4 > do.log 2>&1 # real 20m44.646s # verify this file exists: # /gbdb/galGal3/liftOver/galGal3ToGalGal4.over.chain.gz # and try out the conversion on genome-test from galGal3 to galGal4 ############################################################################