# for emacs: -*- mode: sh; -*- # This file describes browser build for the panTro2 chimp genome: Feb 2006 # # "$Id: panTro2.txt,v 1.19 2010/04/11 16:48:45 markd Exp $" # # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+-------------------------+ # | tableName | type | # +-------------+-------------------------+ # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred | # | nscanGene | genePred nscanPep | # | genscan | genePred genscanPep | # +-------------+-------------------------+ ####################################################################### # DOWNLOAD 6X CHIMP SEQUENCE FROM LaDeana HIllier, WUSTL (2006-02-27 - kate) # # panTro2 # 6X Chimp from LaDeana Hillier, WUSLT # NCBI accession for the project is: AADA01000000 (02-SEP-2005) # NOTE: chrom numbering follows hunan orthology, a change from panTro1. # Human chrom 2 is orthologus to chimp chroms 2a and 2b. # SETUP BUILD AREA AND DOWNLOAD ASSEMBLY (DONE 2006-02-27 kate) ssh kkstore01 mkdir /cluster/store8/panTro2 ln -s /cluster/store8/panTro2 /cluster/data cd /cluster/data/panTro2 # annotations dir mkdir bed # temp dir for lift files and # scripts copy-pasted from this doc mkdir jkStuff # make download dir wget ftp://genome.wustl.edu/pub/user/lhillier/panTro2.tar.gz tar xvfz *.gz mv panTro2 wustl # for finished chr21, there is chr21.agp, chr21.clones.agp, # and chr21.fa (complete chrom fasta file) # for partially finished chrY, there is: # .agp, .clones.agp, and .fa for chrY and chrY_random # for chr7, that includes a 5M finished region, there # are: chr7.agp, chr7.contigs.fa, and chr7.clones.fa # (also chr7_random.{agp,contigs.fa} # For all other chroms there are AGP files plus contig fasta: # chr*.agp, chr*.contigs.fa files grep '>' wustl/*.contigs.fa | wc -l # 246371 # NOTE: Release notes list 265882 contigs # (due to replacement of contigs by clone sequence in finished chroms?) grep '>' wustl/*.clones.fa | wc -l # 505 # BUILD CHROM FASTA FILES (2006-02-28 kate) # Most chroms are built from contigs, a few have # clones in the AGP, chr7 has both! cut -f1 wustl/*.agp | uniq | grep -v '^$' | grep -v random | \ sed 's/chr//' > chrom.lst wc -l chrom.lst # 28 # 1,2a,2b,3-22,X,Y,M,Un, and chr6_hla_hap1 cat > jkStuff/makeChroms.csh << 'EOF' date foreach d (`cat chrom.lst`) set c = chr$d echo $c mkdir $d cp wustl/$c.{agp,*.fa} $d if (-e $d/$c.clones.fa) then set fa = clones.fa # chr7 has both contigs and clones and clone file # doesn't have genbank sequence header -- merge them if ($c == "chr7") then cat $d/$c.contigs.fa >> $d/$c.clones.fa rm $d/$c.contigs.fa else # extract clone id from sequence name line to please agpToFa awk -F\| '{if (/^>/) printf(">%s\n", $4); else print}' \ wustl/$c.clones.fa > $d/$c.clones.fa endif else set fa = contigs.fa endif agpToFa -simpleMultiMixed $d/$c.agp $c $d/$c.fa $d/$c.$fa if (-e wustl/${c}_random.agp) then set r = ${c}_random echo $r cp wustl/$r.{agp,*.fa} $d if (-e $d/$r.clones.fa) then set fa = clones.fa awk -F\| '{if (/^>/) printf(">%s\n", $4); else print}' \ wustl/$r.clones.fa > $d/$r.clones.fa else set fa = contigs.fa endif agpToFa -simpleMultiMixed $d/$r.agp $r $d/$r.fa $d/$r.$fa endif end date 'EOF' # << happy emacs csh jkStuff/makeChroms.csh >&! jkStuff/makeChroms.log & # 5 minutes cat > jkStuff/checkChroms.csh << 'EOF' date foreach d (`cat chrom.lst`) set c = chr$d echo $c faSize $d/$c.fa checkAgpAndFa $d/$c.agp $d/$c.fa | tail -1 if (-e $d/${c}_random.fa) then set r = ${c}_random echo $r faSize $d/$r.fa checkAgpAndFa $d/$r.agp $d/$r.fa | tail -1 endif end date 'EOF' # << happy emacs csh jkStuff/checkChroms.csh >&! jkStuff/checkChroms.log & # 3 minutes # Replace chrM and chrM_random from this assembly with # chrM from NCBI. # so retrieving it from NCBI, via method described in canFam2 make doc. # (search "pan troglodytes mitochondrion genome" finds NC_001643 rm -fr M mkdir M cd M wget -O chrM.fa 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=nucleotide&val=5835121&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Canis familiaris mitochondrion complete genome, and then replace the # header line with just ">chrM". faSize chrM.fa # 16554 bases # cleanup chrom dirs cd .. foreach d (`cat chrom.lst`) set c = chr$d mkdir $d/parts mv $d/*.clones.fa $d/*.contigs.fa $d/parts end # retrieve fixed chr9.agp from LaDeana (2006-03-02) mkdir wustl.chr9 cd wustl.chr9 wget ftp://genome.wustl.edu/pub/user/lhillier/pub/chr9.agp.gz gunzip *.gz cp -p chr9.agp ../9 mv wustl/chr9.agp wustl/chr9.agp.old cp -p chr9.agp wustl/chr9.agp cd ../9 agpToFa -simpleMultiMixed chr9.agp chr9 chr9.fa parts/chr9.contigs.fa checkAgpAndFa chr9.agp chr9.fa | tail -1 ######################## # QUALITY SCORES (2006-03-15 kate) # From Joanne Nelson, WUSTL # Redo chr7 with finished clone scores newly available (2006-07-13 kate) # Finished chr21 scores are being submitted to Genbank by Riken, # but are not yet available ssh kkstore04 cd /cluster/data/panTro2 mkdir bed/quality cd bed/quality wget -r ftp://genome.wustl.edu/private/963082470306293/upload_for_kate mkdir wustl mv genome.wustl.edu/private/*/upload*/*.qvl wustl rm -fr genome.wustl.edu grep '>' wustl/chr*.qvl | wc -l # 246371 # same as #contigs in sequence files, above gzip wustl/*.qvl cat > makeQuals.csh << 'EOF' date mkdir -p qac set b = /cluster/data/panTro2 foreach f (`ls wustl/*.qvl.gz | grep -v chrM | grep -v ContigF`) set c = $f:t:r:r:r echo $c set d = `echo $c | sed 's/chr//;s/_random//'` # contig names in quality files are slightly different from AGP's sed 's/Contig/bld2_Cont/' $b/$d/$c.agp > agp.tmp nice zcat wustl/$c.contigs.qvl | \ nice qaToQac stdin stdout | \ nice qacAgpLift agp.tmp stdin qac/$c.qac end date 'EOF' # << happy emacs csh makeQuals.csh >&! makeQuals.log & #chr7 #AC161123.3 not found # need special handling for chr7, that has both contigs # (with quality scores), and clones (w/o) # For chimpHiQualDiffs, construct a chr7 in one piece, with # missing data filled in with score=98 (manually assigned "low quality"), # although these are clones, and should have high quality. # By request of Daryl Thomas sed 's/Contig/bld2_Cont/' /cluster/data/panTro2/7/chr7.agp > agp.tmp nice zcat wustl/chr7.contigs.qvl.gz | \ nice qaToQac stdin stdout | \ nice qacAgpLift -mScore=98 agp.tmp stdin qac/chr7.qac # Redo chr7 with finished clone scores newly available (2006-07-13) wget -nd -r ftp://genome.wustl.edu/private/723378195268200/chimpqual # oops -- no longer at this site -- inquiring with Joanne Nelson # Also, create dummy chrY, chrY_random, chr21, all of which # are finished sequence from clones cat > makeFakeQuals.csh << 'EOF' date foreach c (chr21 chrY chrY_random) echo $c set d = `echo $c | sed 's/chr//;s/_random//'` # pick a small set of quality scores, needed so utils don't complain nice zcat wustl/chr22_random.contigs.qvl.gz | \ nice qaToQac stdin stdout | \ nice qacAgpLift -mScore=98 /cluster/data/panTro2/$d/$c.agp \ stdin qac/$c.qac end date 'EOF' # And a dummy chrM echo 'chrM 1 16554 1 W Unknown 1 16554 +' \ > agp.tmp nice zcat wustl/chr7.contigs.qvl.gz | \ nice qaToQac stdin stdout | \ nice qacAgpLift -mScore=98 agp.tmp stdin qac/chrM.qac # Load track table, excluding "dummy" chroms cat > toWig.csh << 'EOF' foreach f (`ls qac/chr*.qac | egrep -v 'chrY|chr21|chrM'`) qacToWig -fixed $f stdout end 'EOF' # << happy emacs date csh toWig.csh | nice wigEncode stdin quality.wig quality.wib date ssh hgwdev cd /cluster/data/panTro2/bed/quality ln -s /cluster/data/panTro2/bed/quality/quality.wib /gbdb/panTro2/wib hgLoadWiggle panTro2 quality quality.wig # create downloads for quality scores ssh kkstore04 cd /cluster/data/panTro2/bed/quality mkdir qa cat > makeDownload.csh >> 'EOF' foreach f (qac/chr*.qac) set c = $f:t:r echo $c qacToQa $f stdout | gzip > qa/$c.qa.gz end cd qa tar cvfz chromQuals.tar.gz chr*.qa.gz 'EOF' rm -fr qa ssh hgwdev ln -s /cluster/data/panTro2/bed/quality/chromQuals.tar.gz \ /usr/local/apache/htdocs/goldenPath/panTro2/bigZips # edit README.txt and update md5sum.txt ######################## # CREATE DATABASE AND GRP TABLE (DONE 2006-02-28 kate) ssh hgwdev hgsql '' -e 'create database panTro2' # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql df -h /var/lib/mysql # /dev/sdc1 1.8T 1.6T 71G 96% /var/lib/mysql # oops -- not enough ? hgsql panTro2 -e \ "create table grp (PRIMARY KEY(NAME)) select * from mm8.grp" # remove encode track groups hgsql panTro2 -e "delete from grp where name like 'encode%'" ######################## # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) 2BIT (DONE 2006-02-28 kate) # Redo to fix chr9 (2006-03-02 kate) # Make .2bit, unmasked until RepeatMasker and TRF steps are done. # Do this now so we can load up RepeatMasker and run featureBits; # can also load up other tables that don't depend on masking. ssh kkstore01 cd /cluster/data/panTro2 nice faToTwoBit ?{,?}/chr*.fa 6_hla_hap1/chr*.fa panTro2.2bit mkdir bed/chromInfo twoBitInfo panTro2.2bit stdout \ | awk '{print $1 "\t" $2 "\t/gbdb/panTro2/panTro2.2bit";}' \ > bed/chromInfo/chromInfo.tab # Make symbolic links from /gbdb/panTro2/ to the real .2bit. ssh hgwdev mkdir /gbdb/panTro2 ln -s /cluster/data/panTro2/panTro2.2bit /gbdb/panTro2/ # Load /gbdb/panTro2/panTro2.2bit paths into database and save size info. cd /cluster/data/panTro2 hgsql panTro2 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql panTro2 -e 'load data local infile \ "/cluster/data/panTro2/bed/chromInfo/chromInfo.tab" \ into table chromInfo;' echo "select chrom,size from chromInfo" | hgsql -N panTro2 > chrom.sizes wc chrom.sizes # 52 # GOLD AND GAP TRACKS (DONE 2006-02-28 kate) # Redo to fix chr9 (2006-03-02 kate) # Change chrY centromere to unbridged (2006-07-11 kate) ssh hgwdev cd /cluster/data/panTro2 cp -p wustl/chrY.agp wustl/chrY.agp.old sed '/centromere/s/yes/no/' wustl/chrY.agp.old > wustl/chrY.agp cp wustl/chrY.agp Y cat wustl/*.agp | grep -v chrM | hgGoldGapGl -noGl panTro2 stdin [kate@hgwdev panTro2]$ nice featureBits panTro2 -countGaps -noRandom gap 423043673 bases of 3175632892 (13.322%) in intersection [kate@hgwdev panTro2]$ nice featureBits panTro1 -countGaps -noRandom gap 671583599 bases of 3083993401 (21.776%) in intersection [kate@hgwdev panTro2]$ nice featureBits hg15 -countGaps -noRandom gap 238329157 bases of 3070537687 (7.762%) in intersection [kate@hgwdev panTro2]$ nice featureBits hg18 -countGaps -noRandom gap 222401287 bases of 3091592211 (7.194%) in intersection chr1 12771775 bases of 229974691 (5.554%) in intersection chr10 9298149 bases of 135001995 (6.887%) in intersection chr11 10601065 bases of 134204764 (7.899%) in intersection chr12 5495847 bases of 135371336 (4.060%) in intersection chr13 28069601 bases of 115868456 (24.225%) in intersection chr14 21093350 bases of 107349158 (19.649%) in intersection chr15 23087195 bases of 100063422 (23.073%) in intersection chr16 16171896 bases of 90682376 (17.834%) in intersection chr17 9948345 bases of 83384210 (11.931%) in intersection chr18 3076876 bases of 77261746 (3.982%) in intersection chr19 12470245 bases of 64473437 (19.342%) in intersection chr20 4187763 bases of 62293572 (6.723%) in intersection chr21 13764311 bases of 46489110 (29.608%) in intersection chr22 17821597 bases of 50165558 (35.526%) in intersection chr2a 8581091 bases of 114460064 (7.497%) in intersection chr2b 120728260 bases of 248603653 (48.563%) in intersection chr3 8990241 bases of 203962478 (4.408%) in intersection chr4 7932593 bases of 194897272 (4.070%) in intersection chr5 8761164 bases of 183994906 (4.762%) in intersection chr6 9202606 bases of 173908612 (5.292%) in intersection chr6_hla_hap1 0 bases of 34169 (0.000%) in intersection chr7 9183564 bases of 160261443 (5.730%) in intersection chr8 6927299 bases of 145085868 (4.775%) in intersection #chr9 115285361 bases of 224587821 (51.332%) in intersection # Seems too big -- notified LaDeana of huge gap (80MB) # She fixed the AGP, and here's the result chr9 29207531 bases of 138509991 (21.087%) in intersection chrM 0 bases of 16554 (0.000%) in intersection chrUn 8188633 bases of 58616431 (13.970%) in intersection chrX 24409881 bases of 155361357 (15.712%) in intersection chrY 1261428 bases of 23952694 (5.266%) in intersection # HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 2006-03-03 kate) ssh hgwdev cd $HOME/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add in all the right places and do make update cvs commit makefile mkdir -p chimp/panTro2 cvs add chimp/panTro2 cvs ci -m "trackDb dir for second chimp genome(s)" chimp/panTro2 # Do this in a clean (up-to-date, no edits) tree: make alpha DBS=panTro2 # Add dbDb entry (not a new organism so defaultDb and genomeClade already # have entries): hgsql -h genome-testdb hgcentraltest \ -e 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ values("panTro2", "Jan. 2006", \ "/gbdb/panTro2", "Chimp", "chr7:115705331-115981791", 1, \ 15, "Chimp", "Pan troglodytes", \ "/gbdb/panTro2/html/description.html", 0, 0, \ "Washington University Build 2 Version 1");' # SPLIT SEQUENCE FOR REPEATMASKER (2006-03-01 kate) # Split into 500K chunks, at gaps if possible ssh kkstore01 cd /cluster/data/panTro2 mkdir split500K foreach d (`cat chrom.lst`) set c = chr$d mkdir split500K/$c faSplit gap -minGapSize=100 -verbose=2 $d/$c.fa 500000 \ split500K/$c/${c}_ -lift=split500K/$c.lft set r = ${c}_random if (-e $d/$r.fa) then mkdir split500K/$r faSplit gap -minGapSize=100 -verbose=2 $d/$r.fa 500000 \ split500K/$r/${r}_ -lift=split500K/$r.lft endif end mkdir lifts cat split500K/*.lft > lifts/split500K.lft # Redo chr9 (2006-03-02 kate) rm -fr split500K/chr9 mkdir split500K/chr9 faSplit gap -minGapSize=100 -verbose=2 9/chr9.fa 500000 \ split500K/chr9/chr9_ -lift=split500K/chr9.lft # forgot to redo this -- doing it now (2006-03-21 kate) cat split500K/*.lft > lifts/split500K.lft # REPEATMASKER RUN (2006-03-01 kate) # Using -species pan option to get chimp repeats # Redone 3/10 with additional chimp repeats from # Evan Eichler (left out of -pan lib) ssh pk cd /cluster/data/panTro2 mkdir RMRun cd RMRun # Record RM version used ls -l /cluster/bluearc/RepeatMasker #lrwxrwxrwx 1 hiram protein 18 Jan 20 13:13 /cluster/bluearc/RepeatMasker -> RepeatMasker060120/ cat /cluster/bluearc/RepeatMasker/Libraries/version > RM.version #RM database version 20060120 # Run RepeatMasker on a dummy input, just to make it initialize its chimp # libraries once before the cluster run /cluster/bluearc/RepeatMasker/RepeatMasker -spec pan /dev/null # Building species libraries in: /cluster/bluearc/RepeatMasker060120/Libraries/20060120/pan pushd /cluster/data/panTro2/split500K ls -1S */*.fa | sort > ../RMRun/split.lst popd cat << 'EOF' > template #LOOP ./RMChimp.csh $(dir1) $(root1) {check out line out/$(dir1)/$(root1).out} #ENDLOOP 'EOF' # << for emacs cat > RMChimp.csh << 'EOF' #!/bin/csh -ef set d = /cluster/data/panTro2 set tmp = /scratch/tmp/panTro2/$2 mkdir -p $tmp mkdir -p out/$1 cp $d/split500K/$1/$2.fa $tmp pushd $tmp /cluster/bluearc/RepeatMasker060120/RepeatMasker -ali -s -species pan $2.fa popd cp -p $tmp/$2.fa.out $3 if (-e $tmp/$2.fa.align) cp $tmp/$2.fa.align out/$1 if (-e $tmp/$2.fa.tbl) cp $tmp/$2.fa.tbl out/$1 if (-e $tmp/$2.fa.cat) cp $tmp/$2.fa.cat out/$1 rm -fr $tmp/* rmdir --ignore-fail-on-non-empty $tmp rmdir --ignore-fail-on-non-empty /scratch/tmp/panTro2 'EOF' # << for emacs chmod +x RMChimp.csh gensub2 split.lst single template jobList para create jobList # 6538 jobs para try # CPU time in finished jobs: 32493557s 541559.29m 9025.99h 376.08d 1.030 y # IO & Wait Time: 40999s 683.31m 11.39h 0.47d 0.001 y # Average job time: 4976s 82.94m 1.38h 0.06d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 7607s 126.78m 2.11h 0.09d # Submission to last job: 121890s 2031.50m 33.86h 1.41d # 34 hours # Lift up the 500KB .out's to chrom level ssh kkstore01 cd /cluster/data/panTro2/RMRun cat > lift.csh << 'EOF' foreach d (`cat ../chrom.lst`) set c = chr$d echo $c liftUp ../$d/$c.fa.out ../lifts/split500K.lft error out/$c/*.out set r = ${c}_random if (-e out/$r) then echo $r liftUp ../$d/$r.fa.out ../lifts/split500K.lft error out/$r/*.out endif end 'EOF' # << for emacs csh lift.csh >&! lift.log & # Redo chr9 after remaking split500K.lft (2006-03-24 kate) liftUp ../9/chr9.fa.out ../lifts/split500K.lft error out/chr9/*.out # Load .outs into database # Redone after relift (2006-03-24 kate) ssh hgwdev cd /cluster/data/panTro2 ls */*.fa.out | wc -l # 52 (one per chrom)... load them up hgLoadOut panTro2 */*.fa.out # Several messages like: Strange perc. field -3.1 line 327855 of 1/chr1.fa.out # And at end: 953 records dropped due to repStart > repEnd # These are noted in rn4 and other make docs. # TODO: run it with -verbose=2 and send output to Robert Hubley/Arian Smit # VERIFY REPEATMASKER RESULTS (2006-03-03 kate) # TODO: Eyeball some repeat annotations in the browser, compare to lib seqs. # Run featureBits on previous genome build, and compare: #nice featureBits panTro2 rmsk # w/o 2 Eichler repeats: # 1396630330 bases of 2909512873 (48.002%) in intersection # with 2 Eichler repeats: # 1401134418 bases of 2909512873 (48.157%) in intersection nice featureBits panTro1 rmsk # 1311281819 bases of 2733948177 (47.963%) in intersection # after chr9 .out reload... # 1311281819 bases of 2733948177 (47.963%) in intersection # looks good -- similar but just a bit higher ssh hgwdev cd /cluster/data/panTro2/RMRun # take a look at hgsql panTro2 -e "select distinct(repFamily) from chr1_rmsk" > repFamilies.panTro2 hgsql panTro2 -e "select distinct(repName) from chr1_rmsk where repFamily <> 'Simple_repeat' and repFamily <> 'Low_complexity' order by repName" > repNames.panTro2 wc -l repNames.panTro2 hgsql panTro1 -e "select distinct(repFamily) from chr1_rmsk" > repFamilies.panTro1 hgsql panTro1 -e "select distinct(repName) from chr1_rmsk" > repNames.panTro1 hgsql panTro1 -e "select distinct(repName) from chr1_rmsk where repFamily <> 'Simple_repeat' and repFamily <> 'Low_complexity' order by repName" > repNames.panTro1 wc -l *Names* # 864 repNames.panTro1 # 906 repNames.panTro2 # 40 new repeats (+ 2 added from Eichler) wc -l *Families* # 38 repFamilies.panTro1 # 38 repFamilies.panTro2 # no new repeat families # RUN TRF TO IDENTIFY SIMPLE REPEATS (2006-03-01 kate) ssh kkstore01 cd /cluster/data/panTro2 # First break up sequence into 5MB chunks at contigs/gaps # NOTE: Probably unnecessary -- Hiram has run on whole chroms # using small cluster mkdir split5M foreach d (`cat chrom.lst`) set c = chr$d mkdir split5M/$c faSplit gap -minGapSize=100 -verbose=2 $d/$c.fa 5000000 \ split5M/$c/${c}_ -lift=split5M/$c.lft set r = ${c}_random if (-e $d/$r.fa) then mkdir split5M/$r faSplit gap -minGapSize=100 -verbose=2 $d/$r.fa 5000000 \ split5M/$r/${r}_ -lift=split5M/$r.lft endif end mkdir lifts cat split5M/*.lft > lifts/split5M.lft # Run TRF mkdir -p bed/simpleRepeat/trf cd bed/simpleRepeat cat > runTrf.csh << 'EOF' date foreach d (`find /cluster/data/panTro2/split5M/* -type d`) cd $d foreach f (*.fa) set b = $f:r echo $f trfBig -trf=/cluster/bin/i386/trf $f /dev/null -tempDir=/tmp \ -bedAt=/cluster/data/panTro2/bed/simpleRepeat/trf/$b.bed end end date 'EOF' csh runTrf.csh >&! runTrf.log & # ~6 hours run-time # Redo chr9 (2006-03-02 kate) cd /cluster/data/panTro2 rm -fr split5M/chr9 mkdir split5M/chr9 faSplit gap -minGapSize=100 -verbose=2 9/chr9.fa 5000000 \ split5M/chr9/chr9_ -lift=split5M/chr9.lft cd bed/simpleRepeat cp runTrf.csh runTrf.chr9.csh # edit for just chr9 csh runTrf.chr9.csh >&! runTrf.chr9.log & # check for EOF in output files to give some assurance endsInLf trf/* # trf/chrM_0.bed zero length # lift to chromosome coordinates liftUp simpleRepeat.bed /cluster/data/panTro2/lifts/split5M.lft \ warn trf/*.bed # forgot to include the new lift in the all split (2006-03-21 kate) # so do it now and reload table cd /cluster/data/panTro2 cat split5M/*.lft > lifts/split5M.lft cd bed/simpleRepeat liftUp simpleRepeat.bed /cluster/data/panTro2/lifts/split5M.lft \ warn trf/*.bed # load into database ssh hgwdev cd /cluster/data/panTro2/bed/simpleRepeat hgLoadBed panTro2 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql nice featureBits panTro2 simpleRepeat # 82897828 bases of 2909512873 (2.849%) in intersection nice featureBits panTro1 simpleRepeat # 53732632 bases of 2733948177 (1.965%) in intersection # seems quite a bit higher -- compare it to hg17 chrom1 # nice featureBits panTro2 simpleRepeat -chrom=chr1 # 3068104 bases of 217202916 (1.413%) in intersection nice featureBits hg17 simpleRepeat -chrom=chr1 # 3438627 bases of 222827847 (1.543%) in intersection # darn similar # PROCESS SIMPLE REPEATS INTO MASK # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore01 cd /cluster/data/panTro2/bed/simpleRepeat mkdir -p trfMask trfMaskChrom foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # lift filtered simple repeats to chrom coordinates, by chrom cat > liftChroms.csh << 'EOF' foreach d (`cat /cluster/data/panTro2/chrom.lst`) set c = chr$d echo $c liftUp trfMaskChrom/$c.bed /cluster/data/panTro2/split5M/$c.lft \ warn trfMask/${c}_?{,?}.bed set r = ${c}_random if (-e /cluster/data/panTro2/$d/$r.fa) then liftUp trfMaskChrom/$r.bed /cluster/data/panTro2/split5M/$r.lft \ warn trfMask/${r}_?{,?}.bed endif end 'EOF' # << for emacs csh liftChroms.csh >&! liftChroms.log & # check coverage of filtered TRF hgwdev cd /cluster/data/panTro2/bed/simpleRepeat cat trfMaskChrom/*.bed > /tmp/filtTrf.bed featureBits panTro2 /tmp/filtTrf.bed # 28800863 bases of 2909512873 (0.990%) in intersection # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (2006-03-03) # soft-mask (lower-case) the chrom fasta files # then make hard-masked versions from the soft-masked # note: next time simplify this (as below for chr9) ssh kkstore01 cd /cluster/data/panTro2 set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (?{,?}/chr*.fa 6_hla_hap1/chr*.fa) cp $f $f.unmasked echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f cp $f $f.rmsk set c = $f:t:r maskOutFa -softAdd $f $trfChr/$c.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # remask chr9 after relifting .out's (2006-03-24) set f = 9/chr9.fa set c = chr9 maskOutFa -soft $f.unmasked $f.out $f.rmsk maskOutFa -softAdd $f.rmsk $trfChr/$c.bed $f maskOutFa $f hard $f.masked # a few messages (e.g. 10 per chrom) like: # WARNING: negative rEnd: -531 chr1:14090705-14090763 L1MEc # after checking, remove .unmasked and .rmsk # make 2bit for browser and blat # redone for chr9 correction (2006-03-24 kate) faToTwoBit ?{,?}/chr*.fa 6_hla_hap1/chr*.fa panTro2.2bit mkdir /cluster/bluearc/scratch/hg/panTro2 ssh kkr1u00 cp -p /cluster/data/panTro2/panTro2.2bit /iscratch/i/panTro2/panTro2.2bit ~kent/bin/iSync # after chr9 remask: cp -p /cluster/data/panTro2/panTro2.2bit /iscratch/i/panTro2 cp -p /cluster/data/panTro2/nib/chr9.nib /iscratch/i/panTro2/nib ~kent/bin/iSync # make nibs for blastz w/linSpecRep mkdir nib foreach f (?{,?}/chr*.fa 6_hla_hap1/chr*.fa) echo $f:t:r faToNib -softMask $f nib/$f:t:r.nib end # remake chr9 (2006-03-24 kate) faToNib -softMask 9/chr9.fa nib/chr9.nib # MAKE 11.OOC # #Remade after chr9 remask (2006-03-24 kate) ssh kolossus cd /cluster/data/panTro2 mkdir /cluster/bluearc/panTro2 blat panTro2.2bit /dev/null /dev/null \ -tileSize=11 -makeOoc=/cluster/bluearc/panTro2/11.ooc -repMatch=1024 cp -p /cluster/bluearc/panTro2/11.ooc . # COPY SEQUENCE TO CLUSTER NODES FOR CLUSTER RUNS set scratch = /cluster/bluearc/scratch/hg/panTro2 mkdir $scratch cp -p panTro2.2bit $scratch cp -rp nib $scratch cp -rp chrom.sizes $scratch cp -p 11.ooc $scratch # request cluster-admin sync /scratch/hg/panTro2 to cluster nodes # update 2bit and chr9 nib after updating chr9 masking cp -p panTro2.2bit $scratch cp -p nib/chr9.nib $scratch cp -p 11.ooc $scratch # request cluster-admin sync /scratch/hg/panTro2 to cluster nodes # GENBANK ALIGNMENTS (DONE 2006-03-25 kate/markd) # restarted 3/13 4pm # Make a lift file that identifies gap locations for Genbank # This is a pseudo "liftAll" made from the AGP files, however negative # strand contigs are renamed to indicate they need to be # reversed. ssh kkstore01 cd /cluster/data/panTro2 cat ?{,?}/*.agp 6_hla_hap1/*.agp | agpToLift -revStrand > \ lifts/genbank.lft ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add panTro2 # the hack in the genbank code for panTro1 as changed for # panTro2. panTro2 treats both chimp and human cDNAs as # native, and all else as xeno. Don't do xeno ESTs any more. panTro2.serverGenome = /cluster/data/panTro2/panTro2.2bit panTro2.clusterGenome = /scratch/hg/panTro2/panTro2.2bit panTro2.ooc = /cluster/bluearc/panTro2/11.ooc panTro2.align.unplacedChroms = chrUn,chr*_random panTro2.lift = /cluster/data/panTro2/lifts/genbank.lft panTro2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} panTro2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} panTro2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} panTro2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} panTro2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} panTro2.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} panTro2.downloadDir = panTro2 panTro2.genbank.est.xeno.load = no panTro2.refseq.mrna.native.load = yes panTro2.refseq.mrna.xeno.load = yes panTro2.refseq.mrna.xeno.loadDesc = yes cvs ci etc/genbank.conf make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial panTro2 & # various problems with ssh, bluearc, etc.... nice bin/gbAlignStep -initial -continue=copy panTro2 & ssh hgwdev nice bin/gbDbLoadStop -initialLoad -drop panTro2 & # MAKE LINEAGE-SPECIFIC REPEATS FOR BLASTZ (2006-03-15 kate) ssh kkstore01 cd /cluster/data/panTro2 mkdir linSpecRep cd linSpecRep mkdir dates cd dates # run Arian's script to annotate .out files with species-specificity # include all currently supported comparison species cat > outRepeats.csh << 'EOF' date set d = /cluster/data/panTro2 foreach f ($d/?{,?}/chr*.fa.out $d/6_hla_hap1/chr*.fa.out) set c = $f:t:r:r echo $c cp $f . /cluster/bluearc/RepeatMasker060120/DateRepeats $c.fa.out -query human \ -comp mouse -comp rat -comp dog -comp cow -comp rabbit mv $c.fa.*cuniculus $c.fa.dates.out end date 'EOF' # << for emacs csh outRepeats.csh >&! outRepeats.log & # 25 minutes # Redo for fixed chr9 (2006-03-24 kate) cp /cluster/data/panTro2/9/chr9.fa.out . set c = chr9 /cluster/bluearc/RepeatMasker060120/DateRepeats $c.fa.out -query human \ -comp mouse -comp rat -comp dog -comp cow -comp rabbit mv $c.fa.*cuniculus $c.fa.dates.out # run our script to extract lineage-specific by column # from the annotated .outs # just run on chr1 to assess which variants produce any # differences. mkdir testChr1 cd testChr1 cat > testChr1.csh << 'EOF' set script = /cluster/bin/scripts/extractRepeats set f = ../chr1.fa.dates.out echo mouse $script 1 $f > notInMouse.out.spec echo rat $script 2 $f > notInRat.out.spec echo dog $script 3 $f > notInDog.out.spec echo cow $script 4 $f > notInCow.out.spec echo rabbit $script 5 $f > notInRabbit.out.spec 'EOF' csh testChr1.csh >&! testChr1.log & ls -l *.spec -rw-rw-r-- 1 kate protein 17678799 Mar 15 09:51 notInCow.out.spec -rw-rw-r-- 1 kate protein 17678799 Mar 15 09:51 notInDog.out.spec -rw-rw-r-- 1 kate protein 17773939 Mar 15 09:48 notInHuman.out.spec -rw-rw-r-- 1 kate protein 17773939 Mar 15 09:50 notInMouse.out.spec -rw-rw-r-- 1 kate protein 17773939 Mar 15 09:51 notInRabbit.out.spec -rw-rw-r-- 1 kate protein 17773939 Mar 15 09:50 notInRat.out.spec # this indicates there are 2 unique LSR files: # 2) notInRodent (includes Mouse, Rat, Rabbit) # 3) notInOther (includes Cow & Dog) # Extract mouse (column 1) and dog (column 3) cd /cluster/data/panTro2/linSpecRep mkdir notInRodent notInOthers cat > makeLSR.csh << 'EOF' foreach f (dates/*.fa.dates.out) set c = $f:t:r:r:r echo $c set out = $c.out.spec /cluster/bin/scripts/extractRepeats 1 $f > notInRodent/$out /cluster/bin/scripts/extractRepeats 3 $f > notInOthers/$out end 'EOF' # << happy emacs csh makeLSR.csh >&! makeLSR.log & # copy to bluearc for blastz runs set d = /cluster/bluearc/panTro2/linSpecRep mkdir -p $d cp -rp notInRodent $d cp -rp notInOthers $d # update for chr9 (2006-03-24 kate) set c = chr9 extractRepeats 1 dates/$c.fa.dates.out > \ notInRodent/$c.out.spec extractRepeats 3 dates/$c.fa.dates.out > \ notInOthers/$c.out.spec cp -rp notInRodent $d cp -rp notInOthers $d # CREATE LINEAGE-SPECIFIC REPEATS FOR BLASTZ WITH ZEBRAFISH (2006-04-07 kate) # Treat all repeats as lineage-specific ssh kkstore04 cd /cluster/data/panTro2 set d = /cluster/bluearc/panTro2/linSpecRep/notInZebrafish mkdir $d foreach f ({{?{,?}},6_hla_hap1}/chr*.fa.out) set c = $f:t:r:r echo $c cp -p $f $d/$c.out.spec end # SET UP BLAT SERVER (2006-03-06 kate) # Request blat server from cluster-admin, then enter ports in database ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("panTro2", "blat12", "17790", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("panTro2", "blat12", "17791", "0", "1");' \ hgcentraltest # test it with some sequence # GC 5 BASE TRACK (2006-03-06 kate) ssh kkstore01 cd /cluster/data/panTro2/bed mkdir gc5base cd gc5base time hgGcPercent -wigOut -doGaps -file=stdout -win=5 panTro2 \ /cluster/data/panTro2 | wigEncode stdin gc5Base.wig gc5Base.wib sh hgwdev cd /cluster/data/panTro2/bed/gc5base mkdir -p /gbdb/panTro2/wib ln -s `pwd`/*.wib /gbdb/panTro2/wib hgLoadWiggle panTro2 gc5Base gc5Base.wig # CPGISLANDS (2006-03-06 - kate) # Redo 3/13/06 after second Repeatmasker run # Redone for chr9 remasking (2006-03-24 kate) ssh hgwdev mkdir /cluster/data/panTro2/bed/cpgIsland cd /cluster/data/panTro2/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe cd ../.. ln -s hg3rdParty/cpgIslands/cpglh.exe . # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. ssh kkstore01 cd /cluster/data/panTro2/bed/cpgIsland cat > doCpg.csh << 'EOF' foreach f (../../?{,?}/chr*.fa.masked ../../6_hla_hap1/chr*.fa.masked) set c = $f:t:r:r echo $c ./cpglh.exe $f > $c.cpg end 'EOF' # << happy emacs csh doCpg.csh >&! doCpg.log & # Several chroms have 0 results: # -rw-rw-r-- 1 0 Feb 16 15:19 chr10_random.cpg # -rw-rw-r-- 1 0 Feb 16 15:20 chr15_random.cpg # -rw-rw-r-- 1 0 Feb 16 15:22 chr8_random.cpg # -rw-rw-r-- 1 0 Feb 16 15:22 chr9_random.cpg # -rw-rw-r-- 1 0 Feb 16 15:22 chrM.cpg # -rw-rw-r-- 1 0 Feb 16 15:22 chrX_random.cpg # -rw-rw-r-- 1 0 Feb 16 15:22 chrY.cpg # Transform cpglh output to bed + cat << '_EOF_' > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # happy emacs awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed ssh hgwdev cd /cluster/data/panTro2/bed/cpgIsland hgLoadBed -strict panTro2 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed featureBits panTro2 cpgIslandExt # 19412837 bases of 2909512873 (0.667%) in intersection # BAC Ends (2006-03-08 IN PROGRESS kate) # download BAC ends from NCBI. ssh kkstore cd /cluster/data/ncbi/bacends/chimp mkdir bacends.chimp.2 cd bacends.chimp.2 # wget ftp.ncbi.nih.gov/genomes/CLONEEND/pan_troglodytes/* # trouble with wget on files in this dir -- they're hanging, # so I used ftp mget w/o prompt # creates files: 9598_clone_end???.mfa and 95998_clone_info???.txt # 10 files of each type # NOTE: the *.info files contain a superset of the data in the # previous cl_acc_gi_len file -- extract fields 1-5 and 8 to # use as input to Terry's script that generaes the .pair file # for compatibility with downstream tools ftp ftp.ncbi.nih.gov cd /genomes/CLONEEND/pan_troglodytes prompt mget * gunzip *.gz # setup for track build area cd /cluster/data/panTro2 mkdir -p bed/bacends cd bed/bacends ln -s /cluster/data/ncbi/bacends/chimp/bacends.chimp.2 ncbi # generate pairs info file by/for processing by Terry Furey's scripts grep -h -v '^#' ncbi/*.txt | \ awk '{printf ("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $8)}' \ > cl_acc_gi_len wc -l cl_acc_gi_len # 175757 cl_acc_gi_len # previous load (2004): 158588 entries faSize ncbi/*.mfa # 175757 sequences in 10 files # Info and sequence catch match -- good! /cluster/bin/scripts/convertBacEndPairInfo cl_acc_gi_len # 85911 pairs and 3935 singles # previous: 78846 pairs and 896 singles # Note large increase in singles -- either lower quality in # new data set, or script could use some updating. # Just note this for now. wc -l bacEndPairs.txt bacEndSingles.txt # 85911 bacEndPairs.txt # 3935 bacEndSingles.txt # create sequence file cp /cluster/data/ncbi/bacends/chimp/bacends.chimp.1/convert.pl . cat ncbi/*.mfa | ./convert.pl > BACends.fa faSize BACends.fa # 175757 sequences # make accessible to track ssh hgwdev mkdir -p /gbdb/panTro2/bacends ln -s /cluster/data/panTro2/bed/bacends/BACends.fa /gbdb/panTro1/bacends # split for aligning on cluster ssh kkstore01 cd /cluster/data/panTro2/bed/bacends set tmp = /san/sanvol1/scratch/panTro2/bacends/ mkdir -p $tmp faSplit BACends.fa sequence 10 $tmp # blat vs. unmasked sequence in 5M chunks # 652 chunks * 10 bacends files = 6520 jobs ssh pk cd /cluster/data/panTro2/bed/bacends mkdir run cd run mkdir ../out # list chrom chunks and bacends chunks set dir = /cluster/data/panTro2/split5M ls $dir/*/*.fa | sed "s^$dir/^^" > chromSplit.lst set dir = /san/sanvol1/scratch/panTro2/bacends ls $dir/*.fa | sed "s^$dir/^^" > bacends.lst cat > align.csh << 'EOF' #!/bin/csh -ef set d = $1:h set f = $1:t set e = $2 mkdir -p ../out/$d set tmp = /scratch/tmp/panTro2/align.$$ mkdir -p $tmp cp /cluster/data/panTro2/split5M/$d/$f $tmp cp /san/sanvol1/scratch/panTro2/bacends/$e $tmp blat $tmp/$f $tmp/$e -ooc=/cluster/bluearc/panTro2/11.ooc ../out/$d/$f:r.$e:r.psl rm -fr $tmp 'EOF' # << happy emacs chmod +x align.csh cat > gsub << 'EOF' #LOOP ./align.csh $(path1) $(path2) {check out line+ ../out/$(dir1)/$(root1).$(root2).psl} #ENDLOOP 'EOF' # << happy emacs gensub2 chromSplit.lst bacends.lst gsub jobList para create jobList # 6520 jobs para try # NOTE: these run quickly ######################## # MAKE DOWNLOADABLE SEQUENCE FILES (2006-04-02 kate) ssh kkstore04 cd /cluster/data/panTro2 mkdir -p downloads/bigZips downloads/chromosomes cd downloads cat > makeChroms.csh << 'EOF' date foreach f (../?{,?}/*.fa ../6_hla_hap1/*.fa) echo $f:t:r nice gzip -c $f > chromosomes/$f:t.gz end date 'EOF' csh makeChroms.csh >&! makeChroms.log & # 14 minutes cat > bigZips.csh << 'EOF' cd /cluster/data/panTro2 tar cvzf downloads/bigZips/chromAgp.tar.gz \ ?{,?}/chr*.agp 6_hla_hap1/*.agp tar cvzf downloads/bigZips/chromFa.tar.gz \ ?{,?}/chr*.fa 6_hla_hap1/chr*.fa tar cvzf downloads/bigZips/chromFaMasked.tar.gz \ ?{,?}/chr*.fa.masked 6_hla_hap1/chr*.fa.masked tar cvzf downloads/bigZips/chromOut.tar.gz \ ?{,?}/chr*.fa.out 6_hla_hap1/chr*.fa.out cd /cluster/data/panTro2/bed/simpleRepeat tar cvzf ../../downloads/bigZips/chromTrf.tar.gz ./trfMaskChrom 'EOF' csh bigZips.csh >&! bigZips.log & # get GenBank native mRNAs and refGene ssh hgwdev cd /cluster/data/genbank time ./bin/i386/gbGetSeqs -db=panTro2 -native GenBank mrna \ /cluster/data/panTro2/downloads/bigZips/mrna.fa cd /cluster/data/panTro2/downloads/bigZips cd /cluster/data/panTro2/downloads/bigZips foreach I (1000 2000 5000) echo "upstream${I} working ... " featureBits panTro2 refGene:upstream:${I} -fa=stdout \ | gzip -c > upstream${I}.fa.gz echo "upstream${I} done" end # real 11m25.493s ssh kkstore04 cd /cluster/data/panTro2/downloads/chromosomes # copy previous release README.txt scp hgwdev:/usr/local/apache/htdocs/goldenPath/panTro1/chromosomes/README.txt . # edit it to bring it up to date md5sum *.gz > md5sum.txt cd /cluster/data/panTro2/downloads/bigZips gzip mrna.fa cp -p ../../panTro2.2bit . # copy previous release README.txt scp hgwdev:/usr/local/apache/htdocs/goldenPath/panTro1/bigZips/README.txt . # edit README.txt to indicate proper version of sequence and # RepeatMasker md5sum *.gz *.2bit README.txt > md5sum.txt ssh hgwdev set d = /usr/local/apache/htdocs/goldenPath/panTro2 mkdir $d ln -s /cluster/data/panTro2/downloads/bigZips $d ln -s /cluster/data/panTro2/downloads/chromosomes $d ######################## # BLASTZ HUMAN hg18 (2006-05-23 kate) # Do not use lineage-specific repeats, on recommendation of Arian Smit # Redo with human/chimp scoring matrix ssh pk cd /cluster/data/panTro2/bed mkdir blastz.hg18.2006-05-23 rm blastz.hg18 ln -s blastz.hg18.2006-05-23 blastz.hg18 cd blastz.hg18 cat << '_EOF_' > DEF # chimp vs human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Chimmp panTro2 SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/bluearc/panTro2/chrom.sizes # QUERY: Human hg18 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/hg18/hg18.2bit SEQ2_LEN=/scratch/hg/hg18/chrom.sizes SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/panTro2/bed/blastz.hg18.2006-05-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg18 \ `pwd`/DEF >&! blastz.out & ######################## # BLASTZ HUMAN hg18 (2006-03-13 kate) # Do not use lineage-specific repeats, on recommendation of Arian Smit ssh pk cd /cluster/data/panTro2/bed mkdir blastz.hg18.2006-03-13 ln -s blastz.hg18.2006-03-13 blastz.hg18 cd blastz.hg18 cat << '_EOF_' > DEF # chimp vs human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Chimmp panTro2 SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/bluearc/panTro2/chrom.sizes # QUERY: Human hg18 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/hg18/hg18.2bit SEQ2_LEN=/scratch/hg/hg18/chrom.sizes SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/panTro2/bed/blastz.hg18.2006-03-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg18 \ `pwd`/DEF >&! blastz.out & # failed during chaining step, because /scratch/hg/panTro2 # doesn't exist on minicluster nodes # for now, just symlink /iscratch/i/panTro2 there # so that the 2bit is available to this pipeline -- later # we'll make sure the nodes are completely synced up. /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg18 \ -continue=chainRun \ `pwd`/DEF >&! blastz.2.out & # chr19 failed with out-of-mem, so try it on kolossus # moved to hgwdev by request.... ssh kolossus cd /cluster/data/panTro2/bed/blastz.hg18/axtChain/run csh chain.csh panTro2.2bit:chr19: chain/panTro2.2bit:chr19:.chain >&! \ chainChr19.log & # Create run.time file and remove liftedChain so we can continue /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \ -continue=chainMerge \ `pwd`/DEF >&! blastz.3.out & # SWAP ALIGNMENTS TO PANTRO2 (2006-03-19 kate) /cluster/bin/scripts/doBlastzChainNet.pl -swap \ -workhorse=hgwdev64 \ /cluster/data/panTro2/bed/blastz.hg18/DEF >&! swap.log & # Redo of chr9 (2006-03-28 kate) # Create blastz run dir for just chr9, and run manually on pk. ssh pk cd /cluster/data/panTro2/bed/blastz.hg18 mkdir run.blastz.chr9 cd run.blastz.chr9 cp ../run.blastz/*.csh . grep chr9 ../run.blastz/jobList | grep -v random > jobList ln -s ../run.blastz/qParts . para create jobList # 28 jobs para push para time > run.time # merge chr9 psl's on small cluster ("cat" step in automated script) ssh kki cd /cluster/data/panTro2/bed/blastz.hg18 mkdir run.cat.chr9 cd run.cat.chr9 cp ../run.cat/cat.csh . grep chr9 ../run.cat/jobList > jobList para create jobList # 14 jobs para push para time > run.time cd ../axtChain mkdir run.chr9 cd run.chr9 cp ../run/chain.csh . grep chr9 ../run/jobList > jobList mkdir chain para create jobList # 1 job para push para time > run.time # replace chr9 chain in all.chain ssh hgwdev64 # kolossus busy cd /cluster/data/panTro2/bed/blastz.hg18/axtChain/run mkdir chain set all = panTro2.hg18.all.chain.gz chainSplit chain/ ../$all rm chain/chr9.chain cp ../run.chr9/chain/*.chain chain chainMergeSort chain/*.chain | nice gzip -c > ../$all rm -fr chain/* cd .. mkdir chain chainSplit chain/ $all # automate the rest ssh pk cd /cluster/data/panTro2/bed/blastz.hg18 rm -fr axtNet mafNet ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/panTro2/vsHg18" /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -workhorse=hgwdev64 \ -continue=net \ `pwd`/DEF >&! blastz.net.chr9.out & # done # swap after chr9 fix (2006-03-30 kate) rm -fr /cluster/data/hg18/bed/blastz.panTro2.swap /cluster/bin/scripts/doBlastzChainNet.pl -swap \ -workhorse=kkr4u00 \ /cluster/data/panTro2/bed/blastz.hg18/DEF >&! swap.log & # downloads failed -- need to remove the old dir ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/hg18/vsPanTro2" /cluster/bin/scripts/doBlastzChainNet.pl -swap -continue=download\ /cluster/data/panTro2/bed/blastz.hg18/DEF >&! swap.download.log & # Reciprocal best (DONE 2006-04-02 kate) # These are best alignments in both directions # Start with the best chains from the net (these are in the liftOver chain). # Swap to human-reference, and net them # Finally extract the best chains from this net, and make axt's # REDO with improved blastz params (2006-07-27 kate) ssh kolossus cd /cluster/data/hg18/bed ln -s blastz.panTro2.swap blastz.panTro2 cd blastz.panTro2/axtChain mkdir rBestNet cat > rbest.csh << 'EOF' set swapChain = panTro2ToHg18.swap.chain echo "merge and swap $swapChain" zcat /cluster/data/panTro2/bed/liftOver/panTro2ToHg18.over.chain.gz | \ chainSwap stdin stdout | \ chainMergeSort stdin > $swapChain echo "split chain" chainSplit swapChain/ $swapChain echo "run netter" chainPreNet $swapChain /cluster/bluearc/hg18/chrom.sizes \ /cluster/bluearc/panTro2/chrom.sizes stdout | \ chainNet stdin -minSpace=1 /cluster/bluearc/hg18/chrom.sizes \ /cluster/bluearc/panTro2/chrom.sizes \ stdout /dev/null | \ netSyntenic stdin rbest.noClass.net echo "split net" netSplit rbest.noClass.net rBestNet echo "extract chains from net" netChainSubset -verbose=0 rbest.noClass.net $swapChain stdout | \ chainMergeSort stdin | \ gzip -c > hg18.panTro2.rbest.chain.gz echo "split extracted chain" chainSplit rBestChain/ hg18.panTro2.rbest.chain.gz cd .. mkdir axtRBestNet echo "extract axts" foreach f (axtChain/rBestNet/*.net) echo $f:t:r netToAxt $f axtChain/swapChain/$f:t:r.chain \ /san/sanvol1/scratch/hg18/hg18.2bit \ /cluster/bluearc/panTro2/panTro2.2bit stdout | \ axtSort stdin stdout | \ gzip -c > axtRBestNet/$f:t:r.hg18.panTro2.net.axt.gz end 'EOF' csh rbest.csh >&! rbest.log & # load chains for viewing on genome-test ssh hgwdev cd /cluster/data/hg18/bed/blastz.panTro2/axtChain set chain = hg18.panTro2.rbest.chain # skip for now -- load by request cd rBestChain cat > loadRBestChain.csh << 'EOF' date foreach c (`awk '{print $1;}' /cluster/bluearc/hg18/chrom.sizes`) echo $c set f = $c.chain if (! -e $f) then echo no chains for $c set f = /dev/null endif hgLoadChain hg18 ${c}_chainRBestPanTro2 $f end date 'EOF' cd .. #end skip for now # compare to previous set (other blastz run) and also full set featureBits hg18 chainRBestOldPanTro2Link -chrom=chr7 # 146029338 bases of 154952424 (94.241%) in intersection featureBits hg18 chainRBestPanTro2Link -chrom=chr7 # 145403551 bases of 154952424 (93.838%) in intersection featureBits hg18 chainPanTro2Link -chrom=chr7 # 148298785 bases of 154952424 (95.706%) in intersection # some additional processing of nets # add classification info netClass -noAr rbest.noClass.net hg18 panTro2 hg18.panTro2.rbest.net gzip hg18.panTro2.rbest.net # link to downloads md5sum *.gz > md5sum.txt cd .. md5sum axtNet/*.gz >> axtChain/md5sum.txt md5sum axtRBestNet/*.gz >> axtChain/md5sum.txt cd /usr/local/apache/htdocs/goldenPath/hg18/vsPanTro2 set d = /cluster/data/hg18/bed/blastz.panTro2 ln -s $d/axtChain/*.gz . mkdir axtRBestNet ln -s /cluster/data/hg18/bed/blastz.panTro2/axtRBestNet/*.axt.gz axtRBestNet # edit README.txt # cleanup: TODO after release cd /cluster/data/hg18/bed/blastz.panTro2/axtChain rm -f rbest.noClass.net panTro2ToHg18.swap.chain rm -fr swapChain rBestChain rBestNet ######################## # BLASTZ MACAQUE rheMac2 (2006-03-14 kate) ssh pk cd /cluster/data/panTro2/bed mkdir blastz.rheMac2.2006-03-14 ln -s blastz.rheMac2.2006-03-14 blastz.rheMac2 cd blastz.rheMac2 cat << '_EOF_' > DEF # chimp vs monkey export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Chimp panTro2 SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/bluearc/panTro2/chrom.sizes # QUERY: Macaque rheMac2 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/rheMac2/rheMac2.2bit SEQ2_LEN=/scratch/hg/rheMac2/chrom.sizes SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/panTro2/bed/blastz.rheMac2.2006-03-14 TMPDIR=/scratch/tmp '_EOF_' # NOTE: probably should have used chainMinScore=3000 # << happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.rheMac2 \ `pwd`/DEF >&! blastz.out & # Started Mar 14 18:40 # Done Mar 15 18:50 # Copy MAF files to san for use in multiple alignment set dir = /san/sanvol1/scratch/panTro2/mafNet mkdir -p $dir cp -rp mafNet $dir/rheMac2 ssh hgwdev featureBits panTro2 chainRheMac2Link # crashes out-of-mem on hgwdev (try on kolossus) # SWAP ALIGNMENTS TO PANTRO2 (2006-03-15 kate) /cluster/bin/scripts/doBlastzChainNet.pl -swap \ /cluster/data/panTro2/bed/blastz.rheMac2/DEF >&! swap.log & # failed due to missing file hgwdev:/scratch/hg/rheMac2/chrom.sizes # so I copied the file and ran the "loadUp" script ssh hgwdev cd /cluster/data/rheMac2/bed/blastz.panTro2.swap/axtChain nice loadUp.csh >&! loadUp.log & # Redo of chr9 (2006-03-28 kate) # Create blastz run dir for just chr9, and run manually on pk. ssh pk cd /cluster/data/panTro2/bed/blastz.rheMac2 mkdir run.blastz.chr9 cd run.blastz.chr9 cp ../run.blastz/*.csh . grep chr9 ../run.blastz/jobList | grep -v random > jobList ln -s ../run.blastz/qParts . para create jobList # 28 jobs para try para push para time > run.time # merge chr9 psl's on small cluster ("cat" step in automated script) ssh kki cd /cluster/data/panTro2/bed/blastz.rheMac2 mkdir run.cat.chr9 cd run.cat.chr9 cp ../run.cat/cat.csh . grep chr9 ../run.cat/jobList > jobList para create jobList # 14 jobs para push para time > run.time cd ../axtChain mkdir run.chr9 cd run.chr9 cp ../run/chain.csh . grep chr9 ../run/jobList > jobList mkdir chain para create jobList # 1 job para push para time > run.time # replace chr9 chain in all.chain ssh hgwdev64 # kolossus busy cd /cluster/data/panTro2/bed/blastz.rheMac2/axtChain/run mkdir chain set all = panTro2.rheMac2.all.chain.gz chainSplit chain/ ../$all rm chain/chr9.chain cp ../run.chr9/chain/*.chain chain chainMergeSort chain/*.chain | nice gzip -c > ../$all rm -fr chain/* cd .. mkdir chain chainSplit chain/ $all # GOT HERE # automate the rest ssh pk cd /cluster/data/panTro2/bed/blastz.rheMac2 rm -fr axtNet mafNet ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/panTro2/vsRheMac2" /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -workhorse=hgwdev64 \ -continue=net \ `pwd`/DEF >&! blastz.net.chr9.out & # gotta remove another file for the automation rm axtChain/panTro2.rheMac2.net.gz /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -workhorse=kkr1u00 \ -continue=load \ `pwd`/DEF >&! blastz.load.chr9.out & # missing noClass.net, work around it ssh hgwdev cd /cluster/data/panTro2/bed/blastz.rheMac2/axtChain cat net/*.net > noClass.net # create a script from loadUp.csh, containing only netClass and hgLoadNet csh loadUp2.csh >&! ../blastz.load2.chr9.out & /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -workhorse=kkr1u00 \ -continue=download \ `pwd`/DEF >&! blastz.download.chr9.out & featureBits panTro2 chainRheMac2Link >&! chainRheMac2Link.fb # 2452377221 bases of 2909512873 (84.288%) in intersection # NOTE: had to run this on kolossus as hgwdev ran out of mem. # (needed to push chromInfo and chr*_gap tables over first). # swap alignments after chr9 fix ssh pk rm -fr /cluster/data/rheMac2/bed/blastz.panTro2.swap cd /cluster/data/panTro2/bed/blastz.rheMac2 /cluster/bin/scripts/doBlastzChainNet.pl -swap \ -workhorse=kkr1u00 \ `pwd`/DEF >&! swap.log & ######################## # BLASTZ DOG canFam2 (2006-03-15 kate) ssh pk cd /cluster/data/panTro2/bed mkdir blastz.canFam2.2006-03-15 ln -s blastz.canFam2.2006-03-15 blastz.canFam2 cd blastz.canFam2 cat << '_EOF_' > DEF # chimp vs dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for dog (per Webb email to Brian Raney) BLASTZ_ABRIDGE_REPEATS=1 # NOTE: must use nibs with repeat abridging # TARGET: Chimp panTro2 SEQ1_DIR=/scratch/hg/panTro2/nib SEQ1_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInOthers SEQ1_LEN=/scratch/hg/panTro2/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog CanFam2 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_LEN=/cluster/bluearc/canFam2/chrom.sizes SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/panTro2/bed/blastz.canFam2.2006-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.canFam2 \ `pwd`/DEF >&! blastz.out & # Started 2006-03-15 15:40 # Failed due to missing chimp nibs on small cluster /scratch/hg # copied them over manually, then continued, running from # fileserver, as pk is down /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.canFam2 \ `pwd`/DEF >&! blastz.2.out & # Started 2006-03-16 17:50 # Failed due to missing chrom.sizes on hgwdev:/scratch/hg/panTro2 # Copied manually then continued /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -continue=load \ `pwd`/DEF >&! blastz.3.out & # Redo of chr9 (2006-03-28 kate) # Create blastz run dir for just chr9, and run manually on pk. ssh pk cd /cluster/data/panTro2/bed/blastz.canFam2 mkdir run.blastz.chr9 cd run.blastz.chr9 cp ../run.blastz/*.csh . grep panTro2/nib/chr9.nib ../run.blastz/jobList > jobList para create jobList # 14 jobs para try para push para time > run.time # merge chr9 psl's on small cluster ("cat" step in automated script) ssh kki cd /cluster/data/panTro2/bed/blastz.canFam2 mkdir run.cat.chr9 cd run.cat.chr9 cp ../run.cat/cat.csh . grep chr9 ../run.cat/jobList > jobList para create jobList # 15 jobs para push para time > run.time cd ../axtChain mkdir run.chr9 cd run.chr9 cp ../run/chain.csh . grep chr9 ../run/jobList | grep -v random > jobList mkdir chain para create jobList # 1 job para push para time > run.time # replace chr9 chain in all.chain ssh hgwdev64 # kolossus busy cd /cluster/data/panTro2/bed/blastz.canFam2/axtChain/run mkdir chain set all = panTro2.canFam2.all.chain.gz chainSplit chain/ ../$all rm chain/chr9.chain cp ../run.chr9/chain/*.chain chain chainMergeSort chain/*.chain | nice gzip -c > ../$all rm chain/* chainSplit chain/ ../$all mv chain .. # automate the rest ssh pk cd /cluster/data/panTro2/bed/blastz.canFam2 rm -fr axtNet mafNet rm axtChain/{noClass.net,panTro2.canFam2.net.gz} ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/panTro2/vsCanFam2" /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -workhorse=kkr8u00 \ -continue=net \ `pwd`/DEF >&! blastz.net.chr9.out & # done featureBits panTro2 chainCanFam2Link > chainCanFam2Link.fb # 1516576565 bases of 2909512873 (52.125%) in intersection ######################################################################### # BLASTZ galGal2 (2006-03-18 kate) ssh pk cd /cluster/data/panTro2/bed/ mkdir blastz.galGal2.2006-03-18 ln -s blastz.galGal2.2006-03-18 blastz.galGal2 cd blastz.galGal2 cat << '_EOF_' > DEF # chimp vs chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Chimp panTro2 SEQ1_DIR=/scratch/hg/panTro2/nib SEQ1_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInOthers SEQ1_LEN=/scratch/hg/panTro2/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once SEQ2_DIR=/scratch/hg/galGal2/nib SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/panTro2/bed/blastz.galGal2.2006-03-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.galGal2 \ `pwd`/DEF >&! blastz.out & # error in net step -- can't find all.chain for some reason # try restarting /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net \ `pwd`/DEF >&! blastz.2.out & # Redo of chr9 (2006-03-28 kate) # Create blastz run dir for just chr9, and run manually on pk. ssh pk cd /cluster/data/panTro2/bed/blastz.galGal2 mkdir run.blastz.chr9 cd run.blastz.chr9 cp ../run.blastz/*.csh . grep panTro2/nib/chr9.nib ../run.blastz/jobList > jobList # 162 jobs para create jobList para push para time > run.time # merge chr9 psl's on small cluster ("cat" step in automated script) ssh kki cd /cluster/data/panTro2/bed/blastz.galGal2 mkdir run.cat.chr9 cd run.cat.chr9 cp ../run.cat/cat.csh . grep chr9 ../run.cat/jobList > jobList para create jobList # 15 jobs para push para time > run.time cd ../axtChain mkdir run.chr9 cd run.chr9 cp ../run/chain.csh . grep chr9 ../run/jobList | grep -v random > jobList mkdir chain para create jobList # 1 job para push # GOT HERE para time > run.time # replace chr9 chain in all.chain ssh kkstore01 # kolossus busy cd /cluster/data/panTro2/bed/blastz.galGal2/axtChain/run mkdir chain set all = panTro2.galGal2.all.chain.gz nice chainSplit chain/ ../$all rm chain/chr9.chain cp ../run.chr9/chain/*.chain chain nice chainMergeSort chain/*.chain | nice gzip -c > ../$all rm chain/* cd .. mkdir chain nice chainSplit chain/ $all # automate the rest ssh pk cd /cluster/data/panTro2/bed/blastz.galGal2 rm -fr axtNet mafNet rm axtChain/{noClass.net,panTro2.galGal2.net.gz} ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/panTro2/vsGalGal2" /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -workhorse=kkr7u00 \ -continue=net \ `pwd`/DEF >&! blastz.net.chr9.out & featureBits panTro2 chainGalGal2Link > chainGalGal2Link.fb # 79103883 bases of 2909512873 (2.719%) in intersection # swap alignments to other browser /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap `pwd`/DEF >&! swap.out & ############################################################################ # GENSCAN PREDICTIONS (2006-03-18 kate) ssh kkstore04 cd /cluster/data/panTro2 mkdir bed/genscan # setup to break hardmasked chrom sequence into 5MB chunks at # unbridged contigs/gaps set sdir = /san/sanvol1/scratch/panTro2/splitHard foreach d (`cat chrom.lst | grep -v M`) mkdir -p $sdir/$d foreach agp ($d/chr*.agp) set c = $agp:t:r echo $c cp $agp $sdir/$d cp $d/$c.fa.masked $sdir/$d end end ssh pk cd /cluster/data/panTro2 cd bed/genscan # Make 3 subdirectories for genscan output files mkdir gtf pep subopt # finish split cat > bed/genscan/doSplit.csh << 'EOF' set sdir = /san/sanvol1/scratch/panTro2/splitHard cd /san/sanvol1/scratch/panTro2/splitHard foreach agp (*/chr*.agp) set d = $agp:h set c = $agp:t:r set fa = $d/$c.fa.masked echo splitting $agp $fa nice splitFaIntoContigs $agp $fa . -nSize=5000000 end 'EOF' csh bed/genscan/doSplit.csh >&! bed/genscan/doSplit.log & # add chrM # create list of chunks for cluster run cd bed/genscan rm -f genome.list set sdir = /san/sanvol1/scratch/panTro2/splitHard # slip in chrM mkdir -p $sdir/M/chrM_1 cp ../../M/chrM.fa.masked $sdir/M/chrM_1/chrM_1.fa mkdir $sdir/M/lift awk '/chrM/ {printf("%d\t%s\t%d\t%s\t%d\n", 0, "M/chrM", $2, "chrM", $2)}' \ /cluster/data/panTro2/chrom.sizes > $sdir/M/lift/ordered.lft cd $sdir foreach f (*/chr*_*/*.fa) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $sdir/$f >> \ /cluster/data/panTro2/bed/genscan/genome.list end # retrieve executable ssh hgwdev cd /cluster/data/panTro2/bed/genscan cvs co hg3rdParty/genscanlinux ssh kki cd /cluster/data/panTro2/bed/genscan # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList # 646 jobs written to batch para try para check para push # CPU time in finished jobs: 444979s 7416.32m 123.61h 5.15d 0.014 y # IO & Wait Time: 1898s 31.63m 0.53h 0.02d 0.000 y # Time in running jobs: 150791s 2513.18m 41.89h 1.75d 0.005 y # Average job time: 693s 11.55m 0.19h 0.01d # Longest running job: 150791s 2513.18m 41.89h 1.75d # Longest finished job: 86693s 1444.88m 24.08h 1.00d # Submission to last job: 87402s 1456.70m 24.28h 1.01d # Excessively long on chunks with centromere not at start or end of chunk # Next time, try to break these. # Convert these to chromosome level files ssh kkstore04 cd /cluster/data/panTro2/bed/genscan set sdir = /san/sanvol1/scratch/panTro2/splitHard cat $sdir/*/lift/*.lft > $sdir/liftAll.lft liftUp genscan.gtf $sdir/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed $sdir/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database ssh hgwdev cd /cluster/data/panTro2/bed/genscan ldHgGene panTro2 genscan genscan.gtf # 42844 gene predictions hgsql panTro1 -Ne "select count(*) from genscan" # 41471 hgPepPred panTro2 generic genscanPep genscan.pep hgLoadBed -strict panTro2 genscanSubopt genscanSubopt.bed featureBits panTro2 genscan # 53758249 bases of 2909485072 (1.848%) in intersection featureBits panTro1 genscan # 53757565 bases of 2909485072 (1.848%) in intersection QA NOTE: Ran mytouch for genscanPep table and changed the Update_time to 2006-07-19 18:00:00, because runJoiner was complaining. ####################################################################### # OPOSSUM BLASTZ - (WORKING - 2006-03-20 - Hiram) ssh pk mkdir /cluster/data/panTro2/bed/blastzMonDom4.2006-03-20 cd /cluster/data/panTro2/bed ln -s blastzMonDom4.2006-03-20 blastz.monDom4 cd /cluster/data/panTro2/bed/blastzMonDom4.2006-03-20 cat << '_EOF_' > DEF # chimp vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Chimp (panTro2) SEQ1_DIR=/scratch/hg/panTro2/nib SEQ1_LEN=/scratch/hg/panTro2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom4 SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/panTro2/bed/blastzMonDom4.2006-03-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # running 2006-03-20 11:30 # SWAP CHAINS/NET RN4 (DONE 3/27/06 angie) # This run used the updated chr9.nib. ssh kkstore01 mkdir /cluster/data/panTro2/bed/blastz.rn4.swap cd /cluster/data/panTro2/bed/blastz.rn4.swap doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.panTro2/DEF \ -workhorse kkr8u00 >& do.log & tail -f do.log ln -s blastz.rn4.swap /cluster/data/panTro2/bed/blastz.rn4 ############################################################################ ### swap of Mm8 alignments - (DONE - 2006-03-30 - Hiram) ssh pk mv /cluster/data/panTro2/bed/blastz.mm8.swap \ /cluster/data/panTro2/bed/blastz.mm8.swap.2006-03-21 mkdir /cluster/data/panTro2/bed/blastz.mm8.swap cd /cluster/data/panTro2/bed/blastz.mm8.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/mm8/bed/blastzPanTro2.2006-03-28/DEF \ > blastz.out 2>&1 & # XXX started 2006-03-30 time nice -n +19 featureBits panTro2 chainMm8Link \ > fb.panTro2.chainMm8Link # XXXX first time was: # 986978326 bases of 2909512873 (33.922%) in intersection ############################################################################ # SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM PANTRO1 (2006-03-13 kate) ssh kkr1u00 cd /cluster/data/panTro2/bed mkdir liftOver cd liftOver makeLoChain-split panTro2 /cluster/data/panTro2/nib >&! split.log & ######################## # BLASTZ HUMAN HG17 (2006-06-17 kate) # Do not use lineage-specific repeats, on recommendation of Arian Smit # Do use latest human-chimp scoring matrix and gap penalties from Webb Miller # via rico (Richard Burhans) at PSU # NOTE: these alignments have been extensively reviewed by Kateryna # Makerova, Richard Burnhams and Webb Miller at Penn state to # verify parameters are suitable. The same parameters were # used for the HG18 alignments, below. cat /cluster/data/penn/human_chimp.v2.q # A C G T # 90 -330 -236 -356 # -330 100 -318 -236 # -236 -318 100 -330 # -356 -236 -330 90 # O = 600 E = 150 ssh pk cd /cluster/data/panTro2/bed mkdir blastz.hg17.2006-06-17-05 ln -s blastz.hg17.2006-06-17-05 blastz.hg17 cd blastz.hg17 ssh kkr1u00 cp -p /cluster/data/hg17/hg17.2bit /scratch/hg/hg17 ~kent/bin/iSync # failed for some reason, so I copied the files manually to # kkr*u00 cat << '_EOF_' > DEF # chimp vs human # Specially tuned blastz parameters from Webb Miller export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=0 BLASTZ_O=600 BLASTZ_E=150 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q # TARGET: Chimmp panTro2 SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/bluearc/panTro2/chrom.sizes # QUERY: Human hg17 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/hg17/hg17.2bit SEQ2_LEN=/cluster/bluearc/hg17/chrom.sizes SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/panTro2/bed/blastz.hg17.2006-06-17-05 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg17 \ `pwd`/DEF >&! blastz.out & # failed due to badly formatted scoring matrix file, which # fortuitously pointed out the need for a trailing line # containing gap penalties in the scoring matrix file. # If it's missing (as in all of our current .q files), # axtChain uses defaults, not the penalties used for the # blastz run (although it could now get these from the axt files). # I'm adding this to the matrix file, also will change library # (axt.c) to add the gap values used by axtChain to the header comment /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg17 \ -continue=chainRun \ `pwd`/DEF >&! blastz.2.out & # failed due to ssh key failure on hgwdev /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg17 \ -continue=load \ `pwd`/DEF >&! blastz.3.out & nice featureBits panTro2 chainHg17Link # 2759308860 bases of 2909512873 (94.837%) in intersection nice featureBits panTro2 chainHg17OldLink # from alignments with default matrix #2782705676 bases of 2909512873 (95.642%) in intersection hgsql panTro2 -e "select count(*) from chr7_chainHg17" # 75376 hgsql panTro2 -e "select count(*) from chr7_chainHg17Old" # 241200 # swap alignments to hg17 reference /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk \ `pwd`/DEF >&! swap.out & nice featureBits hg17 chainPanTro2Link # 2719566113 bases of 2866216770 (94.883%) in intersection nice featureBits hg17 chainPanTro2LinkOld # 2739413164 bases of 2866216770 (95.576%) in intersection hgsql panTro2 -e "select count(*) from chr7_chainPanTro2" # 105398 hgsql panTro2 -e "select count(*) from chr7_chainPanTro2Old" # 333069 nice featureBits panTro2 chainHg17Link | tee fb.panTro2.chainHg17Link # 2759308860 bases of 2823407242 (97.730%) in intersection ######################## # BLASTZ HUMAN HG18 (2006-07-9 kate) # Do not use lineage-specific repeats, on recommendation of Arian Smit # Do use latest human-chimp scoring matrix and gap penalties from Webb Miller # via rico (Richard Burhans) at PSU cat /cluster/data/penn/human_chimp.v2.q # A C G T # 90 -330 -236 -356 # -330 100 -318 -236 # -236 -318 100 -330 # -356 -236 -330 90 # O = 600 E = 150 ssh pk cd /cluster/data/panTro2/bed mkdir blastz.hg18.2006-07-09 ln -s blastz.hg18.2006-07-09 blastz.hg18 cd blastz.hg18 cat << '_EOF_' > DEF # chimp vs human # Specially tuned blastz parameters from Webb Miller export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=0 BLASTZ_O=600 BLASTZ_E=150 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q # TARGET: Chimmp panTro2 SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/bluearc/panTro2/chrom.sizes # QUERY: Human hg18 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/hg18/hg18.2bit SEQ2_LEN=/cluster/bluearc/hg18/chrom.sizes SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/panTro2/bed/blastz.hg18.2006-07-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg18 \ `pwd`/DEF >&! blastz.out & # Mini-cluster nodes were dead, so restart required after admins fixed. /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg18 \ -continue=cat \ `pwd`/DEF >&! blastz.2.out & hgsql panTro2 -Nse "select count(*) from chr7_chainHg18" # 76122 hgsql panTro2 -Nse "select count(*) from chr7_chainHg17" # 75376 featureBits panTro2 -chrom=chr7 chr7_chainHg18Link # 146031865 bases of 151077879 (96.660%) in intersection featureBits panTro2 -chrom=chr7 chr7_chainHg17Link # 145990849 bases of 151077879 (96.633%) in intersection nice featureBits panTro2 chainHg18Link >& fb.panTro2.chainHg18Link & cat fb.panTro2.chainHg18Link # 2760856313 bases of 2823407242 (97.785%) in intersection cat ../blastz.hg17/fb.panTro2.chainHg17Link # 2759308860 bases of 2823407242 (97.730%) in intersection # SWAP ALIGNMENTS (2006-07-11 kate) ssh kkstore02 cd /cluster/data/hg18/bed mkdir blastz.panTro2.swap cd blastz.panTro2 /cluster/bin/scripts/doBlastzChainNet.pl -swap \ /cluster/data/panTro2/bed/blastz.hg18/DEF >&! swap.log & # failed during liftover chain copy (netChains.csh) /cluster/bin/scripts/doBlastzChainNet.pl -swap -continue=download\ /cluster/data/panTro2/bed/blastz.hg18/DEF >&! swap.download.log & ######################################################################### # BLASTZ danRer3 (2006-04-07 kate) # Includes both randoms ssh pk mkdir /cluster/data/panTro2/bed/blastz.danRer3.2006-04-07 cd /cluster/data/panTro2/bed ln -s blastz.danRer3.2006-04-07 blastz.danRer3 cd blastz.danRer3 cat << 'EOF' > DEF # human target, zebrafish query export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # use parameters suggested for human-fish evolutionary distance # recommended in doBlastzChainNet.pl help # (previously used for hg16-fr1, danrer1-mm5) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q # TARGET: Human panTro2 SEQ1_DIR=/scratch/hg/panTro2/nib SEQ1_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInZebrafish SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/scratch/hg/panTro2/chrom.sizes # QUERY: zebrafish danRer3 # Use all chroms, including both randoms (chrUn and chrNA) SEQ2_DIR=/san/sanvol1/scratch/danRer3/nib SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers SEQ2_LEN=/cluster/bluearc/danRer3/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=1000 BASE=/cluster/data/panTro2/bed/blastz.danRer3.2006-04-07 TMPDIR=/scratch/tmp 'EOF' # << happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.danRer3 \ `pwd`/DEF >& blastz.out & # failed on 'cat.run' step, for no good reason. # continuing manually with 'para shove -retries=20'... ssh kki cd /cluster/data/panTro2/bed/blastz.danRer3/run.cat para time > run.time ssh pk cd /cluster/data/panTro2/bed/blastz.danRer3 /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainMerge \ `pwd`/DEF >& blastz.chainMerge.out & # failed due to bad NFS mount on one of the mini-cluster nodes. # finished manually, and created run,time ######################################################################### # BLASTZ danRer4 (2006-07-10 kate) # Includes both randoms ssh pk mkdir /cluster/data/panTro2/bed/blastz.danRer4.2006-07-10 cd /cluster/data/panTro2/bed ln -s blastz.danRer4.2006-07-10 blastz.danRer4 cd blastz.danRer4 cat << 'EOF' > DEF # human target, zebrafish query export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # use parameters suggested for human-fish evolutionary distance # recommended in doBlastzChainNet.pl help # (previously used for hg16-fr1, danrer1-mm5) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q # TARGET: Human panTro2 SEQ1_DIR=/scratch/hg/panTro2/nib SEQ1_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInZebrafish SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/scratch/hg/panTro2/chrom.sizes # QUERY: zebrafish danRer4 # Use all chroms, including both randoms (chrUn and chrNA) SEQ2_DIR=/san/sanvol1/scratch/danRer4/nib SEQ2_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers SEQ2_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=1000 BASE=/cluster/data/panTro2/bed/blastz.danRer4.2006-07-10 TMPDIR=/scratch/tmp 'EOF' # << happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.danRer4 \ `pwd`/DEF >& blastz.out & # 2 jobs crashed on kki, pk was rebooted # finish jobs with para push on kki, then continue /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.danRer4 \ -continue=chainRun \ `pwd`/DEF >& blastz.2.out & # swap alignemnts to other assembly /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap `pwd`/DEF >& swap.out & ######################################################################### # LifUunder to panTro1 (2005-07-11 kate) ssh pk cd /cluster/data/panTro2 doSameSpeciesLiftOver.pl panTro2 panTro1 >& liftUnder.out & ln -s panTro1.2005-07-14 blat.panTro1 # stalled out looking for workhorse, restarting doSameSpeciesLiftOver.pl panTro2 panTro1 \ -buildDir=/cluster/data/panTro2/bed/blat.panTro1 \ -continue=net -workhorse=kolossus >& liftOver.2.out & ############################################################################# # 8-WAY MULTIZ ALIGNMENTS (2006-07-15 kate) # hg18 # rheMac2 # mm8 # canFam2 # monDom4 # galGal2 # danRer4 # copy net mafs to cluster-friendly storage for multiz run ssh kkstore04 cd /cluster/data/panTro2/bed set d = /san/sanvol1/scratch/panTro2/mafNet mkdir -p $d foreach s (hg18 rheMac2 mm8 canFam2 monDom4 galGal2 danRer4) echo $s cp -Rp blastz.$s/mafNet $d/$s end # make output dir and run dir ssh pk cd /cluster/data/panTro2/bed mkdir -p multiz8way.2006-07-15 ln -s multiz8way.2006-07-15 multiz8way cd multiz8way cp /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod . cat elliotsEncode.mod ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV TRAINING_LNL: -988246.132962 BACKGROUND: 0.295 0.205 0.205 0.295 RATE_MAT: -1.165221 0.315494 0.589884 0.259843 0.189778 -0.878194 0.208718 0.479698 0.444622 0.261535 -0.885604 0.179447 0.234867 0.720815 0.215191 -1.170872 TREE: (((((((((((((hg17:0.006690,panTro1:0.007571):0.024272,(colobus_monkey:0.015404,(baboon:0.008258,rheMac2:0.028617):0.008519):0.022120):0.023960,(dusky_titi:0.025662,(owl_monkey:0.012151,marmoset:0.029549):0.008236):0.027158):0.066101,(mouse_lemur:0.059024,galago:0.121375):0.032386):0.017073,((rn3:0.081728,mm7:0.077017):0.229273,oryCun1:0.206767):0.023340):0.023026,(((bosTau2:0.159182,canFam2:0.147731):0.004946,rfbat:0.138877):0.010150,(hedgehog:0.193396,shrew:0.261724):0.054246):0.024354):0.028505,dasNov1:0.149862):0.015994,(loxAfr1:0.104891,echTel1:0.259797):0.040371):0.218400,monDom2:0.371073):0.065268,platypus:0.468116):0.123856,galGal2:0.454691):0.123297,xenTro1:0.782453):0.156067,((tetNig1:0.199381,fr1:0.239894):0.492961,danRer3:0.782561):0.156067); # create species list and stripped down tree for autoMZ sed -n '/TREE: /s///p' elliotsEncode.mod > elliotsEncode.nh /cluster/bin/phast/tree_doctor elliotsEncode.nh \ --prune-all-but=hg17,panTro1,rheMac1,mm7,canFam2,monDom2,galGal2,danRer3 \ --rename="hg17->hg18;panTro1->panTro2;rheMac1->rheMac2;mm7->mm8;monDom2->monDom4;danRer3->danRer4" > 8way.nh sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 8way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst /cluster/bin/phast/all_dists 8way.nh > 8way.distances.txt grep panTro2 8way.distances.txt | sort -k3,3n | \ awk '{printf ("%.4f\t%s\t%s\n", $3, $2, $1)}' > distances.txt cat distances.txt # 0.0143 chimp_panTro2 human_hg18 # 0.0910 macaque_rheMac2 chimp_panTro2 # 0.2660 dog_canFam2 chimp_panTro2 # 0.4686 mouse_mm8 chimp_panTro2 # 0.7128 monodelphis_monDom4 chimp_panTro2 # 0.9855 chicken_galGal2 chimp_panTro2 # 1.7488 zebrafish_danRer4 chimp_panTro2 mkdir -p maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = panTro2 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/mafNet rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == panTro2) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp # << happy emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/panTro2/bed/multiz8way.2006-07-15/maf/$(root1).maf} #ENDLOOP 'EOF' # << happy emacs awk '{print $1}' /cluster/data/panTro2/chrom.sizes > chrom.lst gensub2 chrom.lst single spec jobList para create jobList # 52 files para try para check para push para time > run.time # CPU time inn finished jobs: 99662s 1661.03m 27.68h 1.15d 0.003 y # IO & Wait Time: 2009s 33.49m 0.56h 0.02d 0.000 y # Average job time: 1955s 32.59m 0.54h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 8172s 136.20m 2.27h 0.09d # Submission to last job: 8172s 136.20m 2.27h 0.09d # annotate mafs ssh kolossus cd /cluster/data/panTro2/bed/multiz8way mkdir anno cd anno mkdir maf run cd run rm -f sizes nBeds foreach i (`cat /cluster/data/panTro2/bed/multiz8way/species.lst`) ln -s /cluster/data/$i/chrom.sizes $i.len ln -s /cluster/data/$i/$i.N.bed $i.bed echo $i.bed >> nBeds echo $i.len >> sizes end echo date > jobs.csh foreach i (../../maf/*.maf) echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/panTro2/panTro2.2bit ../maf/`basename $i` >> jobs.csh echo "echo $i" >> jobs.csh end echo date >> jobs.csh # do smaller jobs first tac jobs.csh > jobsRev.csh mv jobsRev.csh jobs.csh csh jobs.csh > jobs.log & # Load into database ssh hgwdev cd /cluster/data/panTro2/bed/multiz8way/anno/maf mkdir -p /gbdb/panTro2/multiz8way/anno/maf ln -s /cluster/data/panTro2/bed/multiz8way/anno/maf/*.maf \ /gbdb/panTro2/multiz8way/anno/maf cat > loadMaf.csh << 'EOF' nice hgLoadMaf -pathPrefix=/gbdb/panTro2/multiz8way/anno/maf panTro2 multiz8way cat *.maf | \ nice hgLoadMafSummary panTro2 -minSize=30000 -mergeGap=1500 -maxSize=200000 multiz8waySummary stdin 'EOF' # 16414516 mafs, 1281405 summary blocks #<< happy emacs csh loadMaf.csh >&! loadMaf.log & # generate .gif of tree phyloGif ---------------------------------------------------------------------------- 2009-08-20 markd with guidance from braney ### # mafs not correctly displayed due to incorrectly using -sizes=sizes # with mafAddIRows ### # annotate mafs ssh kolossus cd /cluster/data/panTro2/bed/multiz8way/anno rm -f maf/*.maf run/*.maf cd run # edit jobs.csh to remove -sizes=sizes csh jobs.csh >& jobs.log & >>>>>>>>>>>>>>>>> # Load into database ssh hgwdev cd /cluster/data/panTro2/bed/multiz8way/anno/maf mkdir -p /gbdb/panTro2/multiz8way/anno/maf csh loadMaf.csh >& loadMaf.log & ############################################################################# # PHASTCONS CONSERVATION (2006-07-18 kate) # create cluster-friendly copies of chrom fasta files ssh kkstore04 cd /cluster/data/panTro2 set sdir = /san/sanvol1/scratch/panTro2/chrom mkdir -p $sdir foreach d (`cat chrom.lst`) echo $d cp -p $d/chr*.fa $sdir end # split mafs ssh pk cd /cluster/data/panTro2/bed/multiz8way mkdir cons cd cons mkdir run.split cd run.split set WINDOWS = /san/sanvol1/scratch/panTro2/multiz8way/cons/ss rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/panTro2/bed/multiz8way/maf set WINDOWS = /san/sanvol1/scratch/panTro2/multiz8way/cons/ss set sdir = /san/sanvol1/scratch/panTro2/chrom cd $WINDOWS set c = $1 echo $c rm -fr $c mkdir $c /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \ -M $sdir/$c.fa -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 echo "Done" >> $c.done 'EOF' # << happy emacs chmod +x doSplit.csh rm -f jobList foreach f (../../maf/*.maf) set c = $f:t:r echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList end para create jobList # 52 jobs para try para check para push #CPU time in finished jobs: 2426s 40.44m 0.67h 0.03d 0.000 y #IO & Wait Time: 988s 16.46m 0.27h 0.01d 0.000 y #Average job time: 66s 1.09m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 234s 3.90m 0.07h 0.00d #Submission to last job: 346s 5.77m 0.10h 0.00d # run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cd .. # create tree model file from ENCODE model file, with tree # pruned and renamed for this set of alignments sed '/TREE: /d' ../elliotsEncode.mod > 8way.mod echo "TREE: `cat ../8way.nh`" >> 8way.mod cat 8way.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #TRAINING_LNL: -988246.132962 #BACKGROUND: 0.295 0.205 0.205 0.295 #RATE_MAT: # -1.165221 0.315494 0.589884 0.259843 # 0.189778 -0.878194 0.208718 0.479698 # 0.444622 0.261535 -0.885604 0.179447 # 0.234867 0.720815 0.215191 -1.170872 #TREE: #(((((((hg18:0.006690,panTro2:0.007571):0.024272,rheMac2:0.059256):0.107134,mm8:0.329630):0.023026,canFam2:0.187181):0.262899,monDom4:0.371073):0.189124,galGal2:0.454691):0.279364,danRer4:0.938628); # calculate GC in model file awk '$1 == "BACKGROUND:" {print $3 + $4;}' 8way.mod #0.41 mkdir run.cons cd run.cons cat > doPhast.csh << 'EOF' #!/bin/csh -fe set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set tmp = /scratch/tmp/$f mkdir -p $tmp set san = /san/sanvol1/scratch/panTro2/multiz8way/cons cp -p $san/ss/$c/$f.ss ../8way.mod $tmp pushd $tmp > /dev/null /cluster/bin/phast/$MACHTYPE/phastCons $f.ss 8way.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative hg18,rheMac2 \ --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/pp/$c $san/bed/$c sleep 1 mv $tmp/$f.pp $san/pp/$c mv $tmp/$f.bed $san/bed/$c rm -fr $tmp 'EOF' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file # start with parameters from previous human 8way cat > template << 'EOF' #LOOP doPhast.csh $(root1) $(file1) 13 .01 .23 #ENDLOOP 'EOF' # happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/panTro2/multiz8way/cons ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \ /cluster/data/panTro2/bed/multiz8way/cons/run.cons/in.list popd gensub2 in.list single template jobList grep chr7 jobList > jobList.chr7 para make jobList.chr7 # 346 jobs # using para make because jobs are so small they crash # Note: 1/3 of jobs crashed, but succeeded on retry # These jobs are probably too small/too fast. # Consider increasing the size of the msa_split even further # in future runs. para time #CPU time in finished jobs: 10586s 176.44m 2.94h 0.12d 0.000 y #IO & Wait Time: 5465s 91.08m 1.52h 0.06d 0.000 y #Average job time: 46s 0.77m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 66s 1.10m 0.02h 0.00d #Submission to last job: 581s 9.68m 0.16h 0.01d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/panTro2/multiz8way/cons # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -name "chr*.bed" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/panTro2/bed/multiz8way/cons # load into database ssh hgwdev cd /cluster/data/panTro2/bed/multiz8way/cons hgLoadBed -strict panTro2 phastConsElements8way mostConserved.bed # compare with previous tracks hgsql panTro2 -e "select count(*) from phastConsElements8way" # 1596760 elements hgsql hg18 -e "select count(*) from phastConsElements17way" # 1601903 # Try for 5% overall cov, and 70% CDS cov featureBits panTro2 -enrichment refGene:cds phastConsElements8way # refGene:cds 1.020%, phastConsElements8way 4.999%, both 0.731%, cover 71.68%, enrich 14.34x # experiments featureBits hg18 -chrom=chr7 -enrichment refGene:cds phastConsElements17way # refGene:cds 0.883%, phastConsElements17way 4.838%, both 0.621%, cover 70.31%, enrich 14.53x featureBits panTro2 -chrom=chr7 -enrichment refGene:cds phastConsElements8way # 13 .01 .23 # refGene:cds 0.861%, phastConsElements8way 4.914%, both 0.626%, cover 72.71%, enrich 14.80x # 13 .007 .23 # refGene:cds 0.861%, phastConsElements8way 4.780%, both 0.626%, cover 72.69%, enrich 15.21x # 12 .007 .23 # refGene:cds 0.861%, phastConsElements8way 4.599%, both 0.623%, cover 72.34%, enrich 15.73x # 12 .006 .23 # refGene:cds 0.861%, phastConsElements8way 4.553%, both 0.623%, cover 72.35%, enrich 15.89x # 12 .006 .24 # refGene:cds 0.861%, phastConsElements8way 4.657%, both 0.630%, cover 73.11%, enrich 15.70x # 12 .006 .25 # refGene:cds 0.861%, phastConsElements8way 4.765%, both 0.636%, cover 73.90%, enrich 15.51x # 12 .004 .27 # refGene:cds 0.861%, phastConsElements8way 4.845%, both 0.647%, cover 75.14%, enrich 15.51x # 12 .006 .27 # refGene:cds 0.861%, phastConsElements8way 4.962%, both 0.647%, cover 75.17%, enrich 15.15x # 13 .007 .27 # refGene:cds 0.861%, phastConsElements8way 5.241%, both 0.651%, cover 75.57%, enrich 14.42x # 12 .008 .28 # refGene:cds 0.861%, phastConsElements8way 5.152%, both 0.653%, cover 75.82%, enrich 14.72x # 12 .007 .27 # phastConsElements8way 5.011%, both 0.648%, cover 75.19%, enrich 15.01x # 11 .005 .27 # refGene:cds 0.861%, phastConsElements8way 4.673%, both 0.644%, cover 74.78%, enrich 16.00x # 14 .008 .28 # refGene:cds 0.861%, phastConsElements8way 5.629%, both 0.659%, cover 76.57%, enrich 13.60x # 10, .11 # refGene:cds 0.861%, phastConsElements8way 7.936%, both 0.671%, cover 77.92%, enrich 9.82x # 12, .20 # refGene:cds 0.861%, phastConsElements8way 8.316%, both 0.668%, cover 77.56%, enrich 9.33x # 12, .11 # refGene:cds 0.861%, phastConsElements8way 7.304%, both 0.666%, cover 77.34%, enrich 10.59x # 14, .11 # refGene:cds 0.861%, phastConsElements8way 7.936%, both 0.671%, cover 77.92%, enrich 9.82x # check conservation using consEntropy, at Adam's recommendation # first, generate second model file set rho = .23 tree_doctor --scale $rho # create wiggle files ssh pk cd /san/sanvol1/scratch/panTro2/multiz8way/cons/ # sort by chromName, chromStart so that items are in numerical order # for wigEncode find ./pp -name "chr*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode stdin phastCons8way.wig phastCons8way.wib # half an hour or so mv phastCons8way.* /cluster/data/panTro2/bed/multiz8way/cons ssh hgwdev cd /cluster/data/panTro2/bed/multiz8way/cons ln -s /cluster/data/panTro2/bed/multiz8way/cons/phastCons8way.wib \ /gbdb/panTro2/multiz8way/phastCons8way.wib hgLoadWiggle -pathPrefix=/gbdb/panTro2/multiz8way panTro2 \ phastCons8way phastCons8way.wig # add Mark D's codon framing ssh kkstore04 cd /cluster/data/panTro2/bed/multiz8way mkdir frames cd frames cp /cluster/data/hg17/bed/multiz17way/frames/mkMafFrames . cp /cluster/data/hg17/bed/multiz17way/frames/Makefile.genes . #edit Makefile to correct species names and set and sanDir, prune out all # but getGenes target, as the remainder is below. ssh hgwdev cd /cluster/data/panTro2/bed/multiz8way/frames make getGenes >&! getGenes.log & nice tcsh # easy way to get process niced (cat ../maf/*.maf | genePredToMafFrames panTro2 stdin stdout panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz | gzip > multiz8way.mafFrames.gz)>&log& ################################################################ # Use ensGene to annotate panTro2, reload multiz8wayFrames table # (working 2010-07-29 - Chin) cd /cluster/data/panTro2/bed/multiz8way/frames for DB in panTro2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done time (cat ../maf/*.maf | genePredToMafFrames panTro2 stdin stdout panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz | gzip > multiz8way.mafFrames.gz) > frames.log 2>&1 # real 17m21.258s zcat multiz8way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 4403 rheMac2 # 13552 canFam2 # 84969 galGal2 # 188785 hg18 # 190678 panTro2 # 285468 mm8 ssh hgwdev cd /cluster/data/panTro2/bed/multiz8way/frames time nice -n +19 hgLoadMafFrames panTro2 multiz8wayFrames \ multiz8way.mafFrames.gz > loadFrames.log 2>&1 & # real 0m8.607s # check it: hgsql -N -e "select src from multiz8wayFrames;" panTro2 \ | sort | uniq -c | grep panTro2 # 190678 panTro2 ########################################################################## # NSCAN track - (2006-07-28 markd) cd /cluster/data/panTro2/bed/nscan/ # obtained NSCAN predictions from michael brent's group # at WUSTL wget -nv -r -np http://ardor.wustl.edu/jeltje/panTro2/chr_gtf wget -nv -r -np http://ardor.wustl.edu/jeltje/panTro2/chr_ptx # clean up and rename downloaded directorys: mv ardor.wustl.edu/jeltje/panTro2/chr_gtf . mv ardor.wustl.edu/jeltje/panTro2/chr_ptx . rm -rf ardor.wustl.edu rm chr_*/index.html* gzip chr_*/* chmod a-w chr_**.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt panTro2 nscanGene chr_gtf/chr*.gtf.gz # add .a suffix to match transcript id # add .a suffix to match transcript id hgPepPred -suffix=.a panTro2 generic nscanPep chr_ptx/chr*.fa.gz rm *.tab # update trackDb; need a panTro2-specific page to describe informants chimp/panTro2/nscanGene.html (from CanFam1_hg17_nscan_10_18_2005/description.html ) chimp/panTro2/trackDb.ra ########################################################################## # AUGUSTUS ab initio track (DONE, mario, 1/30/2007) ssh hgwdev mkdir /cluster/data/panTro2/bed/augustus cd /cluster/data/panTro2/bed/augustus # get the program AUGUSTUS wget http://augustus.gobics.de/binaries/augustus.2.0.1.src.tar.gz tar xzf augustus.2.0.src.tar.gz # compile the binary if necessary cd augustus/src make augustus # create output directory cd /cluster/data/panTro2/bed/augustus mkdir out # create file with sequences and their sizes by modifying chrom.sizes cat ../../chrom.sizes | perl -e 'while(<>){s/chr([0-9a-zA-Z]+)/\/cluster\/data\/panTro2\/$1\/chr$1.fa.masked/; print;}' > seq.lst # create the job list augustus/scripts/createAugustusJoblist.pl --sequences seq.lst --chunksize 5300000 --overlap 300000 --command "/cluster/data/panTro2/bed/augustus/augustus/src/augustus --AUGUSTUS_CONFIG_PATH=/cluster/data/panTro2/bed/augustus/augustus/config --species=human --sample=100 --/augustus/verbosity=0" --outputdir /cluster/data/panTro2/bed/augustus/out/ --joblist job.lst para try para check para push # CPU time in finished jobs: 3785702s 63095.03m 1051.58h 43.82d 0.120 y # IO & Wait Time: 38307s 638.45m 10.64h 0.44d 0.001 y # Average job time: 5785s 96.42m 1.61h 0.07d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 8767s 146.12m 2.44h 0.10d # Submission to last job: 20341s 339.02m 5.65h 0.24d # check the error files cat out/*.err # In this case no error was reported. rm out/*.err cat out/*.gff | augustus/scripts/join_aug_pred.pl > augustus.pep.gff augustus/scripts/getAnnoFasta.pl augustus.pep.gff cat augustus.pep.gff | egrep "CDS|codon"> augustus.gff # load into database ssh hgwdev cd /cluster/data/panTro2/bed/augustus/ ldHgGene -bin panTro2 augustus augustus.gff # 33218 gene predictions hgPepPred panTro2 generic augustusPep augustus.pep.aa featureBits panTro2 augustus # 32820465 bases of 2909485072 (1.128%) in intersection ####################################################################### # BLASTZ/CHAIN/NET HORSE AND CHIMP (DONE 2/21/07 Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.panTro2.2007-02-19 cd /cluster/data/equCab1/bed/blastz.panTro2.2007-02-19 cat << '_EOF_' > DEF # Horse vs. Chimp BLASTZ_M=50 # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Chimp panTro2 SEQ2_DIR=/scratch/hg/panTro2/panTro2.2bit SEQ2_LEN=/cluster/data/panTro2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.panTro2.2007-02-19 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/equCab1PanTro2 >& do.log & tail -f do.log ln -s blastz.panTro2.2007-02-19 /cluster/data/equCab1/bed/blastz.panTro2 nice featureBits equCab1 -chrom=chr1 chainPanTro2Link # 121391547 bases of 177498097 (68.390%) in intersection bash time nice -n 19 featureBits equCab1 chainPanTro2Link \ > fb.equCab1.chainPanTro2Link.txt 2>&1 # 1593914101 bases of 2421923695 (65.812%) in intersection ssh kkstore04 mkdir /cluster/data/panTro2/bed/blastz.equCab1.swap cd /cluster/data/panTro2/bed/blastz.equCab1.swap bash time doBlastzChainNet.pl \ /cluster/data/equCab1/bed/blastz.panTro2.2007-02-19/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -verbose=2 -swap -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/equCab1PanTro2 > swap.log 2>&1 & # The -blastzOutRoot /cluster/bluearc/equCab1PanTro2 option above # should not be there. It caused an error message during cleaning up. # Don't specify it for -swap next time. ssh hgwdev cd /cluster/data/panTro2/bed/blastz.equCab1.swap bash time nice -n 19 featureBits panTro2 chainEquCab1Link \ > fb.panTro2.chainEquCab1Link.txt 2>&1 # 1636782924 bases of 2909485072 (56.257%) in intersection ######################################################################### # Blastz Marmoset calJac1 (DONE - 2007-11-15 - Hiram) ssh kkstore04 screen # use screen to control this job mkdir /cluster/data/panTro2/bed/blastzCalJac1.2007-11-13 cd /cluster/data/panTro2/bed/blastzCalJac1.2007-11-13 cat << '_EOF_' > DEF # Chimp vs marmoset BLASTZ_M=50 # TARGET: Chimp PanTro2 SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit SEQ1_LEN=/cluster/data/panTro2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Marmoset calJac1 SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit SEQ2_LEN=/cluster/data/calJac1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/panTro2/bed/blastzCalJac1.2007-11-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=pk > blastz.log 2>&1 & # real 1039m19.171s cat fb.panTro2.chainCalJac1Link.txt # 2220169777 bases of 2909485072 (76.308%) in intersection mkdir /cluster/data/calJac1/bed/blastz.panTro2.swap cd /cluster/data/calJac1/bed/blastz.panTro2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/panTro2/bed/blastzCalJac1.2007-11-13/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=pk > swap.log 2>&1 & # real 320m14.293s cat fb.calJac1.chainPanTro2Link.txt # 2264115411 bases of 2929139385 (77.296%) in intersection ########################################################################### # Blastz Orangutan ponAbe2 (DONE - 2007-11-16 - Hiram) ssh kkstore04 screen # use screen to control this job mkdir /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13 cd /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13 cat << '_EOF_' > DEF # Chimp vs marmoset BLASTZ_M=50 # TARGET: Chimp PanTro2 SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit SEQ1_LEN=/cluster/data/panTro2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Orangutan ponAbe2 SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=kk > blastz.log 2>&1 & # real 388m20.443s # Completed: 129160 of 129168 jobs # Crashed: 8 jobs # CPU time in finished jobs: 23694314s 394905.23m 6581.75h 274.24d 0.751 y # IO & Wait Time: 2536376s 42272.93m 704.55h 29.36d 0.080 y # Average job time: 203s 3.38m 0.06h 0.00d # Longest finished job: 5665s 94.42m 1.57h 0.07d # Submission to last job: 170709s 2845.15m 47.42h 1.98d # some jobs failed due to memory limits on kk nodes # finished them manually on kolossus and hgwdev, then continuing time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat -bigClusterHub=kk > cat.log 2>&1 & # real 754m45.014s cat fb.panTro2.chainPonAbe2Link.txt # 2648603020 bases of 2909485072 (91.033%) in intersection mkdir /cluster/data/ponAbe2/bed/blastz.panTro2.swap cd /cluster/data/ponAbe2/bed/blastz.panTro2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=kk > swap.log 2>&1 & # real 160m51.639s cat fb.ponAbe2.chainPanTro2Link.txt # 2766615040 bases of 3093572278 (89.431%) in intersection # real 2597m24.771s ########################################################################### ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: panTro2.upstreamGeneTbl = refGene ############################################################################ # SWAP HG19 Chain/Net (DONE - 2009-04-25 - Hiram) # running the swap ssh swarm mkdir /hive/data/genomes/panTro2/bed/blastz.hg19.swap cd /hive/data/genomes/panTro2/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19/DEF \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \ > swap.log 2>&1 & # real 723m41.377s cat fb.panTro2.chainHg19Link.txt # 2761343871 bases of 2909485072 (94.908%) in intersection ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-02-11 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/panTro2/bed/lastzCalJac3.2010-02-11 cd /hive/data/genomes/panTro2/bed/lastzCalJac3.2010-02-11 # same kind of parameters as used in human<->marmoset cat << '_EOF_' > DEF # chimp vs. marmoset BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q # and place those items here BLASTZ_O=600 BLASTZ_E=150 # other parameters from panTro2 vs hg18 lastz on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Chimp PanTro2 SEQ1_DIR=/scratch/data/panTro2/panTro2.2bit SEQ1_LEN=/scratch/data/panTro2/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Marmoset (calJac3) SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_LIMIT=50 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/panTro2/bed/lastzCalJac3.2010-02-11 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -stop=partition -syntenicNet \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 311m31.090s cat fb.panTro2.chainCalJac3Link.txt # 2016331285 bases of 2909485072 (69.302%) in intersection mkdir /hive/data/genomes/calJac3/bed/blastz.panTro2.swap cd /hive/data/genomes/calJac3/bed/blastz.panTro2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro2/bed/lastzCalJac3.2010-02-11/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 118m42.203s cat fb.calJac3.chainHg19Link.txt # 1990168262 bases of 2752505800 (72.304%) in intersection ############################################################################ # mrna, est, refGene, xenoRefGene (2010-04-10 markd) # removed special-casing of including Human mRNAs, ESTs, and RefSeqs # delete old data files # to be included with chimp ones. ssh genbank cd /cluster/genbank/genbank (./bin/gbAlignStep -initial panTro2 |& mail markd&) ssh hgwdev cd /cluster/genbank/genbank ./bin/gbDbLoadStep -drop -initialLoad panTro2 ############################################################################ # DENISOVA (ANCIENT HUMAN) (DONE 11/16/10 angie) mkdir /hive/data/genomes/panTro2/bed/denisova cd /hive/data/genomes/panTro2/bed/denisova # Files for both hg18 and panTro2 were downloaded in hg18/bed/denisova, # see analogous section in hg18.txt. # Combine the two sequence-lib files for Denisova into one bam. First ensure that # the headers are identical: set dataDir = /hive/data/genomes/hg18/bed/denisova/cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2 samtools view -H $dataDir/SL3003/SL3003-pt2.bam > h1 samtools view -H $dataDir/SL3004_100122/SL3004-pt2.bam > h2 cmp h1 h2 # No output, and they seem to be sorted by position, good to go: samtools merge SL3003_SL3004_100122-panTro2.bam \ $dataDir/SL3003/SL3003-pt2.bam $dataDir/SL3004_100122/SL3004-pt2.bam #1017.838u 23.035s 17:36.97 98.4% 0+0k 0+0io 0pf+0w # Build BAM index (.bam.bai) files. samtools index SL3003_SL3004_100122-panTro2.bam #85.714u 3.458s 2:03.46 72.2% 0+0k 0+0io 0pf+0w # Make /gbdb/ links and load database tables mkdir /gbdb/panTro2/denisova ln -s `pwd`/SL3003_SL3004_100122-panTro2.bam{,.bai} /gbdb/panTro2/denisova/ hgBbiDbLink panTro2 bamSLDenisova /gbdb/panTro2/denisova/SL3003_SL3004_100122-panTro2.bam # to see the grp table: hgsql -e "select * from grp order by priority;" panTro2 # add new denisova group: hgsql panTro2 -e "INSERT INTO grp VALUES ('denisova', 'Denisova Assembly and Analysis', 6.6);" ############################################################################ # construct liftOver to panTro3 (DONE - 2011-02-22 - Hiram) cd /hive/data/genomes/panTro2/bed mkdir blat.panTro3.2011-02-22 cd blat.panTro3.2011-02-22 time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev panTro2 panTro3 > do.log 2>&1 # real 84m22.796s # check /gbdb/panTro2/liftOver for new file there to panTro3 # verify convert functions on genome-test panTro2 to panTro3 # and file is present on htdocs-hgdownload/goldenPath/panTro2/liftOver ############################################################################