# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on the # Dog (Canis familiaris) May 2005 release. # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | nscanGene | genePred nscanPep | # | genscan | genePred genscanPep | # +-------------+---------------------------------+ # CREATE BUILD DIRECTORY (DONE 6/1/05 angie) ssh kkstore01 mkdir /cluster/store9/canFam2 ln -s /cluster/store9/canFam2 /cluster/data/canFam2 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 6/1/05 angie) mkdir /cluster/data/canFam2/M cd /cluster/data/canFam2/M # go to http://www.ncbi.nih.gov/ and search Nucleotide for # "canis familiaris mitochondrion genome". That shows the gi number: # 17737322 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=17737322&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Canis familiaris mitochondrion complete genome, and then replace the # header line with just ">chrM". # MAKE JKSTUFF AND BED DIRECTORIES (DONE 6/1/05 angie) # This used to hold scripts -- better to keep them inline in the .doc # so they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/canFam2/jkStuff # This is where most tracks will be built: mkdir /cluster/data/canFam2/bed # DOWNLOAD AGP, FASTA & QUAL (DONE 6/2/05 angie) ssh kkstore01 mkdir /cluster/data/canFam2/broad cd /cluster/data/canFam2/broad ftp ftp.broad.mit.edu prompt bin cd pub/assemblies/mammals/canFam2 mget contigs.bases.gz AAEX02.full_AGP Dog2.0.agp supercontigs mget assembly.agp assembly.format contigs.quals.gz supercontigs.summary bye # Jean Chang is going to remove AAEX02.full_AGP to avoid confusion, # so make sure we can tell people how to quickly regenerate it from # Dog2.0.agp: perl -wpe 'if (/contig_(\d+)/) { \ $a = sprintf "AAEX020%05d", $1+1; s/contig_\d+/$a/; }' \ Dog2.0.agp > /tmp/1.agp diff /tmp/1.agp AAEX02.full_AGP | wc -l #0 # Meanwhile, the AGP has chr01, chr02 instead of chr1, chr2... yecch. # Substitute those out: sed -e 's/^chr0/chr/' Dog2.0.agp > UCSC_Dog2.0.agp # BUILD CHROM FA (DONE 6/28/05 angie) ssh kkstore01 cd /cluster/data/canFam2/broad awk '{print $1;}' UCSC_Dog2.0.agp | uniq > ../chrom.lst nice gunzip contigs.bases.gz foreach chr (`cat ../chrom.lst`) set c = `echo $chr | sed -e 's/chr//'` mkdir ../$c awk '$1 == "'$chr'" {print;}' UCSC_Dog2.0.agp > ../$c/$chr.agp agpToFa -simpleMultiMixed ../$c/$chr.agp $chr ../$c/$chr.fa contigs.bases end faSize ../*/chr*.fa #2531673953 bases (146677410 N's 2384996543 real 2384996543 upper 0 lower) in 41 sequences in 41 files #Total size: mean 61748145.2 sd 25246010.8 min 16727 (chrM) max 126883977 (chrX) median 61280721 #N count: mean 3577497.8 sd 1401887.8 #U count: mean 58170647.4 sd 24664411.9 #L count: mean 0.0 sd 0.0 # checkAgpAndFa prints out way too much info -- keep the end/stderr only: cd /cluster/data/canFam2 foreach agp (?{,?}/chr*.agp) set fa = $agp:r.fa echo checking consistency of $agp and $fa checkAgpAndFa $agp $fa | tail -1 end nice gzip broad/contigs.bases echo "chrM" >> chrom.lst # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 6/28/05 angie) ssh kkstore01 cd /cluster/data/canFam2 foreach agp (?{,?}/chr*.agp) set fa = $agp:r.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak nice splitFaIntoContigs $agp $fa . -nSize=5000000 end # No _random's in this assembly, so no need to clean up after them. # Make a "pseudo-contig" for processing chrM too: mkdir M/chrM_1 sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa mkdir M/lift echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst echo "chrM_1" > M/lift/ordered.lst set msize = `faSize M/chrM.fa | awk '{print $1;}'` echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft foreach f ( ?{,?}/chr*.fa.bak ) nice faCmp $f $f:r end # MAKE LIFTALL.LFT (DONE 6/28/05 angie) ssh kkstore01 cd /cluster/data/canFam2 cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft # CREATING DATABASE (DONE 6/28/05 angie) ssh hgwdev hgsql '' -e 'create database canFam2' # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql df -h /var/lib/mysql #/dev/sdc1 1.8T 940G 720G 57% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE 6/28/05 angie) ssh hgwdev hgsql canFam2 -e \ "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) 2BIT (DONE 6/28/05 angie) # Make .2bit, unmasked until RepeatMasker and TRF steps are done. # Do this now so we can load up RepeatMasker and run featureBits; # can also load up other tables that don't depend on masking. ssh kkstore01 cd /cluster/data/canFam2 nice faToTwoBit ?{,?}/chr*.fa canFam2.2bit mkdir bed/chromInfo twoBitInfo canFam2.2bit stdout \ | awk '{print $1 "\t" $2 "\t/gbdb/canFam2/canFam2.2bit";}' \ > bed/chromInfo/chromInfo.tab # Make symbolic links from /gbdb/canFam2/ to the real .2bit. ssh hgwdev mkdir /gbdb/canFam2 ln -s /cluster/data/canFam2/canFam2.2bit /gbdb/canFam2/ # Load /gbdb/canFam2/canFam2.2bit paths into database and save size info. cd /cluster/data/canFam2 hgsql canFam2 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql canFam2 -e 'load data local infile \ "/cluster/data/canFam2/bed/chromInfo/chromInfo.tab" \ into table chromInfo;' echo "select chrom,size from chromInfo" | hgsql -N canFam2 > chrom.sizes # take a look at chrom.sizes, should be 41 lines wc chrom.sizes # 41 82 603 chrom.sizes # GOLD AND GAP TRACKS (DONE 6/28/05 angie) ssh hgwdev cd /cluster/data/canFam2 hgGoldGapGl -noGl -chromLst=chrom.lst canFam2 /cluster/data/canFam2 . # featureBits fails if there's no chrM_gap, so make one: # echo "create table chrM_gap like chr1_gap" | hgsql canFam2 # oops, that won't work until v4.1, so do this for the time being: hgsql canFam2 -e "create table chrM_gap select * from chr1_gap where 0=1" # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CANFAM2 (DONE 8/1/05 angie) ssh hgwdev cd $HOME/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add canFam2 in all the right places and do make update cvs commit makefile mkdir -p dog/canFam2 cvs add dog dog/canFam2 cvs ci -m "trackDb dir for dog genome(s)" dog/canFam2 # Do this in a clean (up-to-date, no edits) tree: make alpha # Add dbDb entry (not a new organism so defaultDb and genomeClade already # have entries): hgsql -h genome-testdb hgcentraltest \ -e 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ values("canFam2", "May 2005", \ "/gbdb/canFam2/nib", "Dog", "chr14:11072309-11078928", 1, \ 18, "Dog", "Canis familiaris", \ "/gbdb/canFam2/html/description.html", 0, 0, \ "Broad Institute v. 2.0");' # REPEAT MASKING (DONE braney/angie 7-10-05) #- Split contigs into 500kb chunks, at gaps if possible: ssh kkstore01 cd /cluster/data/canFam2 foreach chr (`cat chrom.lst`) set c = `echo $chr | sed -e 's/chr//'` foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/canFam2 cat << '_EOF_' > jkStuff/RMDog #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/canFam2/$2 /bin/cp $2 /tmp/canFam2/$2/ cd /tmp/canFam2/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec dog $2 popd /bin/cp /tmp/canFam2/$2/$2.out ./ if (-e /tmp/canFam2/$2/$2.align) /bin/cp /tmp/canFam2/$2/$2.align ./ if (-e /tmp/canFam2/$2/$2.tbl) /bin/cp /tmp/canFam2/$2/$2.tbl ./ if (-e /tmp/canFam2/$2/$2.cat) /bin/cp /tmp/canFam2/$2/$2.cat ./ /bin/rm -fr /tmp/canFam2/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/canFam2/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/canFam2 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMDog mkdir RMRun cp /dev/null RMRun/RMJobs foreach chr (`cat chrom.lst`) set c = `echo $chr | sed -e 's/chr//'` foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/canFam2/jkStuff/RMDog \ /cluster/data/canFam2/$d $f \ '{'check out line+ /cluster/data/canFam2/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh kk cd /cluster/data/canFam2/RMRun para create RMJobs para try, para check, para check, para push, para check,... # Completed: 6149 of 6149 jobs # CPU time in finished jobs: 32138805s 535646.75m 8927.45h 371.98d 1.019 y # IO & Wait Time: 346449s 5774.15m 96.24h 4.01d 0.011 y # Average job time: 5283s 88.05m 1.47h 0.06d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 8642s 144.03m 2.40h 0.10d # Submission to last job: 147094s 2451.57m 40.86h 1.70d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kkstore01 cd /cluster/data/canFam2 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c set dir=`echo $c | sed "s/chr//" ` cd $dir liftUp $c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` > /dev/null cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/canFam2 hgLoadOut canFam2 */chr*.fa.out # VERIFY REPEATMASKER RESULTS (DONE 2005-07-11 braney) # Eyeball some repeat annotations in the browser, compare to lib seqs. # Run featureBits on canFam2 and on a comparable genome build, and compare: ssh hgwdev featureBits canFam2 rmsk # 968054174 bases of 2384996543 (40.589%) in intersection #canFam1 is #896773874 bases of 2359845093 (38.001%) in intersection # SIMPLE REPEATS (TRF) (DONE 2005-07-11 braney) # PARTIALLY REDONE 2005-12-02 angie -- See partial redo notes below... ssh kkstore01 mkdir /cluster/data/canFam2/bed/simpleRepeat cd /cluster/data/canFam2/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/canFam2/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" >> jobs.csh end end csh -ef jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l endsInLf trf/* # When job is done do: liftUp simpleRepeat.bed /cluster/data/canFam2/jkStuff/liftAll.lft warn trf/*.bed # Load into the database: ssh hgwdev hgLoadBed canFam2 simpleRepeat \ /cluster/data/canFam2/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # load of simpleRepeat did not go as planned: 637318 record(s), 0 row(s) skipped, 1172 warning(s) loading bed.tab featureBits canFam2 simpleRepeat # 52855902 bases of 2384996543 (2.216%) in intersection # canFam1 is #36509895 bases of 2359845093 (1.547%) in intersection # 2005-12-02 Error with simpleRepeats found during QA. 1622 entries have what # appears to be a parsing error. The "sequence" value is a float number of # format X.XX instead of an ATCG sequence. Lines removed from active tables. # Copy of original data on hgwdev.canFam2.simpleRepeat_1202problem # Partial redo 12/2/05 -- Jen found some lines with invalid values in # the database. All were from trf/chr10_1.bed which I moved aside to # trf/chr10_1.bed.bad. When I reran the chr10_1 job from jobs.csh, # it produced a valid output file. So I did the rest of the steps and # reloaded. ssh kolossus cd /cluster/data/canFam2/bed/simpleRepeat mv trf/chr10_1.bed{,.bad} /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf /cluster/data/canFam2/10/chr10_1/chr10_1.fa /dev/null -bedAt=trf/chr10_1.bed -tempDir=/tmp mv simpleRepeat.bed simpleRepeat.bed.bad liftUp simpleRepeat.bed /cluster/data/canFam2/jkStuff/liftAll.lft warn trf/*.bed # on hgwdev: featureBits canFam2 simpleRepeat #52885863 bases of 2384996543 (2.217%) in intersection # CREATE MICROSAT TRACK (done 2006-7-5 JK) ssh hgwdev cd /cluster/data/canFam2/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed canFam2 microsat microsat.bed # PROCESS SIMPLE REPEATS INTO MASK (DONE 2005-07-11 braney) # 2005-12-02 angie: continuing redo of chr10_1, see below. # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore01 cd /cluster/data/canFam2/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/canFam2 mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) set dir=`echo $c | sed "s/chr//" ` if (-e $dir/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' $dir/lift/ordered.lst > $dir/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$dir.bed jkStuff/liftAll.lft warn `cat $dir/lift/oTrf.lst` endif end # Here's the coverage for the filtered TRF: ssh hgwdev cat /cluster/data/canFam2/bed/simpleRepeat/trfMaskChrom/*.bed > /tmp/filtTrf.bed featureBits canFam2 /tmp/filtTrf.bed #23111877 bases of 2384996543 (0.969%) in intersection # canFam1 was #23017541 bases of 2359845093 (0.975%) in intersection featureBits canFam2 /tmp/filtTrf.bed \!rmsk # 1205611 bases of 2384996543 (0.051%) in intersection #canFam1 was #1275941 bases of 2359845093 (0.054%) in intersection # 12/2/2005 -- since I regenerated trf/chr10_1.bed above, continue here... ssh kolossus cd /cluster/data/canFam2/bed/simpleRepeat mv trfMask/chr10_1.bed{,.orig} awk '{if ($5 <= 12) print;}' trf/chr10_1.bed > trfMask/chr10_1.bed mv trfMaskChrom/chr10.bed{,.orig} cd ../.. liftUp bed/simpleRepeat/trfMaskChrom/chr10.bed jkStuff/liftAll.lft warn `cat 10/lift/oTrf.lst` ssh hgwdev cat /cluster/data/canFam2/bed/simpleRepeat/trfMaskChrom/*.bed > /tmp/filtTrf.bed featureBits canFam2 /tmp/filtTrf.bed #23133898 bases of 2384996543 (0.970%) in intersection featureBits canFam2 /tmp/filtTrf.bed \!rmsk #1206965 bases of 2384996543 (0.051%) in intersection # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 2005-07-11 braney) # REDONE 2005-12-02 angie (to pick up the chr10_1 simpleRepeat redo) ssh kkstore01 cd /cluster/data/canFam2 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # Tons of warnings like this, mostly for L1M*: #WARNING: negative rEnd: -189 chrX:117586389-117586475 L1M3e foreach c (`cat chrom.lst`) set c=`echo $c | sed "s/chr//" ` echo "repeat- and trf-masking contigs of chr$c" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end #- Rebuild the nib files, using the soft masking in the fa: foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # Make one big 2bit file as well, and make a link to it in # /gbdb/canFam2/nib because hgBlat looks there: faToTwoBit */chr*.fa canFam2.2bit ssh hgwdev ln -s /cluster/data/canFam2/canFam2.2bit /gbdb/canFam2/nib/ # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 8/1/05 angie) ssh hgwdev mkdir /gbdb/canFam2/html # Write ~/kent/src/hg/makeDb/trackDb/dog/canFam2/description.html # with a description of the assembly and some sample position queries. chmod a+r $HOME/kent/src/hg/makeDb/trackDb/dog/canFam2/description.html # Check it in and copy (ideally using "make alpha" in trackDb) to # /gbdb/canFam2/html # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 8/1/05 angie) # pitakluster (blastz etc): ssh pk mkdir /san/sanvol1/scratch/canFam2 rsync -av /cluster/data/canFam2/nib /san/sanvol1/scratch/canFam2/ mkdir /san/sanvol1/scratch/canFam2/rmsk cp -p /cluster/data/canFam2/*/chr*.fa.out \ /san/sanvol1/scratch/canFam2/rmsk # small cluster (chaining etc): ssh kkr1u00 mkdir -p /iscratch/i/canFam2/nib rsync -av /cluster/data/canFam2/nib /iscratch/i/canFam2/ iSync # big cluster (genbank): ssh kkstore01 mkdir /cluster/bluearc/scratch/hg/canFam2 rsync -av /cluster/data/canFam2/nib /cluster/bluearc/scratch/hg/canFam2/ # ask cluster-admin to rsync that to all big cluster nodes' /scratch/... # MAKE LINEAGE-SPECIFIC REPEATS VS. HUMAN, MOUSE (DONE 8/1/05 angie) ssh kolossus cd /san/sanvol1/scratch/canFam2/rmsk # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. # Human in extra column 1, Mouse in extra column 2 foreach outfl ( *.out ) echo "$outfl" /cluster/bluearc/RepeatMasker/DateRepeats \ ${outfl} -query dog -comp human -comp mouse end # Now extract human (extra column 1), mouse (extra column). cd .. mkdir linSpecRep.notInHuman mkdir linSpecRep.notInMouse foreach f (rmsk/*.out_homo-sapiens_mus-musculus) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInHuman/$base.out.spec /cluster/bin/scripts/extractLinSpecReps 2 $f > \ linSpecRep.notInMouse/$base.out.spec end wc -l rmsk/*.out #4533630 total wc -l linSpecRep.notInHuman/* #1542788 total wc -l linSpecRep.notInMouse/* #1546408 total # Clean up. rm rmsk/*.out_h* # PRODUCING GENSCAN PREDICTIONS (DONE 8/11/05 angie) ssh hgwdev mkdir /cluster/data/canFam2/bed/genscan cd /cluster/data/canFam2/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/canFam2/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/data/canFam2/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list #495 # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para make jobList #Completed: 493 of 495 jobs #Crashed: 2 jobs #Average job time: 918s 15.30m 0.25h 0.01d #Longest finished job: 30383s 506.38m 8.44h 0.35d #Submission to last job: 132236s 2203.93m 36.73h 1.53d # If there are crashes, diagnose with "para problems" and "para crashed". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". ssh kkr7u00 cd /cluster/data/canFam2/bed/genscan /cluster/bin/x86_64/gsBig /cluster/data/canFam2/1/chr1_23/chr1_23.fa.masked gtf/chr1_23.fa.gtf -trans=pep/chr1_23.fa.pep -subopt=subopt/chr1_23.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 /cluster/bin/x86_64/gsBig /cluster/data/canFam2/27/chr27_1/chr27_1.fa.masked gtf/chr27_1.fa.gtf -trans=pep/chr27_1.fa.pep -subopt=subopt/chr27_1.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 ls -1 gtf | wc -l # 495 endsInLf gtf/* # Convert these to chromosome level files as so: ssh kkstore01 cd /cluster/data/canFam2/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/canFam2/bed/genscan ldHgGene -gtf canFam2 genscan genscan.gtf hgPepPred canFam2 generic genscanPep genscan.pep hgLoadBed canFam2 genscanSubopt genscanSubopt.bed # MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 8/1/05 angie) ssh kolossus # numerator is canFam2 gapless bases as reported by featureBits, # denominator is hg17 gapless bases as reported by featureBits, # 1024 is threshold used for human -repMatch: calc \( 2384996543 / 2867328468 \) \* 1024 # ( 2384996543 / 2867328468 ) * 1024 = 851.746316 # ==> use -repMatch=852 according to size scaled down from 1024 for human. mkdir /cluster/bluearc/canFam2 mkdir /cluster/data/canFam2/bed/ooc cd /cluster/data/canFam2/bed/ooc ls -1 /cluster/data/canFam2/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/canFam2/11.ooc -repMatch=852 #Wrote 27388 overused 11-mers to /cluster/bluearc/canFam2/11.ooc # AUTO UPDATE GENBANK MRNA RUN (DONE 8/8/05 angie) # 2/22/06: added refGene ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # Edit etc/genbank.conf and add these lines: # canFam2 (dog) canFam2.genome = /scratch/hg/canFam2/nib/chr*.nib canFam2.lift = /cluster/data/canFam2/jkStuff/liftAll.lft canFam2.refseq.mrna.native.load = no canFam2.refseq.mrna.xeno.load = yes canFam2.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005 canFam2.genbank.mrna.xeno.load = yes canFam2.genbank.est.xeno.load = no canFam2.downloadDir = canFam2 cvs ci etc/genbank.conf # Edit src/align/gbBlat to add /cluster/bluearc/canFam2/11.ooc cvs diff src/align/gbBlat make cvs ci src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, (xeno) refseq only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial canFam2 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2 featureBits canFam2 xenoRefGene #41278562 bases of 2384996543 (1.731%) in intersection # Clean up: rm -rf work/initial.canFam2 ssh eieio cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial canFam2 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2 featureBits canFam2 mrna #2061533 bases of 2384996543 (0.086%) in intersection featureBits canFam2 xenoMrna #63057460 bases of 2384996543 (2.644%) in intersection # Clean up: rm -rf work/initial.canFam2 ssh eieio # -initial for ESTs: nice bin/gbAlignStep -srcDb=genbank -type=est -initial canFam2 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 canFam2 & featureBits canFam2 intronEst #16241242 bases of 2384996543 (0.681%) in intersection featureBits canFam2 est #41719045 bases of 2384996543 (1.749%) in intersection # Clean up: rm -rf work/initial.canFam2 # 2/22/06: add native refseq ssh hgwdev cd ~/kent/src/hg/makeDb/genbank # genbank.conf: canFam2.refseq.mrna.native.load = yes make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=refseq -type=mrna -orgCat=native -initial \ canFam2 & tail -f var/build/logs/2006.02.22-21:03:48.canFam2.initalign.log ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2 & tail -f var/dbload/hgwdev/logs/2006.02.22-21:19:53.dbload.log featureBits canFam2 refGene #1222436 bases of 2384996543 (0.051%) in intersection # SWAP CHAINS FROM HG17, BUILD NETS ETC. (DONE 8/3/05 angie - REDONE 12/13/05 - REDONE 2/8/06) # hg17-canFam2 was redone 12/12/05 (makeHg17.doc) so re-running this... # and again 2/8/06, sheesh... mkdir /cluster/data/canFam2/bed/blastz.hg17.swap cd /cluster/data/canFam2/bed/blastz.hg17.swap doBlastzChainNet.pl -swap /cluster/data/hg17/bed/blastz.canFam2/DEF \ >& do.log & tail -f do.log # Add {chain,net}Hg17 to trackDb.ra if necessary. # RE-RUN NETTOAXT, AXTTOMAF FOR HG17 (DONE 10/31/05 angie) # Obsoleted 12/13/05 by re-run of canFam2-hg17 above. # Kate fixed netToAxt to avoid duplicated blocks, which is important # for input to multiz. Regenerate maf using commands from sub-script # netChains.csh generated by doBlastzChainNet.pl above. ssh kolossus cd /cluster/data/canFam2/bed/blastz.hg17.swap/axtChain netSplit canFam2.hg17.net.gz net chainSplit chain canFam2.hg17.all.chain.gz cd .. mv axtNet axtNet.orig mkdir axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/data/canFam2/nib /iscratch/i/hg17/nib stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.canFam2.hg17.net.axt.gz end rm -r mafNet mkdir mafNet foreach f (axtNet/*.canFam2.hg17.net.axt.gz) axtToMaf -tPrefix=canFam2. -qPrefix=hg17. $f \ /cluster/data/canFam2/chrom.sizes /cluster/data/hg17/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end rm -r axtChain/{chain,net}/ axtNet.orig # QUALITY SCORES (DONE 8/11/05 angie) ssh kkstore01 mkdir /cluster/data/canFam2/bed/quality cd /cluster/data/canFam2/bed/quality qaToQac ../../broad/contigs.quals.gz stdout \ | qacAgpLift ../../broad/UCSC_Dog2.0.agp stdin chrom.qac mkdir wigData # Build .wig, .wib files in current directory so that "wigData/" doesn't # appear in the .wig's: cd wigData foreach agp (../../../*/chr*.agp) set chr = $agp:t:r set abbrev = `echo $chr | sed -e 's/^chr//; s/_random/r/;'` echo $chr to qual_$abbrev wiggle qacToWig -fixed ../chrom.qac -name=$chr stdout \ | wigEncode stdin qual_$abbrev.{wig,wib} end # Verify size of .wib file = chrom length foreach f (*.wib) set abbrev = $f:t:r set chr = `echo $abbrev | sed -e 's/^qual_/chr/; s/r$/_random/;'` set wibLen = `ls -l $f | awk '{print $5;}'` set chromLen = `grep -w $chr ../../../chrom.sizes | awk '{print $2;}'` if ($wibLen != $chromLen) then echo "ERROR: $chr size is $chromLen but wib size is $wibLen" else echo $chr OK. endif end # /gbdb & load: ssh hgwdev cd /cluster/data/canFam2/bed/quality/wigData mkdir -p /gbdb/canFam2/wib ln -s `pwd`/*.wib /gbdb/canFam2/wib hgLoadWiggle canFam2 quality *.wig # GC 5 BASE WIGGLE TRACK (DONE 8/11/05 angie) ssh kki mkdir /cluster/data/canFam2/bed/gc5Base cd /cluster/data/canFam2/bed/gc5Base cat > doGc5Base.csh <<'_EOF_' #!/bin/csh -fe set chr = $1 set c = `echo $chr | sed -e 's/^chr//; s/_random/r/;'` /cluster/bin/x86_64/hgGcPercent \ -chr=${chr} -wigOut -doGaps -file=stdout -win=5 canFam2 \ /iscratch/i/canFam2/nib \ | /cluster/bin/x86_64/wigEncode stdin gc5Base_${c}{.wig,.wib} '_EOF_' # << this line makes emacs coloring happy chmod a+x doGc5Base.csh cp /dev/null spec foreach c (`cat ../../chrom.lst`) echo "./doGc5Base.csh $c" >> spec end para make spec para time #Completed: 41 of 41 jobs #Average job time: 23s 0.38m 0.01h 0.00d #Longest finished job: 44s 0.73m 0.01h 0.00d #Submission to last job: 259s 4.32m 0.07h 0.00d # /gbdb and load track on hgwdev ssh hgwdev cd /cluster/data/canFam2/bed/gc5Base mkdir -p /gbdb/canFam2/wib ln -s `pwd`/*.wib /gbdb/canFam2/wib hgLoadWiggle canFam2 gc5Base *.wig # EXTRACT LINEAGE-SPECIFIC REPEATS FOR RAT (DONE 8/12/05 angie) ssh kolossus cd /panasas/store/canFam2/rmsk # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. foreach outfl ( *.out ) echo "$outfl" /cluster/bluearc/RepeatMasker/DateRepeats \ ${outfl} -query dog -comp rat end # Now extract rat (extra column 1): cd .. mkdir linSpecRep.notInRat foreach f (rmsk/*.out_rattus) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractRepeats 1 $f > \ linSpecRep.notInRat/$base.out.spec end # Clean up. rm rmsk/*.out_rat* # LOAD CPGISSLANDS (DONE 8/12/05 angie) ssh hgwdev mkdir -p /cluster/data/canFam2/bed/cpgIsland cd /cluster/data/canFam2/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/canFam2/bed/cpgIsland/ ssh kolossus cd /cluster/data/canFam2/bed/cpgIsland foreach f (../../*/chr*.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpglh.exe $f > $fout end # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd /cluster/data/canFam2/bed/cpgIsland hgLoadBed canFam2 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed wc -l cpgIsland.bed # 47804 cpgIsland.bed featureBits canFam2 cpgIslandExt #38974374 bases of 2384996543 (1.634%) in intersection # ANDY LAW CPGISSLANDS (DONE 8/12/05 angie) # See notes in makeGalGal2.doc. ssh kolossus mkdir /cluster/data/canFam2/bed/cpgIslandGgfAndy cd /cluster/data/canFam2/bed/cpgIslandGgfAndy # Use masked sequence since this is a mammal... cp /dev/null cpgIslandGgfAndyMasked.bed foreach f (../../*/chr*.fa.masked) set chr = $f:t:r:r echo preproc and run on masked $chr /cluster/home/angie/bin/i386/preProcGgfAndy $f \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyMasked.bed end # load into database: ssh hgwdev cd /cluster/data/canFam2/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed canFam2 cpgIslandGgfAndyMasked -tab -noBin \ -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed featureBits canFam2 cpgIslandExt #38974374 bases of 2384996543 (1.634%) in intersection featureBits canFam2 cpgIslandGgfAndyMasked #99187178 bases of 2384996543 (4.159%) in intersection wc -l ../cpgIsland/cpgIsland.bed *bed # 47804 ../cpgIsland/cpgIsland.bed # 138037 cpgIslandGgfAndyMasked.bed # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN & BEYOND (DONE 8/15/05 angie) # In an email 2/13/04, Arian said we could treat all human repeats as # lineage-specific for human-chicken blastz. Do the same for dog. # Scripts expect *.out.spec filenames, so set that up: ssh kkstore01 cd /cluster/data/canFam2 mkdir /panasas/store/canFam2/linSpecRep.notInNonMammal foreach f (/panasas/store/canFam2/rmsk/chr*.fa.out) ln $f /panasas/store/canFam2/linSpecRep.notInNonMammal/$f:t:r:r.out.spec end # SWAP CHAINS FROM MM6, BUILD NETS ETC. (DONE 8/15/05 angie) # REDONE by hiram 12/7/05 -- tables loaded as {chain,net}Mm6u1 then # renamed to {chain,net}Mm6 12/13/05 angie mkdir /cluster/data/canFam2/bed/blastz.mm6.swap cd /cluster/data/canFam2/bed/blastz.mm6.swap doBlastzChainNet.pl -swap /cluster/data/mm6/bed/blastz.canFam2/DEF \ >& do.log echo "check /cluster/data/canFam2/bed/blastz.mm6.swap/do.log" \ | mail -s "check do.log" $USER # Add {chain,net}Mm6 to trackDb.ra if necessary. # RE-RUN NETTOAXT, AXTTOMAF FOR MM6 (DONE 11/1/05 angie) # Obsoleted by re-run of canFam2-mm6 above. # Kate fixed netToAxt to avoid duplicated blocks, which is important # for input to multiz. Regenerate maf using commands from sub-script # netChains.csh generated by doBlastzChainNet.pl above. ssh kolossus cd /cluster/data/canFam2/bed/blastz.mm6.swap/axtChain netSplit canFam2.mm6.net.gz net chainSplit chain canFam2.mm6.all.chain.gz cd .. mv axtNet axtNet.orig mkdir axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/data/canFam2/nib /cluster/data/mm6/nib stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.canFam2.mm6.net.axt.gz end rm -r mafNet mkdir mafNet foreach f (axtNet/*.canFam2.mm6.net.axt.gz) axtToMaf -tPrefix=canFam2. -qPrefix=mm6. $f \ /cluster/data/canFam2/chrom.sizes /cluster/data/mm6/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end rm -r axtChain/{chain,net}/ axtNet.orig # SWAP CHAINS FROM RN3, BUILD NETS ETC. (DONE 8/16/05 angie) mkdir /cluster/data/canFam2/bed/blastz.rn3.swap cd /cluster/data/canFam2/bed/blastz.rn3.swap doBlastzChainNet.pl -swap /cluster/data/rn3/bed/blastz.canFam2/DEF \ >& do.log echo "check /cluster/data/canFam2/bed/blastz.rn3.swap/do.log" \ | mail -s "check do.log" $USER # Add {chain,net}Rn3 to trackDb.ra if necessary. # Found that downloads in rn3 was missing 2005-12-21 - remake - Hiram cd /cluster/data/rn3/bed/blastz.canFam2 /cluster/bin/scripts/doBlastzChainNet.pl -continue=download \ /cluster/data/rn3/bed/blastz.canFam2/DEF \ -chainMinScore=1000 -chainLinearGap=loose > downloads.out 2>&1 # RE-RUN NETTOAXT, AXTTOMAF FOR RN3 (DONE 11/2/05 angie) # Kate fixed netToAxt to avoid duplicated blocks, which is important # for input to multiz. Regenerate maf using commands from sub-script # netChains.csh generated by doBlastzChainNet.pl above. ssh kolossus cd /cluster/data/canFam2/bed/blastz.rn3.swap/axtChain netSplit canFam2.rn3.net.gz net chainSplit chain canFam2.rn3.all.chain.gz cd .. mv axtNet axtNet.orig mkdir axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/data/canFam2/nib /cluster/data/rn3/nib stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.canFam2.rn3.net.axt.gz end rm -r mafNet mkdir mafNet foreach f (axtNet/*.canFam2.rn3.net.axt.gz) axtToMaf -tPrefix=canFam2. -qPrefix=rn3. $f \ /cluster/data/canFam2/chrom.sizes /cluster/data/rn3/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end rm -r axtChain/{chain,net}/ axtNet.orig # MAKE THIS THE DEFAULT ASSEMBLY WHEN THERE ARE ENOUGH TRACKS (DONE 11/28/05 angie) hgsql -h genome-testdb hgcentraltest \ -e 'update defaultDb set name = "canFam2" where genome = "Dog";' # MAKE Human Proteins track (DONE braney 2005-12-10) ssh kkstore01 cd /cluster/data/canFam2 cat */chr*/*.lft > jkStuff/subChr.lft mkdir blastDb for i in */*/*_*_*.fa; do ln `pwd`/$i blastDb; done cd blastDb for i in *.fa; do /cluster/bluearc/blast229/formatdb -p F -i $i; rm $i; done ssh pk destDir=/san/sanvol1/scratch/canFam2/blastDb mkdir -p $destDir cd /cluster/data/canFam2/blastDb for i in nin nsq nhr; do cp *.$i $destDir; done mkdir -p /cluster/data/canFam2/bed/tblastn.hg17KG cd /cluster/data/canFam2/bed/tblastn.hg17KG find $destDir -name "*.nsq" | sed "s/\.nsq//" > target.lst mkdir kgfa # calculate a reasonable number of jobs calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(500000/`wc target.lst | awk "{print \\\$1}"`\) # 37365/(500000/6149) = 459.514770 split -l 460 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst rm -rf /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut mkdir -p /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/canFam2/jkStuff/subChr.lft carry $f.3 liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/canFam2/jkStuff/liftAll.lft carry $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' exit chmod +x blastSome gensub2 target.lst kg.lst blastGsub blastSpec para create blastSpec para push # Completed: 504218 of 504218 jobs # CPU time in finished jobs: 28863259s 481054.31m 8017.57h 334.07d 0.915 y # IO & Wait Time: 4255045s 70917.42m 1181.96h 49.25d 0.135 y # Average job time: 66s 1.09m 0.02h 0.00d # Longest finished job: 443s 7.38m 0.12h 0.01d # Submission to last job: 166648s 2777.47m 46.29h 1.93d ssh kki cd /cluster/data/canFam2/bed/tblastn.hg17KG tcsh cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) $(path2) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.$2_*.psl | /cluster/home/braney/bin/i386/simpleChain -chrom=$2 -prot -outPsl -maxGap=250000 stdin ../c.$2.`bas ename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ../../chrom.lst para create chainSpec para push # Completed: 3362 of 3362 jobs # CPU time in finished jobs: 1078406s 17973.43m 299.56h 12.48d 0.034 y # IO & Wait Time: 85905s 1431.75m 23.86h 0.99d 0.003 y # Average job time: 346s 5.77m 0.10h 0.00d # Longest finished job: 22556s 375.93m 6.27h 0.26d # Submission to last job: 106376s 1772.93m 29.55h 1.23d ssh kkstore01 cd /cluster/data/canFam2/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.*.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/canFam2/bed/tblastn.hg17KG/blastHg17KG.psl cd .. wc blastHg17KG.psl # 64343 1351203 10913814 blastHg17KG.psl pslUniq blastDm2FB.psl stdout | wc # 18827 395367 2992653 cat kgfa/*fa | grep ">" | wc # 309494 309494 4810327 ssh hgwdev cd /cluster/data/canFam2/bed/tblastn.hg17KG hgLoadPsl canFam2 blastHg17KG.psl featureBits canFam2 blastHg17KG # 35781547 bases of 2384996543 (1.500%) in intersection exit # back to kksilo rm -rf blastOut # End tblastn # BLASTZ SELF (DONE braney 8-29-2005) # For future reference -- doBlastzChainNet.pl should have been used ssh pk mkdir -p /cluster/data/canFam2/bed/blastz.canFam2 cd /cluster/data/canFam2/bed/blastz.canFam2 cat << '_EOF_' > DEF # dog vs. dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET # Dog SEQ1_DIR=/scratch/hg/canFam2/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Dog SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/data/canFam2/bed/blastz.canFam2 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy cp /cluster/data/canFam2/chrom.sizes S1.len cp /cluster/data/canFam2/chrom.sizes S2.len mkdir run cd run partitionSequence.pl 10000000 10000 /cluster/data/canFam2/nib ../S1.len \ -xdir xdir.sh -rawDir ../lav \ > canFam2.lst csh -ef xdir.sh cat << '_EOF_' > gsub #LOOP /cluster/bin/scripts/blastz-run-ucsc $(path1) $(path2) ../DEF {check out line ../lav/$(file1)/$(file1)_$(file2).lav} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 canFam2.lst canFam2.lst gsub jobList para create jobList para try, check, push, check, ... # Completed: 70756 of 70756 jobs # CPU time in finished jobs: 5743034s 95717.24m 1595.29h 66.47d 0.182 y # IO & Wait Time: 199800s 3330.00m 55.50h 2.31d 0.006 y # Average job time: 84s 1.40m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2758s 45.97m 0.77h 0.03d # Submission to last job: 17852s 297.53m 4.96h 0.21d # convert lav files to psl # First, combine all files per partition (too many files per chrom # to do in one swoop!). Then do another round to collect all files # per chrom. ssh kki cd /cluster/data/canFam2/bed/blastz.canFam2 mkdir pslChrom pslParts run.lavToPsl cd run.lavToPsl ls -1d ../lav/* | sed -e 's@/$@@' > parts.lst # For self alignments, we need lavToAxt's -dropSelf behavior, # so go lav->axt->psl... could add -dropSelf to lavToPsl, but that # might be somewhat invasive and perhaps not really much faster (?) # because the sequence must be dug up in order to rescore... cat << '_EOF_' > do.csh #!/bin/csh -ef cat $1/*.lav \ | lavToAxt -dropSelf stdin /iscratch/i/canFam2/nib \ /iscratch/i/canFam2/nib stdout \ | axtToPsl stdin ../S1.len ../S2.len stdout \ | sort -k 14,14 -k 16n,17n \ | gzip -c > $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh $(path1) {check out exists ../pslParts/$(file1).psl.gz } #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 parts.lst single gsub jobList para create jobList para try, check, push, check, ... # Completed: 266 of 275 jobs # Crashed: 9 jobs # CPU time in finished jobs: 1228s 20.47m 0.34h 0.01d 0.000 y # IO & Wait Time: 7117s 118.61m 1.98h 0.08d 0.000 y # Average job time: 31s 0.52m 0.01h 0.00d # Longest finished job: 74s 1.23m 0.02h 0.00d # Submission to last job: 641s 10.68m 0.18h 0.01d awk '{print $1;}' /cluster/data/canFam2/chrom.sizes > chroms.lst cat << '_EOF_' > doChrom.csh #!/bin/csh -ef zcat ../pslParts/$1.*.psl.gz \ | sort -k 14,14 -k 16n,17n \ | gzip -c > $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChrom.csh cat << '_EOF_' > gsub.chroms #LOOP ./doChrom.csh $(root1) {check out exists ../pslChrom/$(root1).psl.gz } #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 chroms.lst single gsub.chroms jobList para create jobList para try, check, push, check, ... # Completed: 40 of 40 jobs # CPU time in finished jobs: 463s 7.71m 0.13h 0.01d 0.000 y # IO & Wait Time: 178s 2.97m 0.05h 0.00d 0.000 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest finished job: 38s 0.63m 0.01h 0.00d # Submission to last job: 61s 1.02m 0.02h 0.00d # CHAIN SELF BLASTZ # For future reference -- doBlastzChainNet.pl should have been used # REDONE partially, see below (from the filtering step onward, to get rid # of duplicates, 12/6/05 angie) # CHAINS REMERGED to reassign IDs so that they're unique genome-wide, and # nets deleted to avoid confusion, 12/14/05 angie. The IDs were not unique # genome-wide (only per-chrom) because the filtering was done directly on # run1/chain files which had not yet had their IDs reassigned by # chainMergeSort, and after filtering, chainMergeSort -saveId was used... doh. # It would have been OK if the filtering had been done post-merging, # or if -saveId had been omitted after filtering. caught by joinerCheck. # Also could have been avoided by just using doBlastzChainNet.pl the first time. # Run axtChain on little cluster ssh kki cd /cluster/data/canFam2/bed/blastz.canFam2 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S ../../pslChrom/*.psl.gz > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef setenv PATH /cluster/bin/x86_64:$PATH axtChain -verbose=0 -psl $1 /iscratch/i/canFam2/nib \ /iscratch/i/canFam2/nib stdout \ | chainAntiRepeat /iscratch/i/canFam2/nib /iscratch/i/canFam2/nib \ stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 40 of 41 jobs #Crashed: 1 jobs #Average job time: 185s 3.09m 0.05h 0.00d #Longest job: 490s 8.17m 0.14h 0.01d #Submission to last job: 74789s 1246.48m 20.77h 0.87d # now on the cluster server, sort chains ssh kolossus cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain # Had these steps actually been followed it would have saved some # trouble, but instead some custom filtering was performed directly # on run1/chain files and they never got their IDs reassigned: chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # trim to minScore=20000 to cut some of the fluff mkdir chainFilt foreach f (chain/*.chain) chainFilter -minScore=20000 $f > chainFilt/$f:t end gzip -c all.chain > all.chain.unfiltered.gz rm -r chain mv chainFilt chain chainMergeSort -saveId chain/*.chain > all.chain # Load chains into database ssh hgwdev cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain canFam2 ${c}_chainSelf $i end # 12/6/05 angie -- Jen found that all chains were duplicated -- # turns out that axtChain/run1/chain/ contained some chr*.filt.chain # in addition to chr*.psl.chain, so *.chain resulted in duplicates. # Redo the filtering and chain merge, then re-run # doBlastzChainNet.pl -continue net # to redo the subsequent steps. ssh kolossus cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain chainMergeSort run1/chain/chr*.psl.chain \ | gzip -c > all.chain.unfiltered.gz rm -r chain chainSplit chain all.chain.unfiltered.gz mkdir chainFilt foreach f (run1/chain/chr*.psl.chain) chainFilter -minScore=20000 $f > chainFilt/$f:t:r:r.chain end rm -r chain mv chainFilt chain chainMergeSort -saveId chain/*.chain \ | gzip -c > canFam2.canFam2.all.chain.gz ssh hgwdev cd /cluster/data/canFam2/bed/blastz.canFam2 rm -r axtNet mafNet rm -r /usr/local/apache/htdocs/goldenPath/canFam2/vsSelf/ doBlastzChainNet.pl -continue net DEF >& redo.log & tail -f redo.log # 12/14/05 angie -- Jen ran joinerCheck and it complained about # non-unique IDs in the chain tables. fixing... ssh kolossus cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain chainSplit chain canFam2.canFam2.all.chain.gz mv canFam2.canFam2.all.chain.gz canFam2.canFam2.all.chain.badIDs.gz chainMergeSort chain/*.chain \ | gzip -c > canFam2.canFam2.all.chain.gz rm -r chain chainSplit chain canFam2.canFam2.all.chain.gz # Load chains into database ssh hgwdev cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain canFam2 ${c}_chainSelf $i end cd .. rm -r chain # I removed the self net in order to avoid confusion, so no need to # regenerate it. # NET SELF BLASTZ # For future reference -- doBlastzChainNet.pl should have been used # REDONE 12/6/05 angie as part of doBlastzChainNet.pl command above ssh kolossus cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain chainPreNet all.chain.gz ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain netClass -noAr noClass.net canFam2 canFam2 self.net rm noClass.net # Load the nets into database netFilter -minGap=10 self.net | hgLoadNet canFam2 netSelf stdin # Add entries for chainSelf, netSelf to dog/canFam2 trackDb # ADD SELF DOWNLOADABLE FILES (DONE 11/21/05 angie) # For future reference -- doBlastzChainNet.pl should have been used for # all steps from blastz up to this point. # REDONE 12/6/05 angie as part of doBlastzChainNet.pl command above cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain mv self.net canFam2.canFam2.net gzip canFam2.canFam2.net cd .. doBlastzChainNet.pl -continue download -stop download DEF \ >& doDownload.log tail -f doDownload.log # MULTIZ.V10 4WAY (DOG/H/M/R) (DONE 11/7/05 angie - REDONE 12/13/05) # hg17 and mm6 alignments were redone -- re-run this to pick up improvements. # Tree: ((canFam2 hg17) (mm6 rn3)) ssh kkstore01 mkdir /cluster/data/canFam2/bed/multiz4way.2005-12-13 rm -f /cluster/data/canFam2/bed/multiz4way ln -s /cluster/data/canFam2/bed/multiz4way.2005-12-13 \ /cluster/data/canFam2/bed/multiz4way cd /cluster/data/canFam2/bed/multiz4way # Setup: Copy pairwise MAF to /san/sanvol1/scratch: mkdir /san/sanvol1/scratch/dogMultiz4way foreach db (hg17 mm6 rn3) echo $db cp -pR /cluster/data/canFam2/bed/blastz.$db/mafNet \ /san/sanvol1/scratch/dogMultiz4way/$db end ls -lLR /san/sanvol1/scratch/dogMultiz4way # Make output dir and script to use /cluster/bin/scripts/runMultizV10.csh: mkdir maf cat << '_EOF_' > doMultizAll.csh #!/bin/csh -fex set chr = $1 set tmpDir = /scratch/dogMultiz4way.$chr mkdir $tmpDir set mafScratch = /san/sanvol1/scratch/dogMultiz4way # Really should write a perl script to take a tree like this and generate # commands like the ones below: # ((canFam2 hg17) (mm6 rn3)) /cluster/bin/scripts/runMultizV10.csh \ $mafScratch/mm6/$chr.maf.gz \ $mafScratch/rn3/$chr.maf.gz \ 0 canFam2.$chr $tmpDir $tmpDir/$chr.MusRat.maf /cluster/bin/scripts/runMultizV10.csh \ $mafScratch/hg17/$chr.maf.gz \ $tmpDir/$chr.MusRat.maf \ 1 canFam2.$chr $tmpDir maf/$chr.maf rm -f $tmpDir/*.maf rmdir $tmpDir '_EOF_' # << for emacs chmod 775 doMultizAll.csh awk '{print "./doMultizAll.csh " $1;}' /cluster/data/canFam2/chrom.sizes \ > jobs.lst # Run on brawny cluster ssh pk cd /cluster/data/canFam2/bed/multiz4way para make jobs.lst para time #Completed: 41 of 41 jobs #Average job time: 1296s 21.61m 0.36h 0.02d #Longest finished job: 2879s 47.98m 0.80h 0.03d #Submission to last job: 2879s 47.98m 0.80h 0.03d ls -1 missing* #ls: No match. # make /gbdb/ links to 4way maf files: ssh hgwdev mkdir -p /gbdb/canFam2/multiz4way/maf/multiz4way ln -s /cluster/data/canFam2/bed/multiz4way/maf/chr*.maf \ /gbdb/canFam2/multiz4way/maf/multiz4way/ # load into database cd /tmp hgLoadMaf -warn canFam2 multiz4way \ -pathPrefix=/gbdb/canFam2/multiz4way/maf/multiz4way # load summary table to replace pairwise cat /cluster/data/canFam2/bed/multiz4way/maf/chr*.maf \ | nice hgLoadMafSummary canFam2 multiz4waySummary stdin # Dropped unused indexes (2006-05-09 kate) # NOTE: this is not required in the future, as the loader # has been fixed to not generate these indexes hgsql canFam2 -e "alter table multiz4waySummary drop index chrom_2" hgsql canFam2 -e "alter table multiz4waySummary drop index chrom_3" # put 4way MAF out for download ssh kolossus cd /cluster/data/canFam2/bed/multiz4way mkdir mafDownload foreach f (maf/*.maf) nice gzip -c $f > mafDownload/$f:t.gz end cd mafDownload md5sum *.maf.gz > md5sum.txt # make a README.txt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/canFam2/multiz4way ln -s /cluster/data/canFam2/bed/multiz4way/mafDownload/{*.maf.gz,*.txt} \ /usr/local/apache/htdocs/goldenPath/canFam2/multiz4way # Cleanup rm -rf /san/sanvol1/scratch/dogMultiz4way/ # PHASTCONS 4WAY WITH METHODS FROM PAPER (DONE 11/9/05 angie - REDONE 12/13/05) # ((canFam2,hg17),(mm6,rn3)) # redo of hg17, mm6 -> multiz -> phastCons. ssh kkstore01 mkdir -p /san/sanvol1/scratch/canFam2/chrom cp -p /cluster/data/canFam2/?{,?}/chr*.fa /san/sanvol1/scratch/canFam2/chrom/ # Split chrom fa into smaller windows for phastCons: ssh pk mkdir /cluster/data/canFam2/bed/multiz4way/phastCons mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.split cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.split set WINDOWS = /san/sanvol1/scratch/canFam2/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/san/sanvol1/scratch/canFam2/chrom set WINDOWS=/san/sanvol1/scratch/canFam2/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \ -O canFam2,hg17,mm6,rn3 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/canFam2/bed/multiz4way/maf/*.maf) if (-s $file) then echo "doSplit.sh {check in exists+ $file}" >> jobList endif end para make jobList para time #Completed: 41 of 41 jobs #Average job time: 132s 2.21m 0.04h 0.00d #Longest finished job: 232s 3.87m 0.06h 0.00d #Submission to last job: 232s 3.87m 0.06h 0.00d ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY ############# # Use consEntropy --NH to make it suggest a --expected-lengths param # that we should try next. Adam ran this on hg17 to find out the # total entropy of the hg17 model: # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045 #Relative entropy: H=0.608216 bits/site #Required length: N=16.085437 sites #Total entropy: NH=9.783421 bits # Our target is that NH result: 9.7834 bits. # Use phyloFit to make an initial model: ssh kolossus cd /cluster/data/canFam2/bed/multiz4way/phastCons /cluster/bin/phast/msa_view ../maf/chr{2,20,37}.maf \ --aggregate canFam2,hg17,mm6,rn3 \ -i MAF -o SS > all.ss /cluster/bin/phast/phyloFit all.ss \ --tree "((canFam2,hg17),(mm6,rn3))" \ -i SS --out-root starting-tree cat starting-tree.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #TRAINING_LNL: -378601835.277927 #BACKGROUND: 0.285684 0.213599 0.213743 0.286974 #RATE_MAT: # -0.889874 0.169851 0.563150 0.156874 # 0.227172 -1.149929 0.168661 0.754097 # 0.752694 0.168547 -1.148977 0.227736 # 0.156169 0.561285 0.169622 -0.887076 #TREE: ((canFam2:0.161521,hg17:0.166335):0.114597,(mm6:0.071038,rn3:0.076403):0.114597); # also get GC content from model -- if similar enough, no need to extract it # separately above. awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod #0.427342 # OK, use .427 for --gc below. # Use values of --target-coverage and --expected-lengths sort of like # those in the latest run on human, 0.25 and 12, just as a starting point. # Multiply each subst rate on the TREE line by 3.75 which is roughly the # ratio of noncons to cons in # /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.estimate/ave.* /cluster/bin/phast/tree_doctor -s 3.75 starting-tree.mod \ > starting-tree.noncons.mod /cluster/bin/phast/consEntropy --NH 9.7834 0.25 12 \ starting-tree{,.noncons}.mod #( Solving for new omega: 12.000000 13.069907 13.002421 13.002166 ) #Transition parameters: gamma=0.250000, omega=12.000000, mu=0.083333, nu=0.027778 #Relative entropy: H=0.803151 bits/site #Expected min. length: L_min=11.957635 sites #Expected max. length: L_max=9.271001 sites #Phylogenetic information threshold: PIT=L_min*H=9.603785 bits #Recommended expected length: omega=13.002166 sites (for L_min*H=9.783400) # OK, use --expected-lengths 13. ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ########### # We're here because the actual target coverage was not satisfactory, # so we're changing the --target-coverage param. Given that we're # changing that, take a guess at how we should change --expected-lengths # in order to also hit the total entropy target. cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.18 13 ave.{cons,noncons}.mod #ERROR: too many iterations, not converging; try without --NH. # -- it gets that error unless I raise coverage to 0.34. Well, # stick with 13 for now... # THIRD ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.15 13 ave.{cons,noncons}.mod #ERROR: too many iterations, not converging; try without --NH. # -- it gets that error unless I raise coverage to 0.31. Well, # stick with 13 for now... # Now set up cluster job to estimate model parameters given free params # --target-coverage and --expected-lengths and the data. ssh pk mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate # REDO ONLY cp ../../../multiz4way.2005-11-07/phastCons/run.estimate/ave.*.mod . # REDO ONLY -- skip the first iteration stuff from here on out, and # do subsequent iteration stuff using params from 11-07 run. # FIRST ITERATION: Use ../starting-tree.mod: # REDO ONLY cp ../../../multiz4way.2005-11-07/phastCons/run.estimate/ave.*.mod . # REDO ONLY -- skip the First iteration stuff from here on out, and # do subsequent iteration stuff using params from 11-07 run. cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.427 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 13 --target-coverage 0.25 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model. cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.427 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 13 --target-coverage 0.15 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/san/sanvol1/scratch/canFam2/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end para make jobList para time #Completed: 2434 of 2434 jobs #Average job time: 29s 0.48m 0.01h 0.00d #Longest finished job: 59s 0.98m 0.02h 0.00d #Submission to last job: 214s 3.57m 0.06h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kolossus cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # FIRST ITERATION: #ave.cons.mod:TREE: ((canFam2:0.063241,hg17:0.067179):0.047808,(mm6:0.029756,rn3:0.031924):0.047808); #ave.noncons.mod:TREE: ((canFam2:0.201135,hg17:0.213229):0.151937,(mm6:0.094208,rn3:0.101249):0.151937); # SECOND ITERATION: #ave.cons.mod:TREE: ((canFam2:0.056147,hg17:0.059693):0.042487,(mm6:0.026610,rn3:0.028533):0.042487); #ave.noncons.mod:TREE: ((canFam2:0.192186,hg17:0.203958):0.145480,(mm6:0.090727,rn3:0.097451):0.145480); # THIRD ITERATION: #ave.cons.mod:TREE: ((canFam2:0.052533,hg17:0.055873):0.039758,(mm6:0.024967,rn3:0.026765):0.039758); #ave.noncons.mod:TREE: ((canFam2:0.188439,hg17:0.200074):0.142743,(mm6:0.089242,rn3:0.095829):0.142743); # REDO: #ave.cons.mod:TREE: ((canFam2:0.052883,hg17:0.056022):0.039955,(mm6:0.024815,rn3:0.026742):0.039955); #ave.noncons.mod:TREE: ((canFam2:0.188924,hg17:0.199837):0.142839,(mm6:0.088391,rn3:0.095457):0.142839); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # FIRST ITERATION ONLY: # Check the total entropy figure to see if we're way off. # This takes an hour for 4way (exponential in #species) and has never # produced a different answer from the input after the first iteration, # so do this for the first iteration only: /cluster/bin/phast/consEntropy --NH 9.7834 0.25 13 ave.{cons,noncons}.mod #ERROR: too many iterations, not converging; try without --NH # Dang. That is a new one. Oh well, I'll proceed with 13. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. The we measure the # conserved elements coverage, and if that's not satisfactory then we # adjust parameters and repeat. ssh pk mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.phast cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 13 --target-coverage 0.15 \ --quiet --seqname $chr --idpref $pref \ --viterbi /san/sanvol1/scratch/canFam2/phastCons/ELEMENTS/$chr/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS/$chr/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS} foreach chr (`awk '{print $1;}' /cluster/data/canFam2/chrom.sizes`) mkdir /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS}/$chr end rm -f jobList foreach f (/san/sanvol1/scratch/canFam2/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para make jobList para time #Completed: 2434 of 2434 jobs #Average job time: 11s 0.18m 0.00h 0.00d #Longest finished job: 39s 0.65m 0.01h 0.00d #Submission to last job: 74s 1.23m 0.02h 0.00d # back on kolossus: # combine predictions and transform scores to be in 0-1000 interval cd /cluster/data/canFam2/bed/multiz4way/phastCons cp /dev/null all.bed foreach d (/san/sanvol1/scratch/canFam2/phastCons/ELEMENTS/*) echo $d:t awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ $d/*.bed \ | /cluster/bin/scripts/lodToBedScore >> all.bed end ssh hgwdev # Now measure coverage of CDS by conserved elements. # We want the "cover" figure to be close to 68.9%. # However we don't have dog gene annotations -- we just have xenoRefGene # which is of course biased towards conserved genes. So cover will be # artificially high. Shoot for "all.bed 5%" and make sure cover is # somewhat higher than 68.9%. cd /cluster/data/canFam2/bed/multiz4way/phastCons featureBits -enrichment canFam2 xenoRefGene:cds all.bed # FIRST ITERATION: too high, reduce target-coverage: #xenoRefGene:cds 1.268%, all.bed 6.812%, both 1.091%, cover 86.06%, enrich 12.63x # SECOND ITERATION: still too high. #xenoRefGene:cds 1.268%, all.bed 5.432%, both 1.048%, cover 82.67%, enrich 15.22x # THIRD ITERATION: close enough. #xenoRefGene:cds 1.268%, all.bed 4.960%, both 1.025%, cover 80.83%, enrich 16.29x # REDO: close enough. #xenoRefGene:cds 1.268%, all.bed 4.960%, both 1.025%, cover 80.85%, enrich 16.30x # Having met the CDS coverage target, load up the results. hgLoadBed canFam2 phastConsElements4way all.bed # Create wiggle ssh pk mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.wib cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.wib rm -rf /san/sanvol1/scratch/canFam2/phastCons/wib mkdir -p /san/sanvol1/scratch/canFam2/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /san/sanvol1/scratch/canFam2/phastCons/wib zcat `ls -1 /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS/$chr/*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS \ | sed -e 's/\/$//'`) echo doWigEncode $chr >> jobList end para make jobList para time #Completed: 41 of 41 jobs #Average job time: 35s 0.59m 0.01h 0.00d #Longest finished job: 67s 1.12m 0.02h 0.00d #Submission to last job: 67s 1.12m 0.02h 0.00d # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from san/sanvol1 cd /cluster/data/canFam2/bed/multiz4way/phastCons rm -rf wib POSTPROBS rsync -av /san/sanvol1/scratch/canFam2/phastCons/wib . rsync -av /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/canFam2/multiz4way/wib cd /cluster/data/canFam2/bed/multiz4way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/canFam2/multiz4way/wib/ hgLoadWiggle canFam2 phastCons4way \ -pathPrefix=/gbdb/canFam2/multiz4way/wib wib/*.wig rm wiggle.tab # and clean up san/sanvol1. rm -r /san/sanvol1/scratch/canFam2/phastCons/{ELEMENTS,POSTPROBS,wib} rm -r /san/sanvol1/scratch/canFam2/chrom # Offer raw scores for download since fly folks are likely to be interested: # back on kolossus cd /cluster/data/canFam2/bed/multiz4way/phastCons/POSTPROBS mkdir ../postprobsDownload foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`) zcat `ls -1 $chr/$chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \ > ../postprobsDownload/$chr.pp.gz end cd ../postprobsDownload md5sum *.gz > md5sum.txt # Make a README.txt there too. ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/canFam2/phastCons4way cd /usr/local/apache/htdocs/goldenPath/canFam2/phastCons4way ln -s /cluster/data/canFam2/bed/multiz4way/phastCons/postprobsDownload/* . # UNCERTIFIED ASSEMBLY REGIONS (DONE 11/9/05 angie) ssh hgwdev mkdir /cluster/data/canFam2/bed/uncertified cd /cluster/data/canFam2/bed/uncertified mv ../../broad/canFam2.0uncertifiedRegions.txt . tail +2 canFam2.0uncertifiedRegions.txt \ | awk -F"\t" '{print "chr" $1 "\t" $2 "\t" $3 "\t" $7 "\t" $6;}' \ | sed -e 's/Missing read partners/MRP/; s/Haplotype inconsistency/HI/; \ s/Negative gap/NG/; s/Linking inconsistency/LI/; \ s/Illogical links/IL/; s/; /+/g;' \ > uncertified.bed hgLoadBed -tab canFam2 uncertified uncertified.bed # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 11/21/05 angie) # REDONE parts: Sequence and simpleRepeat. 12/5/05 angie ssh kolossus cd /cluster/data/canFam2 #- Build the .tar.gz files -- no genbank for now. cat << '_EOF_' > jkStuff/zipAll.csh rm -rf bigZips mkdir bigZips tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked cd bed/simpleRepeat tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed cd ../.. '_EOF_' # << this line makes emacs coloring happy csh -efx ./jkStuff/zipAll.csh |& tee zipAll.log #- Look at zipAll.log to make sure all file lists look reasonable. cd bigZips md5sum *.gz > md5sum.txt # Make a README.txt cd .. mkdir chromGz foreach f ( ?{,?}/chr*.fa ) echo $f:t:r gzip -c $f > chromGz/$f:t.gz end cd chromGz md5sum *.gz > md5sum.txt # Make a README.txt #- Link the .gz and .txt files to hgwdev:/usr/local/apache/... ssh hgwdev set gp = /usr/local/apache/htdocs/goldenPath/canFam2 mkdir -p $gp/bigZips ln -s /cluster/data/canFam2/bigZips/{chrom*.tar.gz,*.txt} $gp/bigZips mkdir -p $gp/chromosomes ln -s /cluster/data/canFam2/chromGz/{chr*.gz,*.txt} $gp/chromosomes # Take a look at bigZips/* and chromosomes/* # Can't make refGene upstream sequence files - no refSeq for dog. mkdir database # Create README.txt files in database/ to explain the files. # CHORI BAC END PAIRS (DONE 11/22/05 angie) # Rachel downloaded and parsed BAC end sequences and pair info from # CHORI and NCBI -- seee makeCanFam1.doc "BAC END PAIRS". Use those # files and align to canFam2. # Do BLAT alignments of BAC ends to the genome on the pitakluster. # copy over masked contigs to the san ssh kkstore01 cd /cluster/data/canFam2 mkdir /san/sanvol1/scratch/canFam2/maskedContigs foreach d (?{,?}/chr*_?{,?}) echo $d:t cp -p $d/$d:t.fa /san/sanvol1/scratch/canFam2/maskedContigs/ end # copy over 11.ooc file to the san cp -p /cluster/bluearc/canFam2/11.ooc /san/sanvol1/scratch/canFam2/ # make output directory, run directory and bed/ directory mkdir -p /san/sanvol1/scratch/canFam2/bacendsRun/psl mkdir /cluster/data/canFam2/bed/bacends ssh pk cd /san/sanvol1/scratch/canFam2/bacendsRun echo '#LOOP\n/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/canFam2/11.ooc {check out line+ /san/sanvol1/scratch/canFam2/bacendsRun/psl/$(root1).$(root2).psl}\n#ENDLOOP' > gsub ls -1S /san/sanvol1/scratch/canFam2/maskedContigs/*.fa > contigs.lst ls -1S /san/sanvol1/scratch/canFam1/bacends/bacends*.fa > bacends.lst gensub2 contigs.lst bacends.lst gsub jobList para make jobList para time #Completed: 49005 of 49005 jobs #Average job time: 7s 0.12m 0.00h 0.00d #Longest finished job: 105s 1.75m 0.03h 0.00d #Submission to last job: 1884s 31.40m 0.52h 0.02d # back on kkstore01, retrieve pitakluster results cd /cluster/data/canFam2/bed/bacends rsync -av /san/sanvol1/scratch/canFam2/bacendsRun/{*.lst,batch*,g*,j*,p*} . # lift alignments ssh kolossus cd /cluster/data/canFam2/bed/bacends pslSort dirs raw.psl tmp psl # started 08:15, PID: 16561 pslCheck raw.psl >& check.log wc -l check.log #4379 check.log grep '< previous block' check.log | wc -l #2194 grep -v 'Error: invalid PSL' check.log | grep -v '< previous block' | wc -l #0 wc -l raw.psl #27352955 raw.psl -- 2194 / 27352955 = 0.000080 = 0.008% overlapping pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ raw.psl bacEnds.psl /dev/null # Processed 27147454 alignments wc -l bacEnds.psl #769149 bacEnds.psl liftUp bacEnds.lifted.psl /cluster/data/canFam2/jkStuff/liftAll.lft \ warn bacEnds.psl awk '{print $10}' bacEnds.lifted.psl | sort | uniq | wc -l #317042 faSize /san/sanvol1/scratch/canFam1/bacends/bacends*.fa #290482800 bases (10817785 N's 279665015 real 279665015 upper 0 lower) in 393408 sequences in 99 files calc 317042 / 393408 #317042 / 393408 = 0.805886 # Make BAC end pairs track: mkdir pairs cd pairs set ncbiDir = /cluster/data/ncbi/bacends/dog/bacends.1 /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose ../bacEnds.lifted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds wc -l * # 40 bacEnds.long # 460 bacEnds.mismatch # 44366 bacEnds.orphan # 133248 bacEnds.pairs # 478 bacEnds.short # 816 bacEnds.slop # 179408 total # Filter by score and sort by {chrom,chromStart}: awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed cat bacEnds.{slop,short,long,mismatch,orphan} \ | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed wc -l *.bed # 133241 bacEndPairs.bed # 46057 bacEndPairsBad.bed # 179298 total extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed \ bacEndPairsBad.bed \ | sorttbl tname tstart | headchg -del \ > bacEnds.load.psl # load into database ssh hgwdev cd /cluster/data/canFam2/bed/bacends/pairs hgLoadBed canFam2 bacEndPairs bacEndPairs.bed -notItemRgb \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql #Loaded 133241 elements of size 11 # note - this next track isn't pushed to RR, just used for assembly QA hgLoadBed canFam2 bacEndPairsBad bacEndPairsBad.bed -notItemRgb \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql #Loaded 46057 elements of size 11 hgLoadPsl canFam2 -table=all_bacends bacEnds.load.psl #load of all_bacends did not go as planned: 748544 record(s), 0 row(s) skipped, 928 warning(s) loading psl.tab # Diagnose... echo select \* from all_bacends | hgsql -N canFam2 > /tmp/1 diff psl.tab /tmp/1 | less # Looks like some rows of psl.tab have negative numbers in the # qBaseInsert column! # load BAC end sequences into seq table so alignments may be viewed # symlink to FASTA sequence file in ncbi directory mkdir -p /gbdb/canFam2/bacends ln -s /cluster/data/ncbi/bacends/dog/bacends.1/canFamBacends.fa \ /gbdb/canFam2/bacends/canFamBacends.fa hgLoadSeq canFam2 /gbdb/canFam2/bacends/canFamBacends.fa #393408 sequences # featureBits canFam1 all_bacends #211644790 bases of 2359845093 (8.969%) in intersection featureBits canFam2 all_bacends #218709219 bases of 2384996543 (9.170%) in intersection # featureBits canFam1 bacEndPairs #2334084046 bases of 2359845093 (98.908%) in intersection featureBits canFam2 bacEndPairs #2353239742 bases of 2384996543 (98.668%) in intersection # featureBits canFam1 bacEndPairsBad # 548130287 bases of 2359845093 (23.227%) in intersection featureBits canFam2 bacEndPairsBad #534195657 bases of 2384996543 (22.398%) in intersection # add trackDb entry and html # Clean up ssh pk rm -r /san/sanvol1/scratch/canFam2/bacendsRun # ACCESSIONS FOR CONTIGS (DONE 2006-01-04 kate) # Found WGS project entry in Genbank for canFam2: # LOCUS AAEX02000000 35696 rc DNA linear MAM 13-DEC-2005 # DEFINITION Canis familiaris whole genome shotgun sequencing project. # Project has a total of 35696 contigs, accessioned as # AAEX02000001-AAEX02035696, also named cont2.0--cont2.35695 # Poked around after submitting "canis[ORGN] AND WGS[KYWD]" search # to Entrez Nucleotide cd /cluster/data/canFam2/broad grep contig Dog2.0.agp | wc -l # 35696 grep contig Dog2.0.agp | head -1 # chr01 3000001 3024025 2 W contig_30884 ... hgsql canFam2 -Ns -e "select frag from chr1_gold limit 1" # contig_30884 grep contig_0 Dog2.0.agp # chrUn 57343783 57365839 7071 W contig_0 grep contig_35695 Dog2.0.agp # chrUn 73893245 73902697 11515 W contig_35695 grep contig_35696 Dog2.0.agp # nothing # # in Genbank entries) # Looked at summary file of contig entries from Genbank by # clicking the project entry (AAEX02000000), then clicking on # the WGS contig acession list at the end of the page # the AGP and our Gap tables frag names contig_* correspond to the # cont2.* names and AEX02* names (-1) in the Genbank entries, # so we don't need to to parse out the comments from the summary # entries. # create file mapping our contig names to Genbank accessions cd /cluster/data/canFam2/bed mkdir -p contigAcc cd contigAcc sed -n 's/.*contig_\([0-9][0-9]*\).*/\1/p' \ /cluster/data/canFam2/broad/Dog2.0.agp | \ sort -un | \ awk '{printf ("contig_%d\tAAEX020%05d.1\n", $1, $1+1)}' > contigAcc.tab wc -l contigAcc.tab # 35696 contigAcc.tab # load into database hgsql canFam2 < $HOME/kent/src/hg/lib/contigAcc.sql echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \ hgsql canFam2 hgsql canFam2 -Nse "SELECT COUNT(*) FROM contigAcc" # 35696 ############################################################################ # QA NOTE (2007-08-16 brooke): # rhesus nets on canFam2 were requested by Merck (see the following # thread: http://www.soe.ucsc.edu/pipermail/genome/2007-August/014414.html). # We may want to consider adding rhesus nets/chains to next dog release. ############################################################################ # BLASTZ BOSTAU2 (DONE - 2006-01-17 - 2006-01-18 - Hiram) ssh kk mkdir /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17 cd /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin BLASTZ=blastz.v7 BLASTZ_M=50 # dog vs. cow # TARGET: Dog SEQ1_DIR=/iscratch/i/canFam2/nib SEQ1_LEN=/iscratch/i/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow (bosTau2) SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/cluster/data/canFam2/bed/blastzBosTau2.2006-01-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # running 2006-01-17 13:58 # done 2005-01-18 03:40 # real 823m27.169s ssh hgwdev cd /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17 time featureBits canFam2 chainBosTau2Link \ > fb.canFam2.chainBosTau2Link 2>&1 & # real 32m3.460s # 1376066425 bases of 2384996543 (57.697%) in intersection # SWAP CHAINS FROM RN4, BUILD NETS ETC. (DONE 2/25/06 angie) mkdir /cluster/data/canFam2/bed/blastz.rn4.swap cd /cluster/data/canFam2/bed/blastz.rn4.swap doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.canFam2/DEF \ >& do.log echo "check /cluster/data/canFam2/bed/blastz.rn4.swap/do.log" \ | mail -s "check do.log" $USER ln -s blastz.rn4.swap /cluster/data/canFam2/bed/blastz.rn4 ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram) ssh pk cd /cluster/data/mm8/bed/blastzCanFam2.2006-02-18 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits canFam2 chainMm8Link # 816262344 bases of 2384996543 (34.225%) in intersection ############################################################################ ## BLASTZ swap from panTro2 alignments (DONE 2006-05-04 markd) # FINISHED (2006-07-21 kate) ssh hgwdev mkdir /cluster/data/canFam2/bed/blastz.panTro2.swap ln -s blastz.panTro2.swap /cluster/data/canFam2/bed/blastz.panTro2 cd /cluster/data/canFam2/bed/blastz.panTro2.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/panTro2/bed/blastz.canFam2/DEF >& swap.out& # create the net files ssh hgwdev cd /cluster/data/canFam2/bed/blastz.panTro2.swap/axtChain nice netClass -verbose=0 -noAr noClass.net canFam2 panTro2 canFam2.panTro2.net nice gzip canFam2.panTro2.net # Mark stopped here time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -continue=load \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/panTro2/bed/blastz.canFam2/DEF >& swap.2.out& ########################################################################## # NSCAN track - (2006-06-01 markd) cd /cluster/data/canFam2/bed/nscan/ # obtained NSCAN predictions from michael brent's group # at WUSTL wget -nv http://genes.cs.wustl.edu/jeltje/canFam2/canFam2.nscan.gtf gzip canFam2.nscan.gtf wget -nv -r -np http://genes.cs.wustl.edu/jeltje/canFam2/chr_ptx/ cat genes.cs.wustl.edu/jeltje/canFam2/chr_ptx/*.fa >canFam2.nscan.pep.fa gzip canFam2.nscan.pep.fa rm -rf genes.cs.wustl.edu chmod a-w *.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt canFam2 nscanGene canFam2.nscan.gtf.gz # add .a suffix to match transcript id WRONG, see below: hgPepPred -suffix=.a canFam2 generic nscanPep canFam2.nscan.pep.fa.gz rm *.tab # update trackDb; need a canFam2-specific page to describe informants dog/canFam2/nscanGene.html dog/canFam2/trackDb.ra # 2006-07-24 markd: reloaded without -suffix hgPepPred canFam2 generic nscanPep canFam2.nscan.pep.fa.gz ########################################################################## # MAKE Human Proteins (hg18) track (DONE braney 2006-08-17) ssh pk destDir=/san/sanvol1/scratch/canFam2/blastDb mkdir -p /cluster/data/canFam2/bed/tblastn.hg18KG cd /cluster/data/canFam2/bed/tblastn.hg18KG find $destDir -name "*.nsq" | sed "s/\.nsq//" > target.lst mkdir kgfa # calculate a reasonable number of jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(500000/`wc target.lst | awk "{print \\\$1}"`\) # 36727/(500000/6149) = 451.668646 split -l 452 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst rm -rf /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut mkdir -p /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/canFam2/jkStuff/subChr.lft carry $f.3 liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/canFam2/jkStuff/liftAll.lft carry $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' exit chmod +x blastSome gensub2 target.lst kg.lst blastGsub blastSpec para create blastSpec para push # Completed: 504218 of 504218 jobs # CPU time in finished jobs: 28268903s 471148.38m 7852.47h 327.19d 0.896 y # IO & Wait Time: 5565118s 92751.97m 1545.87h 64.41d 0.176 y # Average job time: 67s 1.12m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 710s 11.83m 0.20h 0.01d # Submission to last job: 91362s 1522.70m 25.38h 1.06d ssh kki cd /cluster/data/canFam2/bed/tblastn.hg18KG tcsh cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) $(path2) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.$2_*.psl | /cluster/home/braney/bin/i386/simpleChain -chrom=$2 -prot -outPsl -maxGap=250000 stdin ../c.$2.`bas ename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst ../../chrom.lst chainGsub chainSpec para create chainSpec para push # Completed: 3335 of 3335 jobs # CPU time in finished jobs: 1056945s 17615.76m 293.60h 12.23d 0.034 y # IO & Wait Time: 75548s 1259.13m 20.99h 0.87d 0.002 y # Average job time: 340s 5.66m 0.09h 0.00d # Longest finished job: 25716s 428.60m 7.14h 0.30d # Submission to last job: 61450s 1024.17m 17.07h 0.71d ssh kkstore04 cd /cluster/data/canFam2/bed/tblastn.hg18KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.*.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -k 14,14 -k 16,16n -k 18,18n u.*.psl m60* | uniq > /cluster/data/canFam2/bed/tblastn.hg18KG/blastHg18KG.psl cd .. wc blastHg18KG.psl # 62741 1317561 12491782 blastHg18KG.psl # hg17 64343 1351203 10913814 blastHg18KG.psl pslUniq blastHg18KG.psl stdout | wc # 36272 761712 8427975 # hg17 18827 395367 2992653 cat kgfa/*fa | grep ">" | wc # 303516 303516 4739313 # hg17 309494 309494 4810327 ssh hgwdev cd /cluster/data/canFam2/bed/tblastn.hg18KG hgLoadPsl canFam2 blastHg18KG.psl featureBits canFam2 blastHg18KG # 32565727 bases of 2384996543 (1.365%) in intersection # hg17 35781547 bases of 2384996543 (1.500%) in intersection exit # back to kksilo rm -rf blastOut # End tblastn ######################################################################### # BLASTZ/CHAIN/NET FELCAT3 (Done Nov 16 2006 heather) # working in /cluster/data/felCat3 because /cluster/data/canFam2 is 94% full mkdir /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16 ln -s /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16 /cluster/data/canFam2/bed/blastz.felCat3 cd /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16 cat << '_EOF_' > DEF BLASTZ_M=50 # TARGET: Dog canFam2 SEQ1_DIR=/scratch/hg/canFam2/nib SEQ1_LEN=/scratch/hg/canFam2/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=200000000 SEQ1_LAP=0 # QUERY: Cat felCat3 SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=500 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/felCat3/bed/blastz.canFam2.2006-11-16 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk -chainMinScore=3000 -chainLinearGap=medium -blastzOutRoot /cluster/bluearc/felCat3/blastz.canFam2 >& do.log & tail -f do.log nice featureBits -chrom=chr1 canFam2 chainFelCat3Link # 65223887 bases of 121613690 (53.632%) in intersection ######################################################################### # BLASTZ/CHAIN/NET DOG AND HORSE (DONE 2/19/07 Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18 cd /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18 cat << '_EOF_' > DEF # Horse vs. Dog BLASTZ_M=50 # TARGET: Horse equCab1 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ1_LIMIT=500 SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Dog canFam2 SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_LEN=/cluster/data/canFam2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.canFam2.2007-02-18 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do.log & tail -f do.log # 6 jobs failed. # Robert re-ran those 6 jobs. (2/22/07) doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \ -continue cat -bigClusterHub pk \ -blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do2.log & tail -f do2.log ln -s blastz.canFam2.2007-02-18 /cluster/data/equCab1/bed/blastz.canFam2 ssh hgwdev nice featureBits equCab1 -chrom=chrI chainCanFam2Link # 128747357 bases of 177498097 (72.534%) in intersection bash time nice -n 19 featureBits equCab1 chainCanFam2Link \ > fb.equCab1.chainCanFam2Link.txt 2>&1 # 1717664968 bases of 2421923695 (70.922%) in intersection ssh kkstore04 mkdir /cluster/data/canFam2/bed/blastz.equCab1.swap cd /cluster/data/canFam2/bed/blastz.equCab1.swap bash time doBlastzChainNet.pl \ /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 & # real 115m55.880s cat *.txt # 1673177547 bases of 2384996543 (70.154%) in intersection ######################################################################### ######################################################################### # BLASTZ OPOSSUM monDom4 (DONE 2007-05-27 braney) ssh kolossus mkdir /cluster/data/canFam2/bed/blastz.monDom4 cd /cluster/data/canFam2/bed/blastz.monDom4 cat << '_EOF_' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin BLASTZ=blastz.v7 # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Dog (canFam2) SEQ1_DIR=/san/sanvol1/scratch/canFam2/nib SEQ1_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom4 #SEQ2_DIR=/iscratch/i/monDom4/monDom4RMExtra.2bit SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit #SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes SEQ2_LEN=/cluster/data/monDom4/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/canFam2/bed/blastz.monDom4 TMPDIR=/scratch/tmp '_EOF_' # << emacs doBlastzChainNet.pl DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ >& blastz.out 2>&1 & ######################################################################### # Blastz Marmoset calJac1 (DONE - 2007-11-28 - 30 - Hiram) ssh kkstore04 screen # use screen to control this job # managing disk space on full filesystem mkdir -p /cluster/store8/canFam2/bed/blastzCalJac1.2007-11-28 ln -s /cluster/store8/canFam2/bed/blastzCalJac1.2007-11-28 \ /cluster/data/canFam2/bed cd /cluster/data/canFam2/bed/blastzCalJac1.2007-11-28 cat << '_EOF_' > DEF # Dog vs. Marmoset BLASTZ_M=50 # TARGET: Dog MonDom4 SEQ1_DIR=/scratch/data/canFam2/nib SEQ1_LEN=/cluster/data/canFam2/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Marmoset calJac1, largest contig 2,551,648, 49,724 contigs SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit SEQ2_LEN=/cluster/data/calJac1/chrom.sizes SEQ2_CHUNK=2000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \ -chainLinearGap=medium -bigClusterHub=kk -verbose=2 \ > do.log 2>&1 & # real 864m49.290s # Completed: 266999 of 267000 jobs # Crashed: 1 jobs # CPU time in finished jobs: 17058860s 284314.33m 4738.57h 197.44d 0.541 y # IO & Wait Time: 1031855s 17197.59m 286.63h 11.94d 0.033 y # Average job time: 68s 1.13m 0.02h 0.00d # Longest finished job: 1302s 21.70m 0.36h 0.02d # Submission to last job: 47998s 799.97m 13.33h 0.56d # one job ran out of memory on the kk nodes. It was finished on kolossus /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \ /scratch/data/canFam2/nib/chr21.nib:chr21:20000000-40010000 qParts/part238.lst \ ../DEF \ ../psl/chr21.nib:chr21:20000000-40010000/chr21.nib:chr21:20000000-40010000_part238.lst.psl # continuing time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \ -continue=cat -chainLinearGap=medium -bigClusterHub=kk -verbose=2 \ > cat.log 2>&1 & # had a problem with files missing from /scratch/data/ on # the Iservers - rsync that directory /iscratch/data/ full on # kkr1u00 from /cluster/bluearc/scratch/data/ and push it to # the other Iservers time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \ -continue=chainMerge -chainLinearGap=medium -bigClusterHub=kk \ -verbose=2 > chainMerge.log 2>&1 & # real 79m39.909s cat fb.canFam2.chainCalJac1Link.txt # 1369690756 bases of 2384996543 (57.429%) in intersection mkdir /cluster/data/calJac1/bed/blastz.canFam2.swap cd /cluster/data/calJac1/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=kk > swap.log 2>&1 & # encountered difficulties with /scratch/data/ on kolossus # had to finish the netChains.csh script manually, then continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \ -continue=load -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=kk > load.log 2>&1 & # real 56m44.375s cat fb.calJac1.chainCanFam2Link.txt # 1451345669 bases of 2929139385 (49.549%) in intersection ######################################################################### # BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11 - Hiram) ssh kkstore04 screen # use a screen to manage this multi-day job mkdir /cluster/data/canFam2/bed/blastzBosTau4.2008-03-11 cd /cluster/data/canFam2/bed/blastzBosTau4.2008-03-11 cat << '_EOF_' > DEF BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/canFam2/nib SEQ1_LEN=/cluster/data/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau4 SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=300 SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/cluster/data/canFam2/bed/blastzBosTau4.2008-03-11 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ `pwd`/DEF -verbose=2 \ -bigClusterHub=memk -chainMinScore=3000 -chainLinearGap=medium \ -syntenicNet > do.log 2>&1 # the pk became free as this job had 3 days to go. So, chill out # the batch on memk, let it finish its running jobs, then finish # the batch manually on pk. Continuing: time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ `pwd`/DEF -verbose=2 -smallClusterHub=memk \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat -syntenicNet > cat.log 2>&1 # real 175m41.674s cat fb.canFam2.chainBosTau4Link.txt # 1367017426 bases of 2384996543 (57.317%) in intersection mkdir /cluster/data/bosTau4/bed/blastz.canFam2.swap cd /cluster/data/bosTau4/bed/blastz.canFam2.swap time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/canFam2/bed/blastzBosTau4.2008-03-11/DEF \ -verbose=2 -smallClusterHub=memk \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -syntenicNet > swap.log 2>&1 # real 388m46.213s cat fb.bosTau4.chainCanFam2Link.txt # 1447088347 bases of 2731830700 (52.971%) in intersection ############################################################################# # BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-18 - larrym) ssh kkstore04 screen # use screen to control this multi-day job mkdir /cluster/data/canFam2/bed/blastz.equCab2.2008-04-18 cd /cluster/data/canFam2/bed/blastz.equCab2.2008-04-18 cat << '_EOF_' > DEF # Dog vs. Horse BLASTZ_M=50 # TARGET: Dog canFam2 SEQ1_DIR=/scratch/data/canFam2/nib SEQ1_LEN=/cluster/data/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/cluster/data/equCab2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/canFam2/bed/blastz.equCab2.2008-04-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab2/blastz.canFam2 >& do.log & 0.117u 0.073s 9:20:00.22 0.0% 0+0k 0+0io 1pf+0w six parasol jobs failed; retried with: para -retries=5 push disable bad servers: 1 kkr10u31.kilokluster.ucsc.edu 1 kkr11u16.kilokluster.ucsc.edu 1 kkr12u29.kilokluster.ucsc.edu pushd /cluster/bluearc/equCab2/blastz.canFam2/psl find . -name '*.psl' | wc 64625 64625 6657995 This is the correct number of psl files, so continue with cat step: time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -continue=cat \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab2/blastz.canFam2 >>& do.log & 0.212u 0.173s 2:49:45.35 0.0% 0+0k 0+0io 62pf+0w ln -s blastz.equCab2.2008-04-18 /cluster/data/canFam2/bed/blastz.equCab2 ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # lastz Poodle canFamPoodle1 (DONE - 2009-06-06,22 - Hiram) mkdir /hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06 cd /hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06 cat << '_EOF_' > DEF # Tasha boxer dog vs Shadow poodle # parameters for very close alignment from human-chimp example BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Tasha, canFam2 SEQ1_DIR=/scratch/data/canFam2/nib SEQ1_LEN=/scratch/data/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog CanFam2 SEQ2_DIR=/scratch/data/canFamPoodle1/canFamPoodle1.2bit SEQ2_LEN=/scratch/data/canFamPoodle1/chrom.sizes SEQ2_CHUNK=40000000 SEQ2_LAP=0 SEQ2_LIMIT=800 BASE=/hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl \ -verbose=2 \ `pwd`/DEF \ -noDbNameCheck -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium > do.log 2>&1 # real 8250m12.188s # Completed: 374825 of 374825 jobs # CPU time in finished jobs: 187350504s 3122508.40m 52041.81h 2168.41d 5.941 y # IO & Wait Time: 4127960s 68799.33m 1146.66h 47.78d 0.131 y # Average job time: 511s 8.51m 0.14h 0.01d # Longest finished job: 2339s 38.98m 0.65h 0.03d # Submission to last job: 494836s 8247.27m 137.45h 5.73d # the lastz run thought it failed, but it didn't, continuing: time nice -n +19 doBlastzChainNet.pl \ -verbose=2 \ `pwd`/DEF \ -continue=cat -noDbNameCheck -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium > cat.log 2>&1 # real 4841m48.581s <- this is actually a fb time on cavPor3 # this finish step was less than an hour fb.canFam2.chainCanFamPoodle1Link.txt # 1405528799 bases of 2384996543 (58.932%) in intersection mkdir /hive/data/genomes/canFamPoodle1/bed/blastz.canFam2.swap cd /hive/data/genomes/canFamPoodle1/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl \ -verbose=2 \ /hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06/DEF \ -swap -noDbNameCheck -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 # real 8585m58.789s cat fb.canFamPoodle1.chainCanFam2Link.txt # 1377478896 bases of 1517497798 (90.773%) in intersection ############################################################################ # Re-Run equCab2 alignment (DONE - 2009-06-29 - Hiram mkdir /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29 cd /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29 cat << '_EOF_' > DEF # Dog vs. Horse BLASTZ_M=50 # TARGET: Dog canFam2 SEQ1_DIR=/scratch/data/canFam2/nib SEQ1_LEN=/scratch/data/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/scratch/data/equCab2/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 338m57.973s cat fb.canFam2.chainEquCab2Link.txt # 1676663178 bases of 2384996543 (70.300%) in intersection # this is identical to what went down before ? mkdir /hive/data/genomes/equCab2/bed/blastz.canFam2.swap cd /hive/data/genomes/equCab2/bed/blastz.canFam2.swap time doBlastzChainNet.pl \ /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29/DEF \ -swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 286m51.658s fb.equCab2.chainCanFam2Link.txt # 1721407500 bases of 2428790173 (70.875%) in intersection ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ # BLASTZ FOR ZEBRAFISH (danRer5) (DONE, 2009-08-07, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS mkdir /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07 cd /hive/data/genomes/canFam2/bed ln -s blastz.danRer5.2009-08-07 blastz.danRer5 cd /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07 cat << '_EOF_' > DEF # dog (canFam2) vs zebrafish (danRer5) BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 # TARGET: Dog canFam2 SEQ1_DIR=/scratch/data/canFam2/canFam2.2bit SEQ1_LEN=/hive/data/genomes/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - zebrafish (danRer5) SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit SEQ2_LEN=/hive/data/genomes/danRer5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs chmod +x DEF time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ >& doBlastz.log & # 0.806u 0.488s 3:58:52.14 0.0% 0+0k 0+0io 0pf+0w cat fb.canFam2.chainDanRer5Link.txt # 29234486 bases of 2384996543 (1.226%) in intersection # DO BLASTZ SWAP TO CREATE CANFAM2 CHAINS, NETS, AXNET, MAFNET AND # ALIGNMENT DOWNLOADS ON DANRER5 - documented in danRer5.txt ########################################################## # monDom5 chains/nets (2009-03-02 Andy) ssh hgwdev cd /hive/data/genomes/monDom5/bed/blastz.canFam2 doBlastzChainNet.pl -swap DEF ## (should have also put in a -noLoadChainSplit... see below) ########################################################## # monDom5 chain table unsplit (2009-09-08 Andy) ssh hgwdev cd /hive/data/genomes/canFam2/bed/blastz.monDom5.swap/axtChain hgLoadChain -tIndex canFam2 chainMonDom5 canFam2.monDom5.all.chain.gz netFilter -minGap=10 canFam2.monDom5.net.gz | hgLoadNet -verbose=0 canFam2 netMonDom5 stdin echo "show tables like 'chr%chainMonDom5%'" | hgsql canFam2 | tail -n +2 \ | while read -a line; do echo drop table $line | hgsql canFam2; done ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # ailMel1 Panda alignment (DONE - 2010-02-04 - Hiram) mkdir /hive/data/genomes/canFam2/bed/lastzAilMel1.2010-02-04 cd /hive/data/genomes/canFam2/bed/lastzAilMel1.2010-02-04 cat << '_EOF_' > DEF # Dog vs. Panda # parameters from the Panda paper supplemental where they describe # their lastz parameters BLASTZ_K=2200 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_H=2000 BLASTZ_C=2 BLASTZ_T=2 # our usual M BLASTZ_M=50 # TARGET: Dog canFam2 SEQ1_DIR=/scratch/data/canFam2/nib SEQ1_LEN=/scratch/data/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam2/bed/lastzAilMel1.2010-02-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm -smallClusterHub=memk -workhorse=hgwdev \ -noLoadChainSplit \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 360m31.044s cat fb.canFam2.chainAilMel1Link.txt # 1791212709 bases of 2384996543 (75.103%) in intersection mkdir /hive/data/genomes/ailMel1/bed/blastz.canFam2.swap cd /hive/data/genomes/ailMel1/bed/blastz.canFam2.swap time doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam2/bed/lastzAilMel1.2010-02-04/DEF \ -swap -noLoadChainSplit -workhorse=hgwdev -bigClusterHub=pk \ -smallClusterHub=memk -chainMinScore=3000 -chainLinearGap=medium \ > swap.log 2>&1 & # real 128m41.005s cat fb.ailMel1.chainCanFam2Link.txt # 1788107935 bases of 2245312831 (79.637%) in intersection ############################################################################ # LASTZ/CHAIN/NET Marmoset calJac3 (DONE - 2010-02-12 - Hiram) # use a screen to control this job screen mkdir /hive/data/genomes/canFam2/bed/lastzCalJac3.2010-02-12 cd /hive/data/genomes/canFam2/bed/lastzCalJac3.2010-02-12 cat << '_EOF_' > DEF # dog vs marmoset BLASTZ_M=50 # TARGET: Mouse Mm9 SEQ1_DIR=/scratch/data/canFam2/nib SEQ1_LEN=/scratch/data/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Marmoset (calJac3) SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_LIMIT=75 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam2/bed/lastzCalJac3.2010-02-12 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 `pwd`/DEF \ -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 1988m5.805s cat fb.canFam2.chainCalJac3Link.txt # 1363307334 bases of 2384996543 (57.162%) in intersection mkdir /hive/data/genomes/calJac3/bed/blastz.canFam2.swap cd /hive/data/genomes/calJac3/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam2/bed/lastzCalJac3.2010-02-12/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 129m56.144s cat fb.calJac3.chainHg19Link.txt # 1397333116 bases of 2752505800 (50.766%) in intersection ##################################################################### # LASTZ Dog Swap (DONE 2010-06-01 - Chin) cd /hive/data/genomes/canFam2/bed/lastzFelCat4.2010-06-01 cat fb.canFam2.chainFelCat4Link.txt mkdir /hive/data/genomes/felCat4/bed/blastz.canFam2.swap cd /hive/data/genomes/felCat4/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam2/bed/lastzFelCat4.2010-06-01/DEF \ -swap -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 327m42.053s cat fb.felCat4.chainCanFam2Link.txt # 1467506008 bases of 1990635005 (73.720%) in intersection ######################################################################### # LASTZ Cow BosTau6 (DONE - 2011-05-25 - Chin) screen mkdir /hive/data/genomes/canFam2/bed/lastzBosTau6.2011-05-25 cd /hive/data/genomes/canFam2/bed/lastzBosTau6.2011-05-25 cat << '_EOF_' > DEF # dog vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Dog canFam2 SEQ1_DIR=/scratch/data/canFam2/nib SEQ1_LEN=/scratch/data/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau6 SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit SEQ2_LEN=/scratch/data/bosTau6/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam2/bed/lastzBosTau6.2011-05-25 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 261m49.574 cat fb.canFam2.chainBosTau6Link.txt # 1389712912 bases of 2384996543 (58.269%) in intersection # Create link cd /hive/data/genomes/canFam2/bed ln -s lastzBosTau6.2011-05-25 lastz.bosTau6 # and the swap mkdir /hive/data/genomes/bosTau6/bed/blastz.canFam2.swap cd /hive/data/genomes/bosTau6/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam2/bed/lastzBosTau6.2011-05-25/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 88m38.555s cat fb.bosTau6.chainCanFam2Link.txt # 1404315725 bases of 2649682029 (52.999%) in intersection cd /hive/data/genomes/bosTau6/bed ln -s blastz.canFam2.swap lastz.canFam2 ############################################################################ # SNP131 (DONE 6/13/11 angie) mkdir -p /hive/data/outside/dbSNP/131/dog cd /hive/data/outside/dbSNP/131/dog # Unfortunately, the contigs we have from Broad are not exactly the same # as the NCBI RefSeq assembly (distinct from the NCBI GenBank assembly) # that dbSNP used. So ignoreDbSnpContigs tosses out all the Un's and randoms's. cat > config.ra <& do.log & tail -f do.log ######################################################################### # LASTZ Cow BosTau7 (DONE - 2012-01-24 - Chin) screen mkdir /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24 cd /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24 cat << '_EOF_' > DEF # dog vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Dog canFam2 SEQ1_DIR=/scratch/data/canFam2/nib SEQ1_LEN=/scratch/data/canFam2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau7 SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 229m55.587 cat fb.canFam2.chainBosTau7Link.txt # 1381869475 bases of 2384996543 (57.940%) in intersection # Create link cd /hive/data/genomes/canFam2/bed ln -s lastzBosTau7.2012-01-24 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam2.swap cd /hive/data/genomes/bosTau7/bed/blastz.canFam2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 86m11.258s cat fb.bosTau7.chainCanFam2Link.txt # 1458512906 bases of 2804673174 (52.003%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.canFam2.swap lastz.canFam2 ##################################################################### # POLYA-SEQ TRACK (from Adnan Derti, Merck) (DONE, Andy 2012-02-06) # (see hg19.txt for multi-species build notes) ############################################################################ # construct liftOver to canFam3 (DONE - 2012-05-02 - Hiram) screen -S canFam2 # manage this longish running job in a screen mkdir /hive/data/genomes/canFam2/bed/blat.canFam3.2012-05-02 cd /hive/data/genomes/canFam2/bed/blat.canFam3.2012-05-02 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/canFam2/11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev canFam2 canFam3 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/canFam2/11.ooc \ -dbHost=hgwdev -workhorse=hgwdev canFam2 canFam3 > do.log 2>&1 # something odd with three jobs that took forever to complete # real 3649m15.243s # Completed: 7392 of 7392 jobs # CPU time in finished jobs: 997168s 16619.46m 276.99h 11.54d 0.032 y # IO & Wait Time: 238352s 3972.54m 66.21h 2.76d 0.008 y # Average job time: 167s 2.79m 0.05h 0.00d # Longest finished job: 215892s 3598.20m 59.97h 2.50d # Submission to last job: 216676s 3611.27m 60.19h 2.51d # verify this file exists: og -L /gbdb/canFam2/liftOver/canFam2ToCanFam3.over.chain.gz # -rw-rw-r-- 1 583607 May 5 02:16 /gbdb/canFam2/liftOver/canFam2ToCanFam3.over.chain.gz # and try out the conversion on genome-test from canFam2 to canFam3 ############################################################################