# for emacs: -*- mode: sh; -*- # Drosophila Melanogaster -- # # Berkeley Drosophila Genome Project (fruitfly.org) release 3.1 (Jan. 2003) # http://www.fruitfly.org/annot/release3.html # # FlyBase (http://flybase.bio.indiana.edu/) last updated 20 Jan 2003 # # DOWNLOAD SEQUENCE (DONE 10/2/03 angie) ssh kksilo mkdir /cluster/store6/dm1 cd /cluster/data ln -s /cluster/store6/dm1 dm1 cd /cluster/data/dm1 wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_genomic_dmel_RELEASE3-1.FASTA faSplit byname whole_genome_genomic_dmel_RELEASE3-1.FASTA dummyArg # Follow FlyBase's lead on the chromosome names, but still use our # "chr" prefix: foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh U) mkdir $c sed -e 's/^>/>chr/' $c.fa > $c/chr$c.fa echo chr$c.fa size: faSize $c.fa echo $c/chr$c.fa size: faSize $c/chr$c.fa echo comparison: faCmp $c.fa $c/chr$c.fa echo "" end # Carefully review output of those commands, then: rm 2L.fa 2R.fa 2h.fa 3L.fa 3R.fa 3h.fa 4.fa X.fa Xh.fa Yh.fa U.fa # put away the big download file mkdir -p downloads/fruitfly mv whole_genome_genomic_dmel_RELEASE3-1.FASTA downloads/fruitfly/ # SPLIT CHROM FA INTO SMALLER CHUNKS BY GAPS (DONE 10/3/03 angie) ssh kksilo cd /cluster/data/dm1 foreach c (?{,?}) faSplit -minGapSize=100 -lift=$c/chr$c.lft \ gap $c/chr$c.fa 2000000 $c/chr${c}_ end foreach ctgFa (?{,?}/chr*_*.fa) set ctg = $ctgFa:r mkdir $ctg mv $ctgFa $ctg end # CREATING DATABASE (DONE 10/3/03 angie) # Create the database. ssh hgwdev echo 'create database dm1' | hgsql '' # Make a semi-permanent read-only alias: alias dm1 "mysql -u hguser -phguserstuff -A dm1" # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql # EXTRACT GAP INFORMATION FROM FASTA, LOAD GAP TRACK (DONE 10/3/03 angie) ssh kksilo cd /cluster/data/dm1 # size up the gap situation -- can we use gaps to extract agp info? faGapSizes downloads/fruitfly/whole_genome_genomic_dmel_RELEASE3-1.FASTA # yup. Jim's verdict: # I think that we can probably just show all gaps as bridged # in the non-h chromosomes, and as unbridged in the h chromosomes # and leave it at that. # Extract gaps using scaffoldFaToAgp. It's really meant for a different # purpose, so clean up its output: remove the .lft and .agp, and remove # the last line of .gap (extra gap added at end). Also substitute in # the correct chrom name in .gap. foreach c (?{,?}) set chr = chr$c pushd $c scaffoldFaToAgp -minGapSize=100 $chr.fa rm $chr.{lft,agp} set chrSize = `faSize $chr.fa | awk '{print $1;}'` set origLines = `cat $chr.gap | wc -l` awk '($2 != '$chrSize'+1) {print;}' $chr.gap \ | sed -e "s/chrUn/$chr/" > $chr.gap2 set newLines = `cat $chr.gap2 | wc -l` if ($newLines == ($origLines - 1)) then mv $chr.gap2 $chr.gap else echo "Error: $chr/$chr.gap2 has wrong number of lines." endif popd end # Call the gaps unbridged in chrU and chr*h: foreach c (U ?h) set chr = chr$c sed -e 's/yes/no/' $c/$chr.gap > $c/$chr.gap2 mv $c/$chr.gap2 $c/$chr.gap end ssh hgwdev hgLoadGap dm1 /cluster/data/dm1 # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 10/13/03 angie/donnak) ssh hgwdev # Write ~/kent/src/hg/makeDb/trackDb/drosophila/dm1/description.html # with a description of the assembly and some sample position queries. chmod a+r ~/kent/src/hg/makeDb/trackDb/drosophila/dm1/description.html # Check it in and copy (perhaps via a make in trackDb??) to # /cluster/data/dm1/html. mkdir -p /gbdb/dm1/html ln -s /cluster/data/dm1/html/description.html /gbdb/dm1/html/ # RUN REPEAT MASKER (DONE 10/4/03 angie) # Note: drosophila library ("drosophila.lib") is dated May 27 '03. # Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make # RepeatMasker runs manageable on the cluster ==> results need lifting. # Split contigs into 500kb chunks: ssh kksilo cd /cluster/data/dm1 foreach d ( */chr*_?{,?} ) cd $d set contig = $d:t faSplit -minGapSize=100 -lift=$contig.lft -maxN=500000 \ gap $contig.fa 500000 ${contig}_ cd ../.. end # make the run directory, output directory, and job list mkdir RMRun cp /dev/null RMRun/RMJobs foreach d ( ?{,?}/chr*_?{,?} ) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/bin/scripts/RMDrosophila \ /cluster/data/dm1/$d $f /cluster/data/dm1/$d \ '{'check out line+ /cluster/data/dm1/$d/$f.out'}' \ >> RMRun/RMJobs end end # do the run ssh kk cd /cluster/data/dm1/RMRun para create RMJobs para try para check para push para check,... #Completed: 288 of 288 jobs #CPU time in finished jobs: 1764726s 29412.10m 490.20h 20.43d 0.056 y #IO & Wait Time: 3392s 56.53m 0.94h 0.04d 0.000 y #Average job time: 6139s 102.32m 1.71h 0.07d #Longest job: 7256s 120.93m 2.02h 0.08d #Submission to last job: 7257s 120.95m 2.02h 0.08d # Lift up the split-contig .out's to contig-level .out's ssh kksilo cd /cluster/data/dm1 foreach d ( ?{,?}/chr*_?{,?} ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null cd ../.. end # Lift up the contig-level .out's to chr-level foreach c (?{,?}) cd $c if (-e chr$c.lft && ! -z chr$c.lft) then echo lifting $c /cluster/bin/i386/liftUp chr$c.fa.out chr$c.lft warn \ `awk '{print $2"/"$2".fa.out";}' chr$c.lft` > /dev/null else echo Can\'t find $c/chr$c.lft \! endif cd .. end # soft-mask contig .fa's with .out's foreach c (?{,?}) foreach j ($c/chr${c}_?{,?}/chr${c}_?{,?}.fa) maskOutFa $j $j.out $j -soft end echo done $c end # Load the .out files into the database with: ssh hgwdev hgLoadOut dm1 /cluster/data/dm1/?{,?}/*.fa.out # SIMPLE REPEATS (TRF) (DONE 10/5/03 angie) # TRF runs pretty quickly now... it takes a few hours total runtime, # so instead of binrsyncing and para-running, just do this on the # local fileserver ssh kksilo mkdir /cluster/data/dm1/bed/simpleRepeat cd /cluster/data/dm1/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach f (/cluster/data/dm1/?{,?}/chr*_*/chr?{,?}_?{,?}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end tcsh jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l # When job is done do: mkdir /cluster/data/dm1/jkStuff liftUp simpleRepeat.bed /cluster/data/dm1/jkStuff/liftAll.lft warn \ trf/*.bed # Load this into the database as so ssh hgwdev hgLoadBed dm1 simpleRepeat \ /cluster/data/dm1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 10/5/03 angie) # make a filtered version # of the trf output: # keep trf's with period <= 12: ssh kksilo cd /cluster/data/dm1/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) echo "filtering $f" awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/dm1 mkdir bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.lft warn \ `awk '{print "bed/simpleRepeat/trfMask/"$2".bed";}' $c/chr$c.lft` end # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 10/5/03 angie) ssh kksilo cd /cluster/data/dm1 foreach c (?{,?}) echo repeat- and trf-masking chr$c.fa /cluster/home/kent/bin/i386/maskOutFa -soft \ $c/chr$c.fa $c/chr$c.fa.out $c/chr$c.fa /cluster/home/kent/bin/i386/maskOutFa -softAdd \ $c/chr$c.fa bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.fa end foreach c (?{,?}) echo repeat- and trf-masking contigs of chr$c foreach ctgFa ($c/chr*/chr${c}_?{,?}.fa) set trfMask=bed/simpleRepeat/trfMask/$ctgFa:t:r.bed /cluster/home/kent/bin/i386/maskOutFa -soft $ctgFa $ctgFa.out $ctgFa /cluster/home/kent/bin/i386/maskOutFa -softAdd $ctgFa $trfMask $ctgFa end end # STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 10/5/03 angie) # Translate to nib ssh kksilo cd /cluster/data/dm1 mkdir nib foreach c (?{,?}) faToNib -softMask $c/chr$c.fa nib/chr$c.nib end # Make symbolic links from /gbdb/dm1/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/dm1/nib foreach f (/cluster/data/dm1/nib/chr*.nib) ln -s $f /gbdb/dm1/nib end # Load /gbdb/dm1/nib paths into database and save size info. hgsql dm1 < ~/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib dm1 /gbdb/dm1/nib /cluster/data/dm1/?{,?}/chr?{,?}.fa echo "select chrom,size from chromInfo" | hgsql -N dm1 \ > /cluster/data/dm1/chrom.sizes # CREATING GRP TABLE FOR TRACK GROUPING (DONE 10/5/03 angie) # Copy all the data from the table "grp" # in the existing database "rn1" to the new database ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \ | hgsql dm1 # MAKE GCPERCENT (DONE 10/5/03 angie) ssh hgwdev mkdir /cluster/data/dm1/bed/gcPercent cd /cluster/data/dm1/bed/gcPercent # create and load gcPercent table hgsql dm1 < ~/src/hg/lib/gcPercent.sql hgGcPercent dm1 ../../nib # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DROSOPHILA (DONE 10/8/03 angie) # Warning: must genome and organism fields must correspond # with defaultDb values echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("dm1", "Jan. 2003", "/gbdb/dm1/nib", "Fruitfly", \ "chr2L:827700-845800", 1, 55, "Fruitfly", \ "Drosophila melanogaster", "/gbdb/dm1/html/description.html", \ 0, 0, "BDGP v. 3");' \ | hgsql -h genome-testdb hgcentraltest echo 'INSERT INTO defaultDb (genome, name) values ("Fruitfly", "dm1");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add dm1 in all the right places and do make update # go public on genome-test #make alpha cvs commit makefile # Add trackDb directories mkdir drosophila mkdir drosophila/dm1 cvs add drosophila cvs add drosophila/dm1 cvs commit drosophila # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DROSOPHILA (DONE 10/17/03 angie) ssh hgwdev # Get appropriate hostname from cluster admins # 8/26/04: set canPcr=1 for untranslated blat server. echo 'insert into blatServers values("dm1", "blat10", 17787, 1, 0); \ insert into blatServers values("dm1", "blat10", 17786, 0, 1);' \ | hgsql -h genome-testdb hgcentraltest # LOAD UP BDGP 3.1 ANNOTATIONS (DONE 10/8/03 angie) # fruitfly.org is the Berkeley Drosophila Genome Project. # Their GFF annotations contain FlyBase IDs useful for cross-linking. ssh kksilo mkdir /cluster/data/dm1/bed/bdgpAnnotations cd /cluster/data/dm1/bed/bdgpAnnotations # Download all available annotations: wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_3_UTR_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_5_UTR_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_CDS_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_annotation_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_exon_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_intron_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_noncoding-gene_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_protein-coding-gene_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_splice_site_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_tRNA_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_transcript_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_translation_dmel_RELEASE3-1.GFF.gz wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_transposable_element_dmel_RELEASE3-1.GFF.gz gunzip *.gz # Protein-coding genes... perl -wpe 's/^/chr/; s/translation/CDS/; \ s/genegrp.*transgrp=(\S+);.*$/$1/; \ s/genegrp.*transgrp=(\S+)$/$1/;' \ whole_genome_protein-coding-gene_dmel_RELEASE3-1.GFF \ > bdgpGene.gff # Loading a .tab file caused some lines with super-long go fields # to be skipped. Generating a .sql file with INSERTs works. perl -we 'while (<>) { \ chop; @fields = split("\t"); \ if ($fields[2] eq "gene") { \ @vars = split("; ", $fields[8]); \ $go = ""; $cdna_clone = ""; \ foreach $v (@vars) { \ @vv = split("=", $v); \ if ($vv[0] eq "name") { \ $bdgpName = $vv[1]; \ } elsif ($vv[0] eq "dbxref") { \ if ($vv[1] =~ /^GO:(\d+)/) { \ $go .= "$1,"; \ } elsif ($vv[1] =~ /FlyBase:(\w+)/) { \ $flybase = $1; \ } else { \ die "unrecognized dbxref $vv[1]"; \ } \ } elsif ($vv[0] eq "symbol") { \ $symbol = $vv[1]; \ } elsif ($vv[0] eq "cytorange") { \ $cytorange = $vv[1]; \ } elsif ($vv[0] eq "cdna_clone") { \ $cdna_clone .= "$vv[1],"; \ } elsif ($vv[0] eq "genegrp") { \ } else { \ die "unrecognized var $v" \ } \ } \ print "INSERT INTO bdgpGeneInfo VALUES ( \"$bdgpName\", \"$flybase\", \"$go\", \"$symbol\", \"$cytorange\", \"$cdna_clone\");\n"; \ } \ }' \ whole_genome_protein-coding-gene_dmel_RELEASE3-1.GFF \ > bdgpGeneInfo.sql # Proteins for coding genes: wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz gunzip -c whole_genome_translation_dmel_RELEASE3-1.FASTA.gz \ | perl -wpe 's/^>(pp-)*(\w+)-\w(\w).*/>$2-R$3/' \ > bdgpGenePep.fa # load up coding genes, proteins and assoc. info: ssh hgwdev ldHgGene dm1 bdgpGene /cluster/data/dm1/bed/bdgpAnnotations/bdgpGene.gff hgPepPred dm1 generic bdgpGenePep \ /cluster/data/dm1/bed/bdgpAnnotations/bdgpGenePep.fa hgsql dm1 < $HOME/src/hg/lib/bdgpGeneInfo.sql hgsql dm1 < /cluster/data/dm1/bed/bdgpAnnotations/bdgpGeneInfo.sql # Non-coding genes... perl -wpe 's/^/chr/; \ s/genegrp.*transgrp=(\S+);.*$/$1/; \ s/genegrp.*transgrp=(\S+)$/$1/;' \ whole_genome_noncoding-gene_dmel_RELEASE3-1.GFF \ > bdgpNonCoding.gff sed -e 's/bdgpGeneInfo/bdgpNonCodingInfo/' \ ~/kent/src/hg/lib/bdgpGeneInfo.sql \ > bdgpNonCodingInfo.sql perl -we 'while (<>) { \ chop; @fields = split("\t"); \ if (($fields[2] ne "exon") && \ ($fields[2] ne "transcript") && \ ($fields[2] ne "translation")) { \ @vars = split("; ", $fields[8]); \ $go = ""; $cdna_clone = ""; \ foreach $v (@vars) { \ @vv = split("=", $v); \ if ($vv[0] eq "name") { \ $bdgpName = $vv[1]; \ } elsif ($vv[0] eq "dbxref") { \ if ($vv[1] =~ /^GO:(\d+)/) { \ $go .= "$1,"; \ } elsif ($vv[1] =~ /FlyBase:(\w+)/) { \ $flybase = $1; \ } else { \ die "unrecognized dbxref $vv[1]"; \ } \ } elsif ($vv[0] eq "symbol") { \ $symbol = $vv[1]; \ } elsif ($vv[0] eq "cytorange") { \ $cytorange = $vv[1]; \ } elsif ($vv[0] eq "cdna_clone") { \ $cdna_clone .= "$vv[1],"; \ } elsif ($vv[0] eq "genegrp") { \ } else { \ die "unrecognized var $v" \ } \ } \ print "INSERT INTO bdgpNonCodingInfo VALUES ( \"$bdgpName\", \"$flybase\", \"$go\", \"$symbol\", \"$cytorange\", \"$cdna_clone\");\n"; \ } \ }' \ whole_genome_noncoding-gene_dmel_RELEASE3-1.GFF \ >> bdgpNonCodingInfo.sql ssh hgwdev ldHgGene dm1 bdgpNonCoding \ /cluster/data/dm1/bed/bdgpAnnotations/bdgpNonCoding.gff hgsql dm1 < /cluster/data/dm1/bed/bdgpAnnotations/bdgpNonCodingInfo.sql # FLYBASE GENES (REDONE 6/28/04 angie) ssh hgwdev mkdir /cluster/data/dm1/bed/flyBase cd /cluster/data/dm1/bed/flyBase wget -O genes.txt.040617 \ ftp://flybase.bio.indiana.edu/flybase/genes/genes.txt # Had to edit genes.txt.040617 to get around these two lines that didn't # start with acode symbols: #line 258894 #Allele class: hypom #line 459025 (459026 of original) #uncertain *u FBan0017679; annotated data are available for this gene. #line 3128725 (3128726 of original), for Trn-SR / FBgn0031456: #%f nu hgFlyBase dm1 genes.txt.040617 # BDGP GENE DISRUPTION PROJECT/PSCREEN (DONE 7/29/04 angie) ssh hgwdev mkdir /cluster/data/dm1/bed/pscreen cd /cluster/data/dm1/bed/pscreen # Robin Hiesinger emailed 2 table dumps that need to be joined on ID; # saved as atbloom.dump, genetag_gdp.dump. # Turns out that this file has a more complete mapping than atbloom -- # use it to fill in stock numbers missing from atbloom: wget http://flybase.bio.indiana.edu/stocks/stock-centers/bloomington/lk/bloomington.csv dos2unix atbloom.dump # one-shot Perl script: chmod a+x mkPscreen.pl mkPscreen.pl genetag_gdp.dump atbloom.dump bloomington.csv > pscreen.bed hgLoadBed dm1 pscreen -sqlTable=$HOME/kent/src/hg/lib/pscreen.sql -tab \ pscreen.bed # 8/13/04: Galt found some duplicate IDs and Robin/Bob(?) suggested # fixing them as follows: hgsql dm1 -e '\ delete from pscreen where name = "KG05017" limit 1; \ delete from pscreen where name = "EY14401" or name = "PA07741"; \ delete from pscreen where name = "EY14474" and chromEnd = "8975785"; \ delete from pscreen where name = "KV00692" and chromEnd = "4702749";' # 8/26/04: Drop geneDeltas column due to incomplete / possibly stale # data. (removed it from pscreen.as & derivatives, hgc.c first) # Also remove 3 items mapped off the end of chrX. hgsql dm1 -e 'alter table pscreen drop column geneDeltas;' hgsql dm1 -e '\ delete from pscreen where name = "KG03740"; \ delete from pscreen where name = "BG01196"; \ delete from pscreen where name = "EY02003";' # 9/16/04: Bob Levis emailed an update... a bunch of deletions, # 1 change and 2 additions: # (see email text and UCSC_changes_091604v2.csv) hgsql dm1 -e '\ delete from pscreen where name = "BG02404"; \ delete from pscreen where name = "EY03152"; \ delete from pscreen where name = "EY05239"; \ delete from pscreen where name = "EY05534"; \ delete from pscreen where name = "EY07100"; \ delete from pscreen where name = "EY09238"; \ delete from pscreen where name = "EY11132"; \ delete from pscreen where name = "EY11235"; \ delete from pscreen where name = "EY11805"; \ delete from pscreen where name = "EY12014"; \ delete from pscreen where name = "EY12019"; \ delete from pscreen where name = "EY12603"; \ delete from pscreen where name = "EY13113"; \ delete from pscreen where name = "EY13711"; \ delete from pscreen where name = "EY14401"; \ delete from pscreen where name = "KG03002"; \ delete from pscreen where name = "KG04889";' hgsql dm1 -e 'update pscreen set geneIds = "CG33472," where name = "EY04063";' hgsql dm1 -e 'update pscreen set chrom = "chr2L" where name = "EY11972";' hgsql dm1 -e 'insert into pscreen values(672, "chrX", 11430596, 11430597, "EY12008", 0, "+", 0, 1, "CG1830,");' hgsql dm1 -e 'insert into pscreen values(720, "chr2R", 17764291, 17764292, "EY12526", 0, "-", 0, 1, "CG30092,");' #QA NOTE: [ASZ, 10-1-2008] mytouch dm1 pscreen 200501071700.00 #(because it was not updated after a bdgpGene update (pscreen is outdated) # FLYBASE IN SITU IMAGES / EXPRESSION (REDONE 6/17/04 angie) # FlyBase has downloadable in situ images for BACs: # ftp://flybase.net/flybase/images/bac_insitu_pic/* # and FBti's: # ftp://flybase.net/flybase/images/in-situ-images/insitus.zip # but what Jim is interested in is the insitus for expression data. # Don't see that in the ftp listing, but they do make it easy to link in. ssh hgwdev cd /cluster/data/dm1/bed/flyBase wget -O summary.txt.040617 \ 'http://www.fruitfly.org/cgi-bin/ex/bquery.pl?qpage=entry&qtype=summarytext' hgsql dm1 < ~/kent/src/hg/lib/bdgpExprLink.sql hgsql dm1 -e \ 'load data local infile "summary.txt.040617" into table bdgpExprLink' # SWISSPROT-FLYBASE CROSS-REFERENCING (REDONE 6/17/04 angie) ssh hgwdev mkdir /cluster/data/dm1/bed/flyBaseSwissProt cd /cluster/data/dm1/bed/flyBaseSwissProt echo "select extAcc1,acc from extDbRef,extDb where extDbRef.extDb = extDb.id and extDb.val = 'flybase'" \ | hgsql -N sp040515 \ > fbSpAcc.tab ssh kksilo cd /cluster/data/dm1/bed/flyBaseSwissProt wget ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/fasta_files/proteomes/7227.FASTAC # Some of those SwissProt "names" are > 255 chars! trim the few long ones. perl -we 'open(F, "fbSpAcc.tab") || die; \ %sp2fb = (); \ while () { \ chop; @words = split("\t"); \ $sp2fb{$words[1]} = $words[0]; \ } \ close(F); \ while (<>) { \ if (/^>(\w+)\s+\((\w+)\)\s+(.*)/) { \ $fbAcc = $sp2fb{$2}; \ $spAcc = $3; \ $spAcc = substr($3, 0, 250) . "..." if (length($3) > 255); \ print "$fbAcc\t$2\t$spAcc\t$1\n" if (defined $fbAcc); \ } \ }' \ 7227.FASTAC \ > flyBaseSwissProt.tab rm 7227.FASTAC ssh hgwdev hgsql dm1 < $HOME/src/hg/lib/flyBaseSwissProt.sql echo 'load data local infile "/cluster/data/dm1/bed/flyBaseSwissProt/flyBaseSwissProt.tab" into table flyBaseSwissProt' \ | hgsql dm1 # AUTO UPDATE GENBANK MRNA RUN (DONE 10/9/03 angie) # Put the nib's on /cluster/bluearc: ssh kksilo mkdir /cluster/bluearc/drosophila mkdir /cluster/bluearc/drosophila/dm1 cp -pR /cluster/data/dm1/nib /cluster/bluearc/drosophila/dm1 # Instructions for setting up incremental genbank updates are here: # http://www.soe.ucsc.edu/~markd/genbank-update/doc/initial-load.html # This time around, Markd handled adding the new species to gbGenome.c # because it's not yet in the kent tree. # Edit /cluster/data/genbank/etc/genbank.conf and add: # dm1 dm1.genome = /cluster/bluearc/drosophila/dm1/nib/chr*.nib dm1.lift = /cluster/data/dm1/jkStuff/liftAll.lft dm1.genbank.mrna.xeno.load = yes dm1.genbank.est.xeno.load = no dm1.downloadDir = dm1 ssh eieio cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \ -srcDb=genbank -type=mrna -verbose=1 -initial dm1 # Load the results from the above ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dm1 ssh eieio # To get this next one started, the work directory of the initial run # needs to be moved out of the way. rm -r /cluster/bluearc/genbank/work/initial.dm1 # Now align refseqs: cd /cluster/data/genbank nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \ -srcDb=refseq -type=mrna -verbose=1 -initial dm1 # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 dm1 ssh eieio # To get this next one started, the work directory of the initial run # needs to be moved out of the way. rm -r /cluster/bluearc/genbank/work/initial.dm1 # Now align ESTs: nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \ -srcDb=genbank -type=est -verbose=1 -initial dm1 # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 dm1 # Clean up: rm -r /cluster/bluearc/genbank/work/initial.dm1 # PUT NIBS ON ISCRATCH (DONE 10/9/03 angie) ssh kkr1u00 cd /iscratch/i/dm1 cp -pR /cluster/data/dm1/nib . iSync # PRODUCING FUGU FISH ALIGNMENTS (DONE 10/9/03 angie) # Assumes masked NIBs have been prepared as above # and Fugu pieces are already on kluster /iscratch/i. # next machine ssh kk mkdir -p /cluster/data/dm1/bed/blatFugu cd /cluster/data/dm1/bed/blatFugu mkdir psl ls -1S /iscratch/i/fugu/*.fa > fugu.lst ls -1S /iscratch/i/dm1/nib/chr*.nib > fly.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -q=dnax -t=dnax -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 fly.lst fugu.lst gsub spec para create spec para try para check para push para check #Completed: 1408 of 1408 jobs #Average job time: 217s 3.61m 0.06h 0.00d #Longest job: 1431s 23.85m 0.40h 0.02d #Submission to last job: 1433s 23.88m 0.40h 0.02d # When cluster run is done, sort alignments # append _blatFugu to chrom for .psl file names. ssh kksilo cd /cluster/data/dm1/bed/blatFugu foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh U) echo -n "chr${c} " pslCat psl/chr${c}_*.psl \ | pslSortAcc nohead chrom temp stdin rm -f chrom/chr${c}_blatFugu.psl mv chrom/chr${c}.psl chrom/chr${c}_blatFugu.psl end # Load alignments ssh hgwdev cd /cluster/data/dm1/bed/blatFugu/chrom hgLoadPsl -noTNameIx dm1 chr*_blatFugu.psl # Make fugu /gbdb/ symlink and load Fugu sequence data. mkdir /gbdb/dm1/fuguSeq ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta /gbdb/dm1/fuguSeq # ! ! ! DO NOT RUN hgLoadSeq in /gbdb - it leaves .tab files cd /cluster/data/dm1/bed/blatFugu hgLoadSeq dm1 /gbdb/dm1/fuguSeq/fugu_v3_mask.fasta # BLASTZ D.PSEUDOOBSCURA (DONE 8/3/04 angie) ssh kksilo mkdir /cluster/data/dm1/bed/blastz.dp2.2004-08-03 cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03 ln -s blastz.dp2.2004-08-03 /cluster/data/dm1/bed/blastz.dp2 cat << '_EOF_' > DEF # D.melanogaster vs. D.pseudoobscura export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm1/nib # unused: SEQ1_RMSK= SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - D. pseudoobscura SEQ2_DIR=/iscratch/i/dp2/nib # unused: SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG=-drosophila SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm1/bed/blastz.dp2.2004-08-03 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # run bash shell if you don't already: bash source DEF mkdir run ~angie/hummus/make-joblist $DEF > $BASE/run/j sh ./xdir.sh cd run sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 mv j2 j # cluster run ssh kk cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/run para create j para try, check, push, check, .... #Completed: 15939 of 15939 jobs #Average job time: 13s 0.21m 0.00h 0.00d #Longest job: 294s 4.90m 0.08h 0.00d #Submission to last job: 1118s 18.63m 0.31h 0.01d # back in the bash shell on kksilo... mkdir /cluster/data/dm1/bed/blastz.dp2.2004-08-03/run.1 cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/run.1 /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j # small cluster run ssh kki cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/run.1 para create j para try, check, push, check, .... #Completed: 21 of 21 jobs #Average job time: 18s 0.30m 0.01h 0.00d #Longest job: 25s 0.42m 0.01h 0.00d #Submission to last job: 55s 0.92m 0.02h 0.00d cd .. rm -r raw # Translate .lav to axt, with dp2 in scaffold coords for collaborators: ssh kksilo cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03 mkdir axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/data/dm1/nib /cluster/data/dp2/nib stdout \ | axtSort stdin ../../$out popd end # CHAIN PSEUDOOBSCURA BLASTZ (DONE 8/3/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -verbose=0 $1 \ /iscratch/i/dm1/nib \ /iscratch/i/dp2/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 8s 0.13m 0.00h 0.00d #Longest job: 14s 0.23m 0.00h 0.00d #Submission to last job: 14s 0.23m 0.00h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm1 ${c}_chainDp2 $i end # NET PSEUDOOBSCURA BLASTZ (DONE 8/3/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain netClass -noAr noClass.net dm1 dp2 pseudoobscura.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain rm noClass.net netFilter -syn pseudoobscura.net > pseudoobscuraSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain netFilter -minGap=10 pseudoobscura.net | hgLoadNet dm1 netDp2 stdin netFilter -minGap=10 pseudoobscuraSyn.net \ | hgLoadNet dm1 netSyntenyDp2 stdin # MAKE VSDP2 DOWNLOADABLES (DONE 8/17/04 angie) ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDp2 cd /usr/local/apache/htdocs/goldenPath/dm1/vsDp2 gzip -c \ /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain/all.chain \ > pseudoobscura.chain.gz gzip -c \ /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain/pseudoobscura.net \ > pseudoobscura.net.gz mkdir axtNet foreach f (/cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtNet/chr*axt) gzip -c $f > axtNet/$f:t.gz end md5sum *.gz */*.gz > md5sum.txt # Make a README.txt which explains the files & formats. PRODUCING GENSCAN PREDICTIONS (TODO 11/14/03 angie) # Run on small cluster -- genscan needs big mem. ssh kkr1u00 mkdir /cluster/data/dm1/bed/genscan cd /cluster/data/dm1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Make hard-masked contigs foreach f (/cluster/data/dm1/?{,?}/chr*/chr?{,?}_?{,?}.fa) maskOutFa $f hard $f.masked end # Generate a list file, contigs.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f contigs.list touch contigs.list foreach f ( `ls -1S /cluster/data/dm1/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> contigs.list end cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 contigs.list single gsub jobList para create jobList para try para check para push #Completed: 79 of 79 jobs #Average job time: 136s 2.26m 0.04h 0.00d #Longest job: 229s 3.82m 0.06h 0.00d #Submission to last job: 1718s 28.63m 0.48h 0.02d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # chr14_21, chr16_4 # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/dm1/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/dm1/bed/genscan ldHgGene dm1 genscan genscan.gtf hgPepPred dm1 generic genscanPep genscan.pep hgLoadBed dm1 genscanSubopt genscanSubopt.bed # EXTEND BDGPGENE AND CREATE BDGPNEAR FOR HGNEAR (REDONE 6/18/04 angie) ssh hgwdev cd /cluster/data/dm1/bed/bdgpAnnotations.3.2 cp ~/kent/src/hg/lib/bdgpSwissProt.sql . perl -we '%bName2all = (); \ %bName2fb = (); \ open(P, "echo \"select * from bdgpGeneInfo\" | hgsql -N dm1|") \ || die; \ while (

) { \ chop; my @words = split("\t"); \ $bName2fb{$words[0]} = \@words; \ } \ close(P); \ open(P, "echo \"select bdgpGeneInfo.*,flyBaseSwissProt.* from bdgpGeneInfo,flyBaseSwissProt where bdgpGeneInfo.flyBaseId = flyBaseSwissProt.flyBaseId\" | hgsql -N dm1|") || die; \ while (

) { \ chop; my @words = split("\t"); \ $bName2all{$words[0]} = \@words; \ } \ close(P); \ open(P, "echo select name from bdgpGene | hgsql -N dm1|") ||die; \ while (

) { \ chop; $name = $_; \ $bName = $name; $bName =~ s/-R.*//; \ if (exists($bName2all{$bName})) { \ ($bName, $fbID, $go, $symbol, $cyto, undef, \ undef, $spID, $spDesc, $spSymb) = @{$bName2all{$bName}}; \ print "INSERT INTO bdgpSwissProt VALUES ( \"$name\", \"$fbID\", \"$go\", \"$symbol\", \"$cyto\", \"$spID\", \"$spDesc\", \"$spSymb\");\n"; \ } elsif (exists($bName2fb{$bName})) { \ ($bName, $fbID, $go, $symbol, $cyto, undef) = @{$bName2fb{$bName}}; \ $go = "" if (not defined $go); \ $cyto = "" if (not defined $cyto); \ print "INSERT INTO bdgpSwissProt VALUES ( \"$name\", \"$fbID\", \"$go\", \"$symbol\", \"$cyto\", \"n/a\", \"n/a\", \"n/a\");\n"; \ } else { die "No info for $name."; } \ } \ close(P); ' \ >> bdgpSwissProt.sql hgsql dm1 < bdgpSwissProt.sql # Re-ran the following steps 1/7/05 after reloading bdgpGene # without -genePredExt, see 1/6/05 note below...: # Use the above table to add a proteinID field to bdgpGene. hgsql dm1 -e 'create table bdgpGene2 \ select bdgpGene.*, bdgpSwissProt.swissProtId as proteinID \ from bdgpGene, bdgpSwissProt \ where bdgpGene.name = bdgpSwissProt.bdgpName' # Now examine bdgpGene2 vs. bdgpGene manually, carefully. # Do they have the same # rows? hgsql dm1 -N -e 'select count(*) from bdgpGene' hgsql dm1 -N -e 'select count(*) from bdgpGene2' # Are most proteinID fields non-"n/a"? hgsql dm1 -N -e 'select count(*) from bdgpGene2 where proteinID = "n/a"' # Spot-check some genes... are the fields of bdgpGene2 identical to # the fields of bdgpGene, except for the new proteinIDs? hgsql dm1 -N -e 'select * from bdgpGene limit 3' hgsql dm1 -N -e 'select * from bdgpGene2 limit 3' # If so, then go ahead: hgsql dm1 -e 'drop table bdgpGene; rename table bdgpGene2 to bdgpGene;' # and check the new bdgpGene manually. # MYTOUCH FIX - Jen - 2006-01-24 sudo mytouch dm1 arbExpDistance 0501081700.00 sudo mytouch dm1 arbExpDistance 0501081700.00 sudo mytouch dm1 arbExpDistance 0501081700.00 sudo mytouch dm1 bdgpBlastTab 0501081700.00 sudo mytouch dm1 bdgpBlastTab 0501081700.00 sudo mytouch dm1 bdgpCanonical 0501081700.00 sudo mytouch dm1 bdgpIsoforms 0501081700.00 sudo mytouch dm1 bdgpGenePep 0501081700.00 sudo mytouch dm1 bdgpSwissProt 0501081700.00 sudo mytouch dm1 bdgpToCanonical 0501081700.00 sudo mytouch dm1 bdgpToLocusLink 0501081700.00 sudo mytouch dm1 bdgpToPfam 0501081700.00 sudo mytouch dm1 bdgpToRefSeq 0501081700.00 sudo mytouch dm1 ceBlastTab 0501081700.00 sudo mytouch dm1 drBlastTab 0501081700.00 sudo mytouch dm1 hgBlastTab 0501081700.00 sudo mytouch dm1 mmBlastTab 0501081700.00 sudo mytouch dm1 scBlastTab 0501081700.00 sudo mytouch dm1 fbTranscript 0501081700.00 sudo mytouch dm1 hgBlastTab 0501081700.00 sudo mytouch dm1 blastFBPep00 0501081700.00 ############################################################################# # MAKE HGNEAR # adapted from makeHgNear.doc; split into sections. See makeHgFixed.doc # for how Arbeitman et al's fly lifecycle expression data were loaded into # hgFixed. # BLASTP SELF, CLUSTER GENES, MAP TO EXP.DATA FOR HGNEAR (REDONE 6/18/04 angie) ssh hgwdev # Now that bdgpGene has proteinID, use hgClusterGenes to cluster # together various alt-splicing isoforms, creating the tables # bdgpIsoforms and bdgpCanonical. hgClusterGenes dm1 bdgpGene bdgpIsoforms bdgpCanonical # Extract peptides from bdgpGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/dm1/bed/blastp cd /cluster/data/dm1/bed/blastp pepPredToFa dm1 bdgpGenePep bdgp.faa formatdb -i bdgp.faa -t bdgp -n bdgp # Copy over database to iscratch/i ssh kkr1u00 if (-e /iscratch/i/dm1/blastp) then rm -r /iscratch/i/dm1/blastp endif mkdir -p /iscratch/i/dm1/blastp cp /cluster/data/dm1/bed/blastp/bdgp.* /iscratch/i/dm1/blastp # Load up iscratch/i with blastp and related files # if necessary if (! -e /iscratch/i/blast/blastall) then mkdir -p /iscratch/i/blast cp /projects/compbio/bin/i686/blastall /iscratch/i/blast mkdir -p /iscratch/i/blast/data cp /projects/compbio/bin/i686/data/* /iscratch/i/blast/data endif iSync # Split up fasta file into bite sized chunks for cluster ssh kksilo cd /cluster/data/dm1/bed/blastp mkdir split faSplit sequence bdgp.faa 6000 split/bg # Make parasol run directory ssh kk mkdir -p /cluster/data/dm1/bed/blastp/self/run/out cd /cluster/data/dm1/bed/blastp/self/run # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... #Completed: 5821 of 5821 jobs #Average job time: 10s 0.16m 0.00h 0.00d #Longest job: 326s 5.43m 0.09h 0.00d #Submission to last job: 655s 10.92m 0.18h 0.01d # Load into database. This took only ~3 minutes for dm1. ssh hgwdev cd /cluster/data/dm1/bed/blastp/self/run/out time hgLoadBlastTab dm1 bdgpBlastTab *.tab # Create table that maps between bdgp genes and RefSeq hgMapToGene dm1 refGene bdgpGene bdgpToRefSeq # Create table that maps between bdgp genes and LocusLink echo "select mrnaAcc,locusLinkId from refLink" | hgsql -N dm1 > refToLl.txt ### NOT DONE (LocusLink info temporarily missing from genbank) hgMapToGene dm1 refGene bdgpGene bdgpToLocusLink -lookup=refToLl.txt ### # Create table that maps between known genes and Pfam domains hgMapViaSwissProt dm1 bdgpGene name proteinID Pfam bdgpToPfam # Create a table that maps BDGP root names to canonical transcripts: cd /cluster/data/dm1/bed/blastp cat > bdgpToCanonical.sql <) { \ chop; $name = $value = $_; $value =~ s/-R.$//; \ print "INSERT INTO bdgpToCanonical VALUES (\"$name\", \"$value\");\n"; \ } \ close(P);' \ >> bdgpToCanonical.sql hgsql dm1 < bdgpToCanonical.sql # Create a table that maps between bdgp genes and the # Stanford Microarray Project expression data. (see makeHgFixed.doc) hgExpDistance -lookup=bdgpToCanonical \ dm1 hgFixed.arbFlyLifeMedianRatio dummyArg arbExpDistance # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. # C.ELEGANS BLASTP FOR HGNEAR (REDONE 6/17/04 angie) # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to iscratch/i # if it doesn't exist already: ssh eieio mkdir /cluster/data/ce2/bed/blastp cd /cluster/data/ce2/bed/blastp # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/ # to find out the latest version. Then use that in place of 126 below. wget -O wormPep126.faa \ ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep126/wormpep formatdb -i wormPep126.faa -t wormPep126 -n wormPep126 ssh kkr1u00 if (-e /iscratch/i/ce2/blastp) then rm -r /iscratch/i/ce2/blastp endif mkdir -p /iscratch/i/ce2/blastp cp /cluster/data/ce2/bed/blastp/wormPep126.p?? /iscratch/i/ce2/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/dm1/bed/blastp/ce2/run/out cd /cluster/data/dm1/bed/blastp/ce2/run # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... #Completed: 5821 of 5821 jobs #Average job time: 8s 0.14m 0.00h 0.00d #Longest job: 156s 2.60m 0.04h 0.00d #Submission to last job: 156s 2.60m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/dm1/bed/blastp/ce2/run/out hgLoadBlastTab dm1 ceBlastTab -maxPer=1 *.tab # MOUSE BLASTP FOR HGNEAR (REDONE 6/17/04 angie) # Make mouse ortholog column using blastp on mouse known genes. # First make mouse protein database and copy it to iscratch/i # if it doesn't exist already: ssh hgwdev mkdir /cluster/data/mm4/bed/blastp cd /cluster/data/mm4/bed/blastp pepPredToFa mm4 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/mm4/blastp) then rm -r /iscratch/i/mm4/blastp endif mkdir -p /iscratch/i/mm4/blastp cp -p /cluster/data/mm4/bed/blastp/known.p?? /iscratch/i/mm4/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/dm1/bed/blastp/mm4/run/out cd /cluster/data/dm1/bed/blastp/mm4/run # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... #Completed: 5821 of 5821 jobs #Average job time: 12s 0.20m 0.00h 0.00d #Longest job: 267s 4.45m 0.07h 0.00d #Submission to last job: 267s 4.45m 0.07h 0.00d # Load into database. ssh hgwdev cd /cluster/data/dm1/bed/blastp/mm4/run/out hgLoadBlastTab dm1 mmBlastTab -maxPer=1 *.tab # HUMAN BLASTP FOR HGNEAR (REDONE 6/17/04 angie) # Make human ortholog column using blastp on human known genes. # First make human protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/hg16/bed/blastp cd /cluster/data/hg16/bed/blastp pepPredToFa hg16 knownGenePep known.faa formatdb -i known.faa -t known -n known ssh kkr1u00 if (-e /iscratch/i/hg16/blastp) then rm -r /iscratch/i/hg16/blastp endif mkdir -p /iscratch/i/hg16/blastp cp /cluster/data/hg16/bed/blastp/known.p?? /iscratch/i/hg16/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/dm1/bed/blastp/hg16/run/out cd /cluster/data/dm1/bed/blastp/hg16/run # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... #Completed: 5821 of 5821 jobs #Average job time: 14s 0.24m 0.00h 0.00d #Longest job: 327s 5.45m 0.09h 0.00d #Submission to last job: 327s 5.45m 0.09h 0.00d # Load into database. ssh hgwdev cd /cluster/data/dm1/bed/blastp/hg16/run/out hgLoadBlastTab dm1 hgBlastTab -maxPer=1 *.tab # ZEBRAFISH BLASTP FOR HGNEAR (REDONE 6/17/04 angie) # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to iscratch/i # if it doesn't exist already: ssh kkstore mkdir /cluster/data/danRer1/bed/blastp cd /cluster/data/danRer1/bed/blastp wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH3.may.pep.fa.gz zcat Dan*.pep.fa.gz > ensembl.faa formatdb -i ensembl.faa -t ensembl -n ensembl ssh kkr1u00 if (-e /iscratch/i/danRer1/blastp) then rm -r /iscratch/i/danRer1/blastp endif mkdir -p /iscratch/i/danRer1/blastp cp /cluster/data/danRer1/bed/blastp/ensembl.p?? /iscratch/i/danRer1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/dm1/bed/blastp/danRer1/run/out cd /cluster/data/dm1/bed/blastp/danRer1/run # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... #Completed: 5821 of 5821 jobs #Average job time: 12s 0.20m 0.00h 0.00d #Longest job: 273s 4.55m 0.08h 0.00d #Submission to last job: 273s 4.55m 0.08h 0.00d # Load into database. ssh hgwdev cd /cluster/data/dm1/bed/blastp/danRer1/run/out hgLoadBlastTab dm1 drBlastTab -maxPer=1 *.tab # YEAST BLASTP FOR HGNEAR (REDONE 6/17/04 angie) # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on # RefSeq. First make protein database and copy it to iscratch/i # if it doesn't exist already: mkdir /cluster/data/sacCer1/bed/blastp cd /cluster/data/sacCer1/bed/blastp wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz zcat orf_trans.fasta.gz > sgdPep.faa formatdb -i sgdPep.faa -t sgdPep -n sgdPep ssh kkr1u00 # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, # fortunately we won't be looking for homologs there. :) if (-e /iscratch/i/sacCer1/blastp) then rm -r /iscratch/i/sacCer1/blastp endif mkdir -p /iscratch/i/sacCer1/blastp cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/dm1/bed/blastp/sacCer1/run/out cd /cluster/data/dm1/bed/blastp/sacCer1/run # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... #Completed: 5821 of 5821 jobs #Average job time: 4s 0.07m 0.00h 0.00d #Longest job: 37s 0.62m 0.01h 0.00d #Submission to last job: 42s 0.70m 0.01h 0.00d # Load into database. ssh hgwdev cd /cluster/data/dm1/bed/blastp/sacCer1/run/out hgLoadBlastTab dm1 scBlastTab -maxPer=1 *.tab # MAKE ORGANISM-SPECIFIC HGNEARDATA FILES (DONE 10/17/03 angie) cd ~/kent/src/hg/near/hgNear/hgNearData # The directory name is the dbDb.genome field, processed by # hdb.c's hgDirForOrg(): mkdir D_melanogaster cp C_elegans/*.{html,ra} D_melanogaster/ cd D_melanogaster mv kimLifeCycleFull.html arbLifeCycleFull.html mv kimLifeCycleMedian.html arbLifeCycleMedian.html # edit all .ra and .html files as appropriate for D. melanogaster/dm1... # cvs add and check in D_melanogaster/ and all .ra and .html files. # The "representatives" lines in columnDb.ra are tricky. # For the median reps, I used this, edited to include an extra "-1," # at each boundary between evelopmental stages (embryo -> larva, etc): # 29->30, 41->42, 59->60 echo "select id from arbFlyLifeMedianExps;" | hgsql -N hgFixed \ | perl -we '$i=0; \ while (<>) { \ chop; print "$_,"; \ $i++; if (($i % 5) == 0) { print "-1," }; \ } \ print "\n";' # For the full reps, I used the output of this command minus the # initial ",-1,". -1 separators are inserted between each group # that was lumped together in arbMed.ra: awk '-F ' '{print $3;}' \ ~/kent/src/hg/makeDb/hgMedianMicroarray/arbMed.ra \ | perl -we 'while (<>) { \ @w=split(" "); \ print ",-1," . join(",", @w); \ } \ print "\n";' # Figure out what values to use for expMulti defn's absoluteMax. # Report on the max and 99.5%ile absolute score: echo select expScores from arbFlyLifeAll | hgsql hgFixed -N \ | perl -we '$max = 0.0; \ @all = (); \ while (<>) { \ chop; @nums = split(","); \ foreach $num (@nums) { \ $max = $num if ($num > $max); \ push @all, $num if (defined $num && $num ne ""); \ } \ } \ $n = scalar(@all); \ print "max is $max, N is $n\n"; \ @all = sort { $a <=> $b } @all; \ $useMax = $all[$n * 0.995 + 1]; \ print "99.5%ile is $useMax -- use this for max\n";' # max is 65535.000, N is 797364 # 99.5%ile is 15433.000 -- use this for max # N is the product of count(*) of arbFlyLifeAll{,Exps} . # Now repeat the above perl in-liner, but on arbFlyLifeAllRatio, # to determine ratioMax: echo select expScores from arbFlyLifeAllRatio | hgsql hgFixed -N \ # # max is 9.862, N is 797364 # 99.5%ile is 2.915 -- use this for max # ENABLE HGNEAR FOR DM1 IN HGCENTRALTEST (DONE 10/17/03 angie) echo "update dbDb set hgNearOk = 1 where name = 'dm1';" \ | hgsql -h genome-testdb hgcentraltest # END OF HGNEAR STUFF ############################################################################# # reload refseqs to pickup a change that uses locus_tag if gene name isn't # available. 2003/10/16 markd # First remove refseqs from databases: drop table refSeqStatus; drop table refLink; drop table refSeqSummary; drop table refGene; drop table refSeqAli; drop table refFlat; delete from gbSeq where srcDb = 'RefSeq'; delete from gbStatus where srcDb = 'RefSeq'; delete from gbExtFile where path like '%/refseq%'; delete from gbLoaded where srcDb = 'RefSeq'; delete from mrna where acc like 'NM\_%'; delete from imageClone where acc like 'NM\_%'; delete from mrna where acc like 'NR\_%'; delete from imageClone where acc like 'NR\_%'; # now reload: cd /cluster/data/genbank ./bin/gbDbLoadStep -type=mrna dm1 # MAKE DOWNLOADABLE FILES (DONE 10/29/03 angie) ssh kksilo cd /cluster/data/dm1 mkdir zips zip -j zips/chromOut.zip ?{,?}/chr?{,?}.fa.out zip -j zips/chromFa.zip ?{,?}/chr?{,?}.fa foreach f (?{,?}/chr?{,?}.fa) maskOutFa $f hard $f.masked end zip -j zips/chromFaMasked.zip ?{,?}/chr?{,?}.fa.masked cd bed/simpleRepeat zip ../../zips/chromTrf.zip trfMaskChrom/chr*.bed zip ../../zips/contigTrf.zip trfMask/N{T,G}*.bed cd ../.. # Make a starter mrna.zip -- it will get updated regularly on the RR. /cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank \ -db=dm1 -native genbank mrna mrna.fa zip zips/mrna.zip mrna.fa rm mrna.fa foreach f (zips/*.zip) echo $f unzip -t $f | tail -1 end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1 cd /usr/local/apache/htdocs/goldenPath/dm1 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cp -p /cluster/data/dm1/zips/*.zip bigZips # ANOPHELES ECORES FROM GENOSCOPE (DONE 11/10/03 angie) ssh hgwdev mkdir /cluster/data/dm1/bed/anophelesEcores cd /cluster/data/dm1/bed/anophelesEcores # save attachment from Olivier Jaillon's email 11/10/03 to # ecotig.6.4ucsc perl -wpe 'if (/^(\w+)\:\d+ (\d+) (\d+) (\w+)\:\d+ (\d+) (\d+)$/) { \ if ($1 ne $4) { die "diff chr: $1 $4"; } \ $name = "chr$1:$2-$6"; $start = $2 - 1; \ $sz1 = $3 - $start; $sz2 = $6 - ($5 - 1); $st2 = $5 - $2; \ $_ = "chr$1\t$start\t$6\t$name\t0\t+\t$start\t$6\t0\t2\t$sz1,$sz2,\t0,$st2,\n"; \ } elsif (/^(\w+)\:\d+ (\d+) (\d+)\s*$/) { \ $name = "chr$1:$2-$3"; $start = $2 - 1; $sz1 = $3 - $start; \ $_ = "chr$1\t$start\t$3\t$name\t0\t+\t$start\t$3\t0\t1\t$sz1,\t0,\n"; \ } else { chop; die "cant parse line $.:\n|$_|"; }' \ < ecotig.6.4ucsc > anophelesEcores.bed hgLoadBed -tab dm1 anophelesEcores anophelesEcores.bed # MASKED-QUERY XENO RNA ALIGNMENTS (DONE 10/22/03 angie) # Experimental track... if this TRF-masking works out, then find out # what it would take to add it as an option to Mark's genbank stuff. ssh kksilo mkdir /cluster/data/dm1/bed/xenoMrnaMasked cd /cluster/data/dm1/bed/xenoMrnaMasked # Grab the latest full-release (137) genbank mRNA's. Strip the # .version suffixes so we can use accessions from mrna.gbidx to # pick out the non-D.mel. mrnas. perl -wpe 's/^>(\w+).\d+/>$1/' \ /cluster/data/genbank/data/processed/genbank.137.0/full/mrna.fa \ > allMrna.fa grep -v "Drosophila melanogaster" \ /cluster/data/genbank/data/processed/genbank.137.0/full/mrna.gbidx \ | awk '{print $1;}' | grep -v "^#" \ > xenoAcc.lst faSomeRecords allMrna.fa xenoAcc.lst flyXenoRna.fa # Split up the sequences into manageably sized files. mkdir flyXenoRnaSplit faSplit about flyXenoRna.fa 10000000 flyXenoRnaSplit/xenoRna # Now TRF-mask the sequences... use repeats with period <= 9 to mask. mkdir trf rm -f trf.log; touch trf.log foreach f (flyXenoRnaSplit/*.fa) set b = trf/$f:t:r.bed echo $f to $b... /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null \ -bedAt=$b -tempDir=/tmp -maxPeriod=9 >& trf.log maskOutFa -soft $f $b $f end # distribute masked xenoRna sequence on i-servers mkdir /cluster/bluearc/dm1/mrna.137 cp -p /cluster/data/dm1/bed/xenoMrnaMasked/flyXenoRnaSplit/*.fa \ /cluster/bluearc/dm1/mrna.137/ ssh kk cd /cluster/data/dm1/bed/xenoMrnaMasked mkdir psl ls -1S /iscratch/i/dm1/nib/chr*.nib > fly.lst ls -1S /cluster/bluearc/dm1/mrna.137/xenoRna*.fa > mrna.lst echo '#LOOP \ /cluster/bin/i386/blat -maxIntron=50000 $(path1) {check in line+ $(path2)} -q=rnax -t=dnax -mask=lower {check out line+ psl/$(root1)_$(root2).psl} \ #ENDLOOP' > gsub gensub2 fly.lst mrna.lst gsub spec para create spec para try, check, push, check, ... #Completed: 814 of 814 jobs #Average job time: 459s 7.65m 0.13h 0.01d #Longest job: 1756s 29.27m 0.49h 0.02d #Submission to last job: 2145s 35.75m 0.60h 0.02d ssh kksilo cd /cluster/data/dm1/bed/xenoMrnaMasked pslSort dirs raw.psl /cluster/store2/temp psl pslReps raw.psl cooked.psl /dev/null -minAli=0.25 # pslFilter -minMatch=60 -gapSizeLogMod=2 -minScore=30 cooked.psl filt.psl pslFilter -minAli=250 -minUniqueMatch=15 cooked.psl filt.psl pslSortAcc nohead chrom /cluster/store2/temp filt.psl pslCat -dir chrom > xenoMrnaMasked.psl rm -r chrom raw.psl cooked.psl filt.psl # Load into database as so: ssh hgwdev cd /cluster/data/dm1/bed/xenoMrnaMasked hgLoadPsl dm1 xenoMrnaMasked.psl # Looks like no need to hgLoadSeq -- seqs are already loaded by Mark's # stuff. # BLAT HONEYBEE (apiMel0) (DONE 1/13/04 angie) ssh kk mkdir /cluster/data/dm1/bed/blatApiMel0 cd /cluster/data/dm1/bed/blatApiMel0 mkdir psl ls -1S /iscratch/i/dm1/nib/*.nib > fly.lst ls -1S /iscratch/i/apiMel0/chunks/*.fa > bee.lst cat << 'EOF' > gsub #LOOP /cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/dm1/bed/blatApiMel0/psl/$(root1)_$(root2).psl} #ENDLOOP 'EOF' # << this line makes emacs coloring happy gensub2 fly.lst bee.lst gsub spec para create spec para try, check, push, check, ... #Completed: 495 of 495 jobs #Average job time: 802s 13.37m 0.22h 0.01d #Longest job: 26590s 443.17m 7.39h 0.31d #Submission to last job: 27028s 450.47m 7.51h 0.31d # postprocess pslCat -dir psl > blatApiMel0.psl # load ssh hgwdev cd /cluster/data/dm1/bed/blatApiMel0 hgLoadPsl dm1 blatApiMel0.psl mkdir /gbdb/dm1/apiMel0 foreach f (/cluster/data/apiMel0/groups/*.fa) ln -s $f /gbdb/dm1/apiMel0/$f:t end hgLoadSeq dm1 /gbdb/dm1/apiMel0/*.fa # miRNA track (DONE - 2004-05-04 - Hiram) # data from: Sam Griffiths-Jones # and Michel.Weber@ibcg.biotoul.fr # notify them if this assembly updates to renew this track ssh hgwdev mkdir /cluster/data/dm1/bed/miRNA cd /cluster/data/dm1/bed/miRNA wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/dme_bdgp3.*" grep -v "^track " dme_bdgp3.bed | sed -e "s/ /\t/g" > dm1.bed hgLoadBed dm1 miRNA dm1.bed # entry in trackDb/trackDb.ra already there # featureBits dm1 miRNA # 6845 bases of 126527731 (0.005%) in intersection # BLASTZ D.YAKUBA (DONE 5/22/04 angie) ssh kksilo mkdir /cluster/data/dm1/bed/blastz.droYak1.2004-05-22 cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22 cat << '_EOF_' > DEF # D.melanogaster vs. D.yakuba export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/cluster/bluearc/drosophila/dm1/nib # unused: SEQ1_RMSK= SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - D. yakuba SEQ2_DIR=/iscratch/i/droYak1/nib # unused: SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG=-drosophila SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm1/bed/blastz.droYak1.2004-05-22 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # run bash shell if you don't already: bash source DEF mkdir run ~angie/hummus/make-joblist $DEF > $BASE/run/j sh ./xdir.sh cd run sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2 wc -l j* head j2 mv j2 j # cluster run ssh kk cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/run para create j para try, check, push, check, .... #Completed: 672 of 672 jobs #Average job time: 164s 2.74m 0.05h 0.00d #Longest job: 1836s 30.60m 0.51h 0.02d #Submission to last job: 2433s 40.55m 0.68h 0.03d # back on kksilo... mkdir /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/run.1 cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/run.1 ~angie/hummus/do.out2lav ../DEF > j # small cluster run ssh kki cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/run.1 para create j para try, check, push, check, .... #Completed: 21 of 21 jobs #Average job time: 453s 7.54m 0.13h 0.01d #Longest job: 650s 10.83m 0.18h 0.01d #Submission to last job: 685s 11.42m 0.19h 0.01d cd .. rm -r raw # third run: lav -> axt ssh kki cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22 mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | $HOME/bin/x86_64/lavToAxt stdin \ /cluster/bluearc/drosophila/dm1/nib /iscratch/i/droYak1/nib stdout \ | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check #Completed: 11 of 11 jobs #Average job time: 1516s 25.27m 0.42h 0.02d #Longest job: 2903s 48.38m 0.81h 0.03d #Submission to last job: 2903s 48.38m 0.81h 0.03d # CHAIN YAKUBA BLASTZ (DONE 5/27/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ=chrUn_random $1 \ | axtChain -verbose=0 stdin \ /cluster/bluearc/drosophila/dm1/nib \ /iscratch/i/droYak1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 18s 0.30m 0.01h 0.00d #Longest job: 35s 0.58m 0.01h 0.00d #Submission to last job: 35s 0.58m 0.01h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm1 ${c}_chainDroYak1 $i end # NET YAKUBA BLASTZ (DONE 5/27/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain netClass -noAr noClass.net dm1 droYak1 yakuba.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain rm noClass.net netFilter -syn yakuba.net > yakubaSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain netFilter -minGap=10 yakuba.net | hgLoadNet dm1 netDroYak1 stdin netFilter -minGap=10 yakubaSyn.net | hgLoadNet dm1 netSyntenyDroYak1 stdin # MAKE VSDROYAK1 DOWNLOADABLES (DONE 10/5/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain gzip -c all.chain > ../../../zips/yakuba.chain.gz gzip -c yakuba.net > ../../../zips/yakuba.net.gz # axtNet/* already compressed... ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDroYak1 cd /usr/local/apache/htdocs/goldenPath/dm1/vsDroYak1 mv /cluster/data/dm1/zips/yakuba*.gz . cp -pR /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtNet . md5sum *.gz */*.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # GENERATE DROYAK1 MAF FOR MULTIZ FROM NET (DONE 5/27/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain netSplit yakuba.net net ssh kolossus cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/dm1/nib \ /cluster/data/droYak1/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.my.maf axtToMaf $f \ /cluster/data/dm1/chrom.sizes /cluster/data/droYak1/chrom.sizes \ $maf -tPrefix=dm1. -qPrefix=droYak1. end # GENERATE DP2 MAF FOR MULTIZ FROM NET (DONE 8/9/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain netSplit pseudoobscura.net net cd .. mkdir axtNet foreach f (axtChain/net/chr*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/data/dm1/nib /cluster/data/dp2/nib stdout \ | axtSort stdin axtNet/$f:t:r.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/dm1/chrom.sizes /cluster/data/dp2/chrom.sizes \ $maf -tPrefix=dm1. -qPrefix=dp2. end # MULTIZ MELANOGASTER/YAKUBA/PSEUDOOBSCURA (DONE 8/9/04 angie) # put the MAFs on bluearc ssh kksilo mkdir -p /cluster/bluearc/multiz.fly/my cp /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/mafNet/*.maf \ /cluster/bluearc/multiz.fly/my mkdir -p /cluster/bluearc/multiz.fly/mp cp /cluster/data/dm1/bed/blastz.dp2.2004-08-03/mafNet/*.maf \ /cluster/bluearc/multiz.fly/mp ssh kki mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2 cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2 mkdir myp # Wrapper script required because of stdout redirect: cat << '_EOF_' > doMultiz #!/bin/csh -ef /cluster/bin/penn/multiz $1 $2 - > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doMultiz rm -f jobList foreach file (/cluster/bluearc/multiz.fly/my/*.maf) set root=$file:t:r:r echo "doMultiz /cluster/bluearc/multiz.fly/mp/${root}.maf $file /cluster/data/dm1/bed/multiz.dm1droYak1dp2/myp/${root}.maf" >> jobList end para create jobList para try, check, push, check #Completed: 11 of 11 jobs #Average job time: 52s 0.87m 0.01h 0.00d #Longest job: 144s 2.40m 0.04h 0.00d #Submission to last job: 144s 2.40m 0.04h 0.00d # clean up bluearc (these are big files!) rm -r /cluster/bluearc/multiz.fly # no downloadables/db-loading -- just go on and do the 4-way multiz. # BDGP 3.2 ANNOTATIONS (DONE 6/18/04 angie) ss kksilo cd /cluster/data/dm1/bed mv bdgpAnnotations bdgpAnnotations.3.1 mkdir bdgpAnnotations.3.2 cd /cluster/data/dm1/bed/bdgpAnnotations.3.2 # No annotations for chrU or h's! probably better that way... foreach c (2L 2R 3L 3R 4 X) set f = dmel_${c}_translation_r3.2.0.fasta.gz wget ftp://flybase.net/genomes/Drosophila_melanogaster/current/fasta/$f set f = dmel_${c}_r3.2.0.gff.gz wget ftp://flybase.net/genomes/Drosophila_melanogaster/current/gff/$f end gunzip dmel*.gff.gz # This new set is almost-but-not-quite GFF3, and all features (genes, # pseudogenes, ESTs, regulatory regions, kitchen sink) are included # in one /gff file. Perl the gene and non-coding gene parts of it into # bdgp{Gene,NonCoding}.gtf and bdgp{Gene,NonCoding}Info.tab. chmod a+x ./extractGenes.pl ./extractGenes.pl *.gff # Sort GTF by position. sort -k1,1 -k4n,5n bdgpGene.gtf > tmp mv tmp bdgpGene.gtf sort -k1,1 -k4n,5n bdgpNonCoding.gtf > tmp mv tmp bdgpNonCoding.gtf # Proteins: zcat dmel*trans*.gz | perl -wpe 's/^>(CG\d+)(-\S+)?-P(\w)\s.*/>$1-R$3/' \ > bdgpGenePep.fa # Load into test table for now -- don't want to mess up Gene Sorter # stuff that depends on bdgpGene* tables. ssh hgwdev cd /cluster/data/dm1/bed/bdgpAnnotations.3.2 # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf dm1 bdgpGene2 bdgpGene.gtf featureBits dm1 bdgpGene #28262131 bases of 126527731 (22.337%) in intersection featureBits dm1 bdgpGene2 #28178241 bases of 126527731 (22.270%) in intersection featureBits dm1 bdgpGene bdgpGene2 #27813359 bases of 126527731 (21.982%) in intersection # OK, looks OK. Replace bdgpGene with bdgpGene2, reload bdgpGeneInfo, # then go back up and quickly rebuild all the hgNear (Gene Sorter) # tables that depend on bdgpGene. hgsql dm1 -e 'drop table bdgpGene; alter table bdgpGene2 rename bdgpGene' hgsql dm1 -e 'delete from bdgpGeneInfo' hgsql dm1 -e 'load data local infile "bdgpGeneInfo.tab" into table bdgpGeneInfo' # load bdgpNonCoding* # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf dm1 bdgpNonCoding bdgpNonCoding.gtf hgsql dm1 -e 'delete from bdgpNonCodingInfo' hgsql dm1 -e 'load data local infile "bdgpNonCodingInfo.tab" into table bdgpNonCodingInfo' # load proteins hgPepPred dm1 generic bdgpGenePep bdgpGenePep.fa # CYTOBANDS (DONE 6/15/04 angie) ssh hgwdev mkdir /cluster/data/dm1/bed/cytoband cd /cluster/data/dm1/bed/cytoband foreach c (2L 2R 3L 3R 4 X) wget ftp://flybase.net/genomes/Drosophila_melanogaster/current/gnomap/cytomap-$c.tsv end cp /dev/null cytoBand.tab foreach c (2L 2R 3L 3R 4 X) grep -v '^#' cytomap-$c.tsv \ | awk '$3 > 0 && $2 != $3 {print "chr'$c'\t" $2 "\t" $3 "\t" $1 "\t";}' \ >> cytoBand.tab end hgsql dm1 < ~/kent/src/hg/lib/cytoBand.sql hgsql dm1 -e 'load data local infile "cytoBand.tab" into table cytoBand' # Normally we would just create cytoBandIdeo by select * from cytoBand -- # but that gives too high of a resolution for any bands to be visible! # So make a boiled-down cytoBandIdeo that just has {number, letter} # instead of {number, letter, number}. perl -we 'while (<>) { \ chomp; @w=split(" "); \ if ($w[3] =~ /(\d+[A-Z]+)\d+/) { \ $b = $1; \ } else { die "doh!" } \ if (! defined $lastB) { \ $start = $w[1]; \ } elsif ($lastB ne $b) { \ print "$chrom\t$start\t$end\t$lastB\n"; \ $start = $w[1]; \ } \ ($chrom, $end, $lastB) = ($w[0], $w[2], $b); \ } \ print "$chrom\t$start\t$end\t$lastB\n";' cytoBand.tab \ > cytoBandIdeo.tab hgsql dm1 -e 'create table cytoBandIdeo select * from cytoBand where 0' hgsql dm1 -e 'load data local infile "cytoBandIdeo.tab" into table \ cytoBandIdeo' # 6/23/04: trim the entries that fall off the ends of their respective # chromosomes. foreach c (`echo select chrom from chromInfo | hgsql -N dm1`) set size = `echo select size from chromInfo where chrom = '"'$c'"' \ | hgsql -N dm1` echo $c $size foreach table (cytoBand cytoBandIdeo) echo delete from $table where chrom = \"$c\" and chromStart \> $size \ | hgsql -N dm1 echo update $table set chromEnd = $size where chrom = \"$c\" and \ chromEnd \> $size \ | hgsql -N dm1 end end # BLASTZ ANOPHELES (DONE 6/10/04 angie) # Will give human-fugu params a try... but without abridging repeats # since I don't know which are lin-spec for fly vs. mosquito, and don't # want to bother Arian or speculate. ssh kksilo mkdir /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10 cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10 cat << '_EOF_' > DEF # D.melanogaster vs. A. gambiae export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/cluster/bluearc/drosophila/dm1/nib # unused: SEQ1_RMSK= SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - A. gambiae SEQ2_DIR=/cluster/bluearc/anoGam1/nib # unused: SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG=-anopheles SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm1/bed/blastz.anoGam1.2004-06-10 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # run bash shell if you don't already: bash source DEF mkdir run ~angie/hummus/make-joblist $DEF > $BASE/run/j sh ./xdir.sh cd run sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 mv j2 j # cluster run -- use rack 9 to avoid getting in the way of hg17. ssh kk9 cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/run para create j para try, check, push, check, .... #Completed: 693 of 693 jobs #Average job time: 208s 3.47m 0.06h 0.00d #Longest job: 675s 11.25m 0.19h 0.01d #Submission to last job: 1884s 31.40m 0.52h 0.02d # back on kksilo... mkdir /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/run.1 cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/run.1 ~angie/hummus/do.out2lav ../DEF > j # small cluster run ssh kki cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/run.1 para create j para try, check, push, check, .... #Completed: 21 of 21 jobs #Average job time: 5s 0.09m 0.00h 0.00d #Longest job: 12s 0.20m 0.00h 0.00d #Submission to last job: 14s 0.23m 0.00h 0.00d cd .. rm -r raw # third run: lav -> axt ssh kki cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10 mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | $HOME/bin/x86_64/lavToAxt stdin \ /cluster/bluearc/drosophila/dm1/nib /iscratch/i/anoGam1/nib stdout \ | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check #Completed: 11 of 11 jobs #Average job time: 9s 0.16m 0.00h 0.00d #Longest job: 18s 0.30m 0.01h 0.00d #Submission to last job: 18s 0.30m 0.01h 0.00d # CHAIN ANOPHELES BLASTZ (DONE 6/10/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Make our own linear gap file with reduced gap penalties: cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap \ -verbose=0 $1 \ /cluster/bluearc/drosophila/dm1/nib \ /iscratch/i/anoGam1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 13s 0.22m 0.00h 0.00d #Longest job: 19s 0.32m 0.01h 0.00d #Submission to last job: 19s 0.32m 0.01h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm1 ${c}_chainAnoGam1 $i end # NET ANOPHELES BLASTZ (DONE 6/10/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain netClass -noAr noClass.net dm1 anoGam1 anopheles.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain rm noClass.net netFilter -syn anopheles.net > anophelesSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain netFilter -minGap=10 anopheles.net | hgLoadNet dm1 netAnoGam1 stdin netFilter -minGap=10 anophelesSyn.net | hgLoadNet dm1 netSyntenyAnoGam1 stdin # MAKE VSANOGAM1 DOWNLOADABLES (DONE 10/5/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain gzip -c all.chain > ../../../zips/anopheles.chain.gz gzip -c anopheles.net > ../../../zips/anopheles.net.gz cd .. mkdir ../../zips/axtNet foreach f (axtNet/chr*.axt) gzip -c $f > ../../zips/axtNet/$f:t.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsAnoGam1 cd /usr/local/apache/htdocs/goldenPath/dm1/vsAnoGam1 mv /cluster/data/dm1/zips/anopheles*.gz . mv /cluster/data/dm1/zips/axtNet . md5sum *.gz */*.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # GENERATE ANOGAM1 MAF FOR MULTIZ FROM NET (DONE 6/10/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain netSplit anopheles.net net ssh kolossus cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/dm1/nib \ /cluster/data/anoGam1/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.ma.maf axtToMaf $f \ /cluster/data/dm1/chrom.sizes /cluster/data/anoGam1/chrom.sizes \ $maf -tPrefix=dm1. -qPrefix=anoGam1. end # MULTIZ MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 8/9/04 angie) # put the MAFs on bluearc ssh kksilo mkdir -p /cluster/bluearc/multiz.flymo/myp cp /cluster/data/dm1/bed/multiz.dm1droYak1dp2/myp/*.maf \ /cluster/bluearc/multiz.flymo/myp mkdir -p /cluster/bluearc/multiz.flymo/ma cp /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/mafNet/*.maf \ /cluster/bluearc/multiz.flymo/ma ssh kki mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1 cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1 mkdir mypa # Wrapper script required because of stdout redirect: cat << '_EOF_' > doMultiz #!/bin/csh -fe /cluster/bin/penn/multiz $1 $2 - > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doMultiz rm -f jobList foreach file (/cluster/bluearc/multiz.flymo/myp/*.maf) set root=$file:t:r:r echo "doMultiz /cluster/bluearc/multiz.flymo/ma/${root}.ma.maf $file /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/${root}.maf" >> jobList end para create jobList para try, check, push, check #Completed: 11 of 11 jobs #Average job time: 27s 0.45m 0.01h 0.00d #Longest job: 74s 1.23m 0.02h 0.00d #Submission to last job: 74s 1.23m 0.02h 0.00d du -sh mypa #415M mypa # clean up bluearc rm -r /cluster/bluearc/multiz.flymo # setup external files for database reference ssh hgwdev mkdir /gbdb/dm1/mzDy1Dp2Ag1_phast ln -s /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/chr*.maf \ /gbdb/dm1/mzDy1Dp2Ag1_phast/ # load into database hgLoadMaf -warn dm1 mzDy1Dp2Ag1_phast # The previous Conservation wigMaf had "pairwise mypa" in its trackDb. # Now we need to define a new suffix for the pairwise tables and # /gbdb/ locations, in order to avoid a clash between dp1 and dp2 # d_pseudoobscura_$pairwise tables/gbdb. mkdir /gbdb/dm1/{d_yakuba,d_pseudoobscura,a_gambiae}_myp2a cd /tmp ln -s /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/mafNet/*.maf \ /gbdb/dm1/d_yakuba_myp2a hgLoadMaf -WARN dm1 d_yakuba_myp2a ln -s /cluster/data/dm1/bed/blastz.dp2.2004-08-03/mafNet/*.maf \ /gbdb/dm1/d_pseudoobscura_myp2a hgLoadMaf -WARN dm1 d_pseudoobscura_myp2a ln -s /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/mafNet/*.maf \ /gbdb/dm1/a_gambiae_myp2a hgLoadMaf -WARN dm1 a_gambiae_myp2a # put it out for download (10/5/04) ssh kksilo cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1 mkdir ../../zips/mzMafs foreach f (mypa/*.maf) gzip -c $f > ../../zips/mzMafs/$f:t.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/multizDroYak1Dp2AnoGam1 cd /usr/local/apache/htdocs/goldenPath/dm1/multizDroYak1Dp2AnoGam1 mv /cluster/data/dm1/zips/mzMafs/* . rmdir /cluster/data/dm1/zips/mzMafs md5sum *.gz > md5sum.txt # PHASTCONS MELANOGASTER/YAKUBA/PSEUDOOBSCURA (first cut 6/1/04 acs) # make sure /cluster/bin/phast is in your path # commands below use bash shell # watch for typos -- copying from a Makefile and editing a bit # there are three steps to go through: fitting a phylogenetic # model to the data set with phyloFit, then running phastCons for # conservation scores, then running phastCons for predictions of # conserved elements. # step 1: fit phylogenetic model, with rate variation (discrete # gamma model) first we need to extract the sufficient stats for # the phylogenetic model from the MAFs. In this case, we don't # care about the order of the tuples (allows for much more compact # representation) ssh eieio mkdir /cluster/data/dm1/bed/phastCons cd /cluster/data/dm1/bed/phastCons MAF=/cluster/data/dm1/bed/multiz.dm1droYak1dp2/myp # first extract chromosome by chromosome for file in ${MAF}/*.maf ; do \ msa_view $file --in-format MAF --out-format SS --order dm1,droYak1,dp2 \ --unordered-ss > `basename $file .maf`.ss ;\ done # now aggregate for whole genome ls chr*.ss > fnames msa_view "*fnames" -i SS --aggregate dm1,droYak1,dp2 -o SS --unordered-ss > all.ss # now estimate parameters; should be very fast in this case phyloFit --msa all.ss --tree "((1,2),3)" --msa-format SS --subst-mod REV --nrates 10 --out-root rev-dg --output-tree # make sure model looks reasonable cat rev-dg.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #NRATECATS: 10 #ALPHA: 1.273963 #TRAINING_LNL: -290963642.026431 #BACKGROUND: 0.279634 0.220483 0.220439 0.279444 #RATE_MAT: # -0.929128 0.196259 0.495954 0.236916 # 0.248910 -1.089402 0.212511 0.627981 # 0.629134 0.212554 -1.090600 0.248913 # 0.237077 0.495481 0.196354 -0.928911 #TREE: ((1:0.057640,2:0.073631):0.166542,3:0.166542); # beware of zero branch lengths (indicates bad topology) or very # large branch lengths (much greater than one; indicates # convergence problems). Can try without --nrates or with # --subst-mod HKY85 to double check (parameter estimates should be # in the same ballpark, REV + dG likelihood should be a bit # better). Also can write a log file to monitor convergence # (--log). Also watch out for alpha much less than one. # Sometimes it's helpful to produce a rendering of the tree using # draw_tree (run on the *.nh file) # step 2: run phastCons. First partition the alignments into # bite-sized chunks. This time we need the ordered version of the # SS format. # some vars used below MAF=/cluster/data/dm1/bed/multiz.dm1droYak1dp2/myp FA=/cluster/data/dm1 WINSIZE=1000000 WINOVERLAP=0 for file in ${MAF}/*.maf ; do \ root=`basename $file .maf` ;\ chr=`echo $root | sed 's/chr//'` ;\ echo $file $root $chr ;\ mkdir -p SS/$chr ;\ msa_split $file -i MAF -o SS -O dm1,droYak1,dp2 -M ${FA}/$chr/$root.fa \ -w ${WINSIZE},${WINOVERLAP} -r SS/$chr/$$root -I 1000 -d 1 -B 5000 ;\ done # (this is worth doing as a little cluster job with mammalian genomes, # but it's pretty fast with fly) ssh hgwdev cd /cluster/data/dm1/bed/phastCons # now set up cluster job. Make a little wrapper for phastCons cat << '_EOF_' > doPostProbs #!/bin/sh PHAST=/cluster/bin/phast TMP=/scratch/phastCons file=$1 root=`basename $file .ss` chrom=`echo $root | awk -F\. '{print $1}'` mkdir -p $TMP $PHAST/phastCons $file rev-dg.mod --cut-at 2 --nrates 10 --transitions 0.030,0.015 --quiet > ${TMP}/$root.pp mkdir -p POSTPROBS/$chrom gzip -c $TMP/$root.pp > POSTPROBS/$chrom/$root.pp.gz rm $TMP/$root.pp '_EOF_' # << this line makes emacs coloring happy chmod 775 doPostProbs # Note: the --cut-params arguments above are approximate # likelihood estimates obtained by running phastCons *without* the # --cut-params argument on four or five different windows (all # gave similar results). They may need to change for different # data sets. Careful, though: the parameter estimation procedure # is a little unstable. Be sure to create a log file and monitor # for convergence. # 2nd note: manually tried various parameter settings and settled # on --cut-params 0.010,0.005. Seems to give best results for # both post probs and viterbi path. At least until we settle on a # better principle for setting the params, these are the # recommended ones to use. # set up a jobs list rm -f jobs.lst for file in `find SS -name "*.ss"` ; do echo doPostProbs $file >> jobs.lst ; done # run cluster job ssh kk cd /cluster/data/dm1/bed/phastCons para create jobs.lst ; para try ; para push ; etc.... #Completed: 134 of 134 jobs #CPU time in finished jobs: 2584s 43.07m 0.72h 0.03d 0.000 y #IO & Wait Time: 2090s 34.83m 0.58h 0.02d 0.000 y #Average job time: 35s 0.58m 0.01h 0.00d #Longest job: 46s 0.77m 0.01h 0.00d #Submission to last job: 62s 1.03m 0.02h 0.00d logout # now create wiggle track # NOTE: might want to integrate with the multiz track instead of # keeping separate mkdir -p wib for dir in POSTPROBS/* ; do \ echo $dir ;\ chr=`basename $dir` ;\ zcat `ls $dir/*.pp.gz | sort -t\. -k2,2n` | \ wigAsciiToBinary -chrom=$chr \ -wibFile=wib/${chr}_phastCons stdin ;\ done hgLoadWiggle dm1 phastCons wib/chr*_phastCons.wig mkdir -p /gbdb/dm1/wib rm -f /gbdb/dm1/wib/*phastCons.wib ln -s `pwd`/wib/*.wib /gbdb/dm1/wib chmod 775 . wib chmod 664 wib/*.wib # trackDb.ra entry #track phastCons #shortLabel phastCons #longLabel phastCons Conservation Score, melanogaster/yakuba/pseudoobscura #group compGeno #priority 103 #visibility hide #color 0,10,100 #maxHeightPixels 40 #type wig 0.0 1.0 #autoScale off # step 3: predictions of conserved elements # (could do these at the same time as step 2, but we want to use # different --rates-cut params) cat << '_EOF_' > doViterbi #!/bin/sh PHAST=/cluster/home/acs/phast/bin TMP=/scratch/phastCons file=$1 root=`basename $file .ss` chrom=`echo $root | awk -F\. '{print $1}'` mkdir -p PREDICTIONS/$chrom $PHAST/phastCons $file rev-dg.mod --nrates 10 --viterbi PREDICTIONS/$chrom/$root.bed --score --no-post-probs --transitions 0.030,0.015 --quiet --seqname $chrom '_EOF_' # << this line makes emacs coloring happy chmod 775 doViterbi # see note above regarding --cut-params (0.010,0.005 recommended) rm -f jobs.viterbi.lst for file in `find SS -name "*.ss"` ; do echo doViterbi $file >> jobs.viterbi.lst ; done logout ssh kk cd /cluster/data/dm1/bed/phastCons para create jobs.viterbi.lst ; para try ; para push ; etc.... #CPU time in finished jobs: 320s 5.34m 0.09h 0.00d 0.000 y #IO & Wait Time: 427s 7.11m 0.12h 0.00d 0.000 y #Average job time: 6s 0.09m 0.00h 0.00d #Longest job: 10s 0.17m 0.00h 0.00d #Submission to last job: 52s 0.87m 0.01h 0.00d logout # create track; we want to tweak the scores and the names sed 's/id //' PREDICTIONS/*/*.bed | \ awk '{printf "%s\t%s\t%s\tlod=%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", \ $1, $2, $3, $5, 221.65 * log($5) - 352.64, $6, $7, $8, $9, \ $10, $11, $12}' > all.bed hgLoadBed dm1 phastConsElements all.bed # Scores are transformed as follows, for a reasonable-looking # "spectrum". Let x_max be the maximum (believable) score (here # x_max = 447) and let x_med be the median score (here x_med = # 19). The scores are transformed via the function f(x) = a * # log x + b, s.t. f(x_med) = 300 and f(x_max) = 1000. Solving # for a and b, you obtain b = (300 log x_max - 1000 log x_med) / # (log x_max - log x_med), a = (1000 - b) / log x_max. Here a = # 221.65, b = -352.64. # trackDb.ra file #track phastConsElements #shortLabel phastConsElements #longLabel phastCons Conserved Elements, melanogaster/yakuba/pseudoobscura #group compGeno #priority 105 #visibility hide #spectrum on #color 0,60,120 #altColor 200,220,255 #exonArrows off #type bed 12 . # should gzip or even delete contents of SS directory when done # PHASTCONS MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 8/10/04 angie) # NOTE - REPLACED, SEE "NEW PHASTCONS" SECTION BELOW # step 1: fit phylogenetic model, with rate variation (discrete # gamma model) first we need to extract the sufficient stats for # the phylogenetic model from the MAFs. In this case, we don't # care about the order of the tuples (allows for much more compact # representation) ssh kksilo mkdir /cluster/data/dm1/bed/phastCons4way cd /cluster/data/dm1/bed/phastCons4way mkdir unordered foreach f (/cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/chr*.maf) /cluster/bin/phast/msa_view $f --in-format MAF --out-format SS \ --order dm1,droYak1,dp2,anoGam1 \ --unordered-ss > unordered/$f:t:r.ss end # aggregate for whole genome ls unordered/chr*.ss > fnames /cluster/bin/phast/msa_view "*fnames" -i SS \ --aggregate dm1,droYak1,dp2,anoGam1 -o SS --unordered-ss \ > all.ss # estimate parameters; should be very fast in this case /cluster/bin/phast/phyloFit --msa all.ss \ --tree "(((dm1,droYak1),dp2),anoGam1)" \ --msa-format SS --subst-mod REV --nrates 10 --out-root rev-dg # make sure model looks reasonable cat rev-dg.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #NRATECATS: 10 #ALPHA: 1.343144 #TRAINING_LNL: -311520966.123671 #BACKGROUND: 0.276938 0.223190 0.223142 0.276730 #RATE_MAT: # -0.926281 0.197688 0.496369 0.232223 # 0.245294 -1.090671 0.230605 0.614772 # 0.616038 0.230655 -1.092265 0.245573 # 0.232398 0.495831 0.198018 -0.926247 #TREE: (((dm1:0.056418,droYak1:0.074515):0.111186,dp2:0.219098):0.288564,anoGam1:0.288564); # Since anopheles is at such a large distance (est. divergence of # branches that became drosophila & anopheles: ~250mya, as opposed to # 35-40mya for mel-pseudo and 10-15mya for mel-yakuba), the branch length # between anopheles and pseudoobscura is underestimated above. Adam # scaled the lengths according to the published est. mya's, so # edit rev-dg.mod's TREE line to this: #TREE: (((dm1:0.058,droYak1:0.074):0.133,dp2:0.200):0,anoGam1:2.66); # A new way to arrive at that: put Mya estimates in 4way.nh, and # merge with the (more reliable) 3-way estimated tree: echo "(((dm1:10,droYak1:10):30,dp2:40):210,anoGam1:250);" \ > 4wayMya.nh /cluster/bin/phast/tree_doctor /cluster/data/dm1/bed/phastCons/rev-dg.mod \ --rename '1->dm1; 2->droYak1; 3->dp2; 4->anoGam1' \ | /cluster/bin/phast/tree_doctor - --merge 4wayMya.nh #TREE: (((dm1:0.057640,droYak1:0.073631):0.142750,dp2:0.190334):1.083495,anoGam1:1.28987 # Very similar to Adam's tree with a little manual redistribution of # weights from ancestors to outgroups at the outermost 2 nodes. # beware of zero branch lengths (indicates bad topology) or very # large branch lengths (much greater than one; indicates # convergence problems). Can try without --nrates or with # --subst-mod HKY85 to double check (parameter estimates should be # in the same ballpark, REV + dG likelihood should be a bit # better). Also can write a log file to monitor convergence # (--log). Also watch out for alpha much less than one. # Sometimes it's helpful to produce a rendering of the tree using # draw_tree (run on the *.nh file) # step 2: run phastCons. First partition the alignments into # bite-sized chunks. This time we need the ordered version of the # SS format. set WINSIZE=1000000 set WINOVERLAP=0 mkdir SS foreach f (/cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/*.maf) set chr=$f:t:r set c=`echo $chr | sed 's/chr//'` echo $f $chr $c mkdir SS/$c /cluster/bin/phast/msa_split $f -i MAF -o SS \ -O dm1,droYak1,dp2,anoGam1 -M /cluster/data/dm1/$c/$chr.fa \ -w ${WINSIZE},${WINOVERLAP} -r SS/$c/$chr -I 1000 -d 1 -B 5000 end # save some space and I/O time: gzip SS/*/*.ss ssh kk cd /cluster/data/dm1/bed/phastCons4way cat << '_EOF_' > doPostProbs #!/bin/csh -ef set PHAST=/cluster/bin/phast set TMP=/scratch/phastCons set file=$1 set root = $file:t:r:r set chrom = $root:r mkdir -p $TMP cp $file $TMP/$root.ss.gz zcat $TMP/$root.ss.gz \ | $PHAST/phastCons - rev-dg.mod --cut-at 2 --nrates 10 \ --transitions 0.030,0.015 --quiet \ > ${TMP}/$root.pp mkdir -p POSTPROBS/$chrom gzip -c $TMP/$root.pp > POSTPROBS/$chrom/$root.pp.gz rm $TMP/$root.pp rm $TMP/$root.ss.gz '_EOF_' # << this line makes emacs coloring happy chmod 775 doPostProbs # Note: the --cut-params arguments above are approximate # likelihood estimates obtained by running phastCons *without* the # --cut-params argument on four or five different windows (all # gave similar results). They may need to change for different # data sets. Careful, though: the parameter estimation procedure # is a little unstable. Be sure to create a log file and monitor # for convergence. rm -f jobs.lst foreach f (`ls -1S SS/*/*.ss.gz`) echo './doPostProbs {check in exists+ '$f'}' \ >> jobs.lst end para create jobs.lst para try ; para push ; etc.... #Completed: 134 of 134 jobs #Average job time: 23s 0.38m 0.01h 0.00d #Longest job: 30s 0.50m 0.01h 0.00d #Submission to last job: 187s 3.12m 0.05h 0.00d # now create wiggle track # NOTE: might want to integrate with the multiz track instead of # keeping separate ssh kksilo cd /cluster/data/dm1/bed/phastCons4way mkdir wib foreach dir (POSTPROBS/*) echo $dir set chr=$dir:t zcat `ls -1 $dir/*.pp.gz | sort -t\. -k2,2n` | \ wigAsciiToBinary -chrom=$chr \ -wibFile=wib/${chr}_phastCons4way stdin end ssh hgwdev mkdir -p /gbdb/dm1/wib/mzDy1Dp2Ag1_phast cd /cluster/data/dm1/bed/phastCons4way chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm1/wib/mzDy1Dp2Ag1_phast/ hgLoadWiggle dm1 mzDy1Dp2Ag1_phast_wig \ -pathPrefix=/gbdb/dm1/wib/mzDy1Dp2Ag1_phast wib/*.wig # step 3: predictions of conserved elements # (could do these at the same time as step 2, but we want to use # different --rates-cut params) ssh kk cd /cluster/data/dm1/bed/phastCons4way cat << '_EOF_' > doElements #!/bin/csh -fe set PHAST=/cluster/home/acs/phast/bin set TMP=/scratch/phastCons set file=$1 set root = $file:t:r:r set chrom = $root:r mkdir -p $TMP mkdir -p PREDICTIONS/$chrom cp $file $TMP/$root.ss.gz zcat $TMP/$root.ss.gz \ | $PHAST/phastCons - rev-dg.mod --nrates 10 \ --viterbi PREDICTIONS/$chrom/$root.bed --score --no-post-probs \ --transitions 0.030,0.015 --quiet --seqname $chrom rm $TMP/$root.ss.gz '_EOF_' # << this line makes emacs coloring happy chmod 775 doElements rm -f jobs.els.lst foreach f (SS/*/*.ss.gz) echo 'doElements {check in exists+ '$f'}' >> jobs.els.lst end para create jobs.els.lst para try ; para push ; etc.... #Completed: 134 of 134 jobs #Average job time: 5s 0.09m 0.00h 0.00d #Longest job: 9s 0.15m 0.00h 0.00d #Submission to last job: 287s 4.78m 0.08h 0.00d # Reloaded 8/24/04 with corrected transform / new script: # Create bed track: tweak scores and names, trim the unused strand column. cat PREDICTIONS/*/*.bed \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore \ > all.bed hgLoadBed dm1 phastConsElements all.bed featureBits dm1 phastConsElements #22859943 bases of 126527731 (18.067%) in intersection # make top-5000 list and launcher on Adam's home page: sort -k5,5nr raw.bed | head -5000 > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/dm-top5000-4way \ "top 5000 conserved elements (4way)" dm1 # NEW PHASTCONS MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 9/24/04 angie) ssh kksilo # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces mkdir -p /cluster/bluearc/dm1/chrom cp -p /cluster/data/dm1/?{,?}/chr*.fa /cluster/bluearc/dm1/chrom/ ssh kki mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.split cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.split set WINDOWS = /cluster/bluearc/dm1/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/cluster/bluearc/dm1/chrom set WINDOWS=/cluster/bluearc/dm1/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O dm1,droYak1,dp2,anoGam1 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/*.maf) if (-s $file) then echo "doSplit.sh {check in line+ $file}" >> jobList endif end para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 25s 0.42m 0.01h 0.00d #Longest job: 60s 1.00m 0.02h 0.00d #Submission to last job: 60s 1.00m 0.02h 0.00d cd .. # use the model previously estimated (seein phastCons4way as a # starting model. cp /cluster/data/dm1/bed/phastCons4way/rev-dg.mod starting-tree.mod # -- Because of the very long branch length to anoGam1 being pretty # much impossible to estimate from alignment data, edit that file to # reduce the anoGam1 branch length from 2.66 to 0.5. Otherwise # estimation process blows up. So our starting tree becomes #TREE: (((dm1:0.058,droYak1:0.074):0.133,dp2:0.200):0,anoGam1:0.5); # Get genome-wide average GC content (for all species together, # not just the reference genome). If you have a globally # estimated tree model, as above, you can get this from the # BACKGROUND line in the .mod file. E.g., # ALPHABET: A C G T # ... # BACKGROUND: 0.276938 0.223190 0.223142 0.276730 # add up the C and G: awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod #0.446 # Great, use 0.446 as the --gc parameter in phastCons below:. # Now set up cluster job to estimate model parameters. # Use parameters iteratively determined for dm2 -- see makeDm2.doc: # --target-coverage 0.20 --expected-lengths 25 # Parameters will be estimated separately for each alignment fragment # then will be combined across fragments. mkdir run.estimate cd run.estimate cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.446 --nrates 1,1 \ --no-post-probs --ignore-missing --expected-lengths 25 \ --target-coverage 0.20 --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job ssh kk9 cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.estimate para create jobList para try, check, push, check, ... #Completed: 134 of 134 jobs #Average job time: 135s 2.26m 0.04h 0.00d #Longest job: 258s 4.30m 0.07h 0.00d #Submission to last job: 350s 5.83m 0.10h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kksilo cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod #ave.cons.mod:TREE: (((dm1:0.019916,droYak1:0.025549):0.038378,dp2:0.068605):0.085662,anoGam1:0.085662); #ave.noncons.mod:TREE: (((dm1:0.081971,droYak1:0.105403):0.161271,dp2:0.290165):0.361849,anoGam1:0.361849); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. It's all downhill # from here. ssh kk9 mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.phast cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 25 --target-coverage 0.20 --quiet --seqname $chr \ --idpref $pref \ --viterbi /cluster/bluearc/dm1/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /cluster/bluearc/dm1/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... #Completed: 134 of 134 jobs #Average job time: 26s 0.43m 0.01h 0.00d #Longest job: 53s 0.88m 0.01h 0.00d #Submission to last job: 55s 0.92m 0.02h 0.00d # back on kksilo: # combine predictions and transform scores to be in 0-1000 interval # do in a way that avoids limits on numbers of args cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /cluster/bluearc/dm1/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons featureBits dm1 all.bed #37105867 bases of 126527731 (29.326%) in intersection # OK, close enough. hgLoadBed dm1 phastConsElements all.bed # Create wiggle on the small cluster ssh kki mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.wib cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.wib rm -rf /cluster/bluearc/dm1/phastCons/wib mkdir -p /cluster/bluearc/dm1/phastCons/wib cat << 'EOF' > doWigAsciiToBinary #!/bin/csh -ef set chr = $1 zcat `ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigAsciiToBinary -chrom=$chr \ -wibFile=/cluster/bluearc/dm1/phastCons/wib/${chr}_phastCons stdin 'EOF' # << for emacs chmod a+x doWigAsciiToBinary rm -f jobList foreach chr (`ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigAsciiToBinary $chr >> jobList end para create jobList para try, check, push, check, ... #Completed: 11 of 11 jobs #Average job time: 19s 0.31m 0.01h 0.00d #Longest job: 47s 0.78m 0.01h 0.00d #Submission to last job: 47s 0.78m 0.01h 0.00d # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from bluearc cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons rm -rf wib POSTPROBS rsync -av /cluster/bluearc/dm1/phastCons/wib . rsync -av /cluster/bluearc/dm1/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir -p /gbdb/dm1/wib/mzDy1Dp2Ag1_phast cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm1/wib/mzDy1Dp2Ag1_phast/ hgLoadWiggle dm1 mzDy1Dp2Ag1_phast_wig \ -pathPrefix=/gbdb/dm1/wib/mzDy1Dp2Ag1_phast wib/*.wig # make top-5000 list and launcher on Adam's home page: sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \ > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/dm-top5000-4way \ "top 5000 conserved elements (4way)" dm1 # and clean up bluearc. rm -r /cluster/bluearc/dm1/phastCons rm -r /cluster/bluearc/dm1/chrom # LOAD ENSEMBL GENES (NEVER MIND 6/16/04 angie) # OK, never mind, Ensembl just imported the BDGP / Flybase genes. # BLAT FlyBase predicted DM proteins against DM ssh hgwdev cd /cluster/data/dm1/bed mkdir blat.dm1FB cd blat.dm1FB pepPredToFa dm1 bdgpGenePep dm1FB.fa hgPepPred dm1 generic blastFBPep00 dm1FB.fa ssh kk cd /cluster/data/hg17/bed/blat.dm1FB cat << '_EOF_' > blatSome #!/bin/csh -fe /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3 '_EOF_' # << keep emacs happy chmod +x blatSome ls -1S /iscratch/i/dm1/nib/*.nib > bug.lst mkdir fbfas cd fbfas faSplit sequence ../dm1FB.fa 5000 kg cd .. ls -1S fbfas/*.fa > fb.lst cat << '_EOF_' > blatGsub #LOOP blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy gensub2 human.lst kg.lst blatGsub blatSpec mkdir psl cd psl foreach i (`cat ../bug.lst`) mkdir `basename $i .nib` end cd .. para create blatSpec para push 53361 jobs in batch Completed: 53361 of 53361 jobs CPU time in finished jobs: 640171s 10669.52m 177.83h 7.41d 0.020 y IO & Wait Time: 144105s 2401.75m 40.03h 1.67d 0.005 y Average job time: 15s 0.24m 0.00h 0.00d Longest job: 1924s 32.07m 0.53h 0.02d Submission to last job: 3506s 58.43m 0.97h 0.04d ssh eieio cd /cluster/data/dm1/bed/blat.dm1FB pslSort dirs raw.psl /tmp psl/* pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cov90.psl /dev/null sort -rn cov90.psl | pslUniq stdin dm1FB.psl pslxToFa dm1FB.psl dm1FB_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft fbName dm1 dm1FB.psl blastFBRef00 ssh hgwdev cd /cluster/data/dm1/bed/blat.dm1FB hgsql dm1 < ~/kent/src/lib/hg/blastRef.sql echo "rename table blastRef to blastFBRef00" | hgsql dm1 echo "load data local infile 'blastFBRef00' into table blastFBRef00" | hgsql dm1 # MITOPRED DATA FOR HGGENE (DONE 8/10/04 angie) ssh hgwdev mkdir /cluster/data/dm1/bed/mitopred cd /cluster/data/dm1/bed/mitopred wget http://mitopred.sdsc.edu/data/fly_30.out perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' fly_30.out > mitopred.tab cat > mitopred.sql << '_EOF_' # Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/ CREATE TABLE mitopred ( name varchar(10) not null, # SwissProt ID confidence varchar(8) not null, # Confidence level #Indices PRIMARY KEY(name(6)) ); '_EOF_' # << this line makes emacs coloring happy hgsql dm1 < mitopred.sql hgsql dm1 -e 'load data local infile "mitopred.tab" into table mitopred' # MAKE Human Proteins track mkdir -p /cluster/data/dm1/bed/tblastn.hg16KG cd /cluster/data/dm1/bed/tblastn.hg16KG ls -1S /iscratch/i/dm1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst exit # back to kksilo cd /cluster/data/dm1/bed/tblastn.hg16KG mkdir kgfa # calculate a reasonable number of jobs calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\) # 41117/(150000/578) = 158.437507 split -l 158 /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl kgfa/kg cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT f=/tmp/`basename $3` for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 bug.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/dm1/bed/tblastn.hg16KG para create blastSpec para push cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/dm1/bed/tblastn.hg16KG para create chainSpec para push exit # back to kksilo cd /cluster/data/dm1/bed/tblastn.hg16KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../preblastHg16KG.psl cd .. blatDir=/cluster/data/hg16/bed/blat.hg16KG protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl ssh hgwdev cd /cluster/data/dm1/bed/tblastn.hg16KG hgLoadPsl dm1 blastHg16KG.psl exit # back to kksilo rm -rf blastOut # End tblastn # BLASTZ DM2 (FOR LIFTOVER EXPERIMENT) (DONE 9/15/04 angie) ssh kksilo mkdir /cluster/data/dm1/bed/blastz.dm2.2004-09-15 cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15 cat << '_EOF_' > DEF # dm1 vs. dm2 (for liftOver) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=/cluster/bin/penn/blastz-run BLASTZ=/cluster/bin/penn/blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 SEQ1_DIR=/iscratch/i/dm1/nib SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ2_DIR=/iscratch/i/dm2/nib SEQ2_SMSK= SEQ2_FLAG=-drosophila SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm1/bed/blastz.dm2.2004-09-15 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # run bash shell if you don't already: bash source DEF mkdir run /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j sh ./xdir.sh # cluster run -- use rack 9 to avoid getting in the way of hg17. ssh kk9 cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15/run para create j para try, check, push, check, .... #Completed: 483 of 483 jobs #Average job time: 119s 1.99m 0.03h 0.00d #Longest job: 1238s 20.63m 0.34h 0.01d #Submission to last job: 1625s 27.08m 0.45h 0.02d # back on kksilo... mkdir /cluster/data/dm1/bed/blastz.dm2.2004-09-15/run.1 cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15/run.1 /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > jobList # small cluster run ssh kki cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15/run.1 para create jobList para try, check, push, check, .... #Completed: 21 of 21 jobs #Average job time: 8s 0.13m 0.00h 0.00d #Longest job: 23s 0.38m 0.01h 0.00d #Submission to last job: 35s 0.58m 0.01h 0.00d cd .. rm -r raw # third run: lav -> axt ssh kki cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15 mkdir axtChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin \ /cluster/bluearc/drosophila/dm1/nib /iscratch/i/dm2/nib stdout \ | axtSort stdin ../../axtChrom/$chr.axt '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check #Completed: 11 of 11 jobs #Average job time: 41s 0.68m 0.01h 0.00d #Longest job: 62s 1.03m 0.02h 0.00d #Submission to last job: 62s 1.03m 0.02h 0.00d # CHAIN DM2 BLASTZ (FOR LIFTOVER EXPERIMENT) (DONE 9/15/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/dm1/bed/blastz.dm2.2004-09-15/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -verbose=0 $1 \ /iscratch/i/dm1/nib \ /iscratch/i/dm2/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 20s 0.33m 0.01h 0.00d #Longest job: 62s 1.03m 0.02h 0.00d #Submission to last job: 62s 1.03m 0.02h 0.00d # now on the file server, sort chains ssh kksilo cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's [looks like could use some filtering, # didn't abridge repeats] foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Don't load chains into database -- just doing this for liftOver. # NET DM2 BLASTZ (FOR LIFTOVER EXPERIMENT) (DONE 9/15/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Don't bother with classification and db-loading -- just make # liftOver-suitable (single-coverage from net) chains: netSplit noClass.net net mkdir subset foreach f (chain/*.chain) echo subsetting net/$f:t:r.net, $f to subset/$f:t netChainSubset net/$f:t:r.net $f subset/$f:t end cat subset/*.chain > dm1ToDm2.over.chain # clean up rm -r net subset # MAKE AN 11.OOC FILE FOR BLAT (DONE 9/16/04 angie) # Use -repMatch=100 (based on size -- for human we use 1024, and # fly size is 126527731 / 2866216770 = ~4.4% of human judging by # gapless genome size from featureBits -- we would use 45, but # bump that up a bit to be more conservative) ssh kolossus mkdir /cluster/data/dm1/bed/ooc cd /cluster/data/dm1/bed/ooc ls -1 /cluster/data/dm1/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/dm1/11.ooc -repMatch=100 #Wrote 2917 overused 11-mers to /cluster/bluearc/dm1/11.ooc ssh kkr1u00 cp -p /cluster/bluearc/dm1/*.ooc /iscratch/i/dm1/ iSync # BLAT DM2 TO GET LIFTOVER CHAINS (DONE 9/17/04 angie) # Note: this process makes /cluster/data/dm1/bed/blat.dm2.2004-09-17/ . # Previously I made a blat.dm2.2004-09-16/ using this process (mostly), # except for the split and lift steps. I used the dm2 2Mb chunks and # dm2/jkStuff/liftAll.lft. Blat run was much slower... liftOver # performance was about the same, so use the quicker smaller chunks. ssh kkr1u00 ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \ dm2 /iscratch/i/dm2/nib # Do what its output says to do next: ssh kk (or kk9) ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \ dm1 /iscratch/i/dm1/nib dm2 /iscratch/i/dm2/split3k \ /iscratch/i/dm1/11.ooc # Do what its output says to do next: cd /cluster/data/dm1/bed/blat.dm2.2004-09-16/run para try, check, push, check, ... #Completed: 143 of 143 jobs #Average job time: 37s 0.62m 0.01h 0.00d #Longest job: 282s 4.70m 0.08h 0.00d #Submission to last job: 283s 4.72m 0.08h 0.00d ssh kksilo ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh dm1 dm2 \ /iscratch/i/dm2/split3k # Do what its output says to do next: ssh kki ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \ dm1 /iscratch/i/dm1/nib dm2 /iscratch/i/dm2/nib # Do what its output says to do next: cd /cluster/data/dm1/bed/blat.dm2.2004-09-16/chainRun para try, check, push, check, ... #Completed: 13 of 13 jobs #Average job time: 7s 0.12m 0.00h 0.00d #Longest job: 11s 0.18m 0.00h 0.00d #Submission to last job: 11s 0.18m 0.00h 0.00d ssh kksilo ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh dm1 dm2 # Do what its output says to do next: ssh hgwdev ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh dm1 dm2 # BLASTZ D.ANANASSAE (DONE 11/4/04 angie) ssh kksilo mkdir /cluster/data/dm1/bed/blastz.droAna1.2004-11-03 cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03 ln -s blastz.droAna1.2004-11-03 /cluster/data/dm1/bed/blastz.droAna1 cat << '_EOF_' > DEF # D.melanogaster vs. D.ananassae export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm1/nib SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - D. ananassae SEQ2_DIR=/iscratch/i/droAna1/chunksUnsplit SEQ2_SMSK= SEQ2_FLAG=-drosophila SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm1/bed/blastz.droAna1.2004-11-03 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # run bash shell if you don't already: bash source DEF mkdir run /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j sh ./xdir.sh cd run sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 mv j2 j # cluster run ssh kk cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/run para create j para try, check, push, check, .... #Completed: 9723 of 9723 jobs #Average job time: 22s 0.36m 0.01h 0.00d #Longest job: 832s 13.87m 0.23h 0.01d #Submission to last job: 1145s 19.08m 0.32h 0.01d # back in the bash shell on kksilo... mkdir /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/run.1 cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/run.1 /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j # small cluster run ssh kki cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/run.1 para create j para try, check, push, check, .... #Completed: 21 of 21 jobs #Average job time: 32s 0.54m 0.01h 0.00d #Longest job: 42s 0.70m 0.01h 0.00d #Submission to last job: 68s 1.13m 0.02h 0.00d cd .. rm -r raw # Translate .lav to axt: ssh kksilo cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03 mkdir axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/data/dm1/nib \ /cluster/data/droAna1/droAna1.2bit stdout \ | axtSort stdin ../../$out popd end # CHAIN ANANASSAE BLASTZ (DONE 11/4/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03 mkdir -p axtChain/run1 cd axtChain/run1 mkdir chain ls -1S /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -verbose=0 $1 \ /iscratch/i/dm1/nib \ /iscratch/i/droAna1/droAna1.2bit stdout \ | chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/droAna1/droAna1.2bit \ stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 10s 0.17m 0.00h 0.00d #Longest job: 19s 0.32m 0.01h 0.00d #Submission to last job: 19s 0.32m 0.01h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm1 ${c}_chainDroAna1 $i end # NET ANANASSAE BLASTZ (DONE 11/4/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain netClass -noAr noClass.net dm1 droAna1 ananassae.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain rm noClass.net netFilter -syn ananassae.net > ananassaeSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain netFilter -minGap=10 ananassae.net | hgLoadNet dm1 netDroAna1 stdin netFilter -minGap=10 ananassaeSyn.net \ | hgLoadNet dm1 netSyntenyDroAna1 stdin # GENERATE DROANA1 AXTNET AND MAF FOR MULTIZ (DONE 11/5/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droAna1/axtChain netSplit ananassae.net net cd .. mkdir axtNet foreach f (axtChain/net/chr*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/data/dm1/nib /cluster/data/droAna1/droAna1.2bit stdout \ | axtSort stdin axtNet/$f:t:r.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/dm1/chrom.sizes /cluster/data/droAna1/chrom.sizes \ $maf -tPrefix=dm1. -qPrefix=droAna1. end # MAKE VSDROANA1 DOWNLOADABLES (DONE 11/5/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droAna1 nice gzip axtChain/{all.chain,*.net} axtNet/*.axt axtChrom/*.axt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDroAna1 cd /usr/local/apache/htdocs/goldenPath/dm1/vsDroAna1 cp -p /cluster/data/dm1/bed/blastz.droAna1/axtChain/all.chain.gz \ ananassae.chain.gz cp -p /cluster/data/dm1/bed/blastz.droAna1/axtChain/ananassae.net.gz . cp -pR /cluster/data/dm1/bed/blastz.droAna1/axtNet . md5sum *.gz */*.gz > md5sum.txt # Make a README.txt which explains the files & formats. # BLASTZ D.MOJAVENSIS (DONE 11/4/04 angie) ssh kksilo mkdir /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04 cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04 ln -s blastz.droMoj1.2004-11-04 /cluster/data/dm1/bed/blastz.droMoj1 cat << '_EOF_' > DEF # D.melanogaster vs. D.mojavensis export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm1/nib SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - D. mojavensis SEQ2_DIR=/iscratch/i/droMoj1/chunks SEQ2_SMSK= SEQ2_FLAG=-drosophila SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm1/bed/blastz.droMoj1.2004-11-04 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # run bash shell if you don't already: bash source DEF mkdir run /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j sh ./xdir.sh cd run sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 mv j2 j # cluster run ssh kk cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/run para create j para try, check, push, check, .... # I should have split these a little coarser -- avg job time too small. #Completed: 18396 of 18396 jobs #Average job time: 20s 0.33m 0.01h 0.00d #Longest job: 472s 7.87m 0.13h 0.01d #Submission to last job: 629s 10.48m 0.17h 0.01d # back in the bash shell on kksilo... mkdir /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/run.1 cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/run.1 /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j # small cluster run ssh kki cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/run.1 para create j para try, check, push, check, .... #Completed: 21 of 21 jobs #Average job time: 38s 0.63m 0.01h 0.00d #Longest job: 43s 0.72m 0.01h 0.00d #Submission to last job: 75s 1.25m 0.02h 0.00d cd .. rm -r raw # Translate .lav to axt: ssh kksilo cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04 mkdir axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/data/dm1/nib \ /cluster/data/droMoj1/droMoj1.2bit stdout \ | axtSort stdin ../../$out popd end # CHAIN MOJAVENSIS BLASTZ (DONE 11/4/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04 mkdir -p axtChain/run1 cd axtChain/run1 mkdir chain ls -1S /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -verbose=0 $1 \ /iscratch/i/dm1/nib \ /iscratch/i/droMoj1/droMoj1.2bit stdout \ | chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/droMoj1/droMoj1.2bit \ stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 6s 0.10m 0.00h 0.00d #Longest job: 11s 0.18m 0.00h 0.00d #Submission to last job: 11s 0.18m 0.00h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm1 ${c}_chainDroMoj1 $i end # NET MOJAVENSIS BLASTZ (DONE 11/4/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain netClass -noAr noClass.net dm1 droMoj1 mojavensis.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain rm noClass.net netFilter -syn mojavensis.net > mojavensisSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain netFilter -minGap=10 mojavensis.net | hgLoadNet dm1 netDroMoj1 stdin netFilter -minGap=10 mojavensisSyn.net \ | hgLoadNet dm1 netSyntenyDroMoj1 stdin # GENERATE DROMOJ1 AXTNET AND MAF FOR MULTIZ (DONE 11/5/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droMoj1/axtChain netSplit mojavensis.net net cd .. mkdir axtNet foreach f (axtChain/net/chr*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/data/dm1/nib /cluster/data/droMoj1/droMoj1.2bit stdout \ | axtSort stdin axtNet/$f:t:r.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/dm1/chrom.sizes /cluster/data/droMoj1/chrom.sizes \ $maf -tPrefix=dm1. -qPrefix=droMoj1. end # MAKE VSDROMOJ1 DOWNLOADABLES (DONE 11/5/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droMoj1 nice gzip axtChain/{all.chain,*.net} axtNet/*.axt axtChrom/*.axt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDroMoj1 cd /usr/local/apache/htdocs/goldenPath/dm1/vsDroMoj1 cp -p /cluster/data/dm1/bed/blastz.droMoj1/axtChain/all.chain.gz \ mojavensis.chain.gz cp -p /cluster/data/dm1/bed/blastz.droMoj1/axtChain/mojavensis.net.gz . cp -pR /cluster/data/dm1/bed/blastz.droMoj1/axtNet . md5sum *.gz */*.gz > md5sum.txt # Make a README.txt which explains the files & formats. # BLASTZ D.VIRILIS (DONE 11/7/04 angie) ssh kksilo mkdir /cluster/data/dm1/bed/blastz.droVir1.2004-11-07 cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07 ln -s blastz.droVir1.2004-11-07 /cluster/data/dm1/bed/blastz.droVir1 cat << '_EOF_' > DEF # D.melanogaster vs. D.virilis export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm1/nib SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - D. virilis SEQ2_DIR=/iscratch/i/droVir1/chunks SEQ2_SMSK= SEQ2_FLAG=-drosophila SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm1/bed/blastz.droVir1.2004-11-07 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # run bash shell if you don't already: bash source DEF mkdir run /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j sh ./xdir.sh cd run sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 mv j2 j # cluster run ssh kk cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/run para create j para try, check, push, check, .... #Completed: 2121 of 2121 jobs #Average job time: 81s 1.34m 0.02h 0.00d #Longest job: 796s 13.27m 0.22h 0.01d #Submission to last job: 1412s 23.53m 0.39h 0.02d # back in the bash shell on kksilo... mkdir /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/run.1 cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/run.1 /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j # small cluster run ssh kki cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/run.1 para create j para try, check, push, check, .... #Completed: 21 of 21 jobs #Average job time: 10s 0.16m 0.00h 0.00d #Longest job: 14s 0.23m 0.00h 0.00d #Submission to last job: 21s 0.35m 0.01h 0.00d cd .. rm -r raw # Translate .lav to axt: ssh kksilo cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07 mkdir axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/data/dm1/nib \ /cluster/data/droVir1/droVir1.2bit stdout \ | axtSort stdin ../../$out popd end # CHAIN VIRILIS BLASTZ (DONE 11/7/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07 mkdir -p axtChain/run1 cd axtChain/run1 mkdir chain ls -1S /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -verbose=0 $1 \ /iscratch/i/dm1/nib \ /iscratch/i/droVir1/droVir1.2bit stdout \ | chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/droVir1/droVir1.2bit \ stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 10s 0.16m 0.00h 0.00d #Longest job: 16s 0.27m 0.00h 0.00d #Submission to last job: 16s 0.27m 0.00h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm1 ${c}_chainDroVir1 $i end # NET VIRILIS BLASTZ (DONE 11/7/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain netClass -noAr noClass.net dm1 droVir1 virilis.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain rm noClass.net netFilter -syn virilis.net > virilisSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain netFilter -minGap=10 virilis.net | hgLoadNet dm1 netDroVir1 stdin netFilter -minGap=10 virilisSyn.net \ | hgLoadNet dm1 netSyntenyDroVir1 stdin # GENERATE DROVIR1 AXTNET AND MAF FOR MULTIZ (DONE 11/7/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droVir1/axtChain netSplit virilis.net net cd .. mkdir axtNet foreach f (axtChain/net/chr*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/data/dm1/nib /cluster/data/droVir1/droVir1.2bit stdout \ | axtSort stdin axtNet/$f:t:r.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/dm1/chrom.sizes /cluster/data/droVir1/chrom.sizes \ $maf -tPrefix=dm1. -qPrefix=droVir1. end # MAKE VSDROVIR1 DOWNLOADABLES (DONE 11/7/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droVir1 nice gzip axtChain/{all.chain,*.net} axtNet/*.axt axtChrom/*.axt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDroVir1 cd /usr/local/apache/htdocs/goldenPath/dm1/vsDroVir1 cp -p /cluster/data/dm1/bed/blastz.droVir1/axtChain/all.chain.gz \ virilis.chain.gz cp -p /cluster/data/dm1/bed/blastz.droVir1/axtChain/virilis.net.gz . cp -pR /cluster/data/dm1/bed/blastz.droVir1/axtNet . md5sum *.gz */*.gz > md5sum.txt # Make a README.txt which explains the files & formats. # BLASTZ A.MELLIFERA (DONE 11/8/04 angie) # Use human-fugu sensitivity settings, as with mosquito. ssh kksilo mkdir /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08 cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08 ln -s blastz.apiMel1.2004-11-08 /cluster/data/dm1/bed/blastz.apiMel1 cat << '_EOF_' > DEF # D.melanogaster vs. A.mellifera export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm1/nib SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - A. mellifera SEQ2_DIR=/iscratch/i/apiMel1/chunksUnsplit SEQ2_SMSK= SEQ2_FLAG=-drosophila SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm1/bed/blastz.apiMel1.2004-11-08 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # run bash shell if you don't already: bash source DEF mkdir run /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j sh ./xdir.sh cd run sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 mv j2 j # cluster run ssh kk cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/run para create j para try, check, push, check, .... #Completed: 6867 of 6867 jobs #Average job time: 87s 1.45m 0.02h 0.00d #Longest job: 833s 13.88m 0.23h 0.01d #Submission to last job: 3109s 51.82m 0.86h 0.04d # back in the bash shell on kksilo... mkdir /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/run.1 cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/run.1 /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j # small cluster run ssh kki cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/run.1 para create j para try, check, push, check, .... #Completed: 21 of 21 jobs #Average job time: 12s 0.20m 0.00h 0.00d #Longest job: 25s 0.42m 0.01h 0.00d #Submission to last job: 85s 1.42m 0.02h 0.00d cd .. rm -r raw # Translate .lav to axt: ssh kksilo cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08 mkdir axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/data/dm1/nib \ /cluster/data/apiMel1/apiMel1.2bit stdout \ | axtSort stdin ../../$out popd end # CHAIN MELLIFERA BLASTZ (DONE 11/8/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08 mkdir -p axtChain/run1 cd axtChain/run1 mkdir chain ls -1S /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \ -verbose=0 $1 \ /iscratch/i/dm1/nib \ /iscratch/i/apiMel1/apiMel1.2bit stdout \ | chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/apiMel1/apiMel1.2bit \ stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 11s 0.18m 0.00h 0.00d #Longest job: 69s 1.15m 0.02h 0.00d #Submission to last job: 69s 1.15m 0.02h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm1 ${c}_chainApiMel1 $i end # NET MELLIFERA BLASTZ (DONE 11/8/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain netClass -noAr noClass.net dm1 apiMel1 mellifera.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain rm noClass.net netFilter -syn mellifera.net > melliferaSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain netFilter -minGap=10 mellifera.net | hgLoadNet dm1 netApiMel1 stdin netFilter -minGap=10 melliferaSyn.net \ | hgLoadNet dm1 netSyntenyApiMel1 stdin # GENERATE APIMEL1 AXTNET AND MAF FOR MULTIZ (DONE 11/8/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.apiMel1/axtChain netSplit mellifera.net net cd .. mkdir axtNet foreach f (axtChain/net/chr*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/data/dm1/nib /cluster/data/apiMel1/apiMel1.2bit stdout \ | axtSort stdin axtNet/$f:t:r.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/dm1/chrom.sizes /cluster/data/apiMel1/chrom.sizes \ $maf -tPrefix=dm1. -qPrefix=apiMel1. end # MAKE VSAPIMEL1 DOWNLOADABLES (DONE 11/8/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.apiMel1 nice gzip axtChain/{all.chain,*.net} axtNet/*.axt axtChrom/*.axt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsApiMel1 cd /usr/local/apache/htdocs/goldenPath/dm1/vsApiMel1 cp -p /cluster/data/dm1/bed/blastz.apiMel1/axtChain/all.chain.gz \ mellifera.chain.gz cp -p /cluster/data/dm1/bed/blastz.apiMel1/axtChain/mellifera.net.gz . cp -pR /cluster/data/dm1/bed/blastz.apiMel1/axtNet . md5sum *.gz */*.gz > md5sum.txt # Make a README.txt which explains the files & formats. # DATAMINE FOR PHYLOGENETIC TREE INFO FOR 12-FLY PROJECT (DONE 11/9/04 angie) # Google... -> Entrez "Tamura K" -> #Temporal patterns of fruit fly (Drosophila) evolution revealed by mutation clocks. #Tamura K, Subramanian S, Kumar S. #http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=12949132 #Our analysis of 2977 pairwise sequence comparisons from 176 nuclear #genes reveals a long-term fruit fly mutation clock ticking at a rate #of 11.1 mutations per kilobase pair per Myr. Genomic mutation #clock-based timings of the landmark speciation events leading to the #evolution of D. melanogaster show that it shared most recent common #ancestry # 5.4 MYA with D. simulans, #12.6 MYA with D. erecta+D. orena, #12.8 MYA with D. yakuba+D. teisseri, #35.6 MYA with the takahashii subgroup, #41.3 MYA with the montium subgroup, #44.2 MYA with the ananassae subgroup, #54.9 MYA with the obscura group, #62.2 MYA with the willistoni group, and #62.9 MYA with the subgenus Drosophila. # 12-species project whitepaper (June 2003): # http://genome.gov/Pages/Research/Sequencing/SeqProposals/Drosophila.pdf # Eventual 12-fly tree from page 3: (((((((melanogaster (simulans sechellia)) yakuba) erecta) ananassae) (pseudoobscura persimilis)) willistoni) (grimshawi (mojavensis virilis))) # Rough MYA from page 3 (Some 2-4x shorter estimates than Tamura et al!): (((((((melanogaster (simulans sechellia):1):4 yakuba:5) erecta:7) ananassae:13) (pseudoobscura persimilis):1):27 willistoni):37 (grimshawi (mojavensis virilis):25):7) # chainLink coverage of species aligned to date: ssh hgwdev foreach db (DroYak1 DroAna1 Dp2 DroVir1 DroMoj1 AnoGam1 ApiMel1) echo -n "$db: " nice featureBits dm1 chain${db}Link end #DroYak1: 108256417 bases of 126527731 (85.559%) in intersection #DroAna1: 82740388 bases of 126527731 (65.393%) in intersection #Dp2: 73920914 bases of 126527731 (58.423%) in intersection #DroVir1: 53742261 bases of 126527731 (42.475%) in intersection #DroMoj1: 49844756 bases of 126527731 (39.394%) in intersection #AnoGam1: 19437860 bases of 126527731 (15.363%) in intersection #ApiMel1: 16555940 bases of 126527731 (13.085%) in intersection # VAR_MULTIZ INSECTS TO DATE (DONE 11/9/04 angie) # Today's tree (8-way): # ((((((dm1 droYak1) droAna1) dp2) (droVir1 droMoj1)) anoGam1) apiMel1) ssh kksilo mkdir /cluster/data/dm1/bed/multiz8way.2004-11-09 ln -s /cluster/data/dm1/bed/multiz8way.2004-11-09 \ /cluster/data/dm1/bed/multiz8way cd /cluster/data/dm1/bed/multiz8way # Setup: Copy pairwise MAF to /cluster/bluearc: mkdir /cluster/bluearc/flyVarMultiz8way foreach db (droYak1 droAna1 dp2 droVir1 droMoj1 anoGam1 apiMel1) cp -pR /cluster/data/dm1/bed/blastz.$db/mafNet \ /cluster/bluearc/flyVarMultiz8way/$db end # Make output dir: mkdir maf # Create script to run var_multiz in the right order: cat << '_EOF_' > doVarMultiz.csh #!/bin/csh -fe set chr = $1 set tmp = /scratch/flyVarMultiz8way.$chr mkdir $tmp set REF = dm1.$chr set YAK = /cluster/bluearc/flyVarMultiz8way/droYak1/$chr.maf set ANA = /cluster/bluearc/flyVarMultiz8way/droAna1/$chr.maf set PSE = /cluster/bluearc/flyVarMultiz8way/dp2/$chr.maf set VIR = /cluster/bluearc/flyVarMultiz8way/droVir1/$chr.maf set MOJ = /cluster/bluearc/flyVarMultiz8way/droMoj1/$chr.maf set ANO = /cluster/bluearc/flyVarMultiz8way/anoGam1/$chr.maf set API = /cluster/bluearc/flyVarMultiz8way/apiMel1/$chr.maf set DEST = /cluster/data/dm1/bed/multiz8way/maf/$chr.maf set VMZ = /cluster/bin/penn/var_multiz set PROJECT = /cluster/bin/penn/maf_project if ( -s $YAK && -s $ANA ) then echo "Aligning $YAK $ANA..." $VMZ $YAK $ANA 1 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAna.maf else if ( -s $YAK ) then cp $YAK $tmp/$chr.YakAna.maf else if ( -s $ANA ) then cp $ANA $tmp/$chr.YakAna.maf endif if ( -s $PSE && -s $tmp/$chr.YakAna.maf ) then echo "Adding $PSE..." $VMZ $tmp/$chr.YakAna.maf $PSE 1 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPse.maf else if ( -s $PSE ) then cp $PSE $tmp/$chr.YakAnaPse.maf else if ( -s $tmp/$chr.YakAna.maf ) then cp $tmp/$chr.YakAna.maf $tmp/$chr.YakAnaPse.maf endif # droVir1 and droMoj1 are a subtree -- run on just them, then fold in: if ( -s $VIR && -s $MOJ ) then echo "Aligning $VIR $MOJ..." $VMZ $VIR $MOJ 0 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.VirMoj.maf else if ( -s $VIR ) then cp $VIR $tmp/$chr.VirMoj.maf else if ( -s $MOJ ) then cp $MOJ $tmp/$chr.VirMoj.maf endif if ( -s $tmp/$chr.VirMoj.maf && -s $tmp/$chr.YakAnaPse.maf ) then echo "Adding $tmp/$chr.VirMoj.maf..." $VMZ $tmp/$chr.YakAnaPse.maf $tmp/$chr.VirMoj.maf 1 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPseVirMoj.maf else if ( -s $tmp/$chr.VirMoj.maf ) then cp $tmp/$chr.VirMoj.maf $tmp/$chr.YakAnaPseVirMoj.maf else if ( -s $tmp/$chr.YakAnaPse.maf ) then cp $tmp/$chr.YakAnaPse.maf $tmp/$chr.YakAnaPseVirMoj.maf endif if ( -s $ANO && -s $tmp/$chr.YakAnaPseVirMoj.maf ) then echo "Adding $ANO..." $VMZ $tmp/$chr.YakAnaPseVirMoj.maf $ANO 1 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPseVirMojAno.maf else if ( -s $ANO ) then cp $ANO $tmp/$chr.YakAnaPseVirMojAno.maf else if ( -s $tmp/$chr.YakAnaPseVirMoj.maf ) then cp $tmp/$chr.YakAnaPseVirMoj.maf $tmp/$chr.YakAnaPseVirMojAno.maf endif if ( -s $API && -s $tmp/$chr.YakAnaPseVirMojAno.maf ) then echo "Adding $API..." $VMZ $tmp/$chr.YakAnaPseVirMojAno.maf $API 1 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.final.maf cp $tmp/$chr.final.maf $DEST else if ( -s $API ) then cp $API $DEST else if ( -s $tmp/$chr.YakAnaPseVirMojAno.maf ) then cp $tmp/$chr.YakAnaPseVirMojAno.maf $DEST endif rm $tmp/$chr.*.maf rmdir $tmp '_EOF_' # << keep emacs coloring happy chmod 755 doVarMultiz.csh awk '{print "./doVarMultiz.csh " $1;}' /cluster/data/dm1/chrom.sizes \ > jobs.lst # Run on small cluster ssh kki cd /cluster/data/dm1/bed/multiz8way para create jobs.lst para try, check, push, check, ... #Completed: 11 of 11 jobs #Average job time: 1272s 21.19m 0.35h 0.01d #Longest job: 3747s 62.45m 1.04h 0.04d #Submission to last job: 3747s 62.45m 1.04h 0.04d # make /gbdb/ links to 8way and pairwise maf files: ssh hgwdev mkdir -p /gbdb/dm1/multiz8way/maf/multiz8way ln -s /cluster/data/dm1/bed/multiz8way/maf/chr*.maf \ /gbdb/dm1/multiz8way/maf/multiz8way/ # pairwise setting from trackDb.ra entry: set pairwise = 041109 foreach orgName (d_yakuba d_ananassae d_pseudoobscura d_virilis \ d_mojavensis a_gambiae a_mellifera) mkdir /gbdb/dm1/multiz8way/maf/${orgName}_$pairwise end ln -s /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/mafNet/*.maf \ /gbdb/dm1/multiz8way/maf/d_yakuba_$pairwise ln -s /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/mafNet/*.maf \ /gbdb/dm1/multiz8way/maf/d_ananassae_$pairwise ln -s /cluster/data/dm1/bed/blastz.dp2.2004-08-03/mafNet/*.maf \ /gbdb/dm1/multiz8way/maf/d_pseudoobscura_$pairwise ln -s /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/mafNet/*.maf \ /gbdb/dm1/multiz8way/maf/d_virilis_$pairwise ln -s /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/mafNet/*.maf \ /gbdb/dm1/multiz8way/maf/d_mojavensis_$pairwise ln -s /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/mafNet/*.maf \ /gbdb/dm1/multiz8way/maf/a_gambiae_$pairwise ln -s /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/mafNet/*.maf \ /gbdb/dm1/multiz8way/maf/a_mellifera_$pairwise # load into database cd /tmp hgLoadMaf -warn dm1 multiz8way \ -pathPrefix=/gbdb/dm1/multiz8way/maf/multiz8way foreach orgName (d_yakuba d_ananassae d_pseudoobscura d_virilis \ d_mojavensis a_gambiae a_mellifera) hgLoadMaf -WARN dm1 ${orgName}_$pairwise \ -pathPrefix=/gbdb/dm1/multiz8way/maf/${orgName}_$pairwise end # put 8way MAF out for download ssh kksilo cd /cluster/data/dm1/bed/multiz8way mkdir ../../zips/mzMafs foreach f (maf/*.maf) gzip -c $f > ../../zips/mzMafs/$f:t.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/multiz8way cd /usr/local/apache/htdocs/goldenPath/dm1/multiz8way mv /cluster/data/dm1/zips/mzMafs/* . rmdir /cluster/data/dm1/zips/mzMafs md5sum *.gz > md5sum.txt # make a README.txt # Cleanup rm -rf /cluster/bluearc/flyVarMultiz8way/ # PHASTCONS 8WAY (DONE 11/10/04 angie) ssh kksilo # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces mkdir -p /cluster/bluearc/dm1/chrom cp -p /cluster/data/dm1/?{,?}/chr*.fa /cluster/bluearc/dm1/chrom/ ssh kki mkdir /cluster/data/dm1/bed/multiz8way/phastCons mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.split cd /cluster/data/dm1/bed/multiz8way/phastCons/run.split set WINDOWS = /cluster/bluearc/dm1/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/cluster/bluearc/dm1/chrom set WINDOWS=/cluster/bluearc/dm1/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir # apiMel1-specific tweak for msa_split: perl -wpe 's/(apiMel1\.Group\w+)\.(\d+)/$1_$2/' $1 > $tmpDir/$c.maf ${PHAST}/msa_split $tmpDir/$c.maf -i MAF -M ${FA_SRC}/$c.fa \ -O dm1,droYak1,droAna1,dp2,droVir1,droMoj1,anoGam1,apiMel1 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.maf rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/dm1/bed/multiz8way/maf/*.maf) if (-s $file) then echo "doSplit.sh {check in line+ $file}" >> jobList endif end para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 50s 0.83m 0.01h 0.00d #Longest job: 130s 2.17m 0.04h 0.00d #Submission to last job: 130s 2.17m 0.04h 0.00d cd .. # use the model previously estimated as a starting model... cp /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/starting-tree.mod . # then add the new species with scaled distance estimates. # As long as they're reasonable, the estimation process should be # able to converge on the right values. $EDITOR starting-tree.mod #TREE: ((((((dm1:0.058,droYak1:0.074):0.1,droAna1:0.17):0.33,dp2:0.200):0,(droVir1:0.3,droMoj1:0.3):0):0.2,anoGam1:0.5):0,apiMel1:0.5); # Get genome-wide average GC content (for all species together, # not just the reference genome). If you have a globally # estimated tree model, as above, you can get this from the # BACKGROUND line in the .mod file. E.g., # ALPHABET: A C G T # ... # BACKGROUND: 0.276938 0.223190 0.223142 0.276730 # add up the C and G: awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod #0.446 # Great, use 0.446 as the --gc parameter in phastCons below:. # Now set up cluster job to estimate model parameters. # Use parameters iteratively determined for dm2 -- see makeDm2.doc: # --target-coverage 0.20 --expected-lengths 25 # Parameters will be estimated separately for each alignment fragment # then will be combined across fragments. mkdir run.estimate cd run.estimate cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.446 --nrates 1,1 \ --no-post-probs --ignore-missing --expected-lengths 25 \ --target-coverage 0.20 --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job ssh kk9 cd /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate para create jobList para try, check, push, check, ... #Completed: 134 of 134 jobs #Average job time: 2054s 34.23m 0.57h 0.02d #Longest job: 4051s 67.52m 1.13h 0.05d #Submission to last job: 64611s 1076.85m 17.95h 0.75d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kksilo cd /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod #ave.cons.mod:TREE: ((((((dm1:0.024149,droYak1:0.028681):0.037845,droAna1:0.074610):0.020246,dp2:0.078803):0.018879,(droVir1:0.046349,droMoj1:0.051929):0.065674):0.063543,anoGam1:0.171043):0.118046,apiMel1:0.118046); #ave.noncons.mod:TREE: ((((((dm1:0.084284,droYak1:0.100215):0.137342,droAna1:0.270012):0.074910,dp2:0.285747):0.071341,(droVir1:0.165827,droMoj1:0.186367):0.245918):0.239605,anoGam1:0.624082):0.439708,apiMel1:0.439708); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. It's all downhill # from here. ssh kk9 mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.phast cd /cluster/data/dm1/bed/multiz8way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 25 --target-coverage 0.20 --quiet --seqname $chr \ --idpref $pref \ --viterbi /cluster/bluearc/dm1/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /cluster/bluearc/dm1/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... #Completed: 134 of 134 jobs #Average job time: 15s 0.26m 0.00h 0.00d #Longest job: 22s 0.37m 0.01h 0.00d #Submission to last job: 48s 0.80m 0.01h 0.00d # back on kksilo: # combine predictions and transform scores to be in 0-1000 interval # do in a way that avoids limits on numbers of args cd /cluster/data/dm1/bed/multiz8way/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /cluster/bluearc/dm1/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev cd /cluster/data/dm1/bed/multiz8way/phastCons featureBits dm1 all.bed #38825802 bases of 126527731 (30.686%) in intersection # OK, close enough. hgLoadBed dm1 phastConsElements8way all.bed # Create wiggle on the small cluster ssh kki mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.wib cd /cluster/data/dm1/bed/multiz8way/phastCons/run.wib rm -rf /cluster/bluearc/dm1/phastCons/wib mkdir -p /cluster/bluearc/dm1/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /cluster/bluearc/dm1/phastCons/wib zcat `ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end para create jobList para try, check, push, check, ... #Completed: 11 of 11 jobs #Average job time: 10s 0.16m 0.00h 0.00d #Longest job: 22s 0.37m 0.01h 0.00d #Submission to last job: 22s 0.37m 0.01h 0.00d # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from bluearc cd /cluster/data/dm1/bed/multiz8way/phastCons rm -rf wib POSTPROBS rsync -av /cluster/bluearc/dm1/phastCons/wib . rsync -av /cluster/bluearc/dm1/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/dm1/multiz8way/wib cd /cluster/data/dm1/bed/multiz8way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm1/multiz8way/wib/ hgLoadWiggle dm1 phastCons8way \ -pathPrefix=/gbdb/dm1/multiz8way/wib wib/*.wig # make top-5000 list and launcher on Adam's home page: sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \ > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/dm-top5000-8way \ "top 5000 conserved elements (8way)" dm1 # and clean up bluearc. rm -r /cluster/bluearc/dm1/phastCons # NOT DONE (I see more phastCons in the future :) rm -r /cluster/bluearc/dm1/chrom # BLASTZ D. PSEUDOOBSCURA (DP3) (DONE 11/16/04 angie) ssh kksilo mkdir /cluster/data/dm1/bed/blastz.dp3.2004-11-16 cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16 ln -s blastz.dp3.2004-11-16 /cluster/data/dm1/bed/blastz.dp3 cat << '_EOF_' > DEF # D.melanogaster vs. D.pseudoobscura export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm1/nib # unused: SEQ1_RMSK= SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - D. pseudoobscura SEQ2_DIR=/iscratch/i/dp3/nib # unused: SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG=-drosophila SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm1/bed/blastz.dp3.2004-11-16 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # run bash shell if you don't already: bash source DEF mkdir run ~angie/hummus/make-joblist $DEF > $BASE/run/j sh ./xdir.sh cd run sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 mv j2 j # cluster run ssh kk cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/run para create j para try, check, push, check, .... #Completed: 504 of 504 jobs #Average job time: 196s 3.26m 0.05h 0.00d #Longest job: 931s 15.52m 0.26h 0.01d #Submission to last job: 1285s 21.42m 0.36h 0.01d # back in the bash shell on kksilo... mkdir /cluster/data/dm1/bed/blastz.dp3.2004-11-16/run.1 cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/run.1 /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j # small cluster run ssh kki cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/run.1 para create j para try, check, push, check, .... #Completed: 21 of 21 jobs #Average job time: 11s 0.18m 0.00h 0.00d #Longest job: 21s 0.35m 0.01h 0.00d #Submission to last job: 27s 0.45m 0.01h 0.00d # Translate .lav to axt: ssh kksilo cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16 rm -r raw mkdir axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/data/dm1/nib /cluster/data/dp3/nib stdout \ | axtSort stdin ../../$out popd end # CHAIN PSEUDOOBSCURA BLASTZ (DONE 11/17/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16 mkdir -p axtChain/run1 cd axtChain/run1 mkdir chain ls -1S /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -verbose=0 $1 \ /iscratch/i/dm1/nib \ /iscratch/i/dp3/nib stdout \ | chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/dp3/nib \ stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 13s 0.22m 0.00h 0.00d #Longest job: 25s 0.42m 0.01h 0.00d #Submission to last job: 25s 0.42m 0.01h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm1 ${c}_chainDp3 $i end # NET PSEUDOOBSCURA BLASTZ (DONE 11/17/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain netClass -noAr noClass.net dm1 dp3 pseudoobscura.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain rm noClass.net netFilter -syn pseudoobscura.net > pseudoobscuraSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain netFilter -minGap=10 pseudoobscura.net | hgLoadNet dm1 netDp3 stdin netFilter -minGap=10 pseudoobscuraSyn.net \ | hgLoadNet dm1 netSyntenyDp3 stdin # GENERATE DP3 AXTNET AND MAF FOR MULTIZ (DONE 11/17/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.dp3/axtChain netSplit pseudoobscura.net net cd .. mkdir axtNet foreach f (axtChain/net/chr*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/data/dm1/nib /cluster/data/dp3/dp3.2bit stdout \ | axtSort stdin axtNet/$f:t:r.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/dm1/chrom.sizes /cluster/data/dp3/chrom.sizes \ $maf -tPrefix=dm1. -qPrefix=dp3. end # MAKE VSDP3 DOWNLOADABLES (DONE 11/17/04 angie) ssh kksilo mkdir /cluster/data/dm1/zips/axtNet gzip -c \ /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain/all.chain \ > /cluster/data/dm1/zips/pseudoobscura.chain.gz gzip -c \ /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain/pseudoobscura.net \ > /cluster/data/dm1/zips/pseudoobscura.net.gz foreach f (/cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtNet/chr*axt) gzip -c $f > /cluster/data/dm1/zips/axtNet/$f:t.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDp3 cd /usr/local/apache/htdocs/goldenPath/dm1/vsDp3 mv /cluster/data/dm1/zips/pseu*gz . mv /cluster/data/dm1/zips/axtNet . md5sum *.gz */*.gz > md5sum.txt # Make a README.txt which explains the files & formats. # VAR_MULTIZ FLIES + MOSQUITO (NO HONEYBEE) FOR PHASTCONS PAPER (DONE 11/18/04 angie) # Tree of genomes safe to use in paper (7-way): # (((((dm1 droYak1) droAna1) dp2) (droVir1 droMoj1)) anoGam1) ssh kksilo mkdir /cluster/data/dm1/bed/multiz7way.2004-11-18 ln -s /cluster/data/dm1/bed/multiz7way.2004-11-18 \ /cluster/data/dm1/bed/multiz7way cd /cluster/data/dm1/bed/multiz7way # Normally would copy pairwise MAF to /cluster/bluearc, but I haven't # cleaned up /cluster/bluearc/flyVarMultiz8way so just reuse it. # Make output dir: mkdir maf # Create script to run var_multiz in the right order: cat << '_EOF_' > doVarMultiz.csh #!/bin/csh -fe set chr = $1 set tmp = /scratch/flyVarMultiz7way.$chr mkdir $tmp set REF = dm1.$chr set YAK = /cluster/bluearc/flyVarMultiz8way/droYak1/$chr.maf set ANA = /cluster/bluearc/flyVarMultiz8way/droAna1/$chr.maf set PSE = /cluster/bluearc/flyVarMultiz8way/dp2/$chr.maf set VIR = /cluster/bluearc/flyVarMultiz8way/droVir1/$chr.maf set MOJ = /cluster/bluearc/flyVarMultiz8way/droMoj1/$chr.maf set ANO = /cluster/bluearc/flyVarMultiz8way/anoGam1/$chr.maf set DEST = /cluster/data/dm1/bed/multiz7way/maf/$chr.maf set VMZ = /cluster/bin/penn/var_multiz set PROJECT = /cluster/bin/penn/maf_project if ( -s $YAK && -s $ANA ) then echo "Aligning $YAK $ANA..." $VMZ $YAK $ANA 1 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAna.maf else if ( -s $YAK ) then cp $YAK $tmp/$chr.YakAna.maf else if ( -s $ANA ) then cp $ANA $tmp/$chr.YakAna.maf endif if ( -s $PSE && -s $tmp/$chr.YakAna.maf ) then echo "Adding $PSE..." $VMZ $tmp/$chr.YakAna.maf $PSE 1 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPse.maf else if ( -s $PSE ) then cp $PSE $tmp/$chr.YakAnaPse.maf else if ( -s $tmp/$chr.YakAna.maf ) then cp $tmp/$chr.YakAna.maf $tmp/$chr.YakAnaPse.maf endif # droVir1 and droMoj1 are a subtree -- run on just them, then fold in: if ( -s $VIR && -s $MOJ ) then echo "Aligning $VIR $MOJ..." $VMZ $VIR $MOJ 0 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.VirMoj.maf else if ( -s $VIR ) then cp $VIR $tmp/$chr.VirMoj.maf else if ( -s $MOJ ) then cp $MOJ $tmp/$chr.VirMoj.maf endif if ( -s $tmp/$chr.VirMoj.maf && -s $tmp/$chr.YakAnaPse.maf ) then echo "Adding $tmp/$chr.VirMoj.maf..." $VMZ $tmp/$chr.YakAnaPse.maf $tmp/$chr.VirMoj.maf 1 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPseVirMoj.maf else if ( -s $tmp/$chr.VirMoj.maf ) then cp $tmp/$chr.VirMoj.maf $tmp/$chr.YakAnaPseVirMoj.maf else if ( -s $tmp/$chr.YakAnaPse.maf ) then cp $tmp/$chr.YakAnaPse.maf $tmp/$chr.YakAnaPseVirMoj.maf endif if ( -s $ANO && -s $tmp/$chr.YakAnaPseVirMoj.maf ) then echo "Adding $ANO..." $VMZ $tmp/$chr.YakAnaPseVirMoj.maf $ANO 1 0 > $tmp/$chr.tmp.maf echo "Projecting on $REF..." $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPseVirMojAno.maf cp $tmp/$chr.YakAnaPseVirMojAno.maf $DEST else if ( -s $ANO ) then cp $ANO $DEST else if ( -s $tmp/$chr.YakAnaPseVirMoj.maf ) then cp $tmp/$chr.YakAnaPseVirMoj.maf $DEST endif rm $tmp/$chr.*.maf rmdir $tmp '_EOF_' # << keep emacs coloring happy chmod 755 doVarMultiz.csh awk '{print "./doVarMultiz.csh " $1;}' /cluster/data/dm1/chrom.sizes \ > jobs.lst # Run on small cluster ssh kki cd /cluster/data/dm1/bed/multiz7way para create jobs.lst para try, check, push, check, ... #Completed: 11 of 11 jobs #Average job time: 1098s 18.29m 0.30h 0.01d #Longest job: 3281s 54.68m 0.91h 0.04d #Submission to last job: 3281s 54.68m 0.91h 0.04d # make /gbdb/ links to 7way maf files: ssh hgwdev mkdir -p /gbdb/dm1/multiz7way/maf/multiz7way ln -s /cluster/data/dm1/bed/multiz7way/maf/chr*.maf \ /gbdb/dm1/multiz7way/maf/multiz7way/ # -- just reuse pairwise tables/files from 8way, instead of duplicating. # load into database cd /tmp hgLoadMaf -warn dm1 multiz7way \ -pathPrefix=/gbdb/dm1/multiz7way/maf/multiz7way # -- again, normally we would load pairwise maf tables -- reuse 8way's. # put 7way MAF out for download ssh kksilo cd /cluster/data/dm1/bed/multiz7way mkdir ../../zips/mzMafs foreach f (maf/*.maf) gzip -c $f > ../../zips/mzMafs/$f:t.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/multiz7way cd /usr/local/apache/htdocs/goldenPath/dm1/multiz7way mv /cluster/data/dm1/zips/mzMafs/* . rmdir /cluster/data/dm1/zips/mzMafs md5sum *.gz > md5sum.txt # make a README.txt # Cleanup rm -rf /cluster/bluearc/flyVarMultiz8way/ # MEASURE MAF CDS COVERAGE FOR PHASTCONS PAPER (DONE 11/30/04 angie) # 11/19, before running phastCons: ssh kolossus mkdir /cluster/data/dm1/bed/multiz7way/coverage cd /cluster/data/dm1/bed/multiz7way/coverage cat ../maf/chr*.maf \ | nice mafRanges -notAllOGap stdin dm1 dm1.7way.mafRanges.bed ssh hgwdev cd /cluster/data/dm1/bed/multiz7way/coverage # Use the alignment coverage of CDS in dm1 vs. hg17 as a scaling # factor for phastCons target coverage, according to this equation: # targetCovCds_fly = targetCovCds_vert * alCovCds_fly / alCovCds_vert # alCovCds_fly is the "cover" figure: nice featureBits -enrichment dm1 bdgpGene:cds dm1.7way.mafRanges.bed #bdgpGene:cds 17.224%, dm1.7way.mafRanges.bed 87.811%, both 17.159%, cover 99.62%, enrich 1.13x # Per Adam, targetCovCds_vert = ~2/3, alCovCds_vert = 0.964 calc 2/3 \* 0.996 / 0.964 #2/3 * 0.996 / 0.964 = 0.688797 # OK, we will aim for a fly phastConsElements CDS coverage of 68.9% below. # We may have to ask for something different in the parameters, but we # will keep trying out different parameters until we get an actual # phastConsElements CDS coverage of 68.9%. # So how did 8way fare? nice featureBits -enrichment dm1 bdgpGene:cds phastConsElements8way #bdgpGene:cds 17.224%, phastConsElements8way 30.686%, both 9.712%, cover 56.39%, enrich 1.84x # 56% is smaller than 68.9% so we need to increase the --target-coverage # param. It was 0.20 before... let's try 0.30. # 11/30: Now get pairwise: ssh kolossus cd /cluster/data/dm1/bed/multiz7way/coverage foreach db (dp2 anoGam1 droAna1 droMoj1 droVir1 droYak1) cat /cluster/data/dm1/bed/blastz.$db/mafNet/*.maf \ | mafRanges -notAllOGap stdin dm1 dm1.$db.mafRanges.bed end ssh hgwdev cd /cluster/data/dm1/bed/multiz7way/coverage foreach db (dp2 anoGam1 droAna1 droMoj1 droVir1 droYak1) echo $db nice featureBits dm1 -enrichment bdgpGene:cds dm1.$db.mafRanges.bed \ -bed=dm1.$db.cds.bed end #dp2 #bdgpGene:cds 17.224%, dm1.dp2.mafRanges.bed 58.121%, both 15.791%, cover 91.68%, enrich 1.58x #anoGam1 #bdgpGene:cds 17.224%, dm1.anoGam1.mafRanges.bed 14.587%, both 11.003%, cover 63.88%, enrich 4.38x #droAna1 #bdgpGene:cds 17.224%, dm1.droAna1.mafRanges.bed 65.118%, both 16.157%, cover 93.80%, enrich 1.44x #droMoj1 #bdgpGene:cds 17.224%, dm1.droMoj1.mafRanges.bed 39.094%, both 15.033%, cover 87.28%, enrich 2.23x #droVir1 #bdgpGene:cds 17.224%, dm1.droVir1.mafRanges.bed 42.200%, both 15.258%, cover 88.59%, enrich 2.10x #droYak1 #bdgpGene:cds 17.224%, dm1.droYak1.mafRanges.bed 85.119%, both 16.935%, cover 98.32%, enrich 1.16x # PHASTCONS 7WAY (DONE 11/19/04 angie) ssh kksilo # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces mkdir -p /cluster/bluearc/dm1/chrom cp -p /cluster/data/dm1/?{,?}/chr*.fa /cluster/bluearc/dm1/chrom/ ssh kki mkdir /cluster/data/dm1/bed/multiz7way/phastCons mkdir /cluster/data/dm1/bed/multiz7way/phastCons/run.split cd /cluster/data/dm1/bed/multiz7way/phastCons/run.split set WINDOWS = /cluster/bluearc/dm1/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/cluster/bluearc/dm1/chrom set WINDOWS=/cluster/bluearc/dm1/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \ -O dm1,droYak1,droAna1,dp2,droVir1,droMoj1,anoGam1 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.maf rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/dm1/bed/multiz7way/maf/*.maf) if (-s $file) then echo "doSplit.sh {check in line+ $file}" >> jobList endif end para create jobList para try, check, push, check... Completed: 11 of 11 jobs Average job time: 42s 0.69m 0.01h 0.00d Longest job: 96s 1.60m 0.03h 0.00d Submission to last job: 96s 1.60m 0.03h 0.00d ssh kksilo cd /cluster/data/dm1/bed/multiz7way/phastCons/ # Get genome-wide average GC content (for all species together, # not just the reference genome). If you have a globally # estimated tree model, as above, you can get this from the # BACKGROUND line in the .mod file. E.g., # ALPHABET: A C G T # ... # BACKGROUND: 0.276938 0.223190 0.223142 0.276730 # add up the C and G: awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod #0.446 # Great, use 0.446 as the --gc parameter in phastCons below:. ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY ############# # use the model previously estimated as a starting model... cp ../../multiz8way/phastCons/run.estimate/ave.noncons.mod \ starting-tree.mod # Remove apiMel1 to get 7-way tree. $EDITOR starting-tree.mod #TREE: (((((dm1:0.058,droYak1:0.074):0.1,droAna1:0.17):0.33,dp2:0.200):0,(droVir1:0.3,droMoj1:0.3):0):0.2,anoGam1:0.5); # Use consEntropy --NH to make it suggest a --expected-lengths param # that we should try next. Adam ran this on hg17 to find out the # total entropy of the hg17 model: # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045 #Relative entropy: H=0.608216 bits/site #Required length: N=16.085437 sites #Total entropy: NH=9.783421 bits # Our target is that NH result: 9.7834 bits. Run that on the previously # estimated models, with the params used to estimate them: /cluster/bin/phast/consEntropy --NH 9.7834 0.2 25 \ ../../multiz8way/phastCons/run.estimate/ave.{cons,noncons}.mod # #( Solving for new omega: 25.000000 14.959772 12.585290 12.339417 12.336344 ) # #Transition parameters: gamma=0.200000, omega=25.000000, mu=0.040000, nu=0.010000 #Relative entropy: H=1.319603 bits/site #Required length: N=8.794103 sites #Total entropy: NH=11.604726 bits #Recommended expected length: omega=12.336344 sites (for NH=9.783400) # OK, we will try --expected-lengths 12.3, although since we're also # changing --target-coverage (see above) we probably won't get it # right the first time. mkdir run.estimate ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ########### # We're here because the actual target coverage was not satisfactory, # so we're changing the --target-coverage param. Given that we're # changing that, take a guess at how we should change --expected-lengths # in order to also hit the total entropy target. cd /cluster/data/dm1/bed/multiz7way/phastCons/run.estimate # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.4 18 ave.{cons,noncons}.mod #Transition parameters: gamma=0.400000, omega=18.000000, mu=0.055556, nu=0.037037 #Relative entropy: H=1.355321 bits/site #Required length: N=6.620854 sites #Total entropy: NH=8.973382 bits #Recommended expected length: omega=23.837201 sites (for NH=9.783400) # OK, bump up --expected-lengths to 24 for second iteration. # THIRD ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.5 24 ave.{cons,noncons}.mod #Transition parameters: gamma=0.500000, omega=24.000000, mu=0.041667, nu=0.041667 #Relative entropy: H=1.235921 bits/site #Required length: N=7.320146 sites #Total entropy: NH=9.047124 bits #Recommended expected length: omega=30.685782 sites (for NH=9.783400) # OK, bump up --expected-lengths to 31 for third iteration. # FOURTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.6 31 ave.{cons,noncons}.mod #Transition parameters: gamma=0.600000, omega=31.000000, mu=0.032258, nu=0.048387 #Relative entropy: H=1.227887 bits/site #Required length: N=7.351105 sites #Total entropy: NH=9.026325 bits #Recommended expected length: omega=39.549314 sites (for NH=9.783400) # OK, --expected-lengths 40. # FIFTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.57 40 ave.{cons,noncons}.mod #Transition parameters: gamma=0.570000, omega=40.000000, mu=0.025000, nu=0.033140 #Relative entropy: H=1.223351 bits/site #Required length: N=8.217344 sites #Total entropy: NH=10.052700 bits #Recommended expected length: omega=36.620015 sites (for NH=9.783400) # OK, --expected-lengths 36.6. # SIXTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.56 36.6 ave.{cons,noncons}.mod #Transition parameters: gamma=0.560000, omega=36.600000, mu=0.027322, nu=0.034774 #Relative entropy: H=1.227349 bits/site #Required length: N=8.033142 sites #Total entropy: NH=9.859466 bits #Recommended expected length: omega=35.696766 sites (for NH=9.783400) # OK, --expected-lengths 35.7. # Now set up cluster job to estimate model parameters given free params # --target-coverage and --expected-lengths and the data. cd /cluster/data/dm1/bed/multiz7way/phastCons/run.estimate # FIRST ITERATION: Use ../starting-tree.mod: cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.446 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 12.3 --target-coverage 0.30 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model. cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.446 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 35.7 --target-coverage 0.56 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job ssh kk9 cd /cluster/data/dm1/bed/multiz7way/phastCons/run.estimate para create jobList para try, check, push, check, ... #Completed: 134 of 134 jobs #Average job time: 662s 11.03m 0.18h 0.01d #Longest job: 1477s 24.62m 0.41h 0.02d #Submission to last job: 1574s 26.23m 0.44h 0.02d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kksilo cd /cluster/data/dm1/bed/multiz7way/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # FIRST ITERATION: #ave.cons.mod:TREE: (((((dm1:0.020863,droYak1:0.024051):0.033187,droAna1:0.063531):0.018324,dp2:0.066908):0.019698,(droVir1:0.039583,droMoj1:0.044682):0.053433):0.106049,anoGam1:0.106049); #ave.noncons.mod:TREE: (((((dm1:0.090685,droYak1:0.105448):0.150077,droAna1:0.288625):0.084164,dp2:0.304556):0.092083,(droVir1:0.176988,droMoj1:0.200279):0.253431):0.490731,anoGam1:0.490731); # SECOND ITERATION: #ave.cons.mod:TREE: (((((dm1:0.024527,droYak1:0.028187):0.039154,droAna1:0.074652):0.021897,dp2:0.078698):0.023959,(droVir1:0.047008,droMoj1:0.053078):0.062285):0.126590,anoGam1:0.126590); #ave.noncons.mod:TREE: (((((dm1:0.093917,droYak1:0.108899):0.155960,droAna1:0.298758):0.088657,dp2:0.315328):0.098722,(droVir1:0.185255,droMoj1:0.209600):0.259606):0.516221,anoGam1:0.516221); # THIRD ITERATION: #ave.cons.mod:TREE: (((((dm1:0.026164,droYak1:0.030017):0.041812,droAna1:0.079557):0.023492,dp2:0.083908):0.025849,(droVir1:0.050196,droMoj1:0.056691):0.066111):0.135440,anoGam1:0.135440); #ave.noncons.mod:TREE: (((((dm1:0.097274,droYak1:0.112628):0.161761,droAna1:0.309265):0.092448,dp2:0.326497):0.103541,(droVir1:0.192162,droMoj1:0.217414):0.267607):0.536500,anoGam1:0.536500); # FOURTH ITERATION: #ave.cons.mod:TREE: (((((dm1:0.027759,droYak1:0.031804):0.044422,droAna1:0.084321):0.025065,dp2:0.088941):0.027674,(droVir1:0.053244,droMoj1:0.060132):0.069757):0.143307,anoGam1:0.143307); #ave.noncons.mod:TREE: (((((dm1:0.100831,droYak1:0.116583):0.167801,droAna1:0.320024):0.096332,dp2:0.337862):0.108245,(droVir1:0.199102,droMoj1:0.225211):0.275450):0.553926,anoGam1:0.553926); # FIFTH ITERATION: #ave.cons.mod:TREE: (((((dm1:0.027267,droYak1:0.031253):0.043621,droAna1:0.082864):0.024584,dp2:0.087408):0.027120,(droVir1:0.052313,droMoj1:0.059084):0.068661):0.141041,anoGam1:0.141041); #ave.noncons.mod:TREE: (((((dm1:0.099838,droYak1:0.115485):0.166143,droAna1:0.317118):0.095263,dp2:0.334808):0.106969,(droVir1:0.197211,droMoj1:0.223103):0.273475):0.549842,anoGam1:0.549842); # SIXTH ITERATION: #ave.cons.mod:TREE: (((((dm1:0.027105,droYak1:0.031070):0.043350,droAna1:0.082377):0.024423,dp2:0.086892):0.026930,(droVir1:0.052002,droMoj1:0.058734):0.068290):0.140237,anoGam1:0.140237); #ave.noncons.mod:TREE: (((((dm1:0.099464,droYak1:0.115066):0.165490,droAna1:0.315987):0.094859,dp2:0.333609):0.106465,(droVir1:0.196484,droMoj1:0.222286):0.272655):0.548007,anoGam1:0.548007); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Check the total entropy figure to see if we're way off: # FIRST ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.30 12.3 ave.{cons,noncons}.mod #Transition parameters: gamma=0.300000, omega=12.300000, mu=0.081301, nu=0.034843 #Relative entropy: H=1.355321 bits/site #Required length: N=6.455682 sites #Total entropy: NH=8.749521 bits #Recommended expected length: omega=17.959820 sites (for NH=9.783400) # OK, bump up --expected-lengths to 18 below (and for next iteration). # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.40 24 ave.{cons,noncons}.mod #Transition parameters: gamma=0.400000, omega=24.000000, mu=0.041667, nu=0.027778 #Relative entropy: H=1.235921 bits/site #Required length: N=7.943664 sites #Total entropy: NH=9.817744 bits #Recommended expected length: omega=23.714197 sites (for NH=9.783400) # OK, looks like 24 is still about right for --expected-lengths having # bumped up --target-coverage. # THIRD ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.50 31 ave.{cons,noncons}.mod #Transition parameters: gamma=0.500000, omega=31.000000, mu=0.032258, nu=0.032258 #Relative entropy: H=1.227887 bits/site #Required length: N=7.992415 sites #Total entropy: NH=9.813781 bits #Recommended expected length: omega=30.685778 sites (for NH=9.783400) # Cool, 31 is still about right. # FOURTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.60 40 ave.{cons,noncons}.mod #Transition parameters: gamma=0.600000, omega=40.000000, mu=0.025000, nu=0.037500 #3Relative entropy: H=1.223351 bits/site #Required length: N=8.025354 sites #Total entropy: NH=9.817829 bits #Recommended expected length: omega=39.556330 sites (for NH=9.783400) # 40 still OK, good. # FIFTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.57 36.6 ave.{cons,noncons}.mod #Transition parameters: gamma=0.570000, omega=36.600000, mu=0.027322, nu=0.036218 #Relative entropy: H=1.227349 bits/site #Required length: N=7.970096 sites #Total entropy: NH=9.782087 bits #Recommended expected length: omega=36.615726 sites (for NH=9.783400) # Yup, --expected-lengths 36.6. # SIXTH ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.56 35.7 ave.{cons,noncons}.mod #Transition parameters: gamma=0.560000, omega=35.700000, mu=0.028011, nu=0.035651 #Relative entropy: H=1.227599 bits/site #Required length: N=7.969781 sites #Total entropy: NH=9.783694 bits #Recommended expected length: omega=35.696550 sites (for NH=9.783400) # We probably don't need to do this, since it always has agreed, but it # only takes a second. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. The we measure the # conserved elements coverage, and if that's not satisfactory then we # adjust parameters and repeat. ssh kk9 mkdir /cluster/data/dm1/bed/multiz7way/phastCons/run.phast cd /cluster/data/dm1/bed/multiz7way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 35.7 --target-coverage 0.56 \ --quiet --seqname $chr --idpref $pref \ --viterbi /cluster/bluearc/dm1/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /cluster/bluearc/dm1/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... #Completed: 134 of 134 jobs #Average job time: 17s 0.29m 0.00h 0.00d #Longest job: 28s 0.47m 0.01h 0.00d #Submission to last job: 34s 0.57m 0.01h 0.00d # back on kksilo: # combine predictions and transform scores to be in 0-1000 interval cd /cluster/data/dm1/bed/multiz7way/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /cluster/bluearc/dm1/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev # Now measure coverage of CDS by conserved elements. # We want the "cover" figure to be close to 68.9%. cd /cluster/data/dm1/bed/multiz7way/phastCons featureBits -enrichment dm1 bdgpGene:cds all.bed # FIRST ITERATION: too low; increase --target-coverage and re-estimate. #bdgpGene:cds 17.224%, all.bed 31.047%, both 9.866%, cover 57.28%, enrich 1.85x # SECOND ITERATION: still too low; increase --target-coverage, re-estimate. #bdgpGene:cds 17.224%, all.bed 35.450%, both 10.948%, cover 63.56%, enrich 1.79x # THIRD ITERATION: close! Still a little low, repeat. #bdgpGene:cds 17.224%, all.bed 38.729%, both 11.542%, cover 67.01%, enrich 1.73x # FOURTH ITERATION: OK, a bit high now; back off and repeat. #bdgpGene:cds 17.224%, all.bed 42.087%, both 12.076%, cover 70.11%, enrich 1.67x # FIFTH ITERATION: back off just a smidge more. #bdgpGene:cds 17.224%, all.bed 41.098%, both 11.927%, cover 69.25%, enrich 1.68x # SIXTH ITERATION: woohoo! #bdgpGene:cds 17.224%, all.bed 40.753%, both 11.871%, cover 68.92%, enrich 1.69x # Having met the CDS coverage target, load up the results. hgLoadBed dm1 phastConsElements7way all.bed # Create wiggle on the small cluster ssh kki mkdir /cluster/data/dm1/bed/multiz7way/phastCons/run.wib cd /cluster/data/dm1/bed/multiz7way/phastCons/run.wib rm -rf /cluster/bluearc/dm1/phastCons/wib mkdir -p /cluster/bluearc/dm1/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /cluster/bluearc/dm1/phastCons/wib zcat `ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end para create jobList para try, check, push, check, ... #Completed: 11 of 11 jobs #Average job time: 10s 0.17m 0.00h 0.00d #Longest job: 22s 0.37m 0.01h 0.00d #Submission to last job: 22s 0.37m 0.01h 0.00d # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from bluearc cd /cluster/data/dm1/bed/multiz7way/phastCons rm -rf wib POSTPROBS rsync -av /cluster/bluearc/dm1/phastCons/wib . rsync -av /cluster/bluearc/dm1/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/dm1/multiz7way/wib cd /cluster/data/dm1/bed/multiz7way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm1/multiz7way/wib/ hgLoadWiggle dm1 phastCons7way \ -pathPrefix=/gbdb/dm1/multiz7way/wib wib/*.wig # make top-5000 list and launcher on Adam's home page: sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \ > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/dm-top5000-7way \ "top 5000 conserved elements (7way)" dm1 # and clean up bluearc. rm -r /cluster/bluearc/dm1/phastCons # NOT DONE (I see more phastCons in the future :) rm -r /cluster/bluearc/dm1/chrom # FLYREG (DONE 12/13/04 angie) # Flyreg loaded 11/22/04; MEME motifs added 12/13/04 ssh kksilo mkdir /cluster/data/dm1/bed/flyreg cd /cluster/data/dm1/bed/flyreg wget http://www.gen.cam.ac.uk/casey/Data/Footprint.GFF # This is not GTF; it should really be bed +. The contributor, # Casey Bergman, says that coords are 0-based half-open, so # translation will be even easier. grep -v '^#' Footprint.GFF \ | perl -wpe 'if (! s/^(\S+)\tBergman_data\tbinding_site\t(\d+)\t(\d+)\t.\t.\t.\tFactor \"([^\"]+)\"; Target \"([^\"]+)\"; PMID \"(\d+)\"$/chr$1\t$2\t$3\t$4\t$5\t$6/) { die "Cant parse line $.:\n$_\n"; }' \ > flyreg.bed ssh hgwdev hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/flyreg.sql \ dm1 flyreg /cluster/data/dm1/bed/flyreg/flyreg.bed # 12/13/04: Add Dan Pollard's MEME motif data for the footprints: ssh kksilo cd /cluster/data/dm1/bed/flyreg wget http://rana.lbl.gov/~dan/matrices/bergman2004/footprint_matrices.txt ./extractMatrices.pl footprint_matrices.txt > flyregMotif.tab ssh hgwdev cd /cluster/data/dm1/bed/flyreg sed -e 's/dnaMotif/flyregMotif/' $HOME/kent/src/hg/lib/dnaMotif.sql \ > flyregMotif.sql hgsql dm1 < flyregMotif.sql hgsql dm1 -e 'load data local infile "flyregMotif.tab" into table flyregMotif' # BLASTZ D.ERECTA (DONE 11/24/04 angie) ssh kksilo mkdir /cluster/data/dm1/bed/blastz.droEre1.2004-11-24 cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24 ln -s blastz.droEre1.2004-11-24 /cluster/data/dm1/bed/blastz.droEre1 cat << '_EOF_' > DEF # D.melanogaster vs. D.erecta export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm1/nib SEQ1_SMSK= SEQ1_FLAG=-drosophila SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - D. erecta SEQ2_DIR=/iscratch/i/droEre1/chunks SEQ2_SMSK= SEQ2_FLAG=-drosophila SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/dm1/bed/blastz.droEre1.2004-11-24 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # run bash shell if you don't already: bash source DEF mkdir run /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j sh ./xdir.sh cd run sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 mv j2 j # cluster run ssh kk cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/run para create j para try, check, push, check, .... #Completed: 5460 of 5460 jobs #Average job time: 46s 0.76m 0.01h 0.00d #Longest job: 1230s 20.50m 0.34h 0.01d #Submission to last job: 1401s 23.35m 0.39h 0.02d # back in the bash shell on kksilo... mkdir /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/run.1 cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/run.1 /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j # small cluster run ssh kki cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/run.1 para create j para try, check, push, check, .... #Completed: 21 of 21 jobs #Average job time: 15s 0.26m 0.00h 0.00d #Longest job: 33s 0.55m 0.01h 0.00d #Submission to last job: 49s 0.82m 0.01h 0.00d # Translate .lav to axt: ssh kksilo cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24 rm -r raw mkdir axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/data/dm1/nib \ /cluster/data/droEre1/droEre1.2bit stdout \ | axtSort stdin ../../$out popd end # CHAIN ERECTA BLASTZ (DONE 11/24/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24 mkdir -p axtChain/run1 cd axtChain/run1 mkdir chain ls -1S /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -verbose=0 $1 \ /iscratch/i/dm1/nib \ /iscratch/i/droEre1/droEre1.2bit stdout \ | chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/droEre1/droEre1.2bit \ stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 8s 0.14m 0.00h 0.00d #Longest job: 23s 0.38m 0.01h 0.00d #Submission to last job: 23s 0.38m 0.01h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain dm1 ${c}_chainDroEre1 $i end # NET ERECTA BLASTZ (DONE 11/24/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain netClass -noAr noClass.net dm1 droEre1 erecta.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain rm noClass.net netFilter -syn erecta.net > erectaSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain netFilter -minGap=10 erecta.net | hgLoadNet dm1 netDroEre1 stdin netFilter -minGap=10 erectaSyn.net \ | hgLoadNet dm1 netSyntenyDroEre1 stdin # GENERATE DROERE1 AXTNET AND MAF FOR MULTIZ (DONE 11/24/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droEre1/axtChain netSplit erecta.net net cd .. mkdir axtNet foreach f (axtChain/net/chr*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/data/dm1/nib /cluster/data/droEre1/droEre1.2bit stdout \ | axtSort stdin axtNet/$f:t:r.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.maf axtToMaf $f \ /cluster/data/dm1/chrom.sizes /cluster/data/droEre1/chrom.sizes \ $maf -tPrefix=dm1. -qPrefix=droEre1. end # MAKE VSDROERE1 DOWNLOADABLES (DONE 11/24/04 angie) ssh kksilo cd /cluster/data/dm1/bed/blastz.droEre1 nice gzip axtChain/{all.chain,*.net} axtNet/*.axt axtChrom/*.axt ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDroEre1 cd /usr/local/apache/htdocs/goldenPath/dm1/vsDroEre1 cp -p /cluster/data/dm1/bed/blastz.droEre1/axtChain/all.chain.gz \ erecta.chain.gz cp -p /cluster/data/dm1/bed/blastz.droEre1/axtChain/erecta.net.gz . cp -pR /cluster/data/dm1/bed/blastz.droEre1/axtNet . md5sum *.gz */*.gz > md5sum.txt # Make a README.txt which explains the files & formats. # TWINSCAN (DONE 11/29/04 angie) ssh kksilo mkdir /cluster/data/dm1/bed/twinscan cd /cluster/data/dm1/bed/twinscan foreach c (2L 2R 3L 3R 4 X) wget http://genes.cs.wustl.edu/predictions/Drosophila/dm1_11-24-04/chr_gtf/chr$c.gtf wget http://genes.cs.wustl.edu/predictions/Drosophila/dm1_11-24-04/chr_ptx/chr$c.ptx end # Add '.a' to end of protein fasta id's, to match gtf transcript_id's: perl -wpe 's/^(>\S+).*/$1.a/' *.ptx > twinscanPep.fa # load. ssh hgwdev cd /cluster/data/dm1/bed/twinscan ldHgGene -gtf -genePredExt dm1 twinscan chr*.gtf hgPepPred dm1 generic twinscanPep twinscanPep.fa featureBits -enrichment dm1 refGene twinscan #refGene 22.303%, twinscan 16.410%, both 15.755%, cover 70.64%, enrich 4.30x # MAF COVERAGE, MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES, FOR # REVISED PHASTCONS PAPER (DONE 1/5/05 acs) mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/coverage cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/coverage cat ../mypa/chr*.maf \ | nice mafRanges -notAllOGap stdin dm1 dm1.4way.mafRanges.bed nice featureBits -enrichment dm1 bdgpGene:cds dm1.4way.mafRanges.bed # bdgpGene:cds 17.224%, dm1.4way.mafRanges.bed 86.879%, both 17.142%, cover 99.53%, enrich 1.15x # FEEP EXPRESSION DATA (DONE 1/28/05 angie) # Full Euchromatic Expression Profile data (Stolc et al. 2004, White lab) # Raw data files from http://genome.med.yale.edu/FEEP/ downloaded to # /projects/compbio/data/microarray/flyFEEP/ . # Used ad hoc perl scripts to massage the data files into hgFixed # expression tables -- see # /projects/compbio/data/microarray/flyFEEP/README . # See makeHgFixed.doc for how *MedianRatio table was extracted. ssh hgwdev mkdir /cluster/data/dm1/bed/flyFeep cd /cluster/data/dm1/bed/flyFeep # More perling to get bed coords; then hgMapMicroarray to make bed 15: ./probes2bed.pl /projects/compbio/data/microarray/flyFEEP/probes3.1-* \ > flyFeepProbesBed12.bed hgMapMicroarray -bedIn flyFeep.bed hgFixed.flyFeepMedianRatio \ flyFeepProbesBed12.bed hgLoadBed dm1 flyFeep flyFeep.bed # "Significant" set of probes: # (see http://bussemaker.bio.columbia.edu/papers/Science2004/peab_discussion.html) wget http://bussemaker.bio.columbia.edu/papers/Science2004/significant_EPs_NEPs_GC10-11_usedinpaper/EPNEP_GC10-11_GC3-8crxn_FDR05_none_pool24_all_EP wget http://bussemaker.bio.columbia.edu/papers/Science2004/significant_EPs_NEPs_GC10-11_usedinpaper/EPNEP_GC10-11_GC3-8crxn_FDR05_none_pool24_all_NEP wget http://bussemaker.bio.columbia.edu/papers/Science2004/significant_SJPs/SJP_WSJP_GC3-8crxn_FDR05_none_pool24_all # ANOVA set (no splice junction probes): wget http://bussemaker.bio.columbia.edu/papers/Science2004/anova/EP_diff_expressed wget http://bussemaker.bio.columbia.edu/papers/Science2004/anova/NEP_diff_expressed # Distill those files to lists of IDs; then use the IDs to winnow # flyFeep.bed. awk '{print $1;}' *FDR05* > probesExpressedAboveBackground.txt awk '{print $1;}' *_diff_expressed > probesAnovaDiffExpressed.txt ./winnowBed.pl probesExpressedAboveBackground.txt flyFeep.bed \ > flyFeepPEAB.bed ./winnowBed.pl probesAnovaDiffExpressed.txt flyFeep.bed \ > flyFeepAnova.bed hgLoadBed dm1 flyFeepPEAB flyFeepPEAB.bed hgLoadBed dm1 flyFeepAnova flyFeepAnova.bed # PHASTCONS 8WAY WITH METHODS FROM PAPER (DONE 1/26/05 angie) # Same procedure as for 7way above -- using the param estimation methods # described in Adam's Genome Research paper. # Move aside the previous phastCons run dir: ssh hgwdev mv /cluster/data/dm1/bed/multiz8way/phastCons \ /cluster/data/dm1/bed/multiz8way/phastCons.2004-11-29 # That breaks /gbdb/links -- fix temporarily, we'll overwrite when done: rm /gbdb/dm1/multiz8way/wib/* ln -s /cluster/store6/dm1/bed/multiz8way.2004-11-09/phastCons.2004-11-29/wib/chr*.wib /gbdb/dm1/multiz8way/wib # chrom fa is already in /cluster/bluearc/dm1/chrom/, no need to copy. # Split chrom fa into smaller windows for phastCons: ssh kki mkdir /cluster/data/dm1/bed/multiz8way/phastCons mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.split cd /cluster/data/dm1/bed/multiz8way/phastCons/run.split set WINDOWS = /cluster/bluearc/dm1/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/cluster/bluearc/dm1/chrom set WINDOWS=/cluster/bluearc/dm1/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir # apiMel1-specific tweak for msa_split: perl -wpe 's/(apiMel1\.Group\w+)\.(\d+)/$1_$2/' $1 > $tmpDir/$c.maf ${PHAST}/msa_split $tmpDir/$c.maf -i MAF -M ${FA_SRC}/$c.fa \ -O dm1,droYak1,droAna1,dp2,droVir1,droMoj1,anoGam1,apiMel1 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.maf rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/dm1/bed/multiz8way/maf/*.maf) if (-s $file) then echo "doSplit.sh {check in line+ $file}" >> jobList endif end para create jobList para try, check, push, check... #Completed: 11 of 11 jobs #Average job time: 49s 0.81m 0.01h 0.00d #Longest job: 147s 2.45m 0.04h 0.00d #Submission to last job: 158s 2.63m 0.04h 0.00d ssh kksilo cd /cluster/data/dm1/bed/multiz8way/phastCons/ # Use the previous 8way run's ave.noncons.mod as starting model: cp ../phastCons.2004-11-29/run.estimate/ave.noncons.mod starting-tree.mod # Get genome-wide all-species average GC content from starting-tree.mod: # [when starting fresh, can use msa_view --aggregate --summary-only # as in makeHg17.doc but that's slower] awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod #0.446 # Use 0.446 as the --gc parameter in phastCons below:. ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY ############# # Use consEntropy --NH to make it suggest a --expected-lengths param # that we should try next. Adam ran this on hg17 to find out the # total entropy of the hg17 model: # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045 #Relative entropy: H=0.608216 bits/site #Required length: N=16.085437 sites #Total entropy: NH=9.783421 bits # Our target is that NH result: 9.7834 bits. # Use the values of --target-coverage and --expected-lengths from the # last iteration of the 7way run, 0.56 and 35.7. # Use the 7-way estimated models. Should be a real headstart relative # to starting from scratch (in which case we would use phyloFit). /cluster/bin/phast/consEntropy --NH 9.7834 0.56 35.7 \ ../../multiz8way/phastCons.2004-11-29/run.estimate/ave.{cons,noncons}.mod #( Solving for new omega: 35.700000 35.623093 ) #Transition parameters: gamma=0.560000, omega=35.700000, mu=0.028011, nu=0.035651 #Relative entropy: H=1.319603 bits/site #Required length: N=7.418872 sites #Total entropy: NH=9.789966 bits #Recommended expected length: omega=35.623093 sites (for NH=9.783400) # OK, we will try --expected-lengths 35.6. ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ########### # We're here because the actual target coverage was not satisfactory, # so we're changing the --target-coverage param. Given that we're # changing that, take a guess at how we should change --expected-lengths # in order to also hit the total entropy target. cd /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.565 35.5 ave.{cons,noncons}.mod #Recommended expected length: omega=35.982083 sites (for NH=9.783400) # OK, --expected-lengths 36. # Now set up cluster job to estimate model parameters given free params # --target-coverage and --expected-lengths and the data. ssh kk9 mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate cd /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate # FIRST ITERATION: Use ../starting-tree.mod: cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.446 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 35.6 --target-coverage 0.56 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model. cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.446 --nrates 1,1 \ --no-post-probs --ignore-missing \ --expected-lengths 36 --target-coverage 0.565 \ --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job para create jobList para try, check, push, check, ... #Completed: 134 of 134 jobs #Average job time: 1076s 17.93m 0.30h 0.01d #Longest job: 2118s 35.30m 0.59h 0.02d #Submission to last job: 2791s 46.52m 0.78h 0.03d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kksilo cd /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # FIRST ITERATION: #ave.cons.mod:TREE: ((((((dm1:0.028206,droYak1:0.033232):0.044076,droAna1:0.086199):0.023901,dp2:0.091206):0.023263,(droVir1:0.053709,droMoj1:0.060420):0.074761):0.080532,anoGam1:0.202889):0.143734,apiMel1:0.143734); #ave.noncons.mod:TREE: ((((((dm1:0.096925,droYak1:0.114585):0.159114,droAna1:0.310607):0.088457,dp2:0.329123):0.088473,(droVir1:0.190784,droMoj1:0.215082):0.280416):0.306475,anoGam1:0.738306):0.535906,apiMel1:0.535906); # SECOND ITERATION: #ave.cons.mod:TREE: ((((((dm1:0.028359,droYak1:0.033409):0.044355,droAna1:0.086718):0.024081,dp2:0.091763):0.023499,(droVir1:0.054045,droMoj1:0.060801):0.075211):0.081326,anoGam1:0.204280):0.144740,apiMel1:0.144740); #ave.noncons.mod:TREE: ((((((dm1:0.097342,droYak1:0.115064):0.159904,droAna1:0.312067):0.088999,dp2:0.330694):0.089234,(droVir1:0.191751,droMoj1:0.216177):0.281661):0.309045,anoGam1:0.742389):0.538846,apiMel1:0.538846); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Check the total entropy figure to see if we're way off. # We probably don't need to do this, since it has always been very close # if not the same as what we used above, but it only takes a second. # FIRST ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.56 35.6 ave.{cons,noncons}.mod #Recommended expected length: omega=35.540632 sites (for NH=9.783400) # OK, tweak --expected-lengths to 35.5 below (and for next iteration). # SECOND ITERATION: /cluster/bin/phast/consEntropy --NH 9.7834 0.565 36 ave.{cons,noncons}.mod #Recommended expected length: omega=35.981000 sites (for NH=9.783400) # ==> keep at 36. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. The we measure the # conserved elements coverage, and if that's not satisfactory then we # adjust parameters and repeat. ssh kk9 mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.phast cd /cluster/data/dm1/bed/multiz8way/phastCons/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 36 --target-coverage 0.565 \ --quiet --seqname $chr --idpref $pref \ --viterbi /cluster/bluearc/dm1/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /cluster/bluearc/dm1/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... #Completed: 134 of 134 jobs #Average job time: 16s 0.27m 0.00h 0.00d #Longest job: 23s 0.38m 0.01h 0.00d #Submission to last job: 47s 0.78m 0.01h 0.00d # back on kksilo: # combine predictions and transform scores to be in 0-1000 interval cd /cluster/data/dm1/bed/multiz8way/phastCons awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ /cluster/bluearc/dm1/phastCons/ELEMENTS/*.bed \ | /cluster/bin/scripts/lodToBedScore > all.bed ssh hgwdev # Now measure coverage of CDS by conserved elements. # We want the "cover" figure to be close to 68.9%. cd /cluster/data/dm1/bed/multiz8way/phastCons featureBits -enrichment dm1 bdgpGene:cds all.bed # FIRST ITERATION: a bit too low; increase --target-coverage, re-estimate. #bdgpGene:cds 17.224%, all.bed 42.260%, both 11.796%, cover 68.49%, enrich 1.62x # SECOND ITERATION: 68.85%'ll do! #bdgpGene:cds 17.224%, all.bed 42.608%, both 11.859%, cover 68.85%, enrich 1.62x # Having met the CDS coverage target, load up the results. hgLoadBed dm1 phastConsElements8way all.bed # Create wiggle on the small cluster ssh kki mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.wib cd /cluster/data/dm1/bed/multiz8way/phastCons/run.wib rm -rf /cluster/bluearc/dm1/phastCons/wib mkdir -p /cluster/bluearc/dm1/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /cluster/bluearc/dm1/phastCons/wib zcat `ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastCons.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end para create jobList para try, check, push, check, ... #Completed: 11 of 11 jobs #Average job time: 20s 0.34m 0.01h 0.00d #Longest job: 114s 1.90m 0.03h 0.00d #Submission to last job: 114s 1.90m 0.03h 0.00d # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want # the raw scores) from bluearc cd /cluster/data/dm1/bed/multiz8way/phastCons rm -rf wib POSTPROBS rsync -av /cluster/bluearc/dm1/phastCons/wib . rsync -av /cluster/bluearc/dm1/phastCons/POSTPROBS . # load wiggle component of Conservation track ssh hgwdev mkdir /gbdb/dm1/multiz8way/wib cd /cluster/data/dm1/bed/multiz8way/phastCons chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm1/multiz8way/wib/ hgLoadWiggle dm1 phastCons8way \ -pathPrefix=/gbdb/dm1/multiz8way/wib wib/*.wig # make top-5000 list and launcher on Adam's home page: sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \ > top5000.bed /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \ /cse/grads/acs/public_html/dm-top5000-8way \ "top 5000 conserved elements (8way)" dm1 # and clean up bluearc. rm -r /cluster/bluearc/dm1/phastCons/{ELEMENTS,POSTPROBS,wib} # NOT DONE (I see more phastCons in the future :) rm -r /cluster/bluearc/dm1/chrom # SELF ALIGNMENTS (DONE 2/24/05 angie) # Doing this partly as a test of doBlastzChainNet.pl... ssh kksilo mkdir /cluster/data/dm1/bed/blastz.dm1.2005-02-24 cd /cluster/data/dm1/bed/blastz.dm1.2005-02-24 cat << '_EOF_' > DEF # D. melanogaster vs. self # TARGET - D. melanogaster SEQ1_DIR=/iscratch/i/dm1/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/cluster/data/dm1/chrom.sizes # QUERY - D. melanogaster SEQ2_DIR=/iscratch/i/dm1/nib SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/dm1/chrom.sizes BASE=/cluster/data/dm1/bed/blastz.dm1.2005-02-24 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/dm1SelfOut >& do.log & tail -f do.log rmdir /panasas/store/dm1SelfOut ln -s blastz.dm1.2005-02-24 /cluster/data/dm1/bed/blastz.dm1 # chainSelf is already in top-level trackDb.ra, so no need to add. # Add /usr/local/apache/htdocs/goldenPath/dm1/vsSelf/README.txt # So, similar to above, we'll shoot for 2/3 * 0.995 / 0.964 = # 68.8% coverage in coding regions # NOTE: the pairwise numbers are the same as before # STANDARDIZED VERSION OF PHASTCONS # MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES, FOR PHASTCONS PAPER # (DONE 1/5/05 acs) # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces mkdir -p /cluster/bluearc/dm1/chrom cp -p /cluster/data/dm1/?{,?}/chr*.fa /cluster/bluearc/dm1/chrom/ mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/run.split cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/run.split set WINDOWS = /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/csh -ef set PHAST=/cluster/bin/phast set FA_SRC=/cluster/bluearc/dm1/chrom set WINDOWS=/cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/WINDOWS set maf=$1 set c = $maf:t:r set tmpDir = /scratch/msa_split/$c rm -rf $tmpDir mkdir -p $tmpDir ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O dm1,droYak1,dp2,anoGam1 \ -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000 cd $tmpDir foreach file ($c.*.ss) gzip -c $file > ${WINDOWS}/$file.gz end rm -f $tmpDir/$c.*.ss rmdir $tmpDir 'EOF' # << for emacs chmod a+x doSplit.sh rm -f jobList foreach file (/cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/*.maf) if (-s $file) then echo "doSplit.sh {check in line+ $file}" >> jobList endif end para create jobList para try, check, push, check... cd .. # now estimate parameters. we have to iterate until we get # coverage of 0.688 in exons and relative entropy 9.78, as above. # we can start the process by seeing how close we were with our # previous estimates (target coverage 0.2, expected length 25) featureBits -enrichment dm1 bdgpGene:cds ../../phastCons/all.bed # bdgpGene:cds 17.224%, ../../phastCons/all.bed 29.326%, both 7.519%, cover 43.65%, enrich 1.49x consEntropy 0.2 25 ../../phastCons/run.estimate/ave.{cons,noncons}.mod # Transition parameters: gamma=0.200000, omega=25.000000, mu=0.040000, nu=0.010000 # Relative entropy: H=0.623983 bits/site # Required length: N=19.348735 sites # Total entropy: NH=12.073289 bits # so, we need to crank the coverage up quite a bit and reduce the # entropy. We'll try target coverage 0.4 and we'll leave the # expected length as it is because the entropy will tend to # decrease with a large increase in coverage mkdir run.estimate cd run.estimate # use previous results for starting model and GC content cp /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.estimate/ave.noncons.mod starting-tree.mod #(GC content is 0.446) cat << '_EOF_' > doEstimate.sh #!/bin/csh -ef zcat $1 \ | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.446 --nrates 1,1 \ --no-post-probs --ignore-missing --expected-lengths 25 \ --target-coverage 0.40 --quiet --log $2 --estimate-trees $3 '_EOF_' # << for emacs chmod a+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobList foreach f ($WINDOWS/*.ss.gz) set root = $f:t:r:r echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList end # run cluster job ssh kk9 cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/run.estimate para create jobList para try, check, push, check, ... # Completed: 134 of 134 jobs # CPU time in finished jobs: 10970s 182.84m 3.05h 0.13d 0.000 y # IO & Wait Time: 361s 6.01m 0.10h 0.00d 0.000 y # Average job time: 85s 1.41m 0.02h 0.00d # Longest job: 171s 2.85m 0.05h 0.00d # Submission to last job: 228s 3.80m 0.06h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ls -1 TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls -1 TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave*.mod # ave.cons.mod:TREE: (((dm1:0.022875,droYak1:0.029296):0.045785,dp2:0.080314):0.111872,anoGam1:0.111872); # ave.noncons.mod:TREE: (((dm1:0.093309,droYak1:0.119704):0.192065,dp2:0.337507):0.475498,anoGam1:0.475498); cat cons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. See notes above. # Now set up the cluster job for computing the conservation scores # and predicted elements. mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/run.phast cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/run.phast cat << 'EOF' > doPhastCons.sh #!/bin/csh -ef set pref = $1:t:r:r set chr = `echo $pref | awk -F\. '{print $1}'` set tmpfile = /scratch/phastCons.$$ zcat $1 \ | /cluster/bin/phast/phastCons - \ ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \ --expected-lengths 25 --target-coverage 0.40 --quiet --seqname $chr \ --idpref $pref \ --viterbi /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/ELEMENTS/$pref.bed --score \ --require-informative 0 \ > $tmpfile gzip -c $tmpfile > /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' # << for emacs chmod a+x doPhastCons.sh rm -fr /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/{POSTPROBS,ELEMENTS} mkdir -p /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/{POSTPROBS,ELEMENTS} rm -f jobList foreach f (/cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobList end para create jobList para try, check, push, check, ... # Completed: 134 of 134 jobs # CPU time in finished jobs: 1530s 25.50m 0.42h 0.02d 0.000 y # IO & Wait Time: 390s 6.50m 0.11h 0.00d 0.000 y # Average job time: 14s 0.24m 0.00h 0.00d # Longest job: 20s 0.33m 0.01h 0.00d # Submission to last job: 42s 0.70m 0.01h 0.00d # check coverage and entropy cat /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/ELEMENTS/*.bed > all.raw.bed featureBits -enrichment dm1 bdgpGene:cds all.raw.bed # bdgpGene:cds 17.224%, all.raw.bed 40.474%, both 10.610%, cover 61.60%, enrich 1.52x consEntropy --NH 9.78 0.4 25 ../run.estimate/ave.{cons,noncons}.mod # Relative entropy: H=0.702265 bits/site # Required length: N=14.324859 sites # Total entropy: NH=10.059845 bits # Recommended expected length: omega=22.594418 sites (for NH=9.780000) # now iterate until convergence # second try: --target-coverage 0.55 --expected-lengths 25 # third try: --target-coverage 0.50 --expected-lengths 30 # fourth try: --target-coverage 0.47 --expected-lengths 30 # fourth try: --target-coverage 0.468 --expected-lengths 28.1 # final results: # bdgpGene:cds 17.224%, all.raw.bed 45.385%, both 11.805%, cover 68.54%, enrich 1.51x # Transition parameters: gamma=0.468000, omega=28.100000, mu=0.035587, nu=0.031306 # Relative entropy: H=0.725136 bits/site # Required length: N=13.512072 sites # Total entropy: NH=9.798094 bits # ave.cons.mod:TREE: (((dm1:0.024397,droYak1:0.031171):0.049856,dp2:0.086316):0.125701,anoGam1:0.125701); # ave.noncons.mod:TREE: (((dm1:0.098596,droYak1:0.126138):0.207382,dp2:0.359019):0.531038,anoGam1:0.531038); # transform scores to be in 0-1000 interval then load track awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ all.raw.bed | /cluster/bin/scripts/lodToBedScore > all.bed hgLoadBed dm1 phastConsElementsPaper all.bed # Create wiggle on the small cluster mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/phastCons/run.wib cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd//phastCons/run.wib rm -rf /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/wib mkdir -p /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/wib cat << 'EOF' > doWigEncode #!/bin/csh -ef set chr = $1 cd /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/wib zcat `ls -1 /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/POSTPROBS/$chr.*.pp.gz \ | sort -t\. -k2,2n` \ | wigEncode stdin ${chr}_phastConsPaper.wi{g,b} 'EOF' # << for emacs chmod a+x doWigEncode rm -f jobList foreach chr (`ls -1 /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/POSTPROBS \ | awk -F\. '{print $1}' | sort -u`) echo doWigEncode $chr >> jobList end ssh kk9, para create jobList, etc. # copy wibs, wigs and POSTPROBS from bluearc cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd rm -rf wib POSTPROBS rsync -av /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/wib . rsync -av /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/POSTPROBS . # load wiggle component of Conservation track mkdir /gbdb/dm1/wib/mzDy1Dp2Ag1_phastPaper chmod 775 . wib chmod 664 wib/*.wib ln -s `pwd`/wib/*.wib /gbdb/dm1/wib/mzDy1Dp2Ag1_phastPaper/ hgLoadWiggle dm1 phastConsPaper \ -pathPrefix=/gbdb/dm1/wib/mzDy1Dp2Ag1_phastPaper wib/*.wig # create rest of Conservation track (duplicate of current 4-way) # setup external files for database reference # (reuse links in /gbdb/dm1/mzDy1Dp2Ag1_phast) # load into database hgLoadMaf -warn -pathPrefix=/gbdb/dm1/mzDy1Dp2Ag1_phast dm1 multizForPhastConsPaper # (we'll also just reuse the pairwise tables) # clean up bluearc. Leave WINDOWS dir for now rm -r /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/{POSTPROBS,ELEMENTS,wib} ##################################################################### # SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie) # File emailed from Xinwei She mkdir /cluster/data/dm1/bed/genomicSuperDups cd /cluster/data/dm1/bed/genomicSuperDups sed -e 's/\t_\t/\t-\t/' dm1_genomicSuperDup.tab \ | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \ | hgLoadBed dm1 genomicSuperDups stdin \ -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql