# for emacs: -*- mode: sh; -*- # Caenorhabditis elegans # Washington University School of Medicine GSC and Sanger Institute WS170 # $Id: ce4.txt,v 1.34 2008/06/24 23:00:53 hiram Exp $ ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2007-02-23 - Hiram) ssh kkstore02 mkdir /cluster/store5/worm/ce4 ln -s /cluster/store5/worm/ce4 /cluster/data/ce4 cd /cluster/data/ce4 mkdir WS170 cd WS170 wget --timestamping -m -np -nd \ "ftp://ftp.sanger.ac.uk/pub/wormbase/WS170/CHROMOSOMES/" ######################################################################### ## Translate Sanger/Wormbase chrom names into UCSC names ## (DONE - 2007-02-23 - Hiram) ## bash syntax, this sequence captured in toUCSC_Fa.sh WSxxx=WS170 export WSxxx if [ -d downloads ]; then mv downloads downloads.0 rm -fr downloads.0 & fi mkdir downloads mkdir downloads/fasta mkdir downloads/agp for C in I II III IV V X do echo -n "${C} " zcat ${WSxxx}/CHROMOSOME_${C}.dna.gz | tr '[a-z]' '[A-Z]' | \ sed -e "s/CHROMOSOME_/chr/" > downloads/fasta/chr${C}.fa sed -e "s/^/chr/" ${WSxxx}/CHROMOSOME_${C}.agp > downloads/agp/chr${C}.agp done C=M echo -n "${C} " zcat ${WSxxx}/CHROMOSOME_MtDNA.dna.gz | tr '[a-z]' '[A-Z]' | \ sed -e "s/CHROMOSOME_MTDNA/chrM/" > downloads/fasta/chr${C}.fa echo "chrM 1 13794 1 F X54252.1 1 13794 +" | sed -e "s/ */\t/g" > \ downloads/agp/chr${C}.agp gzip downloads/fasta/chr*.fa gzip downloads/agp/chr*.agp ######################################################################### ## Create config.ra file, and run makeGenomeDb.pl (DONE - 2007-02-23 - Hiram) ssh kkstore02 cd /cluster/data/ce4 cat << '_EOF_' > ce4.config.ra # Config parameters for makeGenomeDb.pl: db ce4 clade worm genomeCladePriority 10 scientificName Caenorhabditis elegans commonName C. elegans assemblyDate Jan. 2007 assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS170 orderKey 858 mitoAcc none fastaFiles /cluster/data/ce4/downloads/fasta/chr*.fa.gz agpFiles /cluster/data/ce4/downloads/agp/chr*.agp.gz # qualFiles /dev/null dbDbSpeciesDir worm '_EOF_' # << happy emacs nice -n +19 makeGenomeDb.pl ce4.config.ra > makeGenomeDb.out 2>&1 ########################################################################### ## RepeatMasker (DONE - 2007-02-23 - Hiram) ssh kkstore02 cd /cluster/data/ce4 time nice -n +19 doRepeatMasker.pl ce4 > doRepeatMasker.out 2>&1 & mv doRepeatMasker.out bed/RepeatMasker.2007-02-23 # real 48m37.395s twoBitToFa ce4.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt # 100281244 bases (0 N's 100281244 real 87184653 upper 13096591 lower) # in 7 sequences in 1 files # %13.06 masked total, %13.06 masked real ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-18 - Hiram) # Use -repMatch=40 (based on size -- for human we use 1024, and # C. elegans size is ~3.4% of human judging by gapless ce4 vs. hg18 # genome sizes from featureBits. ssh kolossus cd /cluster/data/ce4 blat ce4.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/11.ooc -repMatch=36 # Wrote 43729 overused 11-mers to jkStuff/11.ooc cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/ce4 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2007-03-26,27 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add ce4 just before ce3 # ce4 (C. elegans) ce4.serverGenome = /cluster/data/ce4/ce4.2bit ce4.clusterGenome = /san/sanvol1/scratch/worms/ce4/ce4.2bit ce4.ooc = /san/sanvol1/scratch/worms/ce4/11.ooc ce4.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} ce4.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} ce4.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} ce4.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} ce4.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} ce4.maxIntron = 100000 ce4.lift = no ce4.genbank.mrna.xeno.load = no ce4.downloadDir = ce4 cvs ci -m "Added ce4." etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh kkstore02 cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial ce4 & # logFile: var/build/logs/2007.03.27-15:39:56.ce4.initalign.log # real 61m33.930s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ce4 # logFile: var/dbload/hgwdev/logs/2007.03.27-16:43:00.dbload.log # real 14m41.601s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add ce4 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added ce4 - C. elegans WS170" \ etc/align.dbs etc/hgwdev.dbs make etc-update ######################################################################### ## Create goldenPath files for hgdownload (DONE - 2007-04-05 - Hiram) ssh kkstore02 mkdir /cluster/data/ce4/goldenPath cd /cluster/data/ce4/goldenPath mkdir bigZips chromosomes database for C in I II III IV V X M do maskOutFa ${C}/chr${C}.fa hard ${C}/chr${C}.fa.masked done tar cvzf goldenPath/bigZips/chromFaMasked.tar.gz */chr*.fa.masked cd chromosomes for C in I II III IV V X M do twoBitToFa -seq=chr${C} ../../ce4.2bit stdout | gzip -c > chr${C}.fa.gz echo chr${C} done done # check the sequence faSize *.fa.gz # 100281244 bases (0 N's 100281244 real 87008909 # upper 13272335 lower) in 7 sequences in 7 files # %13.24 masked total, %13.24 masked real twoBitToFa ../../ce4.2bit stdout | faSize stdin # 100281244 bases (0 N's 100281244 real 87008909 # upper 13272335 lower) in 7 sequences in 1 files # %13.24 masked total, %13.24 masked real cd ../bigZips splitFileByColumn -ending=agp ../../ce4.agp . tar cvzf ./chromAgp.tar.gz ./chr*.agp rm chr*.agp for C in I II III IV V X M do zcat ../chromosomes/chr${C}.fa.gz > chr${C}.fa echo chr${C} done done tar cvzf ./chromFa.tar.gz chr*.fa rm chr*.fa cp -p ../../*/chr*.fa.out . tar cvzf ./chromOut.tar.gz chr*.fa.out rm chr*.fa.out splitFileByColumn -ending=bed ../../bed/simpleRepeat/trfMask.bed . tar cvzf ./chromTrf.tar.gz chr*.bed rm chr*.bed ssh hgwdev cd /cluster/data/genbank # get GenBank native mRNAs cd /cluster/data/genbank ./bin/x86_64/gbGetSeqs -db=ce4 -native GenBank mrna \ /cluster/data/ce4/goldenPath/bigZips/mrna.fa # get GenBank xeno mRNAs ./bin/x86_64/gbGetSeqs -db=ce4 -xeno GenBank mrna \ /cluster/data/ce4/goldenPath/bigZips/xenoMrna.fa.gz # get native GenBank ESTs ./bin/x86_64/gbGetSeqs -db=ce4 -native GenBank est \ /cluster/data/ce4/goldenPath/bigZips/est.fa # get native RefSeq mRNAs ./bin/x86_64/gbGetSeqs -db=ce4 -native RefSeq mrna \ /cluster/data/ce4/goldenPath/bigZips/refMrna.fa ssh kkstore02 cd /cluster/data/ce4/goldenPath/bigZips gzip mrna.fa est.fa refMrna.fa md5sum *.gz > md5sum.txt ########################################################################### ## Simple Repeats (DONE - 2007-02-24 - Hiram) ## Documented 31 May 2007 ssh kolossus mkdir /cluster/data/ce4/bed/simpleRepeat cd /cluster/data/ce4/bed/simpleRepeat twoBitToFa ../../ce4.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed ssh hgwdev cd /cluster/data/ce4/bed/simpleRepeat nice -n +19 hgLoadBed ce4 simpleRepeat \ /cluster/data/ce4/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits ce4 simpleRepeat > fb.ce4.simpleRepeat.txt 2>&1 # 3801183 bases of 100281244 (3.791%) in intersection ############################################################################ ## Add TRF to RMSK (DONE - 2007-03-04 - Hiram) ssh kkstore02 cd /cluster/data/ce4 cat bed/simpleRepeat/trfMask.bed \ twoBitMask -add -type=.bed ce4.rmsk.2bit stdin ce4.rmskTrf.2bit twoBitToFa ce4.rmskTrf.2bit stdout | faSize stdin \ > faSize.ce4.rmskTrf.txt # 100281244 bases (0 N's 100281244 real 87008909 upper 13272335 lower) # in 7 sequences in 1 files # %13.24 masked total, %13.24 masked real ######################################################################### ######################################################################### ### The sequences below were an experiment beginning early 2006 # FETCH acedb database for construction of Sanger Links table ## OBSOLETE procedure attempted on WS150 # DONE - 2006-01-09 - Hiram ssh kkstore02 mkdir /cluster/store5/worm/ce4/bed/acedb cd /cluster/store5/worm/ce4/bed/acedb wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=3 --no-parent \ --no-host-directories --cut-dirs=4 \ ftp://ftp.sanger.ac.uk/pub/wormbase/WS150/ # Then, to install this acedb database: # started 2006-01-09 09:48 chmod +x INSTALL ./INSTALL ######################################################################### # Create sangerLinks data and table ## OBSOLETE procedure attempted on WS150 ssh kkstore02 mkdir /cluster/store5/worm/ce4/bed/sangerLinks cd /cluster/store5/worm/ce4/bed/sangerLinks /cluster/bin/acedb/tace /cluster/store5/worm/ce4/bed/acedb # At prompt, can type ? for help, type the following queries # wbGeneClass.txt - associates 3 letter first part of worm gene # name with a brief description. acedb> AQL -o wbGeneClass.txt select l,l->Description from l in class Gene_Class # it says: # // 1578 Active Objects # wbLongDescr.txt - associates worm gene name with a several # sentence description. acedb> AQL -o wbLongDescr.txt select l,l->Gene_info[Provisional_description] from l in class Gene # it says: # // 5640 Active Objects # ctrl-D to exit the acedb monitor # remove header lines and "" from those two files: grep -v "^Format" wbGeneClass.txt | \ sed -e 's/Gene_class://; s/Text://; s/"//g;' \ > wbGeneClass.new grep -v "^Format" wbLongDescr.txt | \ sed -e 's/Gene://; s/Text://; s/"//g;' \ > wbLongDescr.new mv wbGeneClass.new wbGeneClass.txt mv wbLongDescr.new wbLongDescr.txt # Check how we compare to previous build, there will probably be a # few more with each new build: wc wbGeneClass.txt wbLongDescr.txt # 1625 7178 52423 wbGeneClass.txt # 51111 445663 3390455 wbLongDescr.txt wc /cluster/data/ce3/bed/sangerLinks/wbGeneClass.txt \ /cluster/data/ce3/bed/sangerLinks/wbLongDescr.txt # 1515 6589 47870 /cluster/data/ce3/bed/sangerLinks/wbGeneClass.txt # 50417 421524 3220249 /cluster/data/ce3/bed/sangerLinks/wbLongDescr.txt tar xvzf ../acedb/wormpep150.tar.gz zcat ../acedb/geneIDs.WS150.gz > geneIDs.WS150.txt # perl script is from previous build, this same directory cp -p /cluster/data/ce3/bed/sangerLinks/mkSangerLinks.pl . ./mkSangerLinks.pl wormpep150/wormpep.table150 geneIDs.WS150.txt \ wbLongDescr.txt wbGeneClass.txt > sangerLinks.txt 2> dbg.out # should be relatively few names without descriptions: grep "no descr" dbg.out | wc # 63 441 3477 ############################################################################### ## SANGER GENE TRACK 2nd attempt (DONE - 2007-04-27 - Hiram) ## There is a tremendous amount of extraneous notations in the gff ## files. Filter them down to a manageable set, change the chrom ## names, eliminate duplicates, select only what will work in ## ldHgGene ssh kkstore02 mkdir /cluster/data/ce4/bed/ws170Genes cd /cluster/data/ce4/bed/ws170Genes for C in I II III IV V X do echo -n "${C} " zcat ../../WS170/CHROMOSOME_${C}.gff.gz | \ sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \ s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \ s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \ s/CHROMOSOME_M/chrM/g;" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \ -e 's/CDS "//' -e 's/"//' \ > chr${C}.gff done C=M echo -n "${C} " zcat ../../WS170/CHROMOSOME_MtDNA.gff.gz | \ sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \ s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \ s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \ s/CHROMOSOME_M/chrM/g; s/chrMtDNA/chrM/g;" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \ -e 's/CDS "//' -e 's/"//' \ > chr${C}.gff for C in I II III IV V X M do echo "chr${C}.gff -> filtered.chr${C}.gff" grep -v "^#" chr${C}.gff | awk -F'\t' ' BEGIN { IGNORECASE=1 } { if (match($2,"curated|Coding_transcript")) { if (match($3,"intron|coding_exon|cds|three_prime_UTR|five_prime_UTR")) { gsub("Transcript ","",$9) gsub(" .*","",$9) gsub("three_prime_UTR","3utr",$3) gsub("five_prime_UTR","5utr",$3) for (i = 1; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } } } ' | sort -u | sort -k4n > filtered.chr${C}.gff done ssh hgwdev cd /cluster/data/ce4/bed/ws170Genes time nice -n +19 ldHgGene ce4 sangerGene filtered.*.gff # Read 52047 transcripts in 980039 lines in 7 files # 52047 groups 7 seqs 3 sources 5 feature types # 30497 gene predictions # real 0m2.569s ## this didn't work with the bin column, so alter that table hgsql -e "alter table sangerGene drop column bin;" ce4 ## done 2007-07-10 ## found when trying to click through on sequence links, mRNA or ## protein when on a gene details page ## This requires the following to also be re-run cd /cluster/data/ce4/bed/sangerGene # Add proteinID field to sangerGene table, used by Gene Sorter hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce4 # To add index on this column hgsql -e 'alter table sangerGene add index(proteinID);' ce4 # Use SwissProt IDs in sangerLinks table to fill proteinID # column in the sangerGene table hgsql -e 'update sangerGene as g, sangerLinks as l \ set g.proteinID = l.protName where g.name = l.orfName;' ce4 # check there are rows with the protName field filled hgsql -N -e "select proteinId from sangerGene" ce4 | sort | uniq -c | wc -l # 22126 hgLoadSqlTab ce4 orfToGene ~/kent/src/hg/lib/orfToGene.sql orfToGene.tab hgClusterGenes ce4 sangerGene sangerIsoforms sangerCanonical # Got 19965 clusters, from 30497 genes in 7 chromosomes hgLoadSqlTab ce4 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \ sangerLinks.tab hgPepPred ce4 generic sangerPep wormpep170/wormpep170 hgMapToGene ce4 refGene sangerGene sangerToRefSeq cd /cluster/data/ce4/bed/blastp/self/run/out hgLoadBlastTab ce4 sangerBlastTab *.tab # Scanning through 986 files # Loading database with 1214556 rows # Running joinerCheck to see what is sane: joinerCheck -identifier=wormBaseId -database=ce4 -times -keys all.joiner # Error: 999 of 27783 elements of ce4.orfToGene.name are not in key # sangerGene.name # indicates the non-coding genes in orfToGene are missing # from sangerGene. So, removing those from orfToGene: hgsql -N -e "select name from orfToGene;" ce4 | sort > orfToGene.name hgsql -N -e "select name from sangerGene;" ce4 | sort > sangerGene.name comm -23 orfToGene.name sangerGene.name > rm.orfToGene.name wc -l rm.orfToGene.name hgsql -N -e "select count(*) from orfToGene;" ce4 # 27783 for G in `cat rm.orfToGene.name` do hgsql -e 'delete from orfToGene where name="'"${G}"'";' ce4 echo $G done hgsql -N -e "select count(*) from orfToGene;" ce4 # 26874 ## create a gene name to WBGene ID reference table ## to be used to construct URL mkdir /cluster/data/ce4/bed/WBGeneID cd /cluster/data/ce4/bed/WBGeneID # There was one duplicate entry in the file, the sort -u eliminates it sed -e "s#http://wormbase.org/db/gene/gene?name=##; s#;class=Gene##" \ ../../downloads/tarFile/url.txt | sort -u > sangerGeneToWBGeneID.tab hgLoadSqlTab ce4 sangerGeneToWBGeneID \ ~/kent/src/hg/lib/sangerGeneToWBGeneID.sql sangerGeneToWBGeneID.tab ## later, it was discovered that this list needed to also be updated ## with the one-dot names, where only the two-dot names were present. ## all.joiner rule added, but it still is missing gene names. ############################################################################### ## RNA sangerGenes (DONE - 2007-04-27 - Hiram) ssh kkstore02 cd /cluster/data/ce4/bed/ws170Genes for C in I II III IV V X M do echo "chr${C}.gff -> rna.chr${C}.gff" grep -v "^#" chr${C}.gff | awk -F'\t' ' BEGIN { IGNORECASE=1 } { if (match($2,"^miRNA|^ncRNA|^rRNA|^scRNA|^snRNA|^snlRNA|^snoRNA|^tRNA|^tRNAscan-SE-1.23")) { if (match($3,"exon")) { gsub("Transcript ","",$9) gsub(" .*","",$9) for (i = 1; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } } } ' | sort -u | sort -k4n > rna.chr${C}.gff done ssh hgwdev cd /cluster/data/ce4/bed/ws170Genes time nice -n +19 ldHgGene ce4 sangerRnaGene rna.*.gff # Read 999 transcripts in 1030 lines in 7 files # 999 groups 7 seqs 9 sources 1 feature types # 999 gene predictions time nice -n +19 ldHgGene ce4 sangerPseudoGene pseudoGene.*.gff # Read 1154 transcripts in 1154 lines in 7 files # 1154 groups 6 seqs 1 sources 1 feature types # 1154 gene predictions ############################################################################### ## SANGER GENE TRACK 1st attempt - was reloaded (DONE - 2007-03-28 - Hiram) mkdir /cluster/data/ce4/bed/sangerGene cd /cluster/data/ce4/bed/sangerGene # this tar file package was prepared at WormBase at Jim's request # to ease our loading task here. From Anthony Rogers, ar2@sanger.ac.uk # URL in email to Jim 09 Feb 2007: # ftp://ftp.sanger.ac.uk/pub2/wormbase/WS170/wormbase_UCSC_WS170.tar.gz # Was unpacked into /cluster/data/ce4/downloads/tarFile ln -s /cluster/data/ce4/downloads/tarFile/*.gff . time nice -n +19 ldHgGene ce4 sangerGene *.gff # Read 28281 transcripts in 351460 lines in 6 files # 28281 groups 6 seqs 8 sources 2 feature types # 28281 gene predictions ## Create orfToGene table. They had one duplicate line in their file: sort -u /cluster/data/ce4/downloads/tarFile/gene_name.txt \ | sed -e "s/ */\t/" > orfToGene.tab ## plus, they didn't include the self with self references, which are ## needed in the gene sorter so it can find its names. hgLoadSqlTab ce4 orfToGene ~/kent/src/hg/lib/orfToGene.sql orfToGene.tab ## fixed this table in /cluster/data/ce4/bed/fixupOrfToGene ## by taking any name in the second column that was not mentioned in ## the first column, and making it refer to itself. hgLoadSqlTab ce4 orfToGene ~/kent/src/hg/lib/orfToGene.sql \ fixedOrfToGene.tab ln -s /cluster/data/ce4/downloads/tarFile/paragraph.txt ./paragraph.tab ln -s /cluster/data/ce4/downloads/tarFile/uniprot.txt ./uniprot.tab cat << '_EOF_' > mkSangerLinks.pl #!/usr/bin/env perl use strict; use warnings; my %geneToOrf; # key is uc(gene name), value is orf my %orfToGene; # key is uc(orf), value is gene name my $orfCount = 0; open (FH, ") { chomp $line; my ($orf, $gene) = split('\t',$line); die "gene name zero length" if (length($gene) < 1); die "orf name zero length" if (length($orf) < 1); $geneToOrf{uc($gene)} = $orf; $orfToGene{uc($orf)} = $gene; ++$orfCount; } close (FH); printf STDERR "read $orfCount orf designations from orfToGene.tab\n"; open (LS,">sangerLinks.tab") or die "can not write to sangerLinks.tab"; my $geneCount = 0; open (FH, "sort -u paragraph.tab|") or die "can not read paragraph.tab"; while (my $line = ) { chomp $line; my ($orf, $descr) = split('\t',$line); die "gene name zero length" if (length($orf) < 1); die "can not find orfToGene $orf" if (!exists($orfToGene{uc($orf)})); my $orfGene = $orfToGene{uc($orf)}; die "orfGene name zero length" if (length($orfGene) < 1); printf LS "%s\t%s\t%s\n", $orf, $orfGene, $descr; ++$geneCount; } close (FH); printf STDERR "read $geneCount gene descriptions from paragraph.tab\n"; close (LS); '_EOF_' # << happy emacs chmod +x mkSangerLinks.pl ./mkSangerLinks.pl # read 27783 orf designations from orfToGene.tab # read 27783 gene descriptions from paragraph.tab hgLoadSqlTab ce4 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \ sangerLinks.tab # Add proteinID field to sangerGene table, used by Gene Sorter hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce4 # To add index on this column hgsql -e 'alter table sangerGene add index(proteinID);' ce4 # Use SwissProt IDs in sangerLinks table to fill proteinID # column in the sangerGene table hgsql -e 'update sangerGene as g, sangerLinks as l \ set g.proteinID = l.protName where g.name = l.orfName;' ce4 # check there are rows with the protName field filled hgsql -N -e "select proteinId from sangerGene" ce4 | sort | uniq -c | wc -l # 22126 wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/wormbase/WS170/wormpep170.tar.gz" wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/wormbase/WS170/wormrna170.tar.gz" tar xvzf wormrna170.tar.gz mv README README.wormrna170 tar xvzf wormpep170.tar.gz # creates files in directory wormpep170: # -rw-rw-r-- 1 18238530 Jan 4 07:15 wp.fasta170 # -rw-rw-r-- 1 949522 Jan 4 07:15 wormpep.history170 # -rw-rw-r-- 1 1547 Jan 4 07:15 wormpep.diff170 # -rw-rw-r-- 1 31502715 Jan 9 07:19 wormpep.dna170 # -rw-rw-r-- 1 1510654 Jan 9 07:21 wormpep.table170 # -rw-rw-r-- 1 12658135 Jan 9 07:21 wormpep170 # -rw-rw-r-- 1 508392 Jan 9 07:21 wormpep.accession170 hgPepPred ce4 generic sangerPep wormpep170/wormpep170 ############################################################################### ######## MAKING GENE SORTER TABLES #### (DONE - 2007-04-02 - Hiram) ### re-done 2007-04-27 with new sangerGene table # These are instructions for building the # Gene Sorter browser. Don't start these until # there is a sangerGene table with a proteinID column containing Swiss-Prot # protein IDs, and also Kim lab expression data is required (in hgFixed). # Cluster together various alt-splicing isoforms. # Creates the sangerIsoforms and sangerCanonical tables ssh hgwdev hgClusterGenes ce4 sangerGene sangerIsoforms sangerCanonical # Got 20090 clusters, from 30391 genes in 7 chromosomes featureBits ce4 sangerCanonical # 56953095 bases of 100281244 (56.793%) in intersection featureBits ce2 sangerCanonical # 54156601 bases of 100291761 (53.999%) in intersection featureBits ce1 sangerCanonical # 53654286 bases of 100277784 (53.506%) in intersection # Create Self blastp table - sangerBlastTab # Reload blastp data into sangerBlastTab and drop knownBlastTab # - table given the wrong name (DONE, 2004-05-28, hartera) # Get sangerPep peptide sequences fasta file from sangerPep dir # and create a blast database out of them. mkdir -p /cluster/data/ce4/bed/blastp cd /cluster/data/ce4/bed/blastp cat < '_EOF_' > pepTabToFa.pl #!/usr/bin/env perl use strict; use warnings; open (FH,"<../sangerGene/sangerPep.tab") or die "can not open ../sangerGene/sangerPep.tab"; while (my $line = ) { chomp $line; my ($name,$seq) = split('\s+',$line); printf ">%s\n%s\n", $name, $seq; } close (FH); '_EOF_' # << happy emacs chmod +x ./pepTabToFa.pl ./pepTabToFa.pl > wormPep170.faa /cluster/bluearc/blast229/formatdb -i wormPep170.faa \ -t wormPep170 -n wormPep170 # Copy database over to kluster storage mkdir /san/sanvol1/scratch/worms/ce4/blastp rsync -a ./ /san/sanvol1/scratch/worms/ce4/blastp/ cd /cluster/data/ce4/bed/blastp mkdir split faSplit sequence wormPep170.faa 1000 split/wp # Make parasol run directory ssh pk mkdir -p /cluster/data/ce4/bed/blastp/self cd /cluster/data/ce4/bed/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome.csh #!/bin/csh -ef setenv BLASTMAT /cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall -p blastp \ -d /san/sanvol1/scratch/worms/ce4/blastp/wormPep170 \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << happy emacs chmod a+x blastSome.csh # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome.csh {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << Create parasol batch ls ../../split/*.fa > split.lst gensub2 split.lst single gsub jobList para create jobList para try.... etc ... # Completed: 986 of 986 jobs # CPU time in finished jobs: 17327s 288.79m 4.81h 0.20d 0.001 y # IO & Wait Time: 3136s 52.26m 0.87h 0.04d 0.000 y # Average job time: 21s 0.35m 0.01h 0.00d # Longest finished job: 69s 1.15m 0.02h 0.00d # Submission to last job: 2155s 35.92m 0.60h 0.02d # Load into database. ssh hgwdev cd /cluster/data/ce4/bed/blastp/self/run/out hgLoadBlastTab ce4 sangerBlastTab *.tab # Scanning through 986 files # Loading database with 1214556 rows # CREATE EXPRESSION DISTANCE TABLES FOR # KIM LAB EXPRESSION DATA hgExpDistance ce4 hgFixed.kimWormLifeMedianRatio \ hgFixed.kimWormLifeMedianExps kimExpDistance # Have 19134 elements in hgFixed.kimWormLifeMedianRatio # Got 19134 unique elements in hgFixed.kimWormLifeMedianRatio # CREATE TABLE TO MAP BETWEEN SANGERGENES AND REFSEQ GENES # sangerToRefSeq (DONE, 2007-04-02, hiram) hgMapToGene ce4 refGene sangerGene sangerToRefSeq # SET dbDb TABLE ENTRY FOR GENE SORTER (DONE, 2004-05-25, hartera) # set hgNearOk to 1 on hgcentraltest to make Ce4 Gene Sorter viewable hgsql -e 'update dbDb set hgNearOk = 1 where name = "ce4";' \ hgcentraltest ########################################################################## ## Blastz SELF (DONE - 2007-03-29 - Hiram ssh kkstore02 mkdir /cluster/data/ce4/bed/blastz.ce4.2007-03-29 cd /cluster/data/ce4/bed ln -s blastz.ce4.2007-03-29 blastzSelf cd blastz.ce4.2007-03-29 cat << '_EOF_' > DEF # ce4 vs ce4 - Self alignment BLASTZ_H=2000 BLASTZ_M=200 # TARGET: elegans Ce4 SEQ1_DIR=/san/sanvol1/scratch/worms/ce4/ce4.rmskTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: elegans Ce4 SEQ2_DIR=/san/sanvol1/scratch/worms/ce4/ce4.rmskTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=10000 BASE=/cluster/data/ce4/bed/blastz.ce4.2007-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/ce4Ce4 > do.log 2>&1 & # real 37m22.066s ssh hgwdev cd /cluster/data/ce4/bed/blastz.ce4.2007-03-29 nice -n +19 featureBits ce4 chainSelfLink > fb.ce4.chainSelfLink.txt 2>&1 # 31186593 bases of 100281244 (31.099%) in intersection ########################################################################## ## BlastZ caeRem2 (DONE - 2007-03-29 - 2007-04-02 - Hiram) ssh kkstore02 mkdir /cluster/data/ce4/bed/blastz.caeRem2.2007-03-29 cd /cluster/data/ce4/bed/blastz.caeRem2.2007-03-29 cat << '_EOF_' > DEF # ce4 vs caeRem2 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce4 SEQ1_DIR=/san/sanvol1/scratch/worms/ce4/ce4.rmskTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: briggsae caeRem2, 9,660 contigs, longest 5,925,111 SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem2/caeRem2.2bit SEQ2_LEN=/san/sanvol1/scratch/worms/caeRem2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/worms/caeRem2/caeRem2.contigs.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/worms/caeRem2/caeRem2.contigs.sizes SEQ2_LIFT=/san/sanvol1/scratch/worms/caeRem2/caeRem2.chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=10000 SEQ2_LIMIT=50 BASE=/cluster/data/ce4/bed/blastz.caeRem2.2007-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/ce4CaeRem2 > do.log 2>&1 & # real 44m41.608s cat fb.ce4.chainCaeRem2Link.txt # 45160539 bases of 100281244 (45.034%) in intersection ln -s blastz.caeRem2.2007-03-29 blastz.caeRem2 ## swap to caeRem2 - also in caeRem2.txt mkdir /cluster/data/caeRem2/bed/blastz.ce4.swap cd /cluster/data/caeRem2/bed/blastz.ce4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ /cluster/data/ce4/bed/blastz.caeRem2.2007-03-29/DEF \ -swap > swap.log 2>&1 & # real 3m52.831s cat fb.caeRem2.chainCe4Link.txt # 52254587 bases of 146898439 (35.572%) in intersection ########################################################################## ## BlastZ cb3 (DONE - 2007-03-29 - 2007-04-02 - Hiram) ssh kkstore02 mkdir /cluster/data/ce4/bed/blastz.cb3.2007-03-29 cd /cluster/data/ce4/bed/blastz.cb3.2007-03-29 cat << '_EOF_' > DEF # ce4 vs cb3 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce4 SEQ1_DIR=/san/sanvol1/scratch/worms/ce4/ce4.rmskTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: briggsae Cb3 SEQ2_DIR=/san/sanvol1/scratch/worms/cb3/cb3.rmskTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/worms/cb3/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=10000 BASE=/cluster/data/ce4/bed/blastz.cb3.2007-02-25 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl DEF -verbose=2 -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/ce4Ce3 > do.log 2>&1 & # real 16m5.178s cat fb.ce4.chainCb3Link.txt # 42491022 bases of 100281244 (42.372%) in intersection ln -s blastz.cb3.2007-02-25 blastz.cb3 ## swap to cb3 - also in cb3.txt mkdir /cluster/data/cb3/bed/blastz.ce4.swap cd /cluster/data/cb3/bed/blastz.ce4.swap time ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ /cluster/data/ce4/bed/blastz.cb3.2007-02-25/DEF \ -verbose=2 -bigClusterHub=pk -swap > swap.log 2>&1 & # real 3m18.164s cat fb.cb3.chainCe4Link.txt # 43181535 bases of 108433446 (39.823%) in intersection ############################################################################ # BLATSERVERS ENTRY (DONE - 2007-04-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ce4", "blat14", "17786", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ce4", "blat14", "17787", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## Reset default position to the unc-52 gene complex (DONE - 2004-04-04 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrII:14645680-14667144" where name="ce4";' hgcentraltest ############################################################################ ## BLASTZ caePb1 (DONE - 2007-03-04 - Hiram) ssh kkstore01 mkdir /cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04 cd /cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04 cat << '_EOF_' > DEF # ce4 vs caePb1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce4 SEQ1_DIR=/san/sanvol1/scratch/worms/ce4/ce4.unmasked.2bit SEQ1_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. PB2801 caePb1 SEQ2_DIR=/san/sanvol1/scratch/worms/caePb1/caePb1.unmasked.2bit SEQ2_LEN=/san/sanvol1/scratch/worms/caePb1/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=10000 SEQ2_LIMIT=50 BASE=/cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl \ DEF -verbose=2 -qRepeats=windowmaskerSdust \ -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/ce4CaePb1 > do.log 2>&1 & ## it is having trouble with the chaining # real 69m6.967s # recovering from Baskin power failure, continuing time nice -n +19 doBlastzChainNet.pl \ DEF -verbose=2 -qRepeats=windowmaskerSdust -continue=chainMerge \ -bigClusterHub=pk -blastzOutRoot /cluster/bluearc/ce4CaePb1 \ > chainMerge.log 2>&1 & cat fb.ce4.chainCaePb1Link.txt # 52594760 bases of 100281244 (52.447%) in intersection # swap, this is also in caePb1.txt mkdir /cluster/data/caePb1/bed/blastz.ce4.swap cd /cluster/data/caePb1/bed/blastz.ce4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust \ /cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04/DEF \ -bigClusterHub=pk -swap > swap.log 2>&1 & # real 22m40.205s cat fb.caePb1.chainCe4Link.txt # 74758640 bases of 175247318 (42.659%) in intersection ############################################################################ ## BLASTZ priPac1 (DONE - 2007-03-04 - Hiram) ssh kkstore01 mkdir /cluster/data/ce4/bed/blastz.priPac1.um.2007-04-04 cd /cluster/data/ce4/bed/blastz.priPac1.um.2007-04-04 cat << '_EOF_' > DEF # ce4 vs priPac1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce4 SEQ1_DIR=/san/sanvol1/scratch/worms/ce4/ce4.unmasked.2bit SEQ1_LEN=/san/sanvol1/scratch/worms/ce4/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: Pristionchus pacificus priPac1 SEQ2_DIR=/san/sanvol1/scratch/worms/priPac1/priPac1.unmasked.2bit SEQ2_LEN=/san/sanvol1/scratch/worms/priPac1/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=10000 SEQ2_LIMIT=50 BASE=/cluster/data/ce4/bed/blastz.priPac1.um.2007-04-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 DEF \ -bigClusterHub=pk -blastzOutRoot /cluster/bluearc/ce4PriPac1 \ > do.log 2>&1 & # real 110m47.620s cat fb.ce4.chainPriPac1Link.txt # 8671813 bases of 100281244 (8.647%) in intersection ln -s blastz.priPac1.um.2007-04-04 blastz.priPac1 # swap, this is also in priPac1.txt ssh kkstore02 mkdir /cluster/data/priPac1/bed/blastz.ce4.swap cd /cluster/data/priPac1/bed/blastz.ce4.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/ce4/bed/blastz.priPac1.um.2007-04-04/DEF \ -bigClusterHub=pk -swap > swap.log 2>&1 & cat fb.priPac1.chainCe4Link.txt # 9730314 bases of 145948246 (6.667%) in intersection ########################################################################## ## 5-Way multiple alignment (DONE - 2007-04-20,05-01 - Hiram) mkdir /cluster/data/ce4/bed/multiz5way cd /cluster/data/ce4/bed/multiz5way # An additional nematode tree: # http://nematol.unh.edu/tree/tree_des.php # Constructing a tree from the document: # http://www.wormbook.org/chapters/www_phylogrhabditids/phylorhab.html # purely artifical for the last one: :0.2,pristiochus_priPac1:0.8 # since it is not on the graph cat << '_EOF_' > nematodes.nh ((((((((((briggsae_cb3:0.005,remanei_caeRem2:0.003):0.004, cb5161:0.013):0.002,elegans_ce4:0.003):0.001, japonica:0.023):0.024,ps1010:0.027):0.014,df5070:0.056):0.009, plicata:0.102):0.099,sb341:0.060):0.038, (df5074:0.351,df5055:0.090):0.221):0.2,pristiochus_priPac1:0.8) '_EOF_' # << happy emacs # I don't know if this is anywhere near accurate, but I'm going # to call cb5161 our PB2801_caePb1 (it turns out to be true) # Given what comes out of the chainLink measurments, this is not # correct, but it is close ... /cluster/bin/phast/x86_64/tree_doctor \ --prune-all-but briggsae_cb3,remanei_caeRem2,cb5161,elegans_ce4,pristiochus_priPac1 \ --rename "cb5161 -> PB2801_caePb1" nematodes.nh > 5way.nh /cluster/bin/phast/x86_64/all_dists 5way.nh > 5way.distances.txt grep -i ce4 5way.distances.txt | sort -k3,3n # Use this output for reference, and use the calculated # distances in the table below to order the organisms and check # the button order on the browser. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainCe4Link chain linearGap # distance on Ce4 on other minScore # 1 0.0120 - remanei_caeRem2 (% 45.034) (% 35.572) 1000 loose # 2 0.0140 - briggsae_cb3 (% 42.372) (% 39.823) 1000 loose # 3 0.0180 - PB2801_caePb1 (% 52.447) (% 42.659) 1000 loose # 4 1.1880 - pristiochus_priPac1 (% 8.647) (% 6.667) 1000 loose cd /cluster/data/ce4/bed/multiz5way # bash shell syntax here ... export H=/cluster/data/ce4/bed mkdir mafLinks for G in caeRem2 cb3 caePb1 priPac1 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" exit 255 fi ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # Copy MAFs to some appropriate NFS server for kluster run ssh kkstore02 mkdir /san/sanvol1/scratch/worms/ce4/multiz5way cd /san/sanvol1/scratch/worms/ce4/multiz5way time rsync -a --copy-links --progress --stats \ /cluster/data/ce4/bed/multiz5way/mafLinks/ . # a few seconds to copy 109 Mb # these are x86_64 binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn # the autoMultiz cluster run ssh pk cd /cluster/data/ce4/bed/multiz5way/ # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 5way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir maf run cd run # NOTE: you need to set the db properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = ce4 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/worms/$db/multiz5way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/worms/$db/multiz5way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP autoMultiz $(root1) {check out line+ /cluster/data/ce4/bed/multiz5way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/ce4/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para try # 7 jobs in batch # Completed: 7 of 7 jobs # CPU time in finished jobs: 1916s 31.93m 0.53h 0.02d 0.000 y # IO & Wait Time: 31s 0.52m 0.01h 0.00d 0.000 y # Average job time: 278s 4.64m 0.08h 0.00d # Longest finished job: 400s 6.67m 0.11h 0.00d # Submission to last job: 404s 6.73m 0.11h 0.00d ssh kkstore02 cd /cluster/data/ce4/bed/multiz5way time nice -n +19 catDir maf > multiz5way.maf # a few seconds to produce: # -rw-rw-r-- 1 312800151 Apr 21 21:53 multiz5way.maf # before getting to the annotation, load this up so we can get # a first view of the track. This will be replaced with the annotated # mafs ssh hgwdev cd /cluster/data/ce4/bed/multiz5way mkdir /gbdb/ce4/multiz5way ln -s /cluster/data/ce4/bed/multiz5way/multiz5way.maf /gbdb/ce4/multiz5way time nice -n +19 hgLoadMaf ce4 multiz5way # a few seconds to load: # Loaded 375859 mafs in 1 files from /gbdb/ce4/multiz5way time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 ce4 multiz5waySummary multiz5way.maf # a few seconds to load: # Created 108194 summary blocks from 855104 components and 375859 mafs # from multiz5way.maf ############################################################################ # ANNOTATE MULTIZ5WAY MAF AND LOAD TABLES (DONE - 2007-05-03 - Hiram) ssh kolossus # need the nBed stuff on these 5 genomes, if this hasn't been # done before: cd /cluster/data for X in ce4 cb3 caePb1 caeRem2 priPac1 do twoBitInfo -nBed /cluster/data/${X}/${X}.{2bit,N.bed} done mkdir /cluster/data/ce4/bed/multiz5way/anno cd /cluster/data/ce4/bed/multiz5way/anno mkdir maf run cd run rm -f sizes nBeds ln -s /cluster/data/ce4/chrom.sizes ce4.len # This were done before during the 7-way # twoBitInfo -nBed /cluster/data/ce4/ce4.sdTrf.{2bit,N.bed} ln -s /cluster/data/ce4/ce4.N.bed ce4.bed for DB in \ `cat /cluster/data/ce4/bed/multiz5way/species.lst | sed -e "s/ ce4//"` do ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds echo ${DB}.len >> sizes echo $DB done echo '#!/bin/csh -ef' > jobs.csh echo date >> jobs.csh # do smaller jobs first so you can see some progress immediately: for F in `ls -1rS ../../maf/*.maf` do echo mafAddIRows -nBeds=nBeds $F \ /cluster/data/ce4/ce4.2bit ../maf/`basename $F` >> jobs.csh echo "echo $F" >> jobs.csh done echo date >> jobs.csh chmod +x jobs.csh time nice -n +19 ./jobs.csh > jobs.log 2>&1 & # to watch progress; tail -f jobs.log # real 0m43.777s # Load anno/maf ssh hgwdev cd /cluster/data/ce4/bed/multiz5way/anno/maf mkdir -p /gbdb/ce4/multiz5way/anno/maf ln -s /cluster/data/ce4/bed/multiz5way/anno/maf/*.maf \ /gbdb/ce4/multiz5way/anno/maf time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/ce4/multiz5way/anno/maf ce4 multiz5way # Loaded 451902 mafs in 7 files from /gbdb/ce4/multiz5way/anno/maf # real 0m11.804s # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /cluster/data/ce4/bed/multiz5way/anno/maf time cat *.maf | \ nice -n +19 hgLoadMafSummary ce4 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz5waySummary stdin # Created 36099 summary blocks from 855104 components # and 451902 mafs from stdin # real 0m13.892s # -rw-rw-r-- 1 1657980 May 3 16:02 multiz5waySummary.tab ssh hgwdev cd /cluster/data/ce4/bed/multiz5way/anno/maf time nice -n +19 hgLoadSqlTab ce4 multiz5waySummary \ ~/kent/src/hg/lib/mafSummary.sql multiz5waySummary.tab # real 0m4.525 # Create per-chrom individual maf files for downloads ssh kkstore02 cd /cluster/data/ce4/bed/multiz5way mkdir mafDownloads time for M in anno/maf/chr*.maf do B=`basename $M` nice -n +19 cp -p ${M} mafDownloads/${B} nice -n +19 gzip mafDownloads/${B} echo ${B} done done ssh hgwdev cd /cluster/data/ce4/bed/multiz5way # redone 2007-12-21 to fix difficulty in mafFrags when ce4 was not the # first species listed # upstream mafs # There isn't any refGene table, trying sangerGene instead #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 ce4 \ sangerGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ce4 multiz5way \ stdin stdout -orgs=species.lst \ | gzip -c > mafDownloads/sangerGene.upstream${S}.maf.gz echo "done sangerGene.upstream${S}.maf.gz" done cd mafDownloads md5sum *.gz > md5sum.txt # deliver to downloads ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/ce4/multiz5way cd /usr/local/apache/htdocs/goldenPath/ce4/multiz5way ln -s /cluster/data/ce4/bed/multiz5way/mafDownloads/* . ####################################################################### # MULTIZ5WAY MAF FRAMES (DONE - 2007-05-03 - Hiram) ssh hgwdev mkdir /cluster/data/ce4/bed/multiz5way/frames cd /cluster/data/ce4/bed/multiz5way/frames mkdir genes # The following is adapted from the oryLat1 5-way sequence #------------------------------------------------------------------------ # get the genes for all genomes # using sangerGene for ce4 # using blastCe4SG for cb3, caePb1, caeRem2, priPac1 for qDB in ce4 cb3 caePb1 caeRem2 priPac1 do if [ $qDB = "cb3" -o $qDB = "caePb1" -o $qDB = "caeRem2" -o $qDB = "priPac1" ]; then geneTbl=blastCe4SG echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \ > /scratch/tmp/$qDB.tmp.psl mrnaToGene -allCds /scratch/tmp/$qDB.tmp.psl stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz rm -f /scratch/tmp/$qDB.tmp.psl mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz elif [ $qDB = "ce4" ]; then geneTbl=sangerGene echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-11 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz fi done ssh kkstore02 cd /cluster/data/ce4/bed/multiz5way/frames cat ../maf/*.maf | nice -n +19 genePredToMafFrames ce4 \ stdin stdout ce4 genes/ce4.gp.gz cb3 \ genes/cb3.gp.gz caePb1 genes/caePb1.gp.gz caeRem2 \ genes/caeRem2.gp.gz priPac1 genes/priPac1.gp.gz \ | gzip > multiz5way.mafFrames.gz ssh hgwdev cd /cluster/data/ce4/bed/multiz5way/frames nice -n +19 hgLoadMafFrames ce4 multiz5wayFrames multiz5way.mafFrames.gz ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-04-20) ssh kkstore02 # bash if not using bash shell already mkdir /cluster/data/ce4/blastDb cd /cluster/data/ce4 twoBitToFa ce4.unmasked.2bit temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /san/sanvol1/scratch/ce4/blastDb cd /cluster/data/ce4/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/ce4/blastDb done mkdir -p /cluster/data/ce4/bed/tblastn.hg18KG cd /cluster/data/ce4/bed/tblastn.hg18KG echo /san/sanvol1/scratch/ce4/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 104 query.lst # we want around 50000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(50000/104) = 76.392160 mkdir -p /cluster/bluearc/ce4/bed/tblastn.hg18KG/kgfa split -l 76 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/ce4/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/ce4/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/ce4/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/ce4/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/ce4/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/ce4/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/ce4/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. # Completed: 50336 of 50336 jobs # CPU time in finished jobs: 955647s 15927.45m 265.46h 11.06d 0.030 y # IO & Wait Time: 199548s 3325.80m 55.43h 2.31d 0.006 y # Average job time: 23s 0.38m 0.01h 0.00d # Longest finished job: 59s 0.98m 0.02h 0.00d # Submission to last job: 3074s 51.23m 0.85h 0.04d ssh kkstore02 cd /cluster/data/ce4/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/ce4/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/ce4/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/ce4/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 484 of 484 jobs # CPU time in finished jobs: 485s 8.08m 0.13h 0.01d 0.000 y # IO & Wait Time: 1739s 28.99m 0.48h 0.02d 0.000 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest finished job: 16s 0.27m 0.00h 0.00d # Submission to last job: 102s 1.70m 0.03h 0.00d ssh kkstore02 cd /cluster/data/ce4/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/ce4/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/ce4/bed/tblastn.hg18KG hgLoadPsl ce4 blastHg18KG.psl # check coverage featureBits ce4 blastHg18KG # 3424422 bases of 100281244 (3.415%) in intersection featureBits ce4 sangerGene:cds blastHg18KG -enrichment # sangerGene:cds 27.876%, blastHg18KG 3.415%, both 3.231%, cover 11.59%, enrich 3.39x ssh kkstore02 rm -rf /cluster/data/ce4/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/ce4/bed/tblastn.hg18KG/blastOut #end tblastn ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS - 5-WAY # (DONE - 2007-04-23 - Hiram) # Estimate phastCons parameters ssh kkstore02 # We need the fasta sequence for this business cd /cluster/data/ce4 for C in `awk '{print $1}' chrom.sizes | sed -e s/chr//` do echo "twoBitToFa -seq=chr${C} ce4.2bit ${C}/chr${C}.fa" twoBitToFa -seq=chr${C} ce4.2bit ${C}/chr${C}.fa done mkdir /cluster/data/ce4/bed/multiz5way/cons cd /cluster/data/ce4/bed/multiz5way/cons # Create a starting-tree.mod based on chrV (the largest one) # Set -windows large enough so it only makes one piece faSize ../../../V/chrV.fa # 20919398 bases (0 N's 20919398 real 18027882 upper # 2891516 lower) in 1 sequences in 1 files time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \ --refseq ../../../V/chrV.fa --in-format MAF \ --windows 30000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # 10 minutes # the tree in use cat ../tree-commas.nh # ((((cb3,caeRem2),caePb1),ce4),priPac1) # this s1.*.ss should match only one item time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "((((cb3,caeRem2),caePb1),ce4),priPac1)" \ --out-root starting-tree # real 0m4.926s rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.377 # This 0.377 is used in the --gc argument below # Create SS files on san filesystem (takes ~ 2h 20m) # Increasing their size this time from 1,000,000 to 4,000,000 to # slow down the phastCons pk jobs ssh kkstore02 mkdir -p /san/sanvol1/scratch/worms/ce4/cons/ss cd /san/sanvol1/scratch/worms/ce4/cons/ss time for C in `awk '{print $1}' /cluster/data/ce4/chrom.sizes` do if [ -s /cluster/data/ce4/bed/multiz5way/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/ce4/bed/multiz5way/maf/${C}.maf \ --refseq /cluster/data/ce4/${chrN}/${C}.fa \ --in-format MAF --windows 4000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done # real 1m11.734s # Create a random list of 50 1 mb regions (do not use the _randoms) cd /san/sanvol1/scratch/worms/ce4/cons/ss ls -1l chr*/chr*.ss | grep -v random | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Only 13 lines in stdin when the --windows was 10000000 # Only 29 lines in stdin when the --windows was 4000000 # Set up parasol directory to calculate trees on these regions ssh pk mkdir /san/sanvol1/scratch/worms/ce4/cons/treeRun4 cd /san/sanvol1/scratch/worms/ce4/cons/treeRun4 mkdir tree log # Tuning this loop should come back to here to recalculate # Create script that calls phastCons with right arguments # expected-lengths and target-coverage taken from ce2 history # for treeRun4 --expected-lengths 15 --target-coverage 0.60 # and using ave.noncons.mod from treeRun3 # for treeRun3 --expected-lengths 15 --target-coverage 0.55 # and using ave.noncons.mod from treeRun2 in place of # /cluster/data/ce4/bed/multiz5way/cons/starting-tree.mod # for treeRun2 --expected-lengths 15 --target-coverage 0.65 # for treeRun1 --expected-lengths 25 --target-coverage 0.45 # mistakenly: --gc 0.277 cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /san/sanvol1/scratch/worms/ce4/cons/treeRun3/ave.noncons.mod \ --gc 0.377 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 15 --target-coverage 0.60 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # << happy emacs chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 28 of 28 jobs # CPU time in finished jobs: 2264s 37.73m 0.63h 0.03d 0.000 y # IO & Wait Time: 95s 1.59m 0.03h 0.00d 0.000 y # Average job time: 84s 1.40m 0.02h 0.00d # Longest finished job: 156s 2.60m 0.04h 0.00d # Submission to last job: 157s 2.62m 0.04h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore02 cd /san/sanvol1/scratch/worms/ce4/cons/treeRun4 ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ../ave.cons.mod > cons_summary.txt 2>&1 ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/ce4/bed/multiz5way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .60 15 \ ave.cons.mod ave.noncons.mod # for treeRun4 --expected-lengths 15 --target-coverage 0.60 # (Solving for new omega: 15.000000 141.732548 61.619960 42.817579 40.070343 # 39.988928 ) # Transition parameters: gamma=0.600000, omega=15.000000, mu=0.066667, # nu=0.100000 # Relative entropy: H=0.977026 bits/site # Expected min. length: L_min=6.777394 sites # Expected max. length: L_max=6.565768 sites # Phylogenetic information threshold: PIT=L_min*H=6.621688 bits Recommended expected length: omega=39.988928 sites (for L_min*H=9.780000) # for treeRun3 --expected-lengths 15 --target-coverage 0.55 # (Solving for new omega: 15.000000 77.051147 42.798715 35.637374 35.053109 # 35.048618 ) # Transition parameters: gamma=0.550000, omega=15.000000, # mu=0.066667, nu=0.081481 # Relative entropy: H=0.923015 bits/site # Expected min. length: L_min=7.718135 sites # Expected max. length: L_max=7.015398 sites # Phylogenetic information threshold: PIT=L_min*H=7.123957 bits # Recommended expected length: omega=35.048618 sites (for L_min*H=9.780000) # for treeRun2 --expected-lengths 15 --target-coverage 0.65 # ( Solving for new omega: 15.000000 466.852221 137.016323 64.163096 # 47.673547 45.689998 45.653095 ) # Transition parameters: gamma=0.650000, omega=15.000000, # mu=0.066667, nu=0.123810 # Relative entropy: H=1.050500 bits/site # Expected min. length: L_min=5.807815 sites # Expected max. length: L_max=6.063484 sites # Phylogenetic information threshold: PIT=L_min*H=6.101107 bits # Recommended expected length: omega=45.653095 sites (for L_min*H=9.780000) # for treeRun1 --expected-lengths 25 --target-coverage 0.45 # mistakenly: --gc 0.277 /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .45 25 \ ave.cons.mod ave.noncons.mod # ( Solving for new omega: 25.000000 26.470691 26.427531 ) # Transition parameters: gamma=0.450000, omega=25.000000, # mu=0.040000, nu=0.032727 # Relative entropy: H=0.700176 bits/site # Expected min. length: L_min=13.739278 sites # Expected max. length: L_max=11.506136 sites # Phylogenetic information threshold: PIT=L_min*H=9.619918 bits # Recommended expected length: omega=26.427531 sites (for L_min*H=9.780000) # Transition parameters: gamma=0.450000, omega=25.000000, mu=0.040000, nu=0.032727 # Relative entropy: H=0.700176 bits/site # Expected min. length: L_min=13.739278 sites # Expected max. length: L_max=11.506136 sites # Phylogenetic information threshold: PIT=L_min*H=9.619918 bits ssh pk # After some experimentation, going to go with consRun3 at this time # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/worms/ce4/cons/consRun3 cd /san/sanvol1/scratch/worms/ce4/cons/consRun3 # using the ave.cons.mod and ave.noncons.mod from treeRun3 above: ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV BACKGROUND: 0.311500 0.188500 0.188500 0.311500 RATE_MAT: -0.871104 0.170916 0.455562 0.244626 0.282442 -1.213584 0.179324 0.751817 0.752825 0.179324 -1.214011 0.281862 0.244626 0.454952 0.170565 -0.870143 TREE: ((((cb3:0.089669,caeRem2:0.075007):0.023378,caePb1:0.092376):0.038979,ce4:0.082842):0.118524,priPac1:0.118524); ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV BACKGROUND: 0.311500 0.188500 0.188500 0.311500 RATE_MAT: -0.871104 0.170916 0.455562 0.244626 0.282442 -1.213584 0.179324 0.751817 0.752825 0.179324 -1.214011 0.281862 0.244626 0.454952 0.170565 -0.870143 TREE: ((((cb3:0.333096,caeRem2:0.276036):0.086677,caePb1:0.341433):0.145785,ce4:0.306896):0.447539,priPac1:0.447539); mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 15 --target-coverage 0.55 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.{cons,noncons}.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << happy emacs chmod a+x doPhast # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive # CPU time in finished jobs: 383s 6.38m 0.11h 0.00d 0.000 y # IO & Wait Time: 89s 1.49m 0.02h 0.00d 0.000 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 22s 0.37m 0.01h 0.00d # Submission to last job: 233s 3.88m 0.06h 0.00d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/worms/ce4/cons/consRun3 # The sed's and the sort get the file names in chrom,start order # You might like to verify it is correct by first looking at the # list it produces: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" # if that looks right (== in chrom and numerical order), then let it run: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin \ > phastConsElements5way.bed # way too many negative # scores with --expected-length 15 --target-coverage 0.60 # OK with--expected-length 15 --target-coverage 0.55 # way too many negative # scores with --expected-length 15 --target-coverage 0.65 cp -p phastConsElements5way.bed /cluster/data/ce4/bed/multiz5way # Figure out how much is actually covered by the bed file as so: # Get the non-n genome size from faSize on all chroms: ssh kkstore02 cd /cluster/data/ce4 faSize */chr*.fa # 100281244 bases (0 N's 100281244 real 87008909 upper # 13272335 lower) in 7 sequences in 7 files # %13.24 masked total, %13.24 masked real cd /san/sanvol1/scratch/worms/ce4/cons/consRun3 # The 100281244 comes from the non-n genome as counted above. awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/100281244\n",100.0*sum/100281244,sum}' \ phastConsElements5way.bed # after fixing the model arguments: # % 32.37 = 100.0*32459183/100281244 # --expected-length 15 --target-coverage 0.55 # % 2.36 = 100.0*2370557/100281244 # --expected-length 25 --target-coverage 0.45 # % 1.36 = 100.0*1366061/100281244 # Aiming for %70 coverage in # the following featureBits measurement on CDS: # Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits ce4 \ -enrichment sangerGene:cds phastConsElements5way.bed # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.286%, phastConsElements5way.bed 2.364%, both 0.668%, # cover 2.64%, enrich 1.12x # --expected-length 25 --target-coverage 0.45 # sangerGene:cds 25.335%, phastConsElements5way.bed 1.362%, both 0.410%, # cover 1.62%, enrich 1.19x # Load most conserved track into database ssh hgwdev cd /cluster/data/ce4/bed/multiz5way # the copy was already done above # cp -p /san/sanvol1/scratch/worms/ce4/cons/consRun3/phastConsElements5way.bed . time nice -n +19 hgLoadBed ce4 phastConsElements5way \ phastConsElements5way.bed # Loaded 548755 elements of size 5 # should measure the same as above time nice -n +19 featureBits ce4 -enrichment sangerGene:cds \ phastConsElements5way # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.286%, phastConsElements5way 2.364%, both 0.668%, # cover 2.64%, enrich 1.12x # --expected-length 25 --target-coverage 0.45 # sangerGene:cds 25.335%, phastConsElements5way 1.362%, both 0.410%, # cover 1.62%, enrich 1.19x # Create merged posterier probability file and wiggle track data files ssh kkstore02 cd /san/sanvol1/scratch/worms/ce4/cons/consRun3 # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode # This was verified above to be correct time nice -n +19 find ./ppRaw -type f \ | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | wigEncode -noOverlap stdin \ phastCons5way.wig phastCons5way.wib # Converted stdin, upper limit 0.98, lower limit 0.00 # real 0m32.137s # -rw-rw-r-- 1 61949779 Apr 30 13:29 phastCons5way.wib # -rw-rw-r-- 1 11279833 Apr 30 13:29 phastCons5way.wig nice -n +19 cp -p phastCons5way.wi? /cluster/data/ce4/bed/multiz5way/ # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/worms/ce4/cons/consRun3 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons5wayScores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons5wayScores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 1m14.178s # copy them for downloads ssh kkstore02 mkdir /cluster/data/ce4/bed/multiz5way/phastCons5wayScores cd /cluster/data/ce4/bed/multiz5way/phastCons5wayScores cp -p /san/sanvol1/scratch/worms/ce4/cons/consRun3/phastCons5wayScores/* . ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/ce4 ln -s /cluster/data/ce4/bed/multiz5way/phastCons5wayScores . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/ce4/bed/multiz5way ln -s `pwd`/phastCons5way.wib /gbdb/ce4/wib/phastCons5way.wib nice -n +19 hgLoadWiggle ce4 phastCons5way phastCons5way.wig # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/ce4/bed/multiz5way nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=ce4 phastCons5way > histogram.data 2>&1 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Worm Ce4 Histogram phastCons5way track" set xlabel " phastCons5way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ########################################################################## #### Blat sangerGene proteins to determine exons # (DONE - 2007-04-27 - hiram) ssh hgwdev cd /cluster/data/ce4/bed mkdir blat.ce4SG.2007-04-27 ln -s blat.ce4SG.2007-04-27 blat.ce4SG cd blat.ce4SG pepPredToFa ce4 sangerPep sangerPep.fa # The kluster run # created san nib directory from goldenPath chromosomes fa files ssh pk cd /cluster/data/ce4/bed/blat.ce4SG cat << '_EOF_' > blatSome #!/bin/csh -fe blat -t=dnax -q=prot -out=pslx /san/sanvol1/scratch/worms/ce4/nib/$1.nib sgfa/$2.fa $3 '_EOF_' # << happy emacs chmod +x blatSome ls -1S /san/sanvol1/scratch/worms/ce4/nib > ce4.lst mkdir sgfa cd sgfa faSplit sequence ../sangerPep.fa 3000 sg ls -1S *.fa > ../sg.lst cd .. cat << '_EOF_' > template #LOOP blatSome $(root1) $(root2) {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy gensub2 ce4.lst sg.lst template jobList mkdir psl cd psl sed -e "s/.nib//" ../ce4.lst | xargs mkdir cd .. para create jobList para try ... check ... push ... etc # Completed: 20335 of 20335 jobs # CPU time in finished jobs: 64312s 1071.87m 17.86h 0.74d 0.002 y # IO & Wait Time: 53465s 891.08m 14.85h 0.62d 0.002 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest finished job: 278s 4.63m 0.08h 0.00d # Submission to last job: 788s 13.13m 0.22h 0.01d ssh kkstore02 cd /cluster/data/ce4/bed/blat.ce4SG.2007-04-27 pslSort dirs raw.psl /tmp psl/* # -rw-rw-r-- 1 70076854 Apr 27 16:19 raw.psl pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null # Processed 156107 alignments # -rw-rw-r-- 1 25783288 Apr 27 16:20 cooked.psl pslUniq cooked.psl ce4SG.psl # -rw-rw-r-- 1 41321225 Mar 24 11:14 ce4SG.psl cut -f 10 ce4SG.psl > sgName.lst faSomeRecords sangerPep.fa sgName.lst ce4SG.fa faSize ce4SG.fa # 10183094 bases (8184565 N's 1998529 real 1998529 upper 0 lower) in # 23192 sequences in 1 files faSize sangerPep.fa # 10186194 bases (8187162 N's 1999032 real 1999032 upper 0 lower) in # 23224 sequences in 1 files # You may need to build this pslxToFa - it is not in the standard build pslxToFa ce4SG.psl ce4SG_ex.fa -liftTarget=genome.lft \ -liftQuery=protein.lft # -rw-rw-r-- 1 5128274 Apr 27 16:29 protein.lft # -rw-rw-r-- 1 6545612 Apr 27 16:29 genome.lft # -rw-rw-r-- 1 12409406 Apr 27 16:29 ce4SG_ex.fa wc -l *.psl *.lft *.fa sgName.lst # 23192 ce4SG.psl # 25361 cooked.psl # 156112 raw.psl # 146926 genome.lft # 146926 protein.lft # 238132 ce4SG.fa # 293852 ce4SG_ex.fa # 238243 sangerPep.fa # 23192 sgName.lst # 1291936 total # back on hgwdev ssh hgwdev cd /cluster/data/ce4/bed/blat.ce4SG ## This doesn't work here, it complains about a missing kgXref table # may need to make this command in src/hg/oneShot/kgName kgName ce4 ce4SG.psl blastSGRef01 # After about an hour, it exited with this message: # sqlFreeConnection called on cache (ce4) that doesn't contain # the given connection # This may be a lurking error in this program, because the # resulting file seems to have the correct number of lines: # Need a reference table loaded, perl script makes up relationships cat << '_EOF_' >> mkRef.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); sub usage() { print STDERR "usage: ./mkRef.pl blastSGRef01.tab\n"; print STDERR "reads ../sangerGene/genePred.tab and ", "../sangerGene/orfToGene.tab\n"; print STDERR "writes references to the given input file, ", "e.g. blastSGRef01.tab\n"; } if (1 != $argc) { usage; exit 255; } my $outFile = shift; my %orfToGene; # key is all lowercase orf name, value is gene name my $inFile="../sangerGene/orfToGene.tab"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $line=) { chomp $line; my ($orf, $gene) = split('\s+',$line); $orfToGene{lc($orf)} = $gene } close (FH); my %links; # key is all lowercase orf name, value is SwissProt name $inFile="../sangerGene/sangerLinks.tab"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $line=) { chomp $line; my ($orf, $SP, $comments) = split('\s+',$line); $links{lc($orf)} = $SP } close (FH); my %ce4Position; # key is all lowercase orf name, value is ce4 position $inFile="../sangerGene/genePred.tab"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $line=) { chomp $line; my ($bin, $orf, $chr, $strand, $start, $end, $other) = split('\s+',$line); $ce4Position{lc($orf)} = sprintf("%s:%d-%d", $chr, $start, $end); } close (FH); open (OF,">$outFile") or die "can not write to $outFile"; $inFile="sgName.lst"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $orf=) { chomp $orf; if (!exists($orfToGene{lc($orf)})) { $orfToGene{lc($orf)} = ""; } if (!exists($links{lc($orf)})) { $links{lc($orf)} = ""; } if (!exists($ce4Position{lc($orf)})) { $ce4Position{lc($orf)} = ""; } printf OF "%s\t%s\t%s\t%s\t%s\n", $orf, $orfToGene{lc($orf)}, $ce4Position{lc($orf)}, $links{lc($orf)}, ""; } close (FH); close (OF); '_EOF_' # << happy emacs chmod +x ./mkRef.pl ./mkRef.pl blastSGRef.01.tab hgLoadSqlTab ce4 blastSGRef01 ~/kent/src/hg/lib/blastRef.sql \ blastSGRef01.tab wc -l sgName.lst blastSGRef01.tab ce4SG.psl # 23192 sgName.lst # 23192 blastSGRef01.tab # 23192 ce4SG.psl # And load the protein sequences hgPepPred ce4 generic blastSGPep01 ce4SG.fa # end blat proteins ########################################################################## ## 4-Way multiple alignment (DONE - 2007-05-01 - Hiram) mkdir /cluster/data/ce4/bed/multiz4way cd /cluster/data/ce4/bed/multiz4way # Taking the tuning tree ave.noncons.mod from the 5-Way experiment tail -1 /cluster/data/ce4/bed/multiz5way/cons/ave.noncons.mod # TREE: ((((cb3:0.333096,caeRem2:0.276036):0.086677,caePb1:0.341433):0.145785, # ce4:0.306896):0.447539,priPac1:0.447539); cat << '_EOF_' > nematodes.nh ((((briggsae_cb3:0.333096,remanei_caeRem2:0.276036):0.086677, brenneri_caePb1:0.341433):0.145785,elegans_ce4:0.306896):0.447539, priPac1:0.447539) '_EOF_' # << happy emacs /cluster/bin/phast/x86_64/tree_doctor \ --prune-all-but briggsae_cb3,remanei_caeRem2,brenneri_caePb1,elegans_ce4 \ nematodes.nh > 4way.nh /cluster/bin/phast/x86_64/all_dists 4way.nh > 4way.distances.txt grep -i ce4 4way.distances.txt | sort -k3,3n # Use this output for reference, and use the calculated # distances in the table below to order the organisms and check # the button order on the browser. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chainCe4Link chain linearGap # distance on Ce4 on other minScore # 1 0.7941 - brenneri_caePb1 (% 52.447) (% 42.659) 1000 loose # 2 0.8154 - remanei_caeRem2 (% 45.034) (% 35.572) 1000 loose # 3 0.8725 - briggsae_cb3 (% 42.372) (% 39.823) 1000 loose cd /cluster/data/ce4/bed/multiz4way # bash shell syntax here ... export H=/cluster/data/ce4/bed mkdir mafLinks for G in caeRem2 cb3 caePb1 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" exit 255 fi ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # Copy MAFs to some appropriate NFS server for kluster run ssh kkstore02 mkdir /san/sanvol1/scratch/worms/ce4/multiz4way cd /san/sanvol1/scratch/worms/ce4/multiz4way time rsync -a --copy-links --progress --stats \ /cluster/data/ce4/bed/multiz4way/mafLinks/ . # a few seconds to copy 99 Mb # these are x86_64 binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn # the autoMultiz cluster run ssh pk cd /cluster/data/ce4/bed/multiz4way/ # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 4way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir maf run cd run # NOTE: you need to set the db properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = ce4 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/worms/$db/multiz4way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/worms/$db/multiz4way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP autoMultiz $(root1) {check out line+ /cluster/data/ce4/bed/multiz4way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/ce4/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para try # 7 jobs in batch # Completed: 7 of 7 jobs # CPU time in finished jobs: 1708s 28.47m 0.47h 0.02d 0.000 y # IO & Wait Time: 34s 0.57m 0.01h 0.00d 0.000 y # Average job time: 249s 4.15m 0.07h 0.00d # Longest finished job: 361s 6.02m 0.10h 0.00d # Submission to last job: 464s 7.73m 0.13h 0.01d ssh kkstore02 cd /cluster/data/ce4/bed/multiz4way nice -n +19 catDir maf > multiz4way.maf # -rw-rw-r-- 1 285201701 May 1 13:54 multiz4way.maf # before getting to the annotation, load this up so we can get # a first view of the track. This will be replaced with the annotated # mafs ssh hgwdev cd /cluster/data/ce4/bed/multiz4way mkdir /gbdb/ce4/multiz4way ln -s /cluster/data/ce4/bed/multiz4way/multiz4way.maf /gbdb/ce4/multiz4way nice -n +19 hgLoadMaf ce4 multiz4way # Loaded 314717 mafs in 1 files from /gbdb/ce4/multiz4way nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 ce4 multiz4waySummary multiz4way.maf # Created 88583 summary blocks from 631555 components and 314717 mafs # from multiz4way.maf ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS 4-WAY # (DONE - 2007-05-01 - Hiram) # Estimate phastCons parameters ssh kkstore02 mkdir /cluster/data/ce4/bed/multiz4way/cons cd /cluster/data/ce4/bed/multiz4way/cons # Create a starting-tree.mod based on chrV (the largest one) # Set -windows large enough so it only makes one piece faSize ../../../V/chrV.fa # 20919398 bases (0 N's 20919398 real 18027882 upper # 2891516 lower) in 1 sequences in 1 files nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \ --refseq ../../../V/chrV.fa --in-format MAF \ --windows 30000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # the tree in use cat ../tree-commas.nh # (((cb3,caeRem2),caePb1),ce4) # this s1.*.ss should match only one item nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "(((cb3,caeRem2),caePb1),ce4)" \ --out-root starting-tree rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.375 # This 0.375 is used in the --gc argument below # Create SS files on san filesystem (takes ~ 2h 20m) # Increasing their size this time from 1,000,000 to 4,000,000 to # slow down the phastCons pk jobs ssh kkstore02 mkdir -p /san/sanvol1/scratch/worms/ce4/cons.4way/ss cd /san/sanvol1/scratch/worms/ce4/cons.4way/ss time for C in `awk '{print $1}' /cluster/data/ce4/chrom.sizes` do if [ -s /cluster/data/ce4/bed/multiz4way/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/ce4/bed/multiz4way/maf/${C}.maf \ --refseq /cluster/data/ce4/${chrN}/${C}.fa \ --in-format MAF --windows 4000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done & # real 1m1.269s # Create a random list of 50 1 mb regions cd /san/sanvol1/scratch/worms/ce4/cons.4way/ss ls -1l chr*/chr*.ss | awk '$5 > 4000000 {print $9;}' \ | randomLines stdin 50 ../randomSs.list # Only 28 lines in stdin # Set up parasol directory to calculate trees on these regions ssh pk mkdir /san/sanvol1/scratch/worms/ce4/cons.4way/treeRun1 cd /san/sanvol1/scratch/worms/ce4/cons.4way/treeRun1 # Tuning this loop should come back to here to recalculate # Create script that calls phastCons with right arguments # expected-lengths and target-coverage taken from ce2 history # for treeRun1 --expected-lengths 15 --target-coverage 0.60 # and using # /cluster/data/ce4/bed/multiz4way/cons.4way/starting-tree.mod # for treeRun2 --expected-lengths 15 --target-coverage 0.55 # and using # /cluster/data/ce4/bed/multiz4way/cons.4way/starting-tree.mod cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/ce4/bed/multiz4way/cons.4way/starting-tree.mod \ --gc 0.375 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 15 --target-coverage 0.55 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # << happy emacs chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it mkdir tree log gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 28 of 28 jobs # CPU time in finished jobs: 3337s 55.62m 0.93h 0.04d 0.000 y # IO & Wait Time: 77s 1.28m 0.02h 0.00d 0.000 y # Average job time: 122s 2.03m 0.03h 0.00d # Longest finished job: 193s 3.22m 0.05h 0.00d # Submission to last job: 193s 3.22m 0.05h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore02 cd /san/sanvol1/scratch/worms/ce4/cons.4way/treeRun1 ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ../ave.cons.mod > cons_summary.txt 2>&1 ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/ce4/bed/multiz4way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .55 15 \ ave.cons.mod ave.noncons.mod # for treeRun2 --expected-lengths 15 --target-coverage 0.55 # (Solving for new omega: 15.000000 78.545736 43.417367 36.104783 # 35.511720 35.507213 ) # Transition parameters: gamma=0.550000, omega=15.000000, mu=0.066667, # nu=0.081481 # Relative entropy: H=0.656639 bits/site # Expected min. length: L_min=10.742794 sites # Expected max. length: L_max=9.457029 sites # Phylogenetic information threshold: PIT=L_min*H=7.054137 bits # Recommended expected length: omega=35.507213 sites (for L_min*H=9.780000) # for treeRun1 --expected-lengths 15 --target-coverage 0.60 # (Solving for new omega: 15.000000 143.489247 62.468129 43.654450 40.993458 # 40.920278 ) # Transition parameters: gamma=0.600000, omega=15.000000, mu=0.066667, # nu=0.100000 # Relative entropy: H=0.697543 bits/site # Expected min. length: L_min=9.302913 sites # Expected max. length: L_max=8.908666 sites # Phylogenetic information threshold: PIT=L_min*H=6.489180 bits # Recommended expected length: omega=40.920278 sites (for L_min*H=9.780000) ssh pk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/worms/ce4/cons/consRun1 cd /san/sanvol1/scratch/worms/ce4/cons/consRun1 # using the ave.cons.mod from treeRun2 above: ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV BACKGROUND: 0.312500 0.187500 0.187500 0.312500 RATE_MAT: -0.870936 0.168820 0.457329 0.244786 0.281367 -1.216280 0.173422 0.761491 0.762216 0.173422 -1.215832 0.280194 0.244786 0.456895 0.168116 -0.869797 TREE: (((cb3:0.088481,caeRem2:0.073770):0.023475,caePb1:0.091021):0.059375,ce4:0.059375); mkdir ppRaw bed # root1 == chrom name, file1 == ss file name without .ss suffix cat > template << '_EOF_' #LOOP doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << happy emacs # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 15 --target-coverage 0.55 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.{cons,noncons}.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << happy emacs chmod a+x doPhast # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive # Completed: 29 of 29 jobs # CPU time in finished jobs: 375s 6.25m 0.10h 0.00d 0.000 y # IO & Wait Time: 88s 1.46m 0.02h 0.00d 0.000 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 21s 0.35m 0.01h 0.00d # Submission to last job: 32s 0.53m 0.01h 0.00d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/worms/ce4/cons/consRun1 # The sed's and the sort get the file names in chrom,start order # You might like to verify it is correct by first looking at the # list it produces: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" # if that looks right (== in chrom and numerical order), then let it run: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin \ > phastConsElements4way.bed # way too many negative # scores with --expected-length 15 --target-coverage 0.60 # OK with--expected-length 15 --target-coverage 0.55 cp -p phastConsElements4way.bed /cluster/data/ce4/bed/multiz4way # Figure out how much is actually covered by the bed file as so: # Get the non-n genome size from faSize on all chroms: ssh kkstore02 cd /cluster/data/ce4 faSize */chr*.fa # 100281244 bases (0 N's 100281244 real 87008909 upper # 13272335 lower) in 7 sequences in 7 files # %13.24 masked total, %13.24 masked real cd /san/sanvol1/scratch/worms/ce4/cons/consRun2 # The 100281244 comes from the non-n genome as counted above. awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/100281244\n",100.0*sum/100281244,sum}' \ phastConsElements4way.bed # after fixing the two-model argument in the phastCons run # --expected-length 15 --target-coverage 0.55 # % 31.95 = 100.0*32036617/100281244 # --expected-length 15 --target-coverage 0.55 # % 2.46 = 100.0*2466194/100281244 # Aiming for %70 coverage in # the following featureBits measurement on CDS: # Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits ce4 \ -enrichment sangerGene:cds phastConsElements4way.bed # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.286%, phastConsElements4way.bed 31.947%, both # 15.830%, cover 62.60%, enrich 1.96x # the broken single model phastCons run: # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.286%, phastConsElements4way.bed 2.459%, both 0.872%, # cover 3.45%, enrich 1.40x # Load most conserved track into database ssh hgwdev cd /cluster/data/ce4/bed/multiz4way # the copy was already done above # cp -p /san/sanvol1/scratch/worms/ce4/cons/consRun2/phastConsElements4way.bed . time nice -n +19 hgLoadBed ce4 phastConsElements4way \ phastConsElements4way.bed # Loaded 535009 elements of size 5 # should measure the same as above time nice -n +19 featureBits ce4 -enrichment sangerGene:cds \ phastConsElements4way # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.286%, phastConsElements4way 31.947%, both 15.830%, # cover 62.60%, enrich 1.96x # the broken single model phastCons run: # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.286%, phastConsElements4way 2.459%, both 0.872%, # cover 3.45%, enrich 1.40x # Create merged posterier probability file and wiggle track data files ssh kkstore02 cd /san/sanvol1/scratch/worms/ce4/cons.4way/consRun2 # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode # This was verified above to be correct time nice -n +19 find ./ppRaw -type f \ | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | wigEncode -noOverlap stdin \ phastCons4way.wig phastCons4way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 0m30.016s # -rw-rw-r-- 1 61537017 May 3 15:41 phastCons4way.wib # -rw-rw-r-- 1 11279406 May 3 15:41 phastCons4way.wig nice -n +19 cp -p phastCons4way.wi? /cluster/data/ce4/bed/multiz4way/ # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/worms/ce4/cons.4way/consRun2 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons4wayScores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons4wayScores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 1m14.178s # copy them for downloads ssh kkstore02 mkdir /cluster/data/ce4/bed/multiz4way/phastCons4wayScores cd /cluster/data/ce4/bed/multiz4way/phastCons4wayScores cp -p \ /san/sanvol1/scratch/worms/ce4/cons.4way/consRun2/phastCons4wayScores/* . ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/ce4 ln -s /cluster/data/ce4/bed/multiz4way/phastCons4wayScores . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/ce4/bed/multiz4way ln -s `pwd`/phastCons4way.wib /gbdb/ce4/wib/phastCons4way.wib nice -n +19 hgLoadWiggle ce4 phastCons4way phastCons4way.wig # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/ce4/bed/multiz4way nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=ce4 phastCons4way > histogram.data 2>&1 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Worm Ce4 Histogram phastCons4way track" set xlabel " phastCons4way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS just with brenneri # (DONE - 2007-05-03 - Hiram) # Estimate phastCons parameters ssh kkstore02 mkdir /cluster/data/ce4/bed/multiz2way cd /cluster/data/ce4/bed/multiz2way /cluster/bin/phast/x86_64/tree_doctor \ --prune-all-but cb5161,elegans_ce4 \ --rename "cb5161 -> brenneri_caePb1" \ ../multiz5way/nematodes.nh > 2way.nh # the tree in use sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 2way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir maf cp -p ../blastz.caePb1/mafNet/*.maf.gz maf # second attempt with blastz.cb3 cd maf gunzip *.gz mkdir /cluster/data/ce4/bed/multiz2way/cons cd /cluster/data/ce4/bed/multiz2way/cons # Create a starting-tree.mod based on chrV (the largest one) # Set -windows large enough so it only makes one piece faSize ../../../V/chrV.fa # 20919398 bases (0 N's 20919398 real 18027882 upper # 2891516 lower) in 1 sequences in 1 files nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \ --refseq ../../../V/chrV.fa --in-format MAF \ --windows 30000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 cat ../tree-commas.nh # first time with (caePb1,ce4) # second time with (ce4,cb3) # this s1.*.ss should match only one item nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "(ce4,cb3)" \ --out-root starting-tree # first time with --tree "(caePb1,ce4)" \ rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.365 # This 0.365 is used in the --gc argument below # Create SS files on san filesystem (takes ~ 2h 20m) # Increasing their size this time from 1,000,000 to 4,000,000 to # slow down the phastCons pk jobs ssh kkstore02 mkdir -p /san/sanvol1/scratch/worms/ce4/cons2way/ss cd /san/sanvol1/scratch/worms/ce4/cons2way/ss time for C in `awk '{print $1}' /cluster/data/ce4/chrom.sizes` do if [ -s /cluster/data/ce4/bed/multiz2way/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/ce4/bed/multiz2way/maf/${C}.maf \ --refseq /cluster/data/ce4/${chrN}/${C}.fa \ --in-format MAF --windows 4000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done & # real 0m38.178s # Create a random list of 50 1 mb regions cd /san/sanvol1/scratch/worms/ce4/cons2way/ss ls -1l chr*/chr*.ss | awk '$5 > 4000000 {print $9;}' \ | randomLines stdin 50 ../randomSs.list # Only 28 lines in stdin # Set up parasol directory to calculate trees on these regions ssh pk mkdir /san/sanvol1/scratch/worms/ce4/cons2way/treeRun1 cd /san/sanvol1/scratch/worms/ce4/cons2way/treeRun1 # Create template file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) #ENDLOOP '_EOF_' # << happy emacs # Tuning this loop should come back to here to recalculate # Create script that calls phastCons with right arguments # expected-lengths and target-coverage taken from ce2 history # for treeRun1 --expected-lengths 25 --target-coverage 0.45 \ # /cluster/data/ce4/bed/multiz2way/cons/starting-tree.mod cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/ce4/bed/multiz2way/cons/starting-tree.mod \ --gc 0.365 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 25 --target-coverage 0.45 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # << happy emacs chmod +x makeTree.csh # Make cluster job and run it mkdir tree log gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 27 of 27 jobs # CPU time in finished jobs: 1887s 31.45m 0.52h 0.02d 0.000 y # IO & Wait Time: 319s 5.31m 0.09h 0.00d 0.000 y # Average job time: 82s 1.36m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 333s 5.55m 0.09h 0.00d # Submission to last job: 3474s 57.90m 0.96h 0.04d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore02 cd /san/sanvol1/scratch/worms/ce4/cons2way/treeRun1 ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ../ave.cons.mod > cons_summary.txt 2>&1 ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/ce4/bed/multiz2way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .45 25 \ ave.cons.mod ave.noncons.mod # for treeRun1 --expected-lengths 25 --target-coverage 0.45 cb3 # (Solving for new omega: 25.000000 22.541982 22.386815 22.386143 ) # Transition parameters: gamma=0.450000, omega=25.000000, mu=0.040000, nu=0.032727 # Relative entropy: H=0.186326 bits/site # Expected min. length: L_min=53.981235 sites # Expected max. length: L_max=39.938782 sites # Phylogenetic information threshold: PIT=L_min*H=10.058085 bits # Recommended expected length: omega=22.386143 sites (for L_min*H=9.780000) # for treeRun1 --expected-lengths 25 --target-coverage 0.45 caePb1 # (Solving for new omega: 25.000000 22.727583 22.596785 22.596318 ) # Transition parameters: gamma=0.450000, omega=25.000000, mu=0.040000, nu=0.032727 # Relative entropy: H=0.192979 bits/site # Expected min. length: L_min=52.008938 sites # Expected max. length: L_max=38.464113 sites # Phylogenetic information threshold: PIT=L_min*H=10.036610 bits # Recommended expected length: omega=22.596318 sites (for L_min*H=9.780000) ssh pk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/worms/ce4/cons2way/consRun1 cd /san/sanvol1/scratch/worms/ce4/cons2way/consRun1 # using the ave.cons.mod,ave.noncons.mod from treeRun2 above: ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV BACKGROUND: 0.317500 0.182500 0.182500 0.317500 RATE_MAT: -0.861737 0.169946 0.448797 0.242994 0.295659 -1.241318 0.166727 0.778932 0.780784 0.166727 -1.242392 0.294881 0.242994 0.447733 0.169498 -0.860225 TREE: (ce4:0.088042,cb3:0.088042); ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV BACKGROUND: 0.317500 0.182500 0.182500 0.317500 RATE_MAT: -0.861737 0.169946 0.448797 0.242994 0.295659 -1.241318 0.166727 0.778932 0.780784 0.166727 -1.242392 0.294881 0.242994 0.447733 0.169498 -0.860225 TREE: (ce4:0.281698,cb3:0.281698); mkdir ppRaw bed # root1 == chrom name, file1 == ss file name without .ss suffix cat > template << '_EOF_' #LOOP doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << happy emacs # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 25 --target-coverage 0.45 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.{cons,noncons}.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << happy emacs chmod a+x doPhast # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive # Completed: 29 of 29 jobs # CPU time in finished jobs: 312s 5.20m 0.09h 0.00d 0.000 y # IO & Wait Time: 78s 1.30m 0.02h 0.00d 0.000 y # Average job time: 13s 0.22m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 19s 0.32m 0.01h 0.00d # Submission to last job: 38s 0.63m 0.01h 0.00d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/worms/ce4/cons2way/consRun1 # The sed's and the sort get the file names in chrom,start order # You might like to verify it is correct by first looking at the # list it produces: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" # if that looks right (== in chrom and numerical order), then let it run: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin \ > phastConsElements2way.bed # OK with--expected-length 25 --target-coverage 0.45 cp -p phastConsElements2way.bed /cluster/data/ce4/bed/multiz2way # Figure out how much is actually covered by the bed file as so: # Get the non-n genome size from faSize on all chroms: ssh kkstore02 cd /cluster/data/ce4 faSize */chr*.fa # 100281244 bases (0 N's 100281244 real 87008909 upper # 13272335 lower) in 7 sequences in 7 files # %13.24 masked total, %13.24 masked real cd /san/sanvol1/scratch/worms/ce4/cons2way/consRun1 # The 100281244 comes from the non-n genome as counted above. awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/100281244\n",100.0*sum/100281244,sum}' \ phastConsElements2way.bed # --expected-length 15 --target-coverage 0.55 # % 0.54 = 100.0*537580/100281244 # Aiming for %70 coverage in # the following featureBits measurement on CDS: # Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits ce4 \ -enrichment sangerGene:cds phastConsElements2way.bed # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.286%, phastConsElements2way.bed 2.459%, both 0.872%, # cover 3.45%, enrich 1.40x # Load most conserved track into database ssh hgwdev cd /cluster/data/ce4/bed/multiz2way # the copy was already done above # cp -p /san/sanvol1/scratch/worms/ce4/cons2way/consRun2/phastConsElements2way.bed . time nice -n +19 hgLoadBed ce4 phastConsElements2way \ phastConsElements2way.bed # Loaded 51538 elements of size 5 # should measure the same as above time nice -n +19 featureBits ce4 -enrichment sangerGene:cds \ phastConsElements2way # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.286%, phastConsElements2way 2.459%, both 0.872%, # cover 3.45%, enrich 1.40x # Create merged posterier probability file and wiggle track data files ssh kkstore02 cd /san/sanvol1/scratch/worms/ce4/cons2way/consRun1 # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode # This was verified above to be correct time nice -n +19 find ./ppRaw -type f \ | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | wigEncode -noOverlap stdin \ phastCons2way.wig phastCons2way.wib # Converted stdin, upper limit 0.93, lower limit 0.00 # real 0m26.060s # -rw-rw-r-- 1 49349613 May 3 11:02 phastCons2way.wib # -rw-rw-r-- 1 11327787 May 3 11:02 phastCons2way.wig nice -n +19 cp -p phastCons2way.wi? /cluster/data/ce4/bed/multiz2way/ # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/worms/ce4/cons2way/consRun2 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons2wayScores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons2wayScores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 1m14.178s # copy them for downloads ssh kkstore02 # this directory is actually a symlink from store9 to store8 to # avoid the data full problem on store9 mkdir /cluster/data/ce4/bed/multiz2way/phastCons2wayScores cd /cluster/data/ce4/bed/multiz2way/phastCons2wayScores cp -p /san/sanvol1/scratch/worms/ce4/cons2way/consRun2/phastCons2wayScores/* . ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/ce4 ln -s /cluster/data/ce4/bed/multiz2way/phastCons2wayScores . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/ce4/bed/multiz2way ln -s `pwd`/phastCons2way.wib /gbdb/ce4/wib/phastCons2way.wib nice -n +19 hgLoadWiggle ce4 phastCons2way phastCons2way.wig # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/ce4/bed/multiz2way nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=ce4 phastCons2way > histogram.data 2>&1 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Worm Ce4 Histogram phastCons2way track" set xlabel " phastCons2way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################ # ADJUST KIMEXPDISTANCE NAMES TO MATCH SANGERCANONICAL (DONE 7/12/07 angie) # kimExpDistance has all [A-Z]+[0-9]+\.[0-9]+ names, but now sangerCanonical # has some of the new [A-Z]+[0-9]+\.[0-9]+\.[0-9]+ names, so when hgNear # looks for a canonical name in kimExpDistance, it often can't find it. # Canonical also has [A-Z]+[0-9]+\.[0-9]+[a-z] # and [A-Z]+[0-9]+\.[0-9]+[a-z]\.[0-9]+ # kimExpDistance has 4 [A-Z]+[0-9]+\.[0-9]+[A-Z] # Update the names in kimExpDistance to match sangerCanonical. ssh hgwdev # Look at just the names and how they differ between the two sources: hgsql ce4 -N -e 'select transcript from sangerCanonical' \ | sort > /tmp/canonicalNames hgsql ce4 -N -e 'select * from kimExpDistance' \ > /tmp/kimExpDistance awk -F"\t" '{print $1;}' /tmp/kimExpDistance \ | uniq | sort -u > /tmp/kimExpNames sdiff /tmp/canonicalNames /tmp/kimExpNames | less comm -23 /tmp/canonicalNames /tmp/kimExpNames | wc -l #7529 comm -13 /tmp/canonicalNames /tmp/kimExpNames | wc -l #6698 # Pretty significant non-overlap, but I think much of it can be # resolved by slight tweaks to kimExpDistance names so that they # match the latest, more-specifically-versioned canonical transcripts. # Also, kimExpDistance has many names with no match in canonical, # which means many thousands of unnecessary rows bloating the table. cat > fixExpDistance.pl <<'_EOF_' #!/bin/env perl use warnings; use strict; my @kimExpNames; my @kimSpecial; open(F, "/tmp/kimExpNames") || die; while () { chomp; push @kimExpNames, $_; push @kimSpecial, $_ if ($_ =~ /\dA$/); } close(F); my %kim2canon; open(F, "/tmp/canonicalNames") || die; while () { chomp; my $kim = $_; my @special = grep m/^$kim$/i, @kimSpecial; if (scalar(@special)) { $kim2canon{$special[0]} = $kim; } else { my $simple = $kim; $simple =~ s/^(\w+\.\d+).*$/$1/; $kim2canon{$simple} = $kim; } } close(F); while (<>) { chomp; my ($query, $target, $distance) = split('\t'); my $canonicalQ = $kim2canon{$query}; my $canonicalT = $kim2canon{$target}; if (defined $canonicalQ && defined $canonicalT) { print "$canonicalQ\t$canonicalT\t$distance\n"; } } '_EOF_' # << emacs chmod a+x fixExpDistance.pl ./fixExpDistance.pl /tmp/kimExpDistance > kimExpDistanceFixedNames.tab # Look the file over, grep for some IDs in /tmp/canonicalNames. # Load into the db as kimExprDistanceTest. It will take a while # because it's still 12M rows (down from 19M). time nice hgLoadSqlTab ce4 kimExpDistanceTest \ ~/kent/src/hg/lib/distance.sql kimExpDistanceFixedNames.tab #0.000u 0.000s 2:28.94 0.0% 0+0k 0+0io 0pf+0w # Test hgNear with kimExpDistanceTest instead of kimExpDistance. # Don't delete the original table just yet, but do clean it up soon # because it's quite large. hgsql ce4 -e 'rename table kimExpDistance to kimExpDistanceOrig;' hgsql ce4 -e 'rename table kimExpDistanceTest to kimExpDistance;' runJoiner.csh ce4 kimExpDistance # Test hgNear again with kimExpDistance. ############################################################################ ## fixup the sangerPep names to match sangerCanonical ## this is a dead end, it doesn't actually fix anything ssh hgwdev mkdir /cluster/data/ce4/bed/fixupSangerPep cd /cluster/data/ce4/bed/fixupSangerPep ## current sangerPep names cut -f1 ../sangerGene/sangerPep.tab | sort -u > sangerPep.names.original.txt ## and canonical names hgsql -N -e "select transcript from sangerCanonical;" ce4 \ | sort -u > sangerCanonical.transcript.txt # line counts wc -l *.txt # 19965 sangerCanonical.transcript.txt # 23224 sangerPep.names.original.txt ## unique to canonical comm -23 sangerCanonical.transcript.txt sangerPep.names.original.txt | wc -l # 1630 # add all canonical names to peptides by duplicating the peptides ~/kent/src/hg/utils/wormStuff/oneToTwo.pl newPeptides.tab 2> warnings.txt # verify this is a new superset of peptides cut -f1 newPeptides.tab | sort -u > sangerPep.newNames.txt comm -12 sangerPep.names.original.txt sangerPep.newNames.txt | wc -l # 23224 - same number as was in sangerPep.names.original.txt # new names added: comm -13 sangerPep.names.original.txt sangerPep.newNames.txt | wc -l # 1593 # the missing peptides are for canonical non-coding genes comm -23 sangerCanonical.transcript.txt sangerPep.newNames.txt | wc -l # 37 ############################################################################ ## fixup sangerCanonical names to remove the second dot name ssh hgwdev cd /cluster/data/ce4/bed/fixupSangerCanonical hgsql -e "select chrom,txStart,txEnd,name from sangerGene;" ce4 \ | sort -k1,1 -k2,2n > sangerGene.names.txt hgsql -N -e "select * from sangerCanonical;" ce4 > sangerCanonical.tab cat << '_EOF_' > fixup.pl #!/usr/bin/env perl use strict; use warnings; my %genePositions; open (FH,") { chomp $line; my ($chr, $start, $end, $name) = split('\s+',$line); die "duplicate name in sangerGene $name" if (exists($genePositions{$name})); $genePositions{$name} = sprintf "%s\t%d\t%d", $chr, $start, $end; } close (FH); my %names; open (FH,") { chomp $line; my ($chr, $start, $end, $score, $name, $prot) = split('\s+',$line); my ($a, $b, $c, $d) = split('\.',$name); die "found four part name at $line" if (defined($d)); my $twoPartName = $name; if (defined($c)) { $twoPartName = sprintf "%s.%s", $a, $b; } die "ERROR: two part name clash at $line" if (exists($names{$twoPartName})); $names{$twoPartName} = $line; if (!defined($prot)) { print STDERR "# no prot: $line\n"; my $prot = ""; } die "can not find position for $twoPartName" if (!exists($genePositions{$twoPartName})); my $pos = $genePositions{$twoPartName}; my ($existChr, $realStart, $realEnd) = split('\s+', $pos); die "not on same chrom ? $twoPartName Name $chr and $existChr" if ($existChr ne $chr); my $startDiff = $start - $realStart; $startDiff = - $startDiff if ($startDiff < 0); my $endDiff = $end - $realEnd; $endDiff = - $endDiff if ($endDiff < 0); die "start more than 100000 off $twoPartName $pos" if ($startDiff > 100000); die "end more than 20000 off $twoPartName $pos" if ($endDiff > 20000); printf "%s\t%d\t%d\t%d\t%s\t%s\n", $chr, $realStart, $realEnd, $score, $twoPartName, $prot; } close (FH); '_EOF_' # << happy emacs chmod +x ./fixup.pl ./fixup.pl > fixed.tab hgsql -e 'delete from sangerCanonical where chrom like "chr%";' ce4 hgsql -e \ 'load data local infile "fixed.tab" into table sangerCanonical;' ce4 ## reset indexes hgsql -e "analyze table sangerCanonical;" ce4 ############################################################################ ## fixup sangerLinks table (DONE - 2007-07-25 - Hiram) ## it was missing descriptions for the one dot names where only two dot ## names existed mkdir /cluster/data/ce4/bed/fixupSangerLinks cd /cluster/data/ce4/bed/fixupSangerLinks ln -s ../sangerGene/sangerLinks.tab . cat << '_EOF_' > slFixup.pl #!/usr/bin/env perl use strict; use warnings; open (FH,") { chomp $line; my ($name, $orf, $desc) = split('\t',$line); my ($a, $b, $c, $d) = split('\.', $name); die "ERROR: found three dot name at $line" if (defined($d)); if (defined($c)) { my $oneDotName = sprintf "%s.%s", $a, $b; if (exists($newNeeded{$oneDotName})) { my $newString = sprintf "%s\t%s", $orf, $desc; die "ERROR: new one dot name, different descr $name" if ($newNeeded{$oneDotName} ne $newString) } else { $newNeeded{$oneDotName} = sprintf "%s\t%s", $orf, $desc; ++$newCount; } } die "ERROR: duplicate name $name at $line" if (exists($existing{$name})); $existing{$name} = sprintf "%s\t%s", $orf, $desc; } close (FH); foreach my $key (keys %newNeeded) { die "ERROR: duplicate new name $key for $newNeeded{$key}" if (exists($existing{$key})); printf "%s\t%s\n", $key, $newNeeded{$key}; } foreach my $key (keys %existing) { die "ERROR: duplicate existing name $key for $existing{$key}" if (exists($newNeeded{$key})); printf "%s\t%s\n", $key, $existing{$key}; } '_EOF_' # << happy emacs chmod +x ./slFixup.pl ./slFixup.pl > fixedSangerLinks.tab hgLoadSqlTab ce4 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \ fixedSangerLinks.tab ############################################################################# ## creating xxBlastTab tables (WORKING - 2007-07-31 - Hiram) # Make sure there are no joinerCheck errors at this point, because # if left here they can spread via doHgNearBlastp: ssh hgwdev runJoiner.csh ce4 sangerPep # ce4.orfToGene.name - hits 27812 of 27812 ok # ce4.sangerBlastTab.query - hits 1214556 of 1214556 ok # ce4.sangerBlastTab.target - hits 1214556 of 1214556 ok # ce4.sangerCanonical.transcript - hits 19965 of 19965 ok # ce4.sangerIsoforms.transcript - hits 30497 of 30497 ok # ce4.sangerLinks.orfName - hits 29778 of 30777 ok # ce4.sangerPep.name - hits 23224 of 23224 ok # ce4.sangerToRefSeq.name - hits 30336 of 30336 ok ssh hgwdev mkdir /cluster/data/ce4/bed/hgNearBlastp cd /cluster/data/ce4/bed/hgNearBlastp mkdir 070731 cd 070731 # Get the proteins used by the other hgNear organisms: pepPredToFa hg18 knownGenePep hg18.known.faa pepPredToFa mm8 knownGenePep mm8.known.faa pepPredToFa rn4 knownGenePep rn4.known.faa pepPredToFa danRer4 ensPep danRer4.ensPep.faa pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa pepPredToFa ce4 sangerPep ce4.sangerPep.faa pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa # sanity check, number of lines in each faa file wc -l *.faa # 238243 ce4.sangerPep.faa # 435532 danRer4.ensPep.faa # 258316 dm3.flyBasePep.faa # 526563 hg18.known.faa # 315528 mm8.known.faa # 86220 rn4.known.faa # 64956 sacCer1.sgdPep.faa # sanity check, count the number of protein sequences in each for F in *.faa do echo -n "${F} " grep "^>" ${F} | wc -l done # ce4.sangerPep.faa 23224 # danRer4.ensPep.faa 36048 # dm3.flyBasePep.faa 20279 # hg18.known.faa 45480 # mm8.known.faa 30716 # rn4.known.faa 8160 # sacCer1.sgdPep.faa 5766 cd /cluster/data/ce4/bed/hgNearBlastp cat << '_EOF_' > config.ra # Latest worm vs. other Gene Sorter orgs: # human, mouse, rat, zebrafish, fly, yeast targetGenesetPrefix sanger targetDb ce4 queryDbs hg18 mm8 rn4 danRer4 dm3 sacCer1 recipBest hg18 mm8 rn4 danRer4 dm3 sacCer1 ce4Fa /cluster/data/ce4/bed/hgNearBlastp/070731/ce4.sangerPep.faa hg18Fa /cluster/data/ce4/bed/hgNearBlastp/070731/hg18.known.faa mm8Fa /cluster/data/ce4/bed/hgNearBlastp/070731/mm8.known.faa rn4Fa /cluster/data/ce4/bed/hgNearBlastp/070731/rn4.known.faa danRer4Fa /cluster/data/ce4/bed/hgNearBlastp/070731/danRer4.ensPep.faa dm3Fa /cluster/data/ce4/bed/hgNearBlastp/070731/dm3.flyBasePep.faa sacCer1Fa /cluster/data/ce4/bed/hgNearBlastp/070731/sacCer1.sgdPep.faa buildDir /cluster/data/ce4/bed/hgNearBlastp/070731 scratchDir /san/sanvol1/scratch/ce4HgNearBlastp '_EOF_' # << happy emacs # Run this in a screen since it is going to take awhile # Run with -noLoad so we can eyeball files, manually load dm3 tables now, # and later overload other databases' dmBlastTab tables. doHgNearBlastp.pl -noLoad config.ra > do.log 2>&1 & # watch progress in do.log ######################################################################### # CE4->CE5 LIFTOVER (DONE 10/11/07 angie) ssh kkstore02 cd /cluster/data/ce4 doSameSpeciesLiftOver.pl ce4 ce5 -debug cd bed/blat.ce5.2007-10-11 doSameSpeciesLiftOver.pl ce4 ce5 -verbose 3 >& do.log & tail -f do.log ######################################################################### # CE4->CE6 LIFTOVER (DONE - 2008-06-24 - Hiram) ssh kkstore02 cd /cluster/data/ce4 cp -p /cluster/data/ce6/ce6.2bit /cluster/bluearc/ce6 doSameSpeciesLiftOver.pl -bigClusterHub=kk -workhorse=hgwdev ce4 ce6 -debug cd bed/blat.ce5.2008-06-24 doSameSpeciesLiftOver.pl -bigClusterHub=kk -workhorse=hgwdev ce4 ce6 \ -verbose=3 > do.log 2>&1 & # this takes about 10 minutes tail -f do.log rm -f /cluster/bluearc/ce6/ce6.2bit ######################################################################### # Nucleosome tracks (DONE - 2008-04-08 - Hiram) ssh kkstore02 mkdir /cluster/data/ce4/bed/monoNucleosomes cd /cluster/data/ce4/bed/monoNucleosomes # download everything into stanford sub-directory mkdir stanford cd stanford # track data was delivered via the following URL: # http://stellar.stanford.edu/C_elegans_mononucleosomes/index.html # Uncollapsed Track files by chromosome for C in I II III IV V X M do wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/mono_nucleosomes_chr${C}_forward.bed.gz wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/mono_nucleosomes_chr${C}_reverse.bed.gz done # Collapsed nucleosome track files by chromosome: wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/chrI-M.nucl.for.bed.gz wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/chrI-M.nucl.rev.bed.gz # Collapsed control track files by chromosome: wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/chrI-M.cont.paired.for.bed.gz wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/chrI-M.cont.paired.rev.bed.gz # Positioning stringency wig track wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/chrI-M_positioning_stringency.wig.gz # encode that zcat chrI-M_positioning_stringency.wig.gz \ | wigEncode stdin stringency.wig stringency.wib # Adjusted nucleosome coverage wig track: wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/adjusted_coverage_chrI-M.wig.gz wigEncode adjusted_coverage_chrI-M.wig.gz nucleosomeAdjustedCoverage.wig \ nucleosomeAdjustedCoverage.wib # Track files for the whole genome wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/mono_nucleosomes_all_forward.bed.gz wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/mono_nucleosomes_all_reverse.bed.gz # time for all of the above: # real 2m20.465s # resulting files: # -rw-rw-r-- 1 386656573 Feb 29 12:08 mono_nucleosomes_all_forward.bed.gz # -rw-rw-r-- 1 382914445 Feb 29 12:08 mono_nucleosomes_all_reverse.bed.gz # -rw-rw-r-- 1 166189150 Mar 17 15:30 chrI-M.cont.paired.for.bed.gz # -rw-rw-r-- 1 130560514 Mar 17 15:30 chrI-M.nucl.for.bed.gz # -rw-rw-r-- 1 135184112 Mar 17 15:30 chrI-M.nucl.rev.bed.gz # -rw-rw-r-- 1 381477404 Mar 19 15:10 chrI-M_positioning_stringency.wig.gz # -rw-rw-r-- 1 470000654 Mar 24 14:55 adjusted_coverage_chrI-M.wig.gz # -rw-rw-r-- 1 175019114 Apr 3 11:37 chrI-M.cont.paired.rev.bed.gz # shorten the names which indicate counts: cd .. # Collapsed control track files by chromosome: zcat stanford/chrI-M.cont.paired.rev.bed.gz | grep "^chr" | awk ' { gsub("count=","",$4) printf "%s\t%d\t%d\tn = %d\t%d\t-\t%d\t%d\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$4,$7,$8,$9,$10,$11,$12 } ' | gzip > chrI-M.rev.paired.bed.gz zcat chrI-M.rev.paired.bed.gz | sort -k1,1 -k2,2n \ | bedItemOverlapCount ce4 stdin \ | wigEncode stdin nucleosomeControlAntisenseCoverage.wig \ nucleosomeControlAntisenseCoverage.wib zcat stanford/chrI-M.cont.paired.for.bed.gz | grep "^chr" | awk ' { gsub("count=","n = ",$4) printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12 } ' | gzip > chrI-M.for.paired.bed.gz zcat chrI-M.for.paired.bed.gz | sort -k1,1 -k2,2n \ | bedItemOverlapCount ce4 stdin \ | wigEncode stdin nucleosomeControlSenseCoverage.wig \ nucleosomeControlSenseCoverage.wib # Collapsed nucleosome track files by chromosome: zcat stanford/chrI-M.nucl.for.bed.gz | grep "^chr" | awk ' { gsub("count=","n = ",$4) printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12 } ' | gzip > chrI-M.for.clean.bed.gz zcat stanford/chrI-M.nucl.rev.bed.gz | grep "^chr" | awk ' { gsub("count=","n = ",$4) printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12 } ' | gzip > chrI-M.rev.clean.bed.gz ########################################################### # loading track data ssh hgwdev cd /cluster/data/ce4/bed/monoNucleosomes # Collapsed control track files by chromosome: LOAD="hgLoadBed -tab -noNameIx ce4" zcat stanford/chrI-M.for.paired.bed.gz stanford/chrI-M.rev.paired.bed.gz \ | ${LOAD} nucleosomeControl stdin # Collapsed nucleosome track files by chromosome: zcat chrI-M.for.clean.bed.gz \ | ${LOAD} nucleosomeFragmentsSense stdin zcat chrI-M.rev.clean.bed.gz \ | ${LOAD} nucleosomeFragmentsAntisense stdin # these three tracks grouped into the composite: "nucleosome" # labels: # composite container: Nucleosome predictions from SOLidD core alignments # Control - : mononucleosome control # fragments + : mononucleosomal fragments, sense strand reads # fragments - : mononucleosomal fragments, antisense strand reads # Positioning stringency wig track hgLoadWiggle ce4 nucleosomeStringency stringency.wig # NSome Stringency # Nucleosome Positioning Stringency from SOLiD core alignments # Adjusted nucleosome coverage wig track: hgLoadWiggle ce4 nucleosomeAdjustedCoverage \ nucleosomeAdjustedCoverage.wig # Adj NSome Covrg # Adjusted nucleosome coverage (25bp) # MNase Coverage # Micrococcal nuclease control coverage hgLoadWiggle ce4 nucleosomeControlSenseCoverage \ nucleosomeControlSenseCoverage.wig # Cntl MNase +Covg # Micrococcal nuclease control coverage, sense strand reads hgLoadWiggle ce4 nucleosomeControlAntisenseCoverage \ nucleosomeControlAntisenseCoverage.wig # Cntl MNase -Covg # Micrococcal nuclease control coverage, antisense strand reads # these two sets of files were "cleaned" of references to chrK-12 # and chr positions off the end of chromosomes with the following script: for I in forward reverse do cat ../../../chrom.sizes | while read S do C=`echo ${S} | awk '{print $1}'` SZ=`echo ${S} | awk '{print $2}'` echo "checking $C for $SZ" zcat mono_nucl*_${C}_${I}.bed.gz | grep -w "^${C}" \ | grep -v "chrK-12" | awk '$3 <= '"${SZ}"'' \ | sort -k1,1 -k2,2n | gzip > clean/${C}.${I}.bed.gz done done zcat stanford/clean/chr*.forward.bed.gz \ | bedItemOverlapCount ce4 stdin \ | wigEncode stdin monoNucleosomesSense.wig monoNucleosomesSense.wib zcat stanford/mono_nucleosomes_chr*_reverse.bed.gz | grep "^chr" \ | bedItemOverlapCount ce4 stdin \ | wigEncode stdin monoNucleosomesAntiSense.wig monoNucleosomesAntiSense.wib # NSome Coverage # Coverage of nucleosome predictions from SOLiD core alignments # Control # mononucleosome control hgLoadWiggle ce4 monoNucleosomesSense monoNucleosomesSense.wig # NSome Core +Covg # Coverage of mononucleosomal fragments, sense strand reads hgLoadWiggle ce4 monoNucleosomesAntiSense monoNucleosomesAntiSense.wig # NSome Core -Covg # Coverage of mononucleosomal fragments, antisense strand reads ######################################################################### # 25mers repeat track (DONT - 2008-04-16 - Hiram) # data from Anton Valouev "valouev at stanford.edu" # to accompany the nucleosomes business mkdir /cluster/data/ce4/bed/25mersRepeats cd /cluster/data/ce4/bed/25mersRepeats wget --timestamping \ http://stellar.stanford.edu/C_elegans_mononucleosomes/all_chr_repeat.bed.gz # add a better score filed and a better name zcat all_chr_repeat.bed.gz | grep ^chr | awk ' { printf "%s\t%d\t%d\t%d\t1\t%s\t%d\t%d\t%s\n", $1, $2, $3, NR, $6, $7, $8, $9 } ' > withScore.bed hgLoadBed ce4 25mersRepeats withScore.bed # create a description page for it. ########################################################################### # LOAD Transcriptome data (DONE - 2008-04-15 - Hiram) # data from Christian Iseli 'Christian.Iseli at licr.org' ssh hgwdev mkdir /cluster/data/ce4/bed/transcriptome cd /cluster/data/ce4/bed/transcriptome wget --timestamping ftp://ftp.licr.org/pub/databases/trome/ce4/WTr.gtf.gz wget --timestamping ftp://ftp.licr.org/pub/databases/trome/ce4/txg.tar.gz gtfToGenePred -genePredExt WTr.gtf.gz WTr.gp hgLoadGenePred ce4 transcriptome -genePredExt WTr.gp tar xvzf txg.tar.gz # Do a little data cleanup and transformation and # load splice graphs into database. sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql \ > sibTxGraph.sql cat txg/*.txg | txgToAgx stdin stdout \ | hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql ce4 sibTxGraph stdin # Loaded 20553 elements of size 18 # Create sibAltEvents track for analysed alt-splices. cat txg/*.txg \ | txgAnalyze stdin /cluster/data/ce4/ce4.2bit stdout \ | awk '$2 >= 0' | sort | uniq > sibAltEvents.bed hgLoadBed ce4 sibAltEvents sibAltEvents.bed # Loaded 18447 elements of size 6 ######################################################################### # LIFTOVER TO ce10 (DONE - 2011-05-24 - Hiram ) mkdir /hive/data/genomes/ce4/bed/blat.ce10.2011-05-24 cd /hive/data/genomes/ce4/bed/blat.ce10.2011-05-24 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl \ -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ce4/jkStuff/11.ooc -debug ce4 ce10 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl \ -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ce4/jkStuff/11.ooc ce4 ce10 > do.log 2>&1 # about 4 minutes # verify it works on genome-test #############################################################################