# for emacs: -*- mode: sh; -*- # This file describes building the browser database for the archaeal # species Methanosarcina acetivorans. # DOWNLOAD SEQUENCE FROM GENBANK (DONE 8/05) ssh eieio mkdir /cluster/store5/archae/metAce1 ln -s /cluster/store5/archae/metAce1 /cluster/data/metAce1 cd /cluster/data/metAce1 wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Methanosarcina_acetivorans/NC_003552.fna mv NC_003552.fna chr.fa # Edit header of chr.fa to '> metAce1' # CREATE DATABASES AND A BUNCH OF INITIAL STUFF (DONE 8/05) ssh hgwdev echo 'create database metAce1' | hgsql '' cd /cluster/data/metAce1 faToNib chr.fa nib/chr.nib hgNibSeq metAce1 /cluster/data/metAce1/nib chr.fa faSize -detailed chr.fa > chrom.sizes mkdir -p /gbdb/metAce1/nib echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \ | hgsql metAce1 echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) values \ ("metAce1", "April 2002", "/gbdb/metAce1/nib", "Methanosarcina acetivorans", \ "chr:500000-550000", 1, 205, "Methanosarcina acetivorans", \ "Methanosarcina acetivorans C2A", "/gbdb/metAce1/html/description.html", \ 0);' \ | hgsql hgcentraltest echo 'INSERT INTO defaultDb (genome, name) values ("Methanosarcina acetivorans", "metAce1");' \ | hgsql hgcentraltest echo 'INSERT INTO genomeClade (genome, clade, priority) values ("Methanosarcina acetivorans", "archaea",85);' \ | hgsql hgcentraltest cd ~/kent/src/hg/makeDb/trackDb # edit the trackDb makefile # add the trackDb directories mkdir -p archae/metAce1 cvs add archae cvs add archae/metAce1 cvs commit # GC 20 BASE WIGGLE TRACK (DONE 8/13/05) mkdir /cluster/data/metAce1/bed/gc20Base cd /cluster/data/metAce1/bed/gc20Base mkdir wigData20 dataLimits20 hgGcPercent -chr=chr -file=stdout -win=20 -overlap=19 metAce1 ../../nib | grep -w GC | \ awk ' { bases = $3 - $2 perCent = $5/10.0 printf "%d\t%.1f\n", $2+1, perCent }' | wigAsciiToBinary -dataSpan=1 -chrom=chr \ -wibFile=wigData20/gc20Base_1 -name=1 stdin > dataLimits20/chr hgLoadWiggle metAce1 gc20Base wigData20/*.wig mkdir /gbdb/metAce1/wib ln -s `pwd`/wigData20/*.wib /gbdb/metAce1/wib # the trackDb entry # CONTIG TRACK # reformat is a Todd Lowe program for num in `seq 746 946`; do curl -o ${num}.gbk "http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?txt=on&val=AE009${num}.1" reformat fasta ${num}.gbk >> contigs.fa done blat /cluster/data/metAce1/chr.fa contigs.fa -minIdentity=100 contigs.psl perfectBlatBed4 contigs.psl contigs.bed mkdir /cluster/data/metAce1/bed/metAce1Contigs cp contigs.bed /cluster/data/metAce1/bed/metAce1Contigs cd /cluster/data/metAce1/bed/metAce1Contigs hgLoadBed metAce1 metAce1Contigs contigs.bed # the trackDb entry: track metAce1Contigs shortLabel Contigs longLabel Contigs deposited in Genbank group map priority 0.5 visibility pack url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=nucleotide&cmd=search&term=$$ type bed 4 . # TANDEM REPEAT MASKER (DONE) ssh hgwdev mkdir -p /cluster/data/metAce1/bed/simpleRepeat cd /cluster/data/metAce1 trfBig chr.fa /dev/null -bedAt=/cluster/data/metAce1/bed/simpleRepeat/chr.bed cd /cluster/data/metAce1/bed/simpleRepeat vi chr.bed and replace metAce1 with chr hgLoadBed metAce1 simpleRepeat *.bed -sqlTable=~kent/src/hg/lib/simpleRepeat.sql # DESCRIPTION PAGE (DONE) ssh hgwdev # Write ~/kent/src/hg/makeDb/trackDb/archae/metAce1/description.html chmod a+r ~/kent/src/hg/makeDb/trackDb/archae/metAce1/description.html # Check it in. mkdir /gbdb/metAce1/html ln -s /cluster/data/metAce1/html/description.html /gbdb/metAce1/html/ # GENBANK PROTEIN-CODING GENES (DONE) ssh hgwdev mkdir /cluster/data/metAce1/genbank cd /cluster/data/metAce1/genbank wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Methanosarcina_acetivorans/NC_003552.gbk mv NC_003552.gbk metAce1.gbk # Create 3 files to assist parsing of the genbank # 1. for a genePred file echo 'locus_tag chr strand start end start end 1 start end lineNumber gene cmpl cmpl 0' > metAce1-params-gp.txt # 2. for the peptide parts echo 'locus_tag translation' > metAce1-params-pep.txt # 3. for the other gene information echo 'locus_tag gene product note protein_id db_xref EC_number db_xref2' > metAce1-params-xra.txt # Now extract the genes and information: gbArchaeGenome metAce1.gbk metAce1-params-gp.txt metAce1-genbank-cds.gp gbArchaeGenome metAce1.gbk metAce1-params-pep.txt metAce1-genbank-cds.pep gbArchaeGenome metAce1.gbk metAce1-params-xra.txt metAce1-genbank-cds.xra hgsql metAce1 < /cluster/home/baertsch/kent/src/hg/lib/pepPred.sql hgsql metAce1 < /cluster/home/baertsch/kent/src/hg/lib/minGeneInfo.sql echo rename table pepPred to gbProtCodePep | hgsql metAce1 echo rename table minGeneInfo to gbProtCodeXra | hgsql metAce1 echo load data local infile \'metAce1-genbank-cds.pep\' into table gbProtCodePep | hgsql metAce1 echo load data local infile \'metAce1-genbank-cds.xra\' into table gbProtCodeXra | hgsql metAce1 ldHgGene metAce1 refSeq metAce1.gp -predTab -genePredExt # COG STUFF awk 'NR>3{OFS="\t";print $6,$8,$7}' /projects/lowelab/db/Bacteria/Methanosarcina_acetivorans/NC_003552.ptt > COG hgsql metAce1 < /cluster/home/baertsch/kent/src/hg/lib/cogs.sql echo "load data local infile 'COG' into table COG" | hgsql metAce1 # load cog codes hgsql metAce1 < /cluster/data/metAce1/genbank/COGXra.sql # GENBANK rRNA GENES (NOT QUITE DONE) ssh hgdev cd /cluster/data/metAce1/genbank gbArchaeGenome -kind=rRNA metAce1.gbk metAce1-params-bed.txt metAce1-rrnas.bed echo 'gene product NA' > metAce1-params-rrna-xra.txt gbArchaeGenome -kind=rRNA metAce1.gbk metAce1-params-rrna-xra.txt metAce1-rrnas-xra.txt hgLoadBed metAce1 gbRRNA metAce1-rrnas.bed hgsql metAce1 < ~/kent/src/hg/lib/minGeneInfo.sql echo rename table minGeneInfo to gbRRNAXra | hgsql metAce1 echo load data local infile \'metAce1-rrnas-xra.txt\' into table gbRRNAXra | hgsql metAce1 # TODD LOWE tRNA GENES () # This one is a bed 6+ file created by hand of 46 tRNAs and 1 pseudo tRNA by Todd # Lowe. See ~/kent/src/hg/lib/loweTrnaGene.as for a description of the fields. # **Showing the tRNAScanSE instructions would be nice in the future. ssh hgwdev mkdir /cluster/data/metAce1/bed/loweTrnaGene cd /cluster/data/metAce1/bed/loweTrnaGene hgLoadBed -tab metAce1 loweTrnaGene metAce1-lowe-trnas.bed -sqlTable=~/kent/src/hg/lib/loweTrnaGene.sql # TODD LOWE snoRNA GENES () # This is a bed 6 file created by hand. ssh hgwdev mkdir /cluster/data/metAce1/bed/loweSnoGene cd /cluster/data/metAce1/bed/loweSnoGene hgLoadBed -tab metAce1 loweSnoGene metAce1-snos.bed # TIGR GENES (DONE) # First go to http://www.tigr.org/tigr-scripts/CMR2/gene_attribute_form.dbi # and fill out the web form as follows: # - Pick "Retrieve attributes for the specified DNA feature within a specific # organism and/or a specific role category". # * Pick "Pyrobaculum aerophilum IM2", and "Primary and TIGR annotation ORFs" # from the 1st and 3rd box. # * Select everything from "Choose TIGR Annotation Gene Attributes" # * Select "Primary Locus Name" from "Choose Primary Annotation Gene Attributes" # * Select everything from "Choose Other Gene Attributes" # - Click submit, and click save as tab-delimited file. ssh hgwdev mkdir /cluster/data/metAce1/bed/tigrCmrORFs cp metAce1-tigr.tab /cluster/data/metAce1/bed/tigrCmrORFs cd /cluster/data/metAce1/bed/tigrCmrORFs /projects/lowelab/users/aamp/bin/i386/tigrCmrToBed metAce1-tigr.tab metAce1-tigr.bed hgLoadBed -tab metAce1 tigrCmrORFs metAce1-tigr.bed -sqlTable=~/kent/src/hg/lib/tigrCmrGene.sql ########################################################### #mulitz pHMM conservation track for 3 Methosarcina species# ########################################################### ## DONE: 8/14/05 (baertsch) cd /cluster/data/metAce1/bed mkdir conservation cd conservation cp /cluster/data/pyrFur2/bed/conservation/HoxD55.q . #get fastas cp ../../chr.fa metAce1.chr cp /projects/lowelab/db/blastdb/Meth_maze.fa metMaz1.fa wget ftp://ftp.jgi-psf.org/pub/JGI_data/Microbial/methanosarcina/031109/2351479_fasta.screen.contigs mv 2351479_fasta.screen.contigs metBak0.chr ftp://ftp.jgi-psf.org/pub/JGI_data/Microbial/methanococcoides_burtonii/041112/2662199.fsa mv 2662199.fsa metBur0.chr #edit > lines to say metAce1.chr, metBak0.contigN, and metMaz1.chr #make nibs and 2bits /cluster/bin/i386/faToNib metAce1.chr metAce1.chr.nib /cluster/bin/i386/faToNib metMaz1.chr metMaz1.chr.nib /cluster/bin/i386/faToTwoBit metBak0.chr metBak0.2bit /cluster/bin/i386/faToTwoBit metBur0.chr metBur0.2bit #chrom sizes faSize -detailed *.chr > chrom.sizes #blastz blastz metAce1.chr metBak0.chr Q=HoxD55.q > metAce1-metBak0.lav blastz metAce1.chr metMaz1.chr Q=HoxD55.q > metAce1-metMaz1.lav blastz metAce1.chr metBur0.chr Q=HoxD55.q > metAce1-metBur0.lav /cluster/bin/i386/lavToAxt metAce1-metBak0.lav . metBak0.2bit metAce1-metBak0.axt /cluster/bin/i386/lavToAxt metAce1-metMaz1.lav . . metAce1-metMaz1.axt /cluster/bin/i386/lavToAxt metAce1-metBur0.lav . metBur0.2bit metAce1-metBur0.axt axtBest metAce1-metBak0.axt metAce1.chr -winSize=500 -minScore=5000 metAce1-metBak0-best.axt axtBest metAce1-metMaz1.axt metAce1.chr -winSize=500 -minScore=5000 metAce1-metMaz1-best.axt axtBest metAce1-metBur0.axt metAce1.chr -winSize=500 -minScore=5000 metAce1-metBur0-best.axt axtToMaf metAce1-metBak0-best.axt chrom.sizes chrom.sizes metAce1-metBak0.maf axtToMaf metAce1-metMaz1-best.axt chrom.sizes chrom.sizes metAce1-metMaz1.maf axtToMaf metAce1-metBur0-best.axt chrom.sizes chrom.sizes metAce1-metBur0.maf #multiz: v10 has new parameter (3rd par), 0=2 seqs not align to ref, else 1. # stick with v8 for now. multiz metAce1-metBak0.maf metAce1-metMaz1.maf - > metAce1-metBak0-metMaz1.maf #phyloHMM /cluster/bin/phast/msa_view -i MAF -M metAce1.chr -o SS metAce1-metBak0-metMaz1.maf > metAce1.ss /cluster/bin/phast/phyloFit -i SS metAce1.ss -t "(metAce1,(metBak0,metMaz1))" /cluster/bin/phast/msa_view -i SS metAce1.ss --summary-only #descrip. A C G T G+C length all_gaps some_gaps #metAce1.ss 0.2892 0.2088 0.2089 0.2931 0.4177 6057454 0 580625 /cluster/bin/phast/phastCons metAce1.ss phyloFit.mod --gc 0.4177 \ --target-coverage 0.5 --estimate-trees met-tree \ --expected-lengths 7 --no-post-probs --ignore-missing \ --nrates 1,1 /cluster/bin/phast/phastCons metAce1.ss \ met-tree.cons.mod,met-tree.noncons.mod \ --target-coverage 0.5 --expected-lengths 75 \ --viterbi metAce-elements.bed --score \ --require-informative 0 --seqname chr > cons.dat wigEncode cons.dat phastCons.wig phastCons.wib #phlyloHmmm new multiz metAce1-metMaz1.maf metAce1-metBak0.maf - > metAce1-metMaz1-metBak0.maf /cluster/bin/phast/msa_view -i MAF -M metAce1.chr -o SS metAce1-metMaz1-metBak0.maf > metAce1new.ss /cluster/bin/phast/phyloFit -i SS metAce1new.ss -t "(metBak0,(metAce1,metMaz1))" -o Ma1Mz1Mb0 /cluster/bin/phast/msa_view -i SS metAce1new.ss --summary-only #descrip. A C G T G+C length all_gaps some_gaps #metAce1new.ss 0.2892 0.2088 0.2090 0.2930 0.4178 6057224 0 579124 /cluster/bin/phast/phastCons metAce1new.ss Ma1Mz1Mb0.mod --gc 0.4177 \ --target-coverage 0.7 --estimate-trees met-tree2 \ --expected-lengths 7 --no-post-probs --ignore-missing \ --nrates 1,1 /cluster/bin/phast/phastCons metAce1new.ss \ met-tree2.cons.mod,met-tree2.noncons.mod \ --target-coverage 0.7 --expected-lengths 75 \ --viterbi metAce-elements2.bed --score \ --require-informative 0 --seqname chr > cons2.dat wigEncode cons2.dat phastCons2.wig phastCons2.wib #move data mkdir wib mv phastCons.wib wib/phastCons.wib mv phastCons.wig wib/phastCons.wig ln -s /cluster/data/metAce1/bed/conservation/wib/phastCons.wib /gbdb/metAce1/wib mkdir /gbdb/metAce1/pwMaf mkdir -p otherMet/metBak0 otherMet/metMaz1 mv metAce1-metBak0.maf otherMet/metBak0/chr.maf mv metAce1-metMaz1.maf otherMet/metMaz1/chr.maf ln -s /cluster/data/metAce1/bed/conservation/otherMet/metBak0 /gbdb/metAce1/pwMaf/metBak0_pwMaf ln -s /cluster/data/metAce1/bed/conservation/otherMet/metMaz1 /gbdb/metAce1/pwMaf/metMaz1_pwMaf mkdir multiz mv metAce1-metBak0-metMaz1.maf multiz/chr.maf ln -s /cluster/data/metAce1/bed/conservation/multiz /gbdb/metAce1/multizMa1Mb1Mz1 #load hgLoadWiggle metAce1 phastCons /cluster/data/metAce1/bed/conservation/wib/phastCons.wig hgLoadMaf -warn metAce1 multizMa1Mb1Mz1 hgLoadMaf -warn metAce1 metBak0_pwMaf -pathPrefix=/gbdb/metAce1/pwMaf/metBak0_pwMaf hgLoadMaf -warn metAce1 metMaz1_pwMaf -pathPrefix=/gbdb/metAce1/pwMaf/metMaz1_pwMaf #trackDb.ra entry # track multizMa1Mb1Mz1 # shortLabel Conservation # longLabel Methosarcina 3-way multiz alignments # group compGeno # priority 10.0 # visibility pack # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons # yLineOnOff Off # autoScale Off # pairwise pwMaf # speciesOrder metBak0 metMaz1 cd ~/kent/src/hg/makeDb/trackDb/archae/metAce1 cvs commit -m "Added multiz track" trackDb.ra #html page for multizMa1Mb1Mz1 cd ~/kent/src/hg/makeDb/trackDb/archae/metAce1 cvs add multizMa1Mb1Mz1.html cvs commit -m "Details page for multiz track" multizMa1Mb1Mz1.html ## 4 way multiz metAce1-metBur0.maf metAce1-metMaz1-metBak0.maf - > metAce1-metMaz1-metBak0-metBur0.maf /cluster/bin/phast/msa_view -i MAF -M metAce1.chr -o SS metAce1-metMaz1-metBak0-metBur0.maf > metAce4way.ss /cluster/bin/phast/phyloFit -i SS metAce4way.ss -t "(metBur0,(metBak0,(metAce1,metMaz1)))" -o Ma1Mz1Mbk0Mbr0 /cluster/bin/phast/msa_view -i SS metAce4way.ss --summary-only #descrip. A C G T G+C length all_gaps some_gaps #metAce4way.ss 0.2887 0.2092 0.2094 0.2926 0.4186 6100179 0 710386 /cluster/bin/phast/phastCons metAce4way.ss Ma1Mz1Mbk0Mbr0.mod --gc 0.4186 \ --target-coverage 0.7 --estimate-trees met-tree4 \ --expected-lengths 7 --no-post-probs --ignore-missing \ --nrates 1,1 /cluster/bin/phast/phastCons metAce4way.ss \ met-tree4.cons.mod,met-tree4.noncons.mod \ --target-coverage 0.7 --expected-lengths 75 \ --viterbi metAce-elements2.bed --score \ --require-informative 0 --seqname chr > cons4.dat wigEncode cons4.dat phastCons4.wig phastCons4.wib draw_tree Ma1Mz1Mbk0Mbr0.mod > tree4.ps ps2pdf tree4.ps tree4.pdf #move data mv phastCons4.wib wib/phastCons4.wib mv phastCons4.wig wib/phastCons4.wig ln -s /cluster/data/metAce1/bed/conservation/wib/phastCons4.wib /gbdb/metAce1/wib mkdir -p otherMet/metBur0 mv metAce1-metBur0.maf otherMet/metBur0/chr.maf ln -s /cluster/data/metAce1/bed/conservation/otherMet/metBur0 /gbdb/metAce1/pwMaf/metBur0_pwMaf mkdir multiz4way mv metAce1-metMaz1-metBak0-metBur0.maf multiz4way/chr.maf ln -s /cluster/data/metAce1/bed/conservation/multiz4way /gbdb/metAce1/multizMa1Mz1Mbk0Mbr0 #load hgLoadWiggle metAce1 phastCons4 /cluster/data/metAce1/bed/conservation/wib/phastCons4.wig hgLoadMaf -warn metAce1 multizMa1Mz1Mbk0Mbr0 hgLoadMaf -warn metAce1 metBur0_pwMaf -pathPrefix=/gbdb/metAce1/pwMaf/metBur0_pwMaf #trackDb.ra entry # track multizMa1Mz1Mbk0Mbr0 # shortLabel Cons 4way # longLabel Methosarcina Acetivorans/Mazei/Bakeri 3-way multiz alignments + Methanobacteriodes Burtonii # group compGeno # priority 10.0 # visibility pack # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons # yLineOnOff Off # autoScale Off # pairwise pwMaf # speciesOrder metMaz1 metBak0 metBur0 ## re-run with 25 expected length parameter /cluster/bin/phast/phastCons metAce4way.ss Ma1Mz1Mbk0Mbr0.mod --gc 0.4186 \ --target-coverage 0.7 --estimate-trees met-tree4.25 \ --expected-lengths 25 --no-post-probs --ignore-missing \ --nrates 1,1 /cluster/bin/phast/phastCons metAce4way.ss \ met-tree4.25.cons.mod,met-tree4.25.noncons.25.mod \ --target-coverage 0.7 --expected-lengths 25 \ --viterbi metAce-elements.25.bed --score \ --require-informative 0 --seqname chr > cons4.25.dat wigEncode cons4.25.dat phastCons4.25.wig phastCons4.25.wib mkdir wib mv phastCons4.25.wib wib/phastCons4.25.wib mv phastCons4.25.wig wib/phastCons4.25.wig ln -s /cluster/data/metAce1/bed/conservation/wib/phastCons.wib /gbdb/metAce1/wib hgLoadWiggle metAce1 phastCons4_25 /cluster/data/metAce1/bed/conservation/phastCons4.25.wig # 3 states, 2 conserved , 1 nonconserved nice /cluster/bin/phast/phastCons metAce4way.ss Ma1Mz1Mbk0Mbr0.mod --gc 0.4186 \ --target-coverage 0.7 --estimate-trees met-tree4.25.3state \ --expected-lengths 25 --no-post-probs --ignore-missing \ --nrates 2,1 /cluster/bin/phast/phastCons metAce4way.ss \ met-tree4.25.cons1.mod,met-tree4.25.cons2.mod,met-tree4.25.3state.noncons.25.mod \ --target-coverage 0.7 --expected-lengths 25 \ --viterbi metAce-elements.3state.25.bed --score \ --require-informative 0 --seqname chr > cons4.25.3state.dat #FINISH UP MULTIZ TRACK (4way) # DONE (10/9/05), kpollard cd /cluster/data/metAce1/bed/conservation/ #move data cp tree4.jpg /usr/local/apache/htdocs/images/metAce1-tree.jpg cd wib mv phastCons.wib phastCons3.wib mv phastCons.wig phastCons3.wig cp phastCons4.25.wib phastCons.wib cp phastCons4.25.wig phastCons.wig ln -s /cluster/data/metAce1/bed/conservation/wib/phastCons.wib /gbdb/metAce1/wib #drop tables so can reload with new data hgLoadWiggle metAce1 phastCons /cluster/data/metAce1/bed/conservation/wib/phastCons.wig hgLoadMaf -warn metAce1 multizMa1Mz1Mbk0Mbr0 hgLoadMaf -warn metAce1 metBak0_pwMaf -pathPrefix=/gbdb/metAce1/pwMaf/metBak0_pwMaf hgLoadMaf -warn metAce1 metBur0_pwMaf -pathPrefix=/gbdb/metAce1/pwMaf/metBur0_pwMaf hgLoadMaf -warn metAce1 metMaz1_pwMaf -pathPrefix=/gbdb/metAce1/pwMaf/metMaz1_pwMaf hgLoadBed metAce1 phastConsElements metAce-elements.25.bed #trackDb cd ~/kent/src/hg/makeDb/trackDb/archae/metAce1 #trackDb.ra entry # track multizMa1Mz1Mbk0Mbr0 # shortLabel Cons 4way # longLabel Methosarcina 4-way multiz alignments # group compGeno # priority 10.0 # visibility pack # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons # yLineOnOff Off # autoScale Off # pairwise pwMaf # speciesOrder metMaz1 metBak0 metBur0 cvs commit -m "New multiz track" trackDb.ra #html page mv multizMa1Mb1Mz1.html multizMa1Mz1Mbk0Mbr0.html cvs remove multizMa1Mb1Mz1.html cvs add multizMa1Mz1Mbk0Mbr0.html cvs commit -m "Details page for multiz track" multizMa1Mz1Mbk0Mbr0.html