# for emacs: -*- mode: sh; -*- # This file describes building the browser database for the archaeal # species Methanosarcina barkeri # DOWNLOAD SEQUENCE FROM GENBANK (DONE) ssh eieio mkdir /cluster/store5/archae/methBark1 ln -s /cluster/store5/archae/methBark1 /cluster/data/methBark1 cd /cluster/data/methBark1 cp /projects/lowelab/db/Bacteria/Methanosarcina_barkeri_fusaro/NC_0073*.fna . mv NC_007355.fna chr.fa mv NC_007349.fna plasmid1.fa # Edit header of chr.fa and plasmid1.fa to '> chr and >plasmid1 cat chr.fa plasmid1.fa > methBark1.fa faToTwoBit methBark1.fa methBark1.2bit # CREATE DATABASES AND A BUNCH OF INITIAL STUFF (DONE) ssh hgwdev echo 'create database methBark1' | hgsql '' cd /cluster/data/methBark1 faSize -detailed methBark1.fa > chrom.sizes echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \ | hgsql methBark1 echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) values \ ("methBark1", "June 2002", "/gbdb/methBark1", "Methanosarcina barkeri", \ "chr:500000-550000", 1, 251, "Methanosarcina barkeri", \ "Methanosarcina barkeri fusaro", "/gbdb/methBark1/html/description.html", \ 0);' \ | hgsql hgcentraltest echo 'INSERT INTO defaultDb (genome, name) values ("Methanosarcina barkeri", "methBark1");' \ | hgsql hgcentraltest echo 'INSERT INTO genomeClade (genome, clade, priority) values ("Methanosarcina barkeri", "archaea",85);' \ | hgsql hgcentraltest # CREATE CHROMINFO TABLE (DONE) ssh hgwdev cd /cluster/data/methBark1 edit chromInfo.sql; allow fileName to be null cp ~baertsch/kent/src/hg/lib/chromInfo.sql . hgsql methBark1 < chromInfo.sql echo "load data local infile 'chrom.sizes' into table chromInfo" | hgsql methBark1 echo "update chromInfo set fileName = '/gbdb/methBark1/rheMac1.2bit'" | hgsql methBark1 cd ~/kent/src/hg/makeDb/trackDb # add the trackDb directories mkdir -p archae/methBark1 cvs add archae/methBark1 cvs commit # GC20BASE (DONE) ssh kkstore02 mkdir -p /cluster/data/methBark1/bed/gc20Base cd /cluster/data/methBark1/bed/gc20Base hgGcPercent -wigOut -doGaps -file=stdout -win=20 methBark1 \ /cluster/data/methBark1/ | wigEncode stdin gc20Base.wig gc20Base.wib ssh hgwdev cd /cluster/data/methBark1/bed/gc20Base mkdir /gbdb/methBark1/wib ln -s `pwd`/gc20Base.wib /gbdb/methBark1/wib hgLoadWiggle -pathPrefix=/gbdb/methBark1/wib methBark1 gc20Base gc20Base.wig # verify index is correct: hgsql methBark1 -e "show index from gc20Base;" # should see good numbers in Cardinality column # TANDEM REPEAT MASKER (DONE) ssh hgwdev mkdir -p /custer/data/methBark1/bed/simpleRepeat cd /cluster/data/methBark1 trfBig methBark1.fa /dev/null -bedAt=/cluster/data/methBark1/bed/simpleRepeat/chr.bed cd /cluster/data/methBark1/bed/simpleRepeat vi chr.bed and replace methBark1 with chr hgLoadBed methBark1 simpleRepeat *.bed -sqlTable=/cluster/home/baertsch/kent/src/hg/lib/simpleRepeat.sql # MULTIZ with other methanogens # DONE (10/15/05), kpollard cd /cluster/data/methBark1/bed/ mkdir conservation cd conservation cp /cluster/data/metAce1/bed/conservation/HoxD55.q . cp /cluster/data/metAce1/bed/conservation/*.chr . cp /cluster/data/metAce1/bed/conservation/*.nib . cp /cluster/data/metAce1/bed/conservation/*.2bit . #chrom sizes faSize -detailed *.chr > chrom.sizes #blastz blastz metBak0.chr metAce1.chr Q=HoxD55.q > metBak0-metAce1.lav blastz metBak0.chr metMaz1.chr Q=HoxD55.q > metBak0-metMaz1.lav blastz metBak0.chr metBur0.2bit Q=HoxD55.q > metBak0-metBur0.lav /cluster/bin/i386/lavToAxt metBak0-metAce1.lav metBak0.2bit metAce1.2bit metBak0-metAce1.axt /cluster/bin/i386/lavToAxt metBak0-metMaz1.lav metBak0.2bit . metBak0-metMaz1.axt /cluster/bin/i386/lavToAxt metBak0-metBur0.lav metBak0.2bit metBur0.2bit metBak0-metBur0.axt axtBest metBak0-metAce1.axt metBak0.2bit -winSize=500 -minScore=5000 metBak0-metAce1-best.axt axtBest metBak0-metMaz1.axt metBak0.2bit -winSize=500 -minScore=5000 metBak0-metMaz1-best.axt axtBest metBak0-metBur0.axt metBak0.2bit -winSize=500 -minScore=5000 metBak0-metBur0-best.axt axtToMaf metBak0-metAce1-best.axt chrom.sizes chrom.sizes metBak0-metAce1.maf axtToMaf metBak0-metMaz1-best.axt chrom.sizes chrom.sizes metBak0-metMaz1.maf axtToMaf metBak0-metBur0-best.axt chrom.sizes chrom.sizes metBak0-metBur0.maf #STOP: SEEMS TO ONLY USE THE FIRST contig... Need to do separately! #TRY AGAIN WITH methBark1 (assembled into 1 chr, 1 plasmid) cd /cluster/data/methBark1/bed mv conservation conservation.metBak0 mkdir conservation cd conservation cp ../conservation.metBak0/HoxD55.q . cp ../../methBark1.fa . sed s/chr/methBark1.chr/ methBark1.fa > temp sed s/plas/methBark1.plas/ temp > methBark1.fa cp /cluster/data/methBurt1/methBurt1.fa . sed s/Contig/methBurt1.Contig/ methBurt1.fa > temp cat temp | gawk '{if(/Contig/){print $1;}else{print toupper($0);}}' > methBurt1.fa cp /cluster/data/metAce1/bed/conservation/metAce1.chr . cp /cluster/data/metAce1/bed/conservation/metMaz1.chr methMaze1.chr sed s/tMaz/thMaze/ methMaze1.chr > temp mv temp methMaze1.chr faToNib metAce1.chr metAce1.chr.nib faToNib methMaze1.chr methMaze1.chr.nib faToTwoBit methBurt1.fa methBurt1.2bit faToTwoBit methBark1.fa methBark1.2bit #chrom sizes faSize -detailed *.chr *.fa > chrom.sizes #split methBark1.fa faSplit byname methBark1.fa ./ echo "chr" > chroms.txt echo "plasmid1" >> chroms.txt #blastz foreach f (`cat chroms.txt`) echo $f blastz methBark1.$f.fa metAce1.chr Q=HoxD55.q > methBark1.${f}-metAce1.lav blastz methBark1.$f.fa methMaze1.chr Q=HoxD55.q > methBark1.${f}-methMaze1.lav blastz methBark1.$f.fa methBurt1.fa Q=HoxD55.q > methBark1.${f}-methBurt1.lav end foreach f (`cat chroms.txt`) echo $f echo "lavToAxt" lavToAxt methBark1.${f}-metAce1.lav methBark1.2bit . methBark1.${f}-metAce1.axt lavToAxt methBark1.${f}-methMaze1.lav methBark1.2bit . methBark1.${f}-methMaze1.axt lavToAxt methBark1.${f}-methBurt1.lav methBark1.2bit methBurt1.2bit methBark1.${f}-methBurt1.axt echo "axtBest" axtBest methBark1.${f}-metAce1.axt all -winSize=500 -minScore=5000 methBark1.${f}-metAce1-best.axt axtBest methBark1.${f}-methMaze1.axt all -winSize=500 -minScore=5000 methBark1.${f}-methMaze1-best.axt axtBest methBark1.${f}-methBurt1.axt all -winSize=500 -minScore=5000 methBark1.${f}-methBurt1-best.axt echo "axtToMaf" axtToMaf methBark1.${f}-metAce1-best.axt chrom.sizes chrom.sizes methBark1.${f}-metAce1.maf axtToMaf methBark1.${f}-methMaze1-best.axt chrom.sizes chrom.sizes methBark1.${f}-methMaze1.maf axtToMaf methBark1.${f}-methBurt1-best.axt chrom.sizes chrom.sizes methBark1.${f}-methBurt1.maf end #multiz foreach f(*.maf) cat $f | gawk 'BEGIN{getline; print $0; getline; getline; getline; getline;}{print $0;}' > temp mv temp $f end foreach f(`cat chroms.txt`) echo $f multiz methBark1.${f}-metAce1.maf methBark1.${f}-methMaze1.maf - > methBark1.${f}-metAce1-methMaze1.maf multiz methBark1.${f}-methBurt1.maf methBark1.${f}-metAce1-methMaze1.maf - > methBark1.${f}-metAce1-methMaze1-methBurt1.maf end #phyloHMM foreach f(`cat chroms.txt`) echo $f msa_view -i MAF -M methBark1.$f.fa -o SS methBark1.${f}-metAce1-methMaze1-methBurt1.maf > methBark1.$f.ss cat methBark1.$f.ss | gawk '{if(/^NAMES/){print "NAMES = methBark1,metAce1,methMaze1,methBurt1";} else{print $0;}}' > temp mv temp methBark1.$f.ss phyloFit -i SS methBark1.$f.ss -t "(methBurt1,(methBark1,(metAce1,methMaze1)))" -o ${f}_MbaMaMmMbu end #chr has the largest NTUPLES (1072) so use it for starting mod # it shows GC=0.207678+0.206065=0.413743 #add GC content to next call foreach f(`cat chroms.txt`) echo $f phastCons methBark1.$f.ss chr_MbaMaMmMbu.mod \ --gc 0.4137 --target-coverage 0.7 --estimate-trees ${f} \ --expected-lengths 25 --no-post-probs --ignore-missing \ --nrates 1,1 end foreach f(*.cons.mod) set b=$f:r:r echo $b phastCons methBark1.$b.ss chr.cons.mod,chr.noncons.mod \ --target-coverage 0.7 --expected-lengths 25 \ --viterbi ${b}_methBark1-elements.bed --score \ --require-informative 0 --seqname $b > ${b}_cons.dat wigEncode ${b}_cons.dat ${b}_phastCons.wig ${b}_phastCons.wib end #combine phastCons elements into 1 bed file cat *elements.bed > phastCons.bed #move data mkdir wib mv chr_phastCons.wib wib/. mv plasmid1_phastCons.wib wib/. mv chr_phastCons.wig wib/. mv plasmid1_phastCons.wig wib/. ln -s /cluster/data/methBark1/bed/conservation/wib/*.wib /gbdb/methBark1/wib mkdir /gbdb/methBark1/pwMaf mkdir -p otherSpp/metAce1 otherSpp/methMaze1 otherSpp/methBurt1 foreach f(`cat chroms.txt`) echo $f mv methBark1.${f}-metAce1.maf otherSpp/metAce1/$f.maf mv methBark1.${f}-methMaze1.maf otherSpp/methMaze1/$f.maf mv methBark1.${f}-methBurt1.maf otherSpp/methBurt1/$f.maf end ln -s /cluster/data/methBark1/bed/conservation/otherSpp/metAce1 /gbdb/methBark1/pwMaf/metAce1_pwMaf ln -s /cluster/data/methBark1/bed/conservation/otherSpp/methBurt1 /gbdb/methBark1/pwMaf/methBurt1_pwMaf ln -s /cluster/data/methBark1/bed/conservation/otherSpp/methMaze1 /gbdb/methBark1/pwMaf/methMaze1_pwMaf mkdir multiz foreach f(`cat chroms.txt`) echo $f mv methBark1.${f}-metAce1-methMaze1-methBurt1.maf multiz/$f.maf end ln -s /cluster/data/methBark1/bed/conservation/multiz /gbdb/methBark1/multizMbaMaMmMbu #load hgLoadWiggle methBark1 phastCons /cluster/data/methBark1/bed/conservation/wib/*_phastCons.wig hgLoadMaf -warn methBark1 multizMbaMaMmMbu hgLoadMaf -warn methBark1 metAce1_pwMaf -pathPrefix=/gbdb/methBark1/pwMaf/metAce1_pwMaf hgLoadMaf -warn methBark1 methBurt1_pwMaf -pathPrefix=/gbdb/methBark1/pwMaf/methBurt1_pwMaf hgLoadMaf -warn methBark1 methMaze1_pwMaf -pathPrefix=/gbdb/methBark1/pwMaf/methMaze1_pwMaf hgLoadBed methBark1 phastConsElements phastCons.bed #trackDb cd ~/kent/src/hg/makeDb/trackDb/archae/ mkdir methBark1 cvs add methBark1 cd methBark1 #trackDb.ra entry # track multizMbaMaMmMbu # shortLabel Conservation # longLabel Methanogen multiz alignments # group compGeno # priority 10.0 # visibility pack # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons # yLineOnOff Off # autoScale Off # pairwise pwMaf # speciesOrder metAce1 methMaze1 methBurt1 cvs add trackDb.ra cvs commit -m "New multiz track" trackDb.ra #html page cvs add multizMbaMaMmMbu.html cvs commit -m "Details page for multiz track" multizMbaMaMmMbu.html # DESCRIPTION PAGE () ssh hgwdev # Write ~/kent/src/hg/makeDb/trackDb/archae/methBark1/description.html chmod a+r ~/kent/src/hg/makeDb/trackDb/archae/methBark1/description.html # Check it in. mkdir /gbdb/methBark1/html ln -s /cluster/data/methBark1/html/description.html /gbdb/methBark1/html/ # GENBANK PROTEIN-CODING GENES () ssh hgwdev mkdir /cluster/data/methBark1/genbank cd /cluster/data/methBark1/genbank wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Methanosarcina_acetivorans/NC_003552.gbk mv NC_003552.gbk methBark1.gbk # Create 3 files to assist parsing of the genbank # 1. for a bed file echo 'chr start end gene 1000 strand' > methBark1-params-bed.txt # 2. for the peptide parts echo 'gene translation' > methBark1-params-pep.txt # 3. for the other gene information echo 'gene product note' > methBark1-params-xra.txt # Now extract the genes and information: gbArchaeGenome methBark1.gbk methBark1-params-bed.txt methBark1-genbank-cds.bed gbArchaeGenome methBark1.gbk methBark1-params-pep.txt methBark1-genbank-cds.pep gbArchaeGenome methBark1.gbk methBark1-params-xra.txt methBark1-genbank-cds.xra hgLoadBed methBark1 gbProtCode methBark1-genbank-cds.bed hgsql methBark1 < ~/kent/src/hg/lib/pepPred.sql hgsql methBark1 < ~/kent/src/hg/lib/minGeneInfo.sql echo rename table pepPred to gbProtCodePep | hgsql methBark1 echo rename table minGeneInfo to gbProtCodeXra | hgsql methBark1 echo load data local infile \'methBark1-genbank-cds.pep\' into table gbProtCodePep | hgsql methBark1 echo load data local infile \'methBark1-genbank-cds.xra\' into table gbProtCodeXra | hgsql methBark1 #genbank to genePred csh tawk '{print $1,$2,$3,$4,$5,$6,$2,$3,0,1,$3-$2,0}' methBark1-genbank-cds.bed | bedToGenePred stdin tmp.gp tawk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,substr($1,3,4),name2,"cmpl","cmpl",0}' tmp.gp > tmp2.gp join -t " " -o 1.1,1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.3 1.13 1.14 1.15 tmp2.gp methBark1-genbank-cds.xra > methBark1.gp # GENBANK rRNA GENES () ssh hgdev cd /cluster/data/methBark1/genbank gbArchaeGenome -kind=rRNA methBark1.gbk methBark1-params-bed.txt methBark1-rrnas.bed echo 'gene product NA' > methBark1-params-rrna-xra.txt gbArchaeGenome -kind=rRNA methBark1.gbk methBark1-params-rrna-xra.txt methBark1-rrnas-xra.txt hgLoadBed methBark1 gbRRNA methBark1-rrnas.bed hgsql methBark1 < ~/kent/src/hg/lib/minGeneInfo.sql echo rename table minGeneInfo to gbRRNAXra | hgsql methBark1 echo load data local infile \'methBark1-rrnas-xra.txt\' into table gbRRNAXra | hgsql methBark1 # COG STUFF # Cut and paste http://www.ncbi.nlm.nih.gov/cgi-bin/COG/palox into emacs (COG list) # and save as cogpage.txt awk '{printf("%s\t%s\n",$6,$5)}' < cogpage.txt | sed -e 's/\[//' -e 's/\]//' > cogs.txt rm cogpage.txt # Now we have the basic list of cogs and the letter code for each one. # TODD LOWE tRNA GENES () # This one is a bed 6+ file created by hand of 46 tRNAs and 1 pseudo tRNA by Todd # Lowe. See ~/kent/src/hg/lib/loweTrnaGene.as for a description of the fields. # **Showing the tRNAScanSE instructions would be nice in the future. ssh hgwdev mkdir /cluster/data/methBark1/bed/loweTrnaGene cd /cluster/data/methBark1/bed/loweTrnaGene hgLoadBed -tab methBark1 loweTrnaGene methBark1-lowe-trnas.bed -sqlTable=~/kent/src/hg/lib/loweTrnaGene.sql # TODD LOWE snoRNA GENES () # This is a bed 6 file created by hand. ssh hgwdev mkdir /cluster/data/methBark1/bed/loweSnoGene cd /cluster/data/methBark1/bed/loweSnoGene hgLoadBed -tab methBark1 loweSnoGene methBark1-snos.bed # TIGR GENES () # First go to http://www.tigr.org/tigr-scripts/CMR2/gene_attribute_form.dbi # and fill out the web form as follows: # - Pick "Retrieve attributes for the specified DNA feature within a specific # organism and/or a specific role category". # * Pick "Pyrobaculum aerophilum IM2", and "Primary and TIGR annotation ORFs" # from the 1st and 3rd box. # * Select everything from "Choose TIGR Annotation Gene Attributes" # * Select "Primary Locus Name" from "Choose Primary Annotation Gene Attributes" # * Select everything from "Choose Other Gene Attributes" # - Click submit, and click save as tab-delimited file. ssh hgwdev mkdir /cluster/data/methBark1/bed/tigrCmrORFs cp methBark1-tigr.tab /cluster/data/methBark1/bed/tigrCmrORFs cd /cluster/data/methBark1/bed/tigrCmrORFs /projects/lowelab/users/aamp/bin/i386/tigrCmrToBed methBark1-tigr.tab methBark1-tigr.bed hgLoadBed -tab methBark1 tigrCmrORFs methBark1-tigr.bed -sqlTable=~/kent/src/hg/lib/tigrCmrGene.sql