# for emacs: -*- mode: sh; -*- # This file describes building the browser database for the archaeal # species Pyrobaculum aerophilum. # DOWNLOAD SEQUENCE FROM GENBANK (DONE 01/07/04) ssh eieio mkdir /cluster/store5/archae/pyrAer1 ln -s /cluster/store5/archae/pyrAer1 /cluster/data/pyrAer1 cd /cluster/data/pyrAer1 wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Pyrobaculum_aerophilum/NC_003364.fna mv NC_003364.fna chr1.fa # Edit header of chr1.fa to '> pyrAer1' # CREATE DATABASES AND A BUNCH OF INITIAL STUFF (DONE 01/12/04) ssh hgwdev echo 'create database pyrAer1' | hgsql '' cd /cluster/data/pyrAer1 hgNibSeq pyrAer1 /cluster/data/pyrAer1/nib chr1.fa faSize -detailed chr1.fa > chrom.sizes mkdir -p /gbdb/pyrAer1/nib echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \ | hgsql pyrAer1 echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) values \ ("pyrAer1", "Pyrobaculum aerophilum", "/gbdb/pyrAer1/nib", "P. aerophilum", \ "chr1:500000-550000", 1, 85, "Archae", \ "Pyrobaculum aerophilum", "/gbdb/pyrAer1/html/description.html", \ 0);' \ | hgsql -h genome-testdb hgcentraltest echo 'INSERT INTO defaultDb (genome, name) values ("Archae", "pyrAer1");' \ | hgsql -h genome-testdb hgcentraltest cd ~/kent/src/hg/makeDb/trackDb # edit the trackDb makefile # add the trackDb directories mkdir -p archae/pyrAer1 cvs add archae cvs add archae/pyrAer1 cvs commit # GC PERCENT TRACK (DONE 01/12/04) ssh hgwdev mkdir -p /cluster/data/pyrAer1/bed/gcPercent cd /cluster/data/pyrAer1/bed/gcPercent hgsql pyrAer1 < ~/kent/src/hg/lib/gcPercent.sql hgGcPercent -win=2000 pyrAer1 ../../nib # edit ~/kent/src/hg/makeDb/trackDb/archae/trackDb.ra and add an entry for # GC percent for 2,000 base windows instead (simply cut and paste # ~/kent/src/hg/makeDb/trackDb/trackDb.ra). # GC 20 BASE WIGGLE TRACK (DONE 8/20) mkdir /cluster/data/pyrAer1/bed/gc20Base cd /cluster/data/pyrAer1/bed/gc20Base mkdir wigData20 dataLimits20 hgGcPercent -chr=chr -file=stdout -win=20 -overlap=19 pyrAer1 ../../nib | grep -w GC | \ awk ' { bases = $3 - $2 perCent = $5/10.0 printf "%d\t%.1f\n", $2+1, perCent }' | wigAsciiToBinary -dataSpan=1 -chrom=chr \ -wibFile=wigData20/gc20Base_1 -name=1 stdin > dataLimits20/chr hgLoadWiggle pyrAer1 gc20Base wigData20/*.wig mkdir /gbdb/pyrAer1/wib ln -s `pwd`/wigData20/*.wib /gbdb/pyrAer1/wib # the trackDb entry track gc20Base shortLabel GC Percent longLabel GC Percent in 20 base windows group map priority 1.7 visibility hide autoScale Off maxHeightPixels 128:36:16 graphTypeDefault Bar gridDefault OFF windowingFunction Mean color 0,128,255 altColor 255,128,0 viewLimits 30:70 type wig 0 100 # CONTIG TRACK # reformat is a Todd Lowe program for num in `seq 746 946`; do curl -o ${num}.gbk "http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?txt=on&val=AE009${num}.1" reformat fasta ${num}.gbk >> contigs.fa done blat /cluster/data/pyrAer1/chr1.fa contigs.fa -minIdentity=100 contigs.psl perfectBlatBed4 contigs.psl contigs.bed mkdir /cluster/data/pyrAer1/bed/pyrAer1Contigs cp contigs.bed /cluster/data/pyrAer1/bed/pyrAer1Contigs cd /cluster/data/pyrAer1/bed/pyrAer1Contigs hgLoadBed pyrAer1 pyrAer1Contigs contigs.bed # the trackDb entry: track pyrAer1Contigs shortLabel Contigs longLabel Contigs deposited in Genbank group map priority 0.5 visibility pack url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=nucleotide&cmd=search&term=$$ type bed 4 . # TANDEM REPEAT MASKER (NOT DONE YET) ssh hgwdev mkdir -p /cluster/data/pyrAer1/bed/simpleRepeat cd /cluster/data/pyrAer1 trfBig chr1.fa /dev/null -bedAt=/cluster/data/pyrAer1/bed/simpleRepeat/chr1.bed cd /cluster/data/pyrAer1/bed/simpleRepeat hgLoadBed pyrAer1 simpleRepeat *.bed -sqlTable=~kent/src/hg/lib/simpleRepeat.sql # weird, I get a seg fault here. need to ask Hiram or somebody else. # DESCRIPTION PAGE (DONE 01/12/04) ssh hgwdev # Write ~/kent/src/hg/makeDb/trackDb/archae/pyrAer1/description.html chmod a+r ~/kent/src/hg/makeDb/trackDb/archae/pyrAer1/description.html # Check it in. mkdir /gbdb/pyrAer1/html ln -s /cluster/data/pyrAer1/html/description.html /gbdb/pyrAer1/html/ # GENBANK PROTEIN-CODING GENES (DONE 01/14/04) ssh hgwdev mkdir /cluster/data/pyrAer1/genbank cd /cluster/data/pyrAer1/genbank wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Pyrobaculum_aerophilum/NC_003364.gbk mv NC_003364.gbk pyrAer1.gbk # Create 3 files to assist parsing of the genbank # 1. for a bed file echo 'chr1 start end gene 1000 strand' > pyrAer1-params-bed.txt # 2. for the peptide parts echo 'gene translation' > pyrAer1-params-pep.txt # 3. for the other gene information echo 'gene gene product note protein_id db_xref EC_number pseudo' > pyrAer1-params-xra.txt # Now extract the genes and information: gbArchaeGenome pyrAer1.gbk pyrAer1-params-bed.txt pyrAer1-genbank-cds.bed gbArchaeGenome pyrAer1.gbk pyrAer1-params-pep.txt pyrAer1-genbank-cds.pep gbArchaeGenome pyrAer1.gbk pyrAer1-params-xra.txt pyrAer1-genbank-cds.xra hgsql pyrAer1 < ~/kent/src/hg/lib/pepPred.sql hgsql pyrAer1 < ~/kent/src/hg/lib/minGeneInfo.sql echo rename table pepPred to gbProtCodePep | hgsql pyrAer1 echo rename table minGeneInfo to gbProtCodeXra | hgsql pyrAer1 echo load data local infile \'pyrAer1-genbank-cds.pep\' into table gbProtCodePep | hgsql pyrAer1 echo load data local infile \'pyrAer1-genbank-cds.xra\' into table gbProtCodeXra | hgsql pyrAer1 csh /cluster/bin/scripts/tawk '{print $1,$2,$3,$4,$5,$6,$2,$3,0,1,$3-$2,0}' pyrAer1-genbank-cds.bed | bedToGenePred stdin tmp.gp #below substr($1,4,4) must be edited so that it returns a numeric field. /cluster/bin/scripts/tawk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,substr($1,4,4),name2,"cmpl","cmpl",0}' tmp.gp > tmp2.gp # hard tab between quotes use ctrl-V then press tab join -t " " -o 1.1,1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 1.13 1.14 1.15 tmp2.gp pyrAer1-genbank-cds.xra > pyrAer1.gp ldHgGene pyrAer1 refSeq pyrAer1.gp -predTab -genePredExt # GENBANK rRNA GENES (NOT QUITE DONE) ssh hgdev cd /cluster/data/pyrAer1/genbank gbArchaeGenome -kind=rRNA pyrAer1.gbk pyrAer1-params-bed.txt pyrAer1-rrnas.bed echo 'gene product NA' > pyrAer1-params-rrna-xra.txt gbArchaeGenome -kind=rRNA pyrAer1.gbk pyrAer1-params-rrna-xra.txt pyrAer1-rrnas-xra.txt hgLoadBed pyrAer1 gbRRNA pyrAer1-rrnas.bed hgsql pyrAer1 < ~/kent/src/hg/lib/minGeneInfo.sql echo rename table minGeneInfo to gbRRNAXra | hgsql pyrAer1 echo load data local infile \'pyrAer1-rrnas-xra.txt\' into table gbRRNAXra | hgsql pyrAer1 # COG STUFF # Cut and paste http://www.ncbi.nlm.nih.gov/cgi-bin/COG/palox into emacs (COG list) # and save as cogpage.txt awk '{printf("%s\t%s\n",$6,$5)}' < cogpage.txt | sed -e 's/\[//' -e 's/\]//' > cogs.txt rm cogpage.txt # Now we have the basic list of cogs and the letter code for each one. # TODD LOWE tRNA GENES (DONE 01/15/04) # This one is a bed 6+ file created by hand of 46 tRNAs and 1 pseudo tRNA by Todd # Lowe. See ~/kent/src/hg/lib/loweTrnaGene.as for a description of the fields. # **Showing the tRNAScanSE instructions would be nice in the future. ssh hgwdev mkdir /cluster/data/pyrAer1/bed/loweTrnaGene cd /cluster/data/pyrAer1/bed/loweTrnaGene hgLoadBed -tab pyrAer1 loweTrnaGene pyrAer1-lowe-trnas.bed -sqlTable=~/kent/src/hg/lib/loweTrnaGene.sql # TODD LOWE snoRNA GENES (DONE 01/16/04) # This is a bed 6 file created by hand. ssh hgwdev mkdir /cluster/data/pyrAer1/bed/loweSnoGene cd /cluster/data/pyrAer1/bed/loweSnoGene hgLoadBed -tab pyrAer1 loweSnoGene pyrAer1-snos.bed # TIGR GENES (DONE 02/09/04) # First go to http://www.tigr.org/tigr-scripts/CMR2/gene_attribute_form.dbi # and fill out the web form as follows: # - Pick "Retrieve attributes for the specified DNA feature within a specific # organism and/or a specific role category". # * Pick "Pyrobaculum aerophilum IM2", and "Primary and TIGR annotation ORFs" # from the 1st and 3rd box. # * Select everything from "Choose TIGR Annotation Gene Attributes" # * Select "Primary Locus Name" from "Choose Primary Annotation Gene Attributes" # * Select everything from "Choose Other Gene Attributes" # - Click submit, and click save as tab-delimited file. ssh hgwdev mkdir /cluster/data/pyrAer1/bed/tigrCmrORFs cp pyrAer1-tigr.tab /cluster/data/pyrAer1/bed/tigrCmrORFs cd /cluster/data/pyrAer1/bed/tigrCmrORFs tigrCmrToBed pyrAer1-tigr.tab pyrAer1-tigr.bed hgLoadBed -tab pyrAer1 tigrCmrGene pyrAer1-tigr.bed -sqlTable=~/kent/src/hg/lib/tigrCmrGene.sql echo RENAME TABLE tigrCmrGene to tigrCmrORFs | hgsql pyrAer1 # Lowe Lab Microarrays (DONE 02/09/04) ssh hgwdev mkdir /cluster/data/pyrAer1/bed/llaPaePrintA cp Pae-arrays.{bed,exps} /cluster/data/pyrAer1/bed/llaPaePrintA cd /cluster/data/pyrAer1/bed/llaPaePrintA hgLoadBed pyrAer1 llaPaePrintA Pae-arrays.bed hgsql pyrAer1 < ~/kent/src/hg/lib/expRecord.sql echo RENAME TABLE expRecord to llaPaePrintAExps | hgsql pyrAer1 echo LOAD DATA LOCAL INFILE \'Pae-arrays.exps\' INTO TABLE llaPaePrintAExps | hgsql pyrAer1 # for now I'm using both hgFixed and the pyrAer1 database for this... that will change. hgsql hgFixed < ~/kent/src/hg/lib/expRecord.sql echo RENAME TABLE expRecord to llaPaePrintAExps | hgsql hgFixed echo LOAD DATA LOCAL INFILE \'Pae-arrays.exps\' INTO TABLE llaPaePrintAExps | hgsql hgFixed # AORF TRACK # This is another hand-created file, provided originally by Sorel Fitz-Gibbon and massaged to be a bed. ssh hgwdev mkdir /cluster/data/pyrAer1/bed/primAORF cp aorfs.bed /cluster/data/pyrAer1/bed/primAORF cd /cluster/data/pyrAer1/bed/primAORF hgLoadBed pyrAer1 primAORF aorfs.bed # CHANGE "chr1" to "chr" ssh hgdev mv /cluster/data/pyrAer1/nib/chr1.nib /cluster/data/pyrAer1/nib/chr.nib rm /gbdb/pyrAer1/nib/chr1.nib ln -s /cluster/data/pyrAer1/nib/chr.nib /gbdb/pyrAer1/nib/chr.nib # a quick script to replace chr1 with chr #!/bin/bash sed 's/chr1/chr/g' $1 > /tmp/whatever mv /tmp/whatever $1 cd /cluster/data/pyrAer1/bed find -name '*.bed' | xargs changeCh.sh # Now change the DB cd /tmp hgsqldump pyrAer1 | sed 's/chr1/chr/g' > paeNew.sql echo drop database pyrAer1 | hgsql test echo create database pyrAer1 | hgsql test hgsql pyrAer1 < paeNew.sql rm paeNew.sql echo 'update dbDb set defaultPos="chr:550000-580000" where name="pyrAer1"' | hgsql -h genome-testdb hgcentraltest # Pae promoter track cd /cluster/data/pyrAer1/wiggle mkdir promoterScanPos mkdir promoterScanNeg cd promoterScanPos cp prom.neg.gz . wigAsciiToBinary -chrom=chr -wibFile=promoterScanPos prom.pos.gz hgLoadWiggle pyrAer1 promoterScanPos promoterScanPos.wig cd ../promoterScanNeg cp prom.neg.gz . wigAsciiToBinary -chrom=chr -wibFile=promoterScanNeg prom.neg.gz hgLoadWiggle pyrAer1 promoterScanNeg promoterScanNeg.wig cd /gbdb/pyrAer1/wib ln -s /cluster/data/pyrAer1/wiggle/promoterScanPos/promoterScanPos.wib promoterScanPos.wib ln -s /cluster/data/pyrAer1/wiggle/promoterScanNeg/promoterScanNeg.wib promoterScanNeg.wib # shine DG track cd /cluster/data/pyrAer1/wiggle mkdir shineDGPos mkdir shineDGNeg cd shineDGPos cp shine.pos.gz . wigAsciiToBinary -chrom=chr -wibFile=shineDGPos shine.pos.gz hgLoadWiggle pyrAer1 shineDGPos shineDGPos.wig cd ../shineDGNeg cp shine.neg.gz . wigAsciiToBinary -chrom=chr -wibFile=shineDGNeg shine.neg.gz hgLoadWiggle pyrAer1 shineDGNeg shineDGNeg.wig cd /gbdb/pyrAer1/wib ln -s /cluster/data/pyrAer1/wiggle/shineDGPos/shineDGPos.wib shineDGPos.wib ln -s /cluster/data/pyrAer1/wiggle/shineDGNeg/shineDGNeg.wib shineDGNeg.wib # RNA genes cd /cluster/data/pyrAer1/bed mkdir rnaGenes cp paeAllRNA.bed . cp ~/kent/src/hg/lib/rnaGenes.sql . hgLoadBed -sqlTable=rnaGenes.sql pyrAer1 rnaGenes paeAllRNA.bed