# for emacs: -*- mode: sh; -*- # This file describes building the browser database for the archaeal # species Methanosarcina acetivorans. # # if this is the first time you are making your own hgwdev browser, need to do # cd ~/kent/src/, then a make # DOWNLOAD SEQUENCE FROM GENBANK (DONE) ssh eieio mkdir /cluster/store5/archae/haloHalo1 ln -s /cluster/store5/archae/haloHalo1 /cluster/data/haloHalo1 cd /cluster/data/haloHalo1 cp /projects/lowelab/db/Bacteria/Halobacterium_sp/Halo_sp* . mv Halo_sp.fa haloHalo1.fa # Edit header of *.fa to '>chr >plasmid_pNRC100 >plasmid_pNRC200' faToTwoBit haloHalo1.fa haloHalo1.2bit mkdir /gbdb/haloHalo1 ln -s /cluster/data/haloHalo1/haloHalo1.2bit /gbdb/haloHalo1/haloHalo1.2bit # CREATE DATABASES AND A BUNCH OF INITIAL STUFF (DONE) ssh hgwdev echo 'create database haloHalo1' | hgsql '' cd /cluster/data/haloHalo1 faSize -detailed haloHalo1.fa > chrom.sizes echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \ | hgsql haloHalo1 echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) values \ ("haloHalo1", "August 1996", "/gbdb/haloHalo1", "Halobacterium sp.", \ "chr1:500000-550000", 1, 233, "Halobacterium sp.", \ "Halobacterium sp. NRC-1", "/gbdb/haloHalo1/html/description.html", \ 0);' \ | hgsql hgcentraltest echo 'INSERT INTO defaultDb (genome, name) values ("Halobacterium sp.", "haloHalo1");' \ | hgsql hgcentraltest echo 'INSERT INTO genomeClade (genome, clade, priority) values ("Halobacterium sp.", "archaea",85);' \ | hgsql hgcentraltest # CREATE CHROMINFO TABLE (DONE) ssh hgwdev cd /cluster/data/haloHalo1 cp ~baertsch/kent/src/hg/lib/chromInfo.sql . hgsql haloHalo1 < chromInfo.sql echo "load data local infile 'chrom.sizes' into table chromInfo" | hgsql haloHalo1 echo "update chromInfo set fileName = '/gbdb/haloHalo1/haloHalo1.2bit'" | hgsql haloHalo1 cd ~/kent/src/hg/makeDb/trackDb # add the trackDb directories mkdir -p archae/haloHalo1 cvs add archae/haloHalo1 cvs commit archae/haloHalo1 make DBS=haloHalo1 # GC20BASE (DONE) mkdir -p /cluster/data/haloHalo1/bed/gc20Base cd /cluster/data/haloHalo1/bed/gc20Base hgGcPercent -wigOut -doGaps -file=stdout -win=20 haloHalo1 \ /cluster/data/haloHalo1/ | wigEncode stdin gc20Base.wig gc20Base.wib cd /cluster/data/haloHalo1/bed/gc20Base mkdir /gbdb/haloHalo1/wib ln -s `pwd`/gc20Base.wib /gbdb/haloHalo1/wib hgLoadWiggle -pathPrefix=/gbdb/haloHalo1/wib haloHalo1 gc20Base gc20Base.wig # verify index is correct: hgsql haloHalo1 -e "show index from gc20Base;" # should see good numbers in Cardinality column # TANDEM REPEAT MASKER (DONE) ssh hgwdev mkdir -p /cluster/data/haloHalo1/bed/simpleRepeat cd /cluster/data/haloHalo1 trfBig haloHalo1.fa /dev/null -bedAt=/cluster/data/haloHalo1/bed/simpleRepeat/chr.bed cd /cluster/data/haloHalo1/bed/simpleRepeat hgLoadBed haloHalo1 simpleRepeat *.bed -sqlTable=/cluster/home/lowe/kent/sr/hg/lib/simpleRepeat.sql # CHAIN TRACK with halMar1 # DONE (10/13/05), kpollard cd /cluster/data/haloHalo1/bed/ mkdir conservation cd conservation cp /cluster/data/pyrFur2/bed/conservation/HoxD55.q . cp ../../haloHalo1.fa haloHalo1.chr cat /cluster/data/halMar1/*.fa > halMar1.chr #fix headers cat haloHalo1.chr | gawk '{if(/plasmid/){print $1;} else{print $0;}}' > temp cat temp | gawk '{if(/chr/){print $1;} else{print $0;}}' > haloHalo1.chr sed s/chrom/\>chr/ halMar1.chr | sed s/plas/\>plasmid/ | gawk '{if(/halMar1/){print $2;} else{print $0;}}' > temp mv temp halMar1.chr #ref seq needs separate files for each seq mv haloHalo1.chr haloHalo1.fa cat haloHalo1.fa | gawk 'BEGIN{ind=0;}{if($1==">chr"){ind=1;} if(ind==0){print $0;}}' > haloHalo1.plasmid_pNRC100 cat haloHalo1.fa | gawk 'BEGIN{ind=1;}{if($1==">chr"){ind=0;} if($1==">plasmid_pNRC200"){ind=1;} if(ind==0){print $0;}}' > haloHalo1.chr cat haloHalo1.fa | gawk 'BEGIN{ind=1;}{if($1==">plasmid_pNRC200"){ind=0;} if(ind==0){print $0;}}' > haloHalo1.plasmid_pNRC200 faToTwoBit halMar1.chr halMar1.2bit faToTwoBit haloHalo1.fa haloHalo1.2bit faToNib haloHalo1.plasmid_pNRC100 plasmid_pNRC100.nib faToNib haloHalo1.chr chr.nib faToNib haloHalo1.plasmid_pNRC200 plasmid_pNRC200.nib faSize -detailed haloHalo1.plasmid_pNRC100 haloHalo1.plasmid_pNRC200 haloHalo1.chr halMar1.chr > chrom.sizes #blastz blastz haloHalo1.plasmid_pNRC100 halMar1.chr Q=HoxD55.q > haloHalo1.p1-halMar1.lav blaztz haloHalo1.plasmid_pNRC200 halMar1.chr Q=HoxD55.q > haloHalo1.p2-halMar1.lav blastz haloHalo1.chr halMar1.chr Q=HoxD55.q > haloHalo1.c-halMar1.lav lavToAxt haloHalo1.p1-halMar1.lav . halMar1.2bit haloHalo1.p1-halMar1.axt lavToAxt haloHalo1.p2-halMar1.lav . halMar1.2bit haloHalo1.p2-halMar1.axt lavToAxt haloHalo1.c-halMar1.lav . halMar1.2bit haloHalo1.c-halMar1.axt #p2 lav is empty - remove rm haloHalo1.p2-halMar1.lav haloHalo1.p2-halMar1.axt axtBest haloHalo1.c-halMar1.axt all -winSize=500 -minScore=5000 haloHalo1.c-halMar1-best.axt axtBest haloHalo1.p1-halMar1.axt all -winSize=500 -minScore=5000 haloHalo1.p1-halMar1-best.axt #chain # Reuse gap penalties from chicken run. cat << '_EOF_' > temp.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' sed 's/ */\t/g' temp.gap > chicken.gap rm temp.gap mkdir chains foreach f (*best.axt) set a=$f:t:r set b=`basename $a -best` axtChain -scoreScheme=HoxD55.q -linearGap=chicken.gap \ -minScore=5000 $f haloHalo1.2bit halMar1.2bit \ chains/$b.chain end #load chains cd chains hgLoadChain haloHalo1 chr_chainHalMar1 haloHalo1.c-halMar1.chain hgLoadChain haloHalo1 plasmid_pNRC100_chainHalMar1 haloHalo1.p1-halMar1.chain #load axt files for blastz track cd .. axtToPsl haloHalo1.c-halMar1-best.axt chrom.sizes chrom.sizes haloHalo1.c-halMar1.psl axtToPsl haloHalo1.p1-halMar1-best.axt chrom.sizes chrom.sizes haloHalo1.p1-halMar1.psl hgLoadPsl haloHalo1 -table=chr_blastzHalMar1 haloHalo1.c-halMar1.psl hgLoadPsl haloHalo1 -table=plasmid_pNRC100_blastzHalMar1 haloHalo1.p1-halMar1.psl #trackDb cd ~/kent/src/hg/makeDb/trackDb/archae/ mkdir haloHalo1 cvs add haloHalo1 cd haloHalo1 # track chainHalMar1 # shortLabel $o_Organism Chain # longLabel $o_Organism ($o_date/$o_db) Chained Alignments # group compGeno # priority 150.1 # visibility hide # color 100,50,0 # altColor 255,240,200 # spectrum on # type chain halMar1 # otherDb halMar1 # # track blastzHalMar1 # shortLabel $o_Organism Blastz # longLabel $o_Organism ($o_date/$o_db) Blastz Alignments # group compGeno # priority 154.1 # visibility hide # color 100,50,0 # altColor 255,240,200 # spectrum on # type psl xeno halMar1 # otherDb halMar1 cvs add trackDb.ra cvs commit -m "halMar1 chains" trackDb.ra cvs add chainHalMar1.html cvs commit -m "added malMar1 chains" chainHalMar1.html cvs add blastzHalMar1.html cvs commit -m "added halMar1 blastz" blastzHalMar1.html # DESCRIPTION PAGE (DONE 10/20/05), kpollard # Write ~/kent/src/hg/makeDb/trackDb/archae/haloHalo1/description.html cd ~/kent/src/hg/makeDb/trackDb/archae/haloHalo1/ chmod a+r description.html cvs add description.html cvs commit -m "description page" description.html mkdir -p /cluster/data/haloHalo1/html/ cp ~/kent/src/hg/makeDb/trackDb/archae/haloHalo1/description.html \ /cluster/data/haloHalo1/html/description.html mkdir -p /gbdb/haloHalo1/html ln -s /cluster/data/haloHalo1/html/description.html /gbdb/haloHalo1/html/ # GENBANK PROTEIN-CODING GENES (DONE) ssh hgwdev mkdir /cluster/data/haloHalo1/genbank cd /cluster/data/haloHalo1/genbank cp /projects/lowelab/db/Bacteria/Halobacterium_sp/NC_00*.gbk . mv NC_002607.gbk haloHalo1.gbk mv NC_001869.gbk haloHalo1-pNRC100.gbk mv NC_002608.gbk haloHalo1-pNRC200.gbk # Create 3 files to assist parsing of the genbank # 1. for a bed file cat > haloHalo1-params-bed.txt chr start end locus_tag 1000 strand twice # 2. for the peptide parts cat > haloHalo1-params-pep.txt locus_tag translation twice # 3. for the other gene information cat > haloHalo1-params-xra.txt locus_tag gene product note protein_id db_xref EC_number pseudo twice # Now extract the genes and information: gbArchaeGenome haloHalo1.gbk haloHalo1-params-bed.txt haloHalo1-genbank-cds.bed gbArchaeGenome haloHalo1.gbk haloHalo1-params-pep.txt haloHalo1-genbank-cds.pep gbArchaeGenome haloHalo1.gbk haloHalo1-params-xra.txt haloHalo1-genbank-cds.xra gbArchaeGenome haloHalo1-pNRC100.gbk haloHalo1-params-bed.txt haloHalo1-pNRC100-genbank-cds.bed.tmp sed 's/chr/plasmid_pNRC100/' haloHalo1-pNRC100-genbank-cds.bed.tmp > haloHalo1-pNRC100-genbank-cds.bed # Check to see if coordinates for first entry in the above bed file are screwed up -- fix if needed gbArchaeGenome haloHalo1-pNRC100.gbk haloHalo1-params-pep.txt haloHalo1-pNRC100-genbank-cds.pep gbArchaeGenome haloHalo1-pNRC100.gbk haloHalo1-params-xra.txt haloHalo1-pNRC100-genbank-cds.xra gbArchaeGenome haloHalo1-pNRC200.gbk haloHalo1-params-bed.txt haloHalo1-pNRC200-genbank-cds.bed.tmp sed 's/chr/plasmid_pNRC200/' haloHalo1-pNRC200-genbank-cds.bed.tmp > haloHalo1-pNRC200-genbank-cds.bed # Check to see if coordinates for first entry in the above bed file are screwed up -- fix if needed gbArchaeGenome haloHalo1-pNRC200.gbk haloHalo1-params-pep.txt haloHalo1-pNRC200-genbank-cds.pep gbArchaeGenome haloHalo1-pNRC200.gbk haloHalo1-params-xra.txt haloHalo1-pNRC200-genbank-cds.xra cat haloHalo1-pNRC100-genbank-cds.bed haloHalo1-pNRC200-genbank-cds.bed >> haloHalo1-genbank-cds.bed cat haloHalo1-pNRC100-genbank-cds.xra haloHalo1-pNRC200-genbank-cds.xra >> haloHalo1-genbank-cds.xra hgsql haloHalo1 < /cluster/home/baertsch/kent/src/hg/lib/pepPred.sql hgsql haloHalo1 < /cluster/home/baertsch/kent/src/hg/lib/minGeneInfo.sql echo rename table pepPred to gbProtCodePep | hgsql haloHalo1 echo rename table minGeneInfo to gbProtCodeXra | hgsql haloHalo1 echo load data local infile \'haloHalo1-genbank-cds.pep\' into table gbProtCodePep | hgsql haloHalo1 echo load data local infile \'haloHalo1-genbank-cds.xra\' into table gbProtCodeXra | hgsql haloHalo1 echo load data local infile \'haloHalo1-pNRC100-genbank-cds.pep\' into table gbProtCodePep | hgsql haloHalo1 echo load data local infile \'haloHalo1-pNRC200-genbank-cds.pep\' into table gbProtCodePep | hgsql haloHalo1 #genbank to genePred (DONE) csh /cluster/bin/scripts/tawk '{print $1,$2,$3,$4,$5,$6,$2,$3,0,1,$3-$2,0}' haloHalo1-genbank-cds.bed | bedToGenePred stdin tmp.gp /cluster/bin/scripts/tawk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,NR,name2,"cmpl","cmpl",0}' tmp.gp > tmp2.gp # hard tab between quotes use ctrl-V then press tab join -t " " -o 1.1,1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 1.13 1.14 1.15 tmp2.gp haloHalo1-genbank-cds.xra > haloHalo1.gp ldHgGene haloHalo1 refSeq haloHalo1.gp -predTab -genePredExt # COG STUFF (DONE) grep COG /projects/lowelab/db/Bacteria/Halobacterium_sp/NC_00*.ptt | awk 'NR>3{OFS="\t";print $6,$8,$7}' > COG hgsql haloHalo1 < /cluster/home/baertsch/kent/src/hg/lib/cogs.sql echo "load data local infile 'COG' into table COG" | hgsql haloHalo1 # load cog codes hgsql haloHalo1 < /cluster/data/metAce1/genbank/COGXra.sql # GENBANK RNA Genes (listed in *.rnt files -- like *.ptt files) cd /cluster/data/haloHalo1/genbank cat /projects/lowelab/db/Bacteria/Halobacterium_sp/NC_00*.rnt | sed 's/\.\./\t/' | awk 'NR>6{OFS="\t";print chr $1-1, $2, $7,1000, $3}' > haloHalo1-genbank-rna.bed hgsql haloHalo1 < /cluster/data/haloHalo1/genbank/ # GENBANK rRNA GENES (NOT QUITE DONE) ssh hgdev gbArchaeGenome -kind=rRNA haloHalo1.gbk haloHalo1-params-bed.txt haloHalo1-rrnas.bed echo 'gene product NA' > haloHalo1-params-rrna-xra.txt gbArchaeGenome -kind=rRNA haloHalo1.gbk haloHalo1-params-rrna-xra.txt haloHalo1-rrnas-xra.txt hgLoadBed haloHalo1 gbRRNA haloHalo1-rrnas.bed hgsql haloHalo1 < ~/kent/src/hg/lib/minGeneInfo.sql echo rename table minGeneInfo to gbRRNAXra | hgsql haloHalo1 echo load data local infile \'haloHalo1-rrnas-xra.txt\' into table gbRRNAXra | hgsql haloHalo1 # TODD LOWE tRNA GENES () # This one is a bed 6+ file created by hand of 46 tRNAs and 1 pseudo tRNA by Todd # Lowe. See ~/kent/src/hg/lib/loweTrnaGene.as for a description of the fields. # **Showing the tRNAScanSE instructions would be nice in the future. ssh hgwdev mkdir /cluster/data/haloHalo1/bed/loweTrnaGene cd /cluster/data/haloHalo1/bed/loweTrnaGene hgLoadBed -tab haloHalo1 loweTrnaGene haloHalo1-lowe-trnas.bed -sqlTable=~/kent/src/hg/lib/loweTrnaGene.sql # TODD LOWE snoRNA GENES () # This is a bed 6 file created by hand. ssh hgwdev mkdir /cluster/data/haloHalo1/bed/loweSnoGene cd /cluster/data/haloHalo1/bed/loweSnoGene hgLoadBed -tab haloHalo1 loweSnoGene haloHalo1-snos.bed # TIGR GENES (DONE) # First go to http://www.tigr.org/tigr-scripts/CMR2/gene_attribute_form.dbi # and fill out the web form as follows: # - Pick "Retrieve attributes for the specified DNA feature within a specific # organism and/or a specific role category". # * Pick "Pyrobaculum aerophilum IM2", and "Primary and TIGR annotation ORFs" # from the 1st and 3rd box. # * Select everything from "Choose TIGR Annotation Gene Attributes" # * Select "Primary Locus Name" from "Choose Primary Annotation Gene Attributes" # * Select everything from "Choose Other Gene Attributes" # - Click submit, and click save as tab-delimited file. ssh hgwdev mkdir /cluster/data/haloHalo1/bed/tigrCmrORFs cp haloHalo1-tigr.tab /cluster/data/haloHalo1/bed/tigrCmrORFs cd /cluster/data/haloHalo1/bed/tigrCmrORFs /projects/lowelab/users/aamp/bin/i386/tigrCmrToBed haloHalo1-tigr.tab haloHalo1-tigr.bed hgLoadBed -tab haloHalo1 tigrCmrORFs haloHalo1-tigr.bed -sqlTable=~/kent/src/hg/lib/tigrCmrGene.sql