# for emacs: -*- mode: sh; -*-

 
# This file describes building the browser database for the archaeal
# species Methanosarcina acetivorans.
#
# if this is the first time you are making your own hgwdev browser, need to do 
# cd ~/kent/src/, then a make

# DOWNLOAD SEQUENCE FROM GENBANK (DONE 10/2/05)

    mkdir /cluster/store5/archae/therAcid1
    ln -s /cluster/store5/archae/therAcid1 /cluster/data/therAcid1
    cd /cluster/data/therAcid1
    cp /projects/lowelab/db/Bacteria/Thermoplasma_acidophilum/Ther_acid* .
    mv Ther_acid.fa therAcid1.fa
    grep ">" therAcid1.fa
    # Edit header of therAcid1.fa seqs to '>chr >plasmid_pNRC100 >plasmid_pNRC200'
   
    faToTwoBit therAcid1.fa therAcid1.2bit 

    mkdir /gbdb/therAcid1
    ln -s /cluster/data/therAcid1/therAcid1.2bit /gbdb/therAcid1/therAcid1.2bit

# CREATE DATABASES AND A BUNCH OF INITIAL STUFF (DONE 10/2/05)

    ssh hgwdev
    echo 'create database therAcid1' | hgsql ''
    cd /cluster/data/therAcid1

    faSize -detailed therAcid1.fa > chrom.sizes
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" | hgsql therAcid1
    echo 'INSERT INTO dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, htmlPath, hgNearOk) values ("therAcid1", "Sep 2000", "/gbdb/therAcid1", "Thermoplasma acidophilum", "chr:500000-550000", 1, 284, "Thermoplasma acidophilum","Thermoplasma acidophilum DSM 1728", "/gbdb/therAcid1/html/description.html", 0);' | hgsql hgcentraltest
    echo 'INSERT INTO defaultDb (genome, name) values ("Thermoplasma acidophilum", "therAcid1");' | hgsql hgcentraltest
    echo 'INSERT INTO genomeClade (genome, clade, priority) values ("Thermoplasma acidophilum", "archaea",85);'  | hgsql hgcentraltest

# CREATE CHROMINFO TABLE 
  ssh hgwdev
  cd /cluster/data/therAcid1

   cp ~baertsch/kent/src/hg/lib/chromInfo.sql .
   hgsql therAcid1 < chromInfo.sql
   echo "load data local infile 'chrom.sizes' into table chromInfo" | hgsql therAcid1
   echo "update chromInfo set fileName = '/gbdb/therAcid1/therAcid1.2bit'" | hgsql therAcid1

    cd ~/kent/src/hg/makeDb/trackDb

    # add the trackDb directories
    mkdir -p archae/therAcid1
    cvs add archae/therAcid1
    cvs commit archae/therAcid1

    make DBS=therAcid1


# GC20BASE (DONE 10/2/05)
    mkdir -p /cluster/data/therAcid1/bed/gc20Base
    cd /cluster/data/therAcid1/bed/gc20Base
    hgGcPercent -wigOut -doGaps -file=stdout -overlap=10 -win=20 therAcid1 /cluster/data/therAcid1/ | wigEncode stdin gc20Base.wig gc20Base.wib

    cd /cluster/data/therAcid1/bed/gc20Base
    mkdir /gbdb/therAcid1/wib
    ln -s `pwd`/gc20Base.wib /gbdb/therAcid1/wib
    hgLoadWiggle -pathPrefix=/gbdb/therAcid1/wib therAcid1 gc20Base gc20Base.wig
    #	verify index is correct:
    hgsql therAcid1 -e "show index from gc20Base;"
    #	should see good numbers in Cardinality column


# TANDEM REPEAT MASKER (DONE 10/2/05)

    ssh hgwdev
    mkdir -p /cluster/data/therAcid1/bed/simpleRepeat
    cd /cluster/data/therAcid1
    trfBig therAcid1.fa /dev/null -bedAt=/cluster/data/therAcid1/bed/simpleRepeat/chr.bed
    cd /cluster/data/therAcid1/bed/simpleRepeat
    hgLoadBed therAcid1 simpleRepeat *.bed -sqlTable=/cluster/home/lowe/kent/src/hg/lib/simpleRepeat.sql

# TIGR GENES (DONE 10/3/05)
    # First go to http://www.tigr.org/tigr-scripts/CMR2/gene_attribute_form.dbi
    # and fill out the web form as follows:
    #   - Pick "Retrieve attributes for the specified DNA feature within a specific 
    #     organism and/or a specific role category".
    #       * Pick genome and "Primary and TIGR annotation ORFs" 
    #         from the 1st and 3rd box.
    #       * Select everything from "Choose TIGR Annotation Gene Attributes"
    #       * Select "Primary Locus Name" from "Choose Primary Annotation Gene Attributes"
    #       * Select everything from "Choose Other Gene Attributes"
    #   - Click submit, and click save as tab-delimited file.
   
    mkdir /cluster/data/therAcid1/bed/tigrCmrORFs
    cp therAcid1-tigr.tab /cluster/data/therAcid1/bed/tigrCmrORFs
    cd /cluster/data/therAcid1/bed/tigrCmrORFs
    ~aamp/bin/i386/tigrCmrToBed therAcid1-tigr.tab therAcid1-tigr.bed
    cat therAcid1-tigr.bed | sed -e 's/chr1/chr/' > temp
    mv temp therAcid1-tigr.bed
    hgLoadBed -tab therAcid1 tigrCmrGene therAcid1-tigr.bed -sqlTable=/cluster/home/kpollard/kent/src/hg/lib/tigrCmrGene.sql
    echo "rename table tigrCmrGene to tigrCmrORFs;" | hgsql therAcid1

# DESCRIPTION PAGE (DONE 10/4/05)

    # Write ~/kent/src/hg/makeDb/trackDb/archae/therAcid1/description.html
    chmod a+r ~/kent/src/hg/makeDb/trackDb/archae/therAcid1/description.html
    mkdir -p /cluster/data/therAcid1/html/
    cp ~/kent/src/hg/makeDb/trackDb/archae/therAcid1/description.html /cluster/data/therAcid1/html/description.html
    mkdir -p /gbdb/therAcid1/html
    ln -s /cluster/data/therAcid1/html/description.html /gbdb/therAcid1/html/

# MULTIZ with therVolc, picrTorr, and ferrAcid
# DONE (10/11/05), kpollard

    cd /cluster/data/therAcid1/bed/
    mkdir conservation
    cd conservation
    cp /cluster/data/metAce1/bed/conservation/HoxD55.q .
    cp ../../therAcid1.fa therAcid1.chr
    cp /cluster/data/therVolc1/therVolc1.fa therVolc1.chr
    cp /cluster/data/picrTorr1/chr.fa picrTorr1.chr
    cp /cluster/data/ferrAcid1/chr.fa ferrAcid1.chr
    #fix names to be therAcid1, therVolc1, picrTorr1 manually
    sed s/\>C/\>ferrAcid1.C/ ferrAcid1.chr | gawk '{if(/ferrAcid1.Contig/){ print $1;}else{print toupper($0);}}' > temp
    mv temp ferrAcid1.chr
    faToNib therAcid1.chr therAcid1.chr.nib
    faToNib therVolc1.chr therVolc1.chr.nib
    faToNib picrTorr1.chr picrTorr1.chr.nib
    faToTwoBit ferrAcid1.chr ferrAcid1.2bit

    #chrom sizes
    faSize -detailed *.chr > chrom.sizes

    #blastz 
    blastz therAcid1.chr therVolc1.chr Q=HoxD55.q > therAcid1-therVolc1.lav
    blastz therAcid1.chr picrTorr1.chr Q=HoxD55.q > therAcid1-picrTorr1.lav
    blastz therAcid1.chr ferrAcid1.chr Q=HoxD55.q > therAcid1-ferrAcid1.lav

    /cluster/bin/i386/lavToAxt therAcid1-therVolc1.lav . . therAcid1-therVolc1.axt
    /cluster/bin/i386/lavToAxt therAcid1-picrTorr1.lav . . therAcid1-picrTorr1.axt
    /cluster/bin/i386/lavToAxt therAcid1-ferrAcid1.lav . ferrAcid1.2bit therAcid1-ferrAcid1.axt

    axtBest therAcid1-therVolc1.axt therAcid1.chr -winSize=500 -minScore=5000 therAcid1-therVolc1-best.axt
    axtBest therAcid1-picrTorr1.axt therAcid1.chr -winSize=500 -minScore=5000 therAcid1-picrTorr1-best.axt
    axtBest therAcid1-ferrAcid1.axt therAcid1.chr -winSize=500 -minScore=5000 therAcid1-ferrAcid1-best.axt

    axtToMaf therAcid1-therVolc1-best.axt chrom.sizes chrom.sizes therAcid1-therVolc1.maf
    axtToMaf therAcid1-picrTorr1-best.axt chrom.sizes chrom.sizes therAcid1-picrTorr1.maf
    axtToMaf therAcid1-ferrAcid1-best.axt chrom.sizes chrom.sizes therAcid1-ferrAcid1.maf

    #multiz
    #remove extra header lines
    multiz therAcid1-therVolc1.maf therAcid1-picrTorr1.maf - > therAcid1-therVolc1-picrTorr1.maf
    multiz therAcid1-ferrAcid1.maf therAcid1-therVolc1-picrTorr1.maf - > therAcid1-therVolc1-picrTorr1-ferrAcid1.maf

    #phyloHMM
    /cluster/bin/phast/msa_view -i MAF -M therAcid1.chr -o SS therAcid1-therVolc1-picrTorr1-ferrAcid1.maf > therAcid1.ss
    /cluster/bin/phast/phyloFit -i SS therAcid1.ss -t "(ferrAcid1,(picrTorr1,(therVolc1,therAcid1)))" -o TaTvPtFa
    /cluster/bin/phast/msa_view -i SS therAcid1.ss --summary-only
    #add GC content to next call
    /cluster/bin/phast/phastCons therAcid1.ss TaTvPtFa.mod --gc 0.4133 \
    --target-coverage 0.7 --estimate-trees ther-tree \
    --expected-lengths 25 --no-post-probs --ignore-missing \
    --nrates 1,1
    /cluster/bin/phast/phastCons therAcid1.ss \
    ther-tree.cons.mod,ther-tree.noncons.mod \
    --target-coverage 0.7 --expected-lengths 25 \
    --viterbi therAcid1-elements.bed --score \
    --require-informative 0 --seqname chr > cons.dat
    wigEncode cons.dat phastCons.wig phastCons.wib
    /cluster/bin/phast/draw_tree TaTvPtFa.mod > ther-tree.ps 
    #make ai and jpg files in Illustrator
    cp ther-tree.jpg /usr/local/apache/htdocs/images/therAcid1-tree.jpg

    #move data
    mkdir wib
    mv phastCons.wib wib/phastCons.wib
    mv phastCons.wig wib/phastCons.wig
    ln -s /cluster/data/therAcid1/bed/conservation/wib/phastCons.wib /gbdb/therAcid1/wib
    mkdir /gbdb/therAcid1/pwMaf
    mkdir -p otherSpp/therVolc1 otherSpp/picrTorr1 otherSpp/ferrAcid1
    mv therAcid1-picrTorr1.maf otherSpp/picrTorr1/chr.maf
    mv therAcid1-therVolc1.maf otherSpp/therVolc1/chr.maf
    mv therAcid1-ferrAcid1.maf otherSpp/ferrAcid1/chr.maf
    ln -s /cluster/data/therAcid1/bed/conservation/otherSpp/picrTorr1 /gbdb/therAcid1/pwMaf/picrTorr1_pwMaf
    ln -s /cluster/data/therAcid1/bed/conservation/otherSpp/ferrAcid1 /gbdb/therAcid1/pwMaf/ferrAcid1_pwMaf
    ln -s /cluster/data/therAcid1/bed/conservation/otherSpp/therVolc1 /gbdb/therAcid1/pwMaf/therVolc1_pwMaf
    mkdir multiz
    mv therAcid1-therVolc1-picrTorr1-ferrAcid1.maf multiz/chr.maf
    ln -s /cluster/data/therAcid1/bed/conservation/multiz /gbdb/therAcid1/multizTaTvPtFa

    #load
    hgLoadWiggle therAcid1 phastCons /cluster/data/therAcid1/bed/conservation/wib/phastCons.wig
    hgLoadMaf -warn therAcid1 multizTaTvPtFa
    hgLoadMaf -warn therAcid1 picrTorr1_pwMaf -pathPrefix=/gbdb/therAcid1/pwMaf/picrTorr1_pwMaf
    hgLoadMaf -warn therAcid1 ferrAcid1_pwMaf -pathPrefix=/gbdb/therAcid1/pwMaf/ferrAcid1_pwMaf
    hgLoadMaf -warn therAcid1 therVolc1_pwMaf -pathPrefix=/gbdb/therAcid1/pwMaf/therVolc1_pwMaf
    hgLoadBed therAcid1 phastConsElements therAcid1-elements.bed 

    #trackDb
    cd ~/kent/src/hg/makeDb/trackDb/archae/therAcid1
    #trackDb.ra entry
    # track multizTaTvPtFa
    # shortLabel Conservation
    # longLabel Thermoplasma/Ferroplasma/Picrophilus multiz alignments
    # group compGeno
    # priority 10.0
    # visibility pack
    # type wigMaf 0.0 1.0
    # maxHeightPixels 100:40:11
    # wiggle phastCons
    # yLineOnOff Off
    # autoScale Off
    # pairwise pwMaf
    # speciesOrder therVolc1 picrTorr1 ferrAcid1
    cvs add trackDb.ra
    cvs commit -m "New multiz track" trackDb.ra
    #html page
    cvs add multizTaTvPtFa.html
    cvs commit -m "Details page for multiz track" multizTaTvPtFa.html

# GENBANK PROTEIN-CODING GENES

    ssh hgwdev
    mkdir /cluster/data/therAcid1/genbank
    cd /cluster/data/therAcid1/genbank
    cp /projects/lowelab/db/Bacteria/Thermoplasma_acidophilum/ .
    
    mv NC_003552.gbk therAcid1.gbk
    # Create 3 files to assist parsing of the genbank
    # 1. for a bed file
    echo 'chr
start
end
gene
1000
strand' > therAcid1-params-bed.txt
    # 2. for the peptide parts
    echo 'gene
translation' > therAcid1-params-pep.txt
    # 3. for the other gene information
    echo 'gene
product
note' > therAcid1-params-xra.txt
    # Now extract the genes and information:
    gbArchaeGenome therAcid1.gbk therAcid1-params-bed.txt therAcid1-genbank-cds.bed
    gbArchaeGenome therAcid1.gbk therAcid1-params-pep.txt therAcid1-genbank-cds.pep
    gbArchaeGenome therAcid1.gbk therAcid1-params-xra.txt therAcid1-genbank-cds.xra
    hgLoadBed therAcid1 gbProtCode therAcid1-genbank-cds.bed
    hgsql therAcid1 < ~/kent/src/hg/lib/pepPred.sql
    hgsql therAcid1 < ~/kent/src/hg/lib/minGeneInfo.sql
    echo rename table pepPred to gbProtCodePep | hgsql therAcid1
    echo rename table minGeneInfo to gbProtCodeXra | hgsql therAcid1
    echo load data local infile \'therAcid1-genbank-cds.pep\' into table gbProtCodePep | hgsql therAcid1
    echo load data local infile \'therAcid1-genbank-cds.xra\' into table gbProtCodeXra | hgsql therAcid1

#genbank to genePred

csh
tawk '{print $1,$2,$3,$4,$5,$6,$2,$3,0,1,$3-$2,0}' therAcid1-genbank-cds.bed | bedToGenePred stdin tmp.gp
tawk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,substr($1,3,4),name2,"cmpl","cmpl",0}' tmp.gp  > tmp2.gp
join -t "     " -o 1.1,1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.3 1.13 1.14 1.15  tmp2.gp therAcid1-genbank-cds.xra > therAcid1.gp

# GENBANK rRNA GENES
    ssh hgdev
    cd /cluster/data/therAcid1/genbank
    gbArchaeGenome -kind=rRNA therAcid1.gbk therAcid1-params-bed.txt therAcid1-rrnas.bed
    echo 'gene product NA' > therAcid1-params-rrna-xra.txt
    gbArchaeGenome -kind=rRNA therAcid1.gbk therAcid1-params-rrna-xra.txt therAcid1-rrnas-xra.txt
    hgLoadBed therAcid1 gbRRNA therAcid1-rrnas.bed
    hgsql therAcid1 < ~/kent/src/hg/lib/minGeneInfo.sql
    echo rename table minGeneInfo to gbRRNAXra | hgsql therAcid1
    echo load data local infile \'therAcid1-rrnas-xra.txt\' into table gbRRNAXra | hgsql therAcid1

# COG STUFF
    # Cut and paste http://www.ncbi.nlm.nih.gov/cgi-bin/COG/palox into emacs (COG list)
    # and save as cogpage.txt
    awk '{printf("%s\t%s\n",$6,$5)}' < cogpage.txt | sed -e 's/\[//' -e 's/\]//' > cogs.txt
    rm cogpage.txt
    # Now we have the basic list of cogs and the letter code for each one.
    

# TODD LOWE tRNA GENES 

    # This one is a bed 6+ file created by hand of 46 tRNAs and 1 pseudo tRNA by Todd
    # Lowe.  See ~/kent/src/hg/lib/loweTrnaGene.as for a description of the fields.
    # **Showing the tRNAScanSE instructions would be nice in the future.  
    ssh hgwdev
    mkdir /cluster/data/therAcid1/bed/loweTrnaGene
    cd /cluster/data/therAcid1/bed/loweTrnaGene
    hgLoadBed -tab therAcid1 loweTrnaGene therAcid1-lowe-trnas.bed -sqlTable=~/kent/src/hg/lib/loweTrnaGene.sql

# TODD LOWE snoRNA GENES 
    # This is a bed 6 file created by hand.
    ssh hgwdev
    mkdir /cluster/data/therAcid1/bed/loweSnoGene
    cd /cluster/data/therAcid1/bed/loweSnoGene
    hgLoadBed -tab therAcid1 loweSnoGene therAcid1-snos.bed