# for emacs: -*- mode: sh; -*-

 
# This file describes building the browser database for the archaeal
# species Methanosarcina acetivorans.
#
# if this is the first time you are making your own hgwdev browser, need to do 
# cd ~/kent/src/, then a make

# DOWNLOAD SEQUENCE FROM GENBANK (DONE)

    ssh eieio
    mkdir /cluster/store5/archae/haloHalo1
    ln -s /cluster/store5/archae/haloHalo1 /cluster/data/haloHalo1
    cd /cluster/data/haloHalo1
    cp /projects/lowelab/db/Bacteria/Halobacterium_sp/Halo_sp* .
    mv Halo_sp.fa haloHalo1.fa
    # Edit header of *.fa to '>chr >plasmid_pNRC100 >plasmid_pNRC200'
   
    faToTwoBit haloHalo1.fa haloHalo1.2bit 

    mkdir /gbdb/haloHalo1
    ln -s /cluster/data/haloHalo1/haloHalo1.2bit /gbdb/haloHalo1/haloHalo1.2bit

# CREATE DATABASES AND A BUNCH OF INITIAL STUFF (DONE)

    ssh hgwdev
    echo 'create database haloHalo1' | hgsql ''
    cd /cluster/data/haloHalo1

    faSize -detailed haloHalo1.fa > chrom.sizes
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
	    | hgsql haloHalo1
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk) values \
        ("haloHalo1", "August 1996", "/gbdb/haloHalo1", "Halobacterium sp.", \
               "chr1:500000-550000", 1, 233, "Halobacterium sp.", \
                "Halobacterium sp. NRC-1", "/gbdb/haloHalo1/html/description.html", \
                0);' \
      | hgsql hgcentraltest
    echo 'INSERT INTO defaultDb (genome, name) values ("Halobacterium sp.", "haloHalo1");' \
      | hgsql hgcentraltest
    echo 'INSERT INTO genomeClade (genome, clade, priority) values ("Halobacterium sp.", "archaea",85);'  \
      | hgsql hgcentraltest

# CREATE CHROMINFO TABLE (DONE)
  ssh hgwdev
  cd /cluster/data/haloHalo1

   cp ~baertsch/kent/src/hg/lib/chromInfo.sql .
   hgsql haloHalo1 < chromInfo.sql
   echo "load data local infile 'chrom.sizes' into table chromInfo" | hgsql haloHalo1
   echo "update chromInfo set fileName = '/gbdb/haloHalo1/haloHalo1.2bit'" | hgsql haloHalo1

    cd ~/kent/src/hg/makeDb/trackDb

    # add the trackDb directories
    mkdir -p archae/haloHalo1
    cvs add archae/haloHalo1
    cvs commit archae/haloHalo1

    make DBS=haloHalo1


# GC20BASE (DONE)
    mkdir -p /cluster/data/haloHalo1/bed/gc20Base
    cd /cluster/data/haloHalo1/bed/gc20Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=20 haloHalo1 \
        /cluster/data/haloHalo1/ | wigEncode stdin gc20Base.wig gc20Base.wib

    cd /cluster/data/haloHalo1/bed/gc20Base
    mkdir /gbdb/haloHalo1/wib
    ln -s `pwd`/gc20Base.wib /gbdb/haloHalo1/wib
    hgLoadWiggle -pathPrefix=/gbdb/haloHalo1/wib haloHalo1 gc20Base gc20Base.wig
    #	verify index is correct:
    hgsql haloHalo1 -e "show index from gc20Base;"
    #	should see good numbers in Cardinality column


# TANDEM REPEAT MASKER (DONE)

    ssh hgwdev
    mkdir -p /cluster/data/haloHalo1/bed/simpleRepeat
    cd /cluster/data/haloHalo1
    trfBig haloHalo1.fa /dev/null -bedAt=/cluster/data/haloHalo1/bed/simpleRepeat/chr.bed
    cd /cluster/data/haloHalo1/bed/simpleRepeat
    hgLoadBed haloHalo1 simpleRepeat *.bed -sqlTable=/cluster/home/lowe/kent/sr/hg/lib/simpleRepeat.sql

# CHAIN TRACK with halMar1
# DONE (10/13/05), kpollard

    cd /cluster/data/haloHalo1/bed/
    mkdir conservation
    cd conservation
    cp /cluster/data/pyrFur2/bed/conservation/HoxD55.q .
    cp ../../haloHalo1.fa haloHalo1.chr
    cat /cluster/data/halMar1/*.fa > halMar1.chr
    #fix headers
    cat haloHalo1.chr | gawk '{if(/plasmid/){print $1;} else{print $0;}}' > temp
    cat temp | gawk '{if(/chr/){print $1;} else{print $0;}}' > haloHalo1.chr
    sed s/chrom/\>chr/ halMar1.chr | sed s/plas/\>plasmid/ | gawk '{if(/halMar1/){print $2;} else{print $0;}}' > temp
    mv temp halMar1.chr
    #ref seq needs separate files for each seq
    mv haloHalo1.chr haloHalo1.fa
    cat haloHalo1.fa | gawk 'BEGIN{ind=0;}{if($1==">chr"){ind=1;} if(ind==0){print $0;}}' > haloHalo1.plasmid_pNRC100
    cat haloHalo1.fa | gawk 'BEGIN{ind=1;}{if($1==">chr"){ind=0;} if($1==">plasmid_pNRC200"){ind=1;} if(ind==0){print $0;}}' > haloHalo1.chr
    cat haloHalo1.fa | gawk 'BEGIN{ind=1;}{if($1==">plasmid_pNRC200"){ind=0;} if(ind==0){print $0;}}' > haloHalo1.plasmid_pNRC200
    faToTwoBit halMar1.chr halMar1.2bit
    faToTwoBit haloHalo1.fa haloHalo1.2bit
    faToNib haloHalo1.plasmid_pNRC100 plasmid_pNRC100.nib
    faToNib haloHalo1.chr chr.nib
    faToNib haloHalo1.plasmid_pNRC200 plasmid_pNRC200.nib
    faSize -detailed haloHalo1.plasmid_pNRC100 haloHalo1.plasmid_pNRC200 haloHalo1.chr halMar1.chr > chrom.sizes

    #blastz 
    blastz haloHalo1.plasmid_pNRC100 halMar1.chr Q=HoxD55.q > haloHalo1.p1-halMar1.lav
    blaztz haloHalo1.plasmid_pNRC200 halMar1.chr Q=HoxD55.q > haloHalo1.p2-halMar1.lav
    blastz haloHalo1.chr halMar1.chr Q=HoxD55.q > haloHalo1.c-halMar1.lav

    lavToAxt haloHalo1.p1-halMar1.lav . halMar1.2bit haloHalo1.p1-halMar1.axt
    lavToAxt haloHalo1.p2-halMar1.lav . halMar1.2bit haloHalo1.p2-halMar1.axt
    lavToAxt haloHalo1.c-halMar1.lav . halMar1.2bit haloHalo1.c-halMar1.axt
    #p2 lav is empty - remove
    rm haloHalo1.p2-halMar1.lav haloHalo1.p2-halMar1.axt

    axtBest haloHalo1.c-halMar1.axt all -winSize=500 -minScore=5000 haloHalo1.c-halMar1-best.axt
    axtBest haloHalo1.p1-halMar1.axt all -winSize=500 -minScore=5000 haloHalo1.p1-halMar1-best.axt

    #chain
    # Reuse gap penalties from chicken run.
    cat << '_EOF_' > temp.gap
tablesize       11
smallSize       111
position        1       2       3       11      111     2111    12111   32111   
72111   152111  252111
qGap    325     360     400     450     600     1100    3600    7600    15600   
31600   56600
tGap    325     360     400     450     600     1100    3600    7600    15600   
31600   56600
bothGap 625     660     700     750     900     1400    4000    8000    16000   
32000   57000
'_EOF_'
    sed 's/  */\t/g' temp.gap > chicken.gap
    rm temp.gap
    mkdir chains
    foreach f (*best.axt)
	set a=$f:t:r
	set b=`basename $a -best`
	axtChain -scoreScheme=HoxD55.q -linearGap=chicken.gap \
                      -minScore=5000 $f haloHalo1.2bit halMar1.2bit \
                       chains/$b.chain
    end    
   
    #load chains
    cd chains
    hgLoadChain haloHalo1 chr_chainHalMar1 haloHalo1.c-halMar1.chain
    hgLoadChain haloHalo1 plasmid_pNRC100_chainHalMar1 haloHalo1.p1-halMar1.chain

    #load axt files for blastz track
    cd ..
    axtToPsl haloHalo1.c-halMar1-best.axt chrom.sizes chrom.sizes haloHalo1.c-halMar1.psl
    axtToPsl haloHalo1.p1-halMar1-best.axt chrom.sizes chrom.sizes haloHalo1.p1-halMar1.psl
    hgLoadPsl haloHalo1 -table=chr_blastzHalMar1 haloHalo1.c-halMar1.psl
    hgLoadPsl haloHalo1 -table=plasmid_pNRC100_blastzHalMar1 haloHalo1.p1-halMar1.psl

    #trackDb
    cd ~/kent/src/hg/makeDb/trackDb/archae/
    mkdir haloHalo1
    cvs add haloHalo1
    cd haloHalo1
    # track chainHalMar1
    # shortLabel $o_Organism Chain    
    # longLabel $o_Organism ($o_date/$o_db) Chained Alignments                
    # group compGeno
    # priority 150.1
    # visibility hide
    # color 100,50,0
    # altColor 255,240,200
    # spectrum on
    # type chain halMar1
    # otherDb halMar1
    #
    # track blastzHalMar1
    # shortLabel $o_Organism Blastz    
    # longLabel $o_Organism ($o_date/$o_db) Blastz Alignments                
    # group compGeno
    # priority 154.1
    # visibility hide
    # color 100,50,0
    # altColor 255,240,200
    # spectrum on
    # type psl xeno halMar1
    # otherDb halMar1
    cvs add trackDb.ra
    cvs commit -m "halMar1 chains" trackDb.ra
    cvs add chainHalMar1.html
    cvs commit -m "added malMar1 chains" chainHalMar1.html
    cvs add blastzHalMar1.html
    cvs commit -m "added halMar1 blastz" blastzHalMar1.html

# DESCRIPTION PAGE (DONE 10/20/05), kpollard

    # Write ~/kent/src/hg/makeDb/trackDb/archae/haloHalo1/description.html
    cd ~/kent/src/hg/makeDb/trackDb/archae/haloHalo1/
    chmod a+r description.html
    cvs add description.html
    cvs commit -m "description page" description.html
    mkdir -p /cluster/data/haloHalo1/html/
    cp ~/kent/src/hg/makeDb/trackDb/archae/haloHalo1/description.html \
	/cluster/data/haloHalo1/html/description.html
    mkdir -p /gbdb/haloHalo1/html
    ln -s /cluster/data/haloHalo1/html/description.html /gbdb/haloHalo1/html/

# GENBANK PROTEIN-CODING GENES (DONE)

    ssh hgwdev
    mkdir /cluster/data/haloHalo1/genbank
    cd /cluster/data/haloHalo1/genbank
    cp /projects/lowelab/db/Bacteria/Halobacterium_sp/NC_00*.gbk .
    mv NC_002607.gbk haloHalo1.gbk
    mv NC_001869.gbk haloHalo1-pNRC100.gbk
    mv NC_002608.gbk haloHalo1-pNRC200.gbk

    # Create 3 files to assist parsing of the genbank
    # 1. for a bed file
    cat >  haloHalo1-params-bed.txt
chr
start
end
locus_tag
1000
strand
<type control-D> twice
    # 2. for the peptide parts
    cat > haloHalo1-params-pep.txt
locus_tag
translation
<type control-D> twice
    # 3. for the other gene information
    cat > haloHalo1-params-xra.txt
locus_tag
gene
product
note
protein_id
db_xref
EC_number
pseudo
<type control-D> twice
    # Now extract the genes and information:
    gbArchaeGenome haloHalo1.gbk haloHalo1-params-bed.txt haloHalo1-genbank-cds.bed
    gbArchaeGenome haloHalo1.gbk haloHalo1-params-pep.txt haloHalo1-genbank-cds.pep
    gbArchaeGenome haloHalo1.gbk haloHalo1-params-xra.txt haloHalo1-genbank-cds.xra

    gbArchaeGenome haloHalo1-pNRC100.gbk haloHalo1-params-bed.txt haloHalo1-pNRC100-genbank-cds.bed.tmp 
    sed 's/chr/plasmid_pNRC100/' haloHalo1-pNRC100-genbank-cds.bed.tmp > haloHalo1-pNRC100-genbank-cds.bed
    # Check to see if coordinates for first entry in the above bed file are screwed up -- fix if needed
    gbArchaeGenome haloHalo1-pNRC100.gbk haloHalo1-params-pep.txt haloHalo1-pNRC100-genbank-cds.pep
    gbArchaeGenome haloHalo1-pNRC100.gbk haloHalo1-params-xra.txt haloHalo1-pNRC100-genbank-cds.xra

    gbArchaeGenome haloHalo1-pNRC200.gbk haloHalo1-params-bed.txt haloHalo1-pNRC200-genbank-cds.bed.tmp 
    sed 's/chr/plasmid_pNRC200/' haloHalo1-pNRC200-genbank-cds.bed.tmp > haloHalo1-pNRC200-genbank-cds.bed
    # Check to see if coordinates for first entry in the above bed file are screwed up -- fix if needed
    gbArchaeGenome haloHalo1-pNRC200.gbk haloHalo1-params-pep.txt haloHalo1-pNRC200-genbank-cds.pep
    gbArchaeGenome haloHalo1-pNRC200.gbk haloHalo1-params-xra.txt haloHalo1-pNRC200-genbank-cds.xra

    cat haloHalo1-pNRC100-genbank-cds.bed haloHalo1-pNRC200-genbank-cds.bed >>  haloHalo1-genbank-cds.bed
    cat haloHalo1-pNRC100-genbank-cds.xra haloHalo1-pNRC200-genbank-cds.xra >> haloHalo1-genbank-cds.xra

    hgsql haloHalo1 < /cluster/home/baertsch/kent/src/hg/lib/pepPred.sql
    hgsql haloHalo1 < /cluster/home/baertsch/kent/src/hg/lib/minGeneInfo.sql
    echo rename table pepPred to gbProtCodePep | hgsql haloHalo1
    echo rename table minGeneInfo to gbProtCodeXra | hgsql haloHalo1
    echo load data local infile \'haloHalo1-genbank-cds.pep\' into table gbProtCodePep | hgsql haloHalo1
    echo load data local infile \'haloHalo1-genbank-cds.xra\' into table gbProtCodeXra | hgsql haloHalo1
    echo load data local infile \'haloHalo1-pNRC100-genbank-cds.pep\' into table gbProtCodePep | hgsql haloHalo1
    echo load data local infile \'haloHalo1-pNRC200-genbank-cds.pep\' into table gbProtCodePep | hgsql haloHalo1

#genbank to genePred (DONE)

csh
/cluster/bin/scripts/tawk '{print $1,$2,$3,$4,$5,$6,$2,$3,0,1,$3-$2,0}' haloHalo1-genbank-cds.bed | bedToGenePred stdin tmp.gp
/cluster/bin/scripts/tawk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,NR,name2,"cmpl","cmpl",0}' tmp.gp  > tmp2.gp
# hard tab between quotes use ctrl-V then press tab
join -t "     " -o 1.1,1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 1.13 1.14 1.15  tmp2.gp haloHalo1-genbank-cds.xra > haloHalo1.gp
ldHgGene haloHalo1 refSeq haloHalo1.gp -predTab -genePredExt

# COG STUFF (DONE)
  grep COG /projects/lowelab/db/Bacteria/Halobacterium_sp/NC_00*.ptt | awk 'NR>3{OFS="\t";print $6,$8,$7}'  > COG
  hgsql haloHalo1 < /cluster/home/baertsch/kent/src/hg/lib/cogs.sql
  echo "load data local infile 'COG' into table COG" | hgsql haloHalo1
# load cog codes
  hgsql haloHalo1 < /cluster/data/metAce1/genbank/COGXra.sql

# GENBANK RNA Genes (listed in *.rnt files -- like *.ptt files)

    cd /cluster/data/haloHalo1/genbank
    cat /projects/lowelab/db/Bacteria/Halobacterium_sp/NC_00*.rnt | sed 's/\.\./\t/' | awk 'NR>6{OFS="\t";print chr $1-1, $2, $7,1000, $3}' > haloHalo1-genbank-rna.bed
    hgsql haloHalo1 < /cluster/data/haloHalo1/genbank/

# GENBANK rRNA GENES (NOT QUITE DONE)
    ssh hgdev

    gbArchaeGenome -kind=rRNA haloHalo1.gbk haloHalo1-params-bed.txt haloHalo1-rrnas.bed
    echo 'gene product NA' > haloHalo1-params-rrna-xra.txt
    gbArchaeGenome -kind=rRNA haloHalo1.gbk haloHalo1-params-rrna-xra.txt haloHalo1-rrnas-xra.txt
    hgLoadBed haloHalo1 gbRRNA haloHalo1-rrnas.bed
    hgsql haloHalo1 < ~/kent/src/hg/lib/minGeneInfo.sql
    echo rename table minGeneInfo to gbRRNAXra | hgsql haloHalo1
    echo load data local infile \'haloHalo1-rrnas-xra.txt\' into table gbRRNAXra | hgsql haloHalo1

# TODD LOWE tRNA GENES ()

    # This one is a bed 6+ file created by hand of 46 tRNAs and 1 pseudo tRNA by Todd
    # Lowe.  See ~/kent/src/hg/lib/loweTrnaGene.as for a description of the fields.
    # **Showing the tRNAScanSE instructions would be nice in the future.  
    ssh hgwdev
    mkdir /cluster/data/haloHalo1/bed/loweTrnaGene
    cd /cluster/data/haloHalo1/bed/loweTrnaGene
    hgLoadBed -tab haloHalo1 loweTrnaGene haloHalo1-lowe-trnas.bed -sqlTable=~/kent/src/hg/lib/loweTrnaGene.sql

# TODD LOWE snoRNA GENES ()
    # This is a bed 6 file created by hand.
    ssh hgwdev
    mkdir /cluster/data/haloHalo1/bed/loweSnoGene
    cd /cluster/data/haloHalo1/bed/loweSnoGene
    hgLoadBed -tab haloHalo1 loweSnoGene haloHalo1-snos.bed

# TIGR GENES (DONE)
    # First go to http://www.tigr.org/tigr-scripts/CMR2/gene_attribute_form.dbi
    # and fill out the web form as follows:
    #   - Pick "Retrieve attributes for the specified DNA feature within a specific 
    #     organism and/or a specific role category".
    #       * Pick "Pyrobaculum aerophilum IM2", and "Primary and TIGR annotation ORFs" 
    #         from the 1st and 3rd box.
    #       * Select everything from "Choose TIGR Annotation Gene Attributes"
    #       * Select "Primary Locus Name" from "Choose Primary Annotation Gene Attributes"
    #       * Select everything from "Choose Other Gene Attributes"
    #   - Click submit, and click save as tab-delimited file.
    ssh hgwdev
    mkdir /cluster/data/haloHalo1/bed/tigrCmrORFs
    cp haloHalo1-tigr.tab /cluster/data/haloHalo1/bed/tigrCmrORFs
    cd /cluster/data/haloHalo1/bed/tigrCmrORFs
    /projects/lowelab/users/aamp/bin/i386/tigrCmrToBed haloHalo1-tigr.tab haloHalo1-tigr.bed
    hgLoadBed -tab haloHalo1 tigrCmrORFs haloHalo1-tigr.bed -sqlTable=~/kent/src/hg/lib/tigrCmrGene.sql