##########################################################################
# CREATE H1N1 DATABASE (STARTED 4/26/09, Fan) 

# for emacs: -*- mode: sh; -*-

# DOWNLOAD SEQUENCE (DONE, Fan, 4/26/09)

    ssh hgwdev
    mkdir /hive/data/genomes/h1n1
    cd /hive/data/genomes/h1n1

# Get H1N1 sequences from GISAID/EpiFluDB
    
    mkdir download
    cd download

# downloaded the following sequences:

    epiflu_0421_dna_sequence.fasta
    epiflu_0425_dna_sequence.fasta
    epiflu_0421_protein_sequence.fasta
    epiflu_0425_protein_sequence.fasta

# translate to nib
    cd ..

    ln -s /hive/data/genomes/h1n1 ~/h1n1
    cd ~/h1n1
    mkdir nib

# use sequence segments in epiflu_0421_dna_sequence.fasta as the base genome

    fgrep -v ">" download/epiflu_0421_dna_sequence.fasta >j.1
    echo ">chr1" >j.0
    cat j.0 j.1 >chr1.fa
    rm j.0 j.1

    faToNib chr1.fa nib/chr1.nib

# CREATING DATABASE (DONE 4/26/09)

    # Create the h1n1 database.

    ssh hgwdev
    echo 'create database h1n1' | hgsql ''

    # make a semi-permanent read-only alias:
    alias h1n1 "mysql -u hguser -phguserstuff -A h1n1"

# CREATING GRP TABLE FOR TRACK GROUPING (DONE 4/26/09)
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg18.grp" \
      | hgsql h1n1

    # remove ENCODE groups
    echo "delete from grp where name like 'encode%'" | hgsql h1n1

# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE 4/26/09)

    # Make symbolic links from /gbdb/h1n1/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/h1n1/nib
    ln -s /hive/data/genomes/h1n1/nib/chr1.nib /gbdb/h1n1/nib/chr1.nib 

    # Load /gbdb/h1n1/nib paths into database and save size info.
    hgsql h1n1  < ~/src/hg/lib/chromInfo.sql

    hgNibSeq -preMadeNib h1n1 /gbdb/h1n1/nib chr1.fa
    echo "select chrom,size from chromInfo" | hgsql -N h1n1 > chrom.sizes
    
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR H1N1 (DONE 08/02/06)
    echo 'insert into defaultDb values("A/California/04/2009(EPI_ISL_29573)", "h1n1");' \
      | hgsql -h genome-testdb hgcentraltest

    echo 'insert into dbDb values("h1n1", "Apr. 2009", \
          "/gbdb/h1n1/nib", "Human case of H1N1 swine influenza", "chr1", 1, 99.5, "A H1N1","Human case of H1N1 swine influenza", "/gbdb/h1n1/html/description.html", 0, 0, "GISAID sequence as of Apr. 21st, 2009", 99999);' \
      | hgsql -h genome-testdb hgcentraltest

    echo 'insert into genomeClade values("A H1N1", "other", 100);'\
      | hgsql -h genome-testdb hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit that makefile to add h1n1 in all the right places

    vi makefile

    make update
    make alpha
    cvs commit makefile

# START BLAT SERVERS (on hgwdev for now, DONE 4/26/09, Fan)
    
    cd /hive/data/genomes/h1n1/nib
    gfServer start hgwdev 18891 -trans -mask -log=gfServer.trans.log chr1.nib &
    gfServer start hgwdev 18892 -stepSize=5  -log=gfServer.log       chr1.nib &

    # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR H1N1 

    echo 'insert into blatServers values("h1n1", "hgwdev", "18891", "1", "0"); \
          insert into blatServers values("h1n1", "hgwdev", "18892", "0", "0");' \
      | hgsql -h genome-testdb hgcentraltest

# CREATE h1n1Gene TRACK (DONE. 4/26/09, Fan)

    mkdir /hive/data/genomes/h1n1/bed/h1n1Gene
    cd    /hive/data/genomes/h1n1/bed/h1n1Gene 

    cp ../../download/epiflu_0421_dna_sequence.fasta h1n1Gene.fa
    # manually edit h1n1Gene.fa
    # change line like:
    # >EPI176470 | HA | A/California/04/2009 | EPI_ISL_29573 | 2009712049_seg4 | H1N1
    # into:
    # >HA EPI176470

    gfClient -minScore=200 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
    -out=psl -t=dna -q=dna h1n1Gene.fa h1n1GenePsl.psl

    hgLoadPsl h1n1 h1n1GenePsl.psl
    hgsql h1n1 -N -e 'select tName, tStart, tEnd, qName from h1n1GenePsl' >h1n1Gene.bed
    hgLoadBed h1n1  h1n1Gene h1n1Gene.bed

# CREATE h1n1Seq TRACK (DONE.  4/26/09, Fan)

    mkdir –p /hive/data/genomes/h1n1/bed/h1n1Seq
    cd /hive/data/genomes/h1n1/bed/h1n1Seq

# get h1n1Seq sequences
    cat ../../download/*dna*.fasta |sed -e 's/ | /_/g' |sed -e 's/_EPI_ISL/ | /' >h1n1Seq.fa

# create .fa file

    mkdir -p /gbdb/h1n1/h1n1Seq
    cp -p h1n1Seq.fa /gbdb/h1n1/h1n1Seq/h1n1Seq.fa

    hgLoadSeq –replace h1n1 /gbdb/h1n1/h1n1Seq/h1n1Seq.fa

# BLAT
    gfClient -minScore=200 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
    -out=psl -t=dna -q=dna h1n1Seq.fa h1n1Seq.psl

# load the psl result into h1n1Seq table
    hgLoadPsl h1n1 h1n1Seq.psl

# CREATE humanHA TRACK (DONE.  4/26/09, Fan)

    mkdir -p /hive/data/genomes/h1n1/bed/humanHA
    cd /hive/data/genomes/h1n1/bed/humanHA

# get humanHA sequences
    cat ../../download/humanHA_dna.fasta |sed -e 's/ | /_/g' |sed -e 's/_EPI_ISL/ | /' >humanHA.fa

# create .fa file

    mkdir -p /gbdb/h1n1/humanHA
    cp -p humanHA.fa /gbdb/h1n1/humanHA/humanHA.fa

    hgLoadSeq -replace h1n1 /gbdb/h1n1/humanHA/humanHA.fa

# BLAT
    gfClient -minScore=500 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
    -out=psl -t=dna -q=dna humanHA.fa humanHA.psl

# load the psl result into humanHA table
    hgLoadPsl h1n1 humanHA.psl

# CREATE swineHA TRACK (DONE.  4/26/09, Fan)

    mkdir -p /hive/data/genomes/h1n1/bed/swineHA
    cd /hive/data/genomes/h1n1/bed/swineHA

# get swineHA sequences
    cat ../../download/swineHA_dna.fasta |sed -e 's/ | /_/g' |sed -e 's/_EPI_ISL/ | /' >swineHA.fa

# create .fa file

    mkdir -p /gbdb/h1n1/swineHA
    cp -p swineHA.fa /gbdb/h1n1/swineHA/swineHA.fa

    hgLoadSeq -replace h1n1 /gbdb/h1n1/swineHA/swineHA.fa

# BLAT
    gfClient -minScore=500 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
    -out=psl -t=dna -q=dna swineHA.fa swineHA.psl

# load the psl result into humanHA table
    hgLoadPsl h1n1 swineHA.psl

#######################################################################################
# CREATE h1n1_0514Seq TRACK (DONE.  5/15/09, Fan)

    mkdir -p /hive/data/genomes/h1n1/bed/h1n1_0514Seq
    cd /hive/data/genomes/h1n1/bed/h1n1_0514Seq

# create .fa files from downloaded .fasta files

    mkdir -p /gbdb/h1n1/h1n1_0514Seq
    cd ../../download/0514
    
    dos2unix < epiflu_0514_dna_sequence.fasta > epiflu_0514_dna_sequence.fa 
    dos2unix < epiflu_0514_protein_sequence.fasta > epiflu_0514_protein_sequence.fa 
    
    cp -p ../../download/0514/epiflu_0514_dna_sequence.fa /gbdb/h1n1/h1n1_0514Seq

    cd /hive/data/genomes/h1n1/bed/h1n1_0514Seq

    hgLoadSeq -replace h1n1 /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa 

# BLAT, make sure the port number is correct.

    gfClient -minScore=200 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
    -out=psl -t=dna -q=dna /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa h1n1_0514Seq.psl

# load the psl result into h1n1_0514Seq table
    hgLoadPsl h1n1 h1n1_0514Seq.psl

# update h1n1SeqXref table

    fgrep ">" /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa|\
    sed -e 's/ | /\t/g' |sed -e "s/>//" > h1n1SeqXref.tab

    hgsql h1n1 -e 'delete from h1n1SeqXref'
    hgsql h1n1 -e 'load data local infile "h1n1SeqXref.tab" into table h1n1SeqXref'

# create dna sequences table

    cp -p ~/h1n1/download/0514/epiflu_0514_dna_sequence.fa j1

    ~/src/utils/faToTab/faToTab.pl /dev/null j1 >j2.tab
    cut -f 1 j2.tab >jj1
    cut -f 2 j2.tab >jseq

    cat jj1 |sed -e 's/ | /\t/g'|sed -e 's/>//' >jj
    cut -f 1 jj > jDnaId

    paste jDnaId jseq >epiflu_0514_dna_sequence.tab

    hgLoadSqlTab h1n1 dnaSeq ~/src/hg/lib/dnaSeq.sql epiflu_0514_dna_sequence.tab

    rm j1 jj jj1 jseq jDnaId j2.tab

# create protein sequences file with appropriate IDs

    cp -p ~/h1n1/download/0514/epiflu_0514_protein_sequence.fa j1

    ~/src/utils/faToTab/faToTab.pl /dev/null j1 >j2.tab
    cut -f 1 j2.tab >jj1
    cut -f 2 j2.tab >jseq

    cat jj1 |sed -e 's/ | /\t/g'|sed -e 's/>//' >jj
    cut -f 1 jj > jDnaId
    cut -f 2 jj > j2
    cut -f 1,2 jj|sed -e 's/\t/_/' |\
    sed -e 's/_PB2//'|\
    sed -e 's/_HA//'|\
    sed -e 's/_NS//'|\
    sed -e 's/_PA//'|\
    sed -e 's/_NA//'|\
    sed -e 's/_NP//'|\
    sed -e 's/-/_/'|\
    sed -e 's/_PB1//' > jProtId

    paste jDnaId jProtId >h1n1DnaProt.tab
    hgsql h1n1 -e 'delete from h1n1DnaProt'
    hgsql h1n1 -e 'load data local infile "h1n1DnaProt.tab" into table h1n1DnaProt'

# remove _F2 sequences, they are too short
    paste jProtId jseq |sort -u  >epiflu_0514_protein_sequence.tab

    tabToFa epiflu_0514_protein_sequence

    mkdir -p /gbdb/h1n1/h1n1_0514ProtSeq
    cp epiflu_0514_protein_sequence.fa /gbdb/h1n1/h1n1_0514ProtSeq

# BLAT the protein sequences, make sure port number is correct.

    gfClient -minScore=50 -minIdentity=60 -nohead hgwdev.cse.ucsc.edu 18891  /gbdb/h1n1/nib \
      -q=prot -out=psl -t=dnax  /gbdb/h1n1/h1n1_0514ProtSeq/epiflu_0514_protein_sequence.fa testPsl0.psl

    hgsql h1n1 -e 'delete from aaSeq'
    hgsql h1n1 -e 'load data local infile "epiflu_0514_protein_sequence.tab" into table aaSeq'

    rm j*

    cat testPsl0.psl |grep -v "_NEP"|grep -v "_M2"|grep -v "_F2"|sed -e 's/_M1//' >testPsl.psl
    hgLoadPsl h1n1 testPsl.psl

# construct CDS info based on corresponding DNA and protein alignments.

    hgsql h1n1 -N -e \
    'select p.qName,  p.tStart-d.tStart+d.qStart+1, "xxx", d.qStart+(d.tEnd-d.tStart)-(d.tEnd-p.tEnd) from testPsl p, h1n1_0514Seq d, h1n1DnaProt n where dnaId=d.qName and d.qName=p.qName' |\
    sed -e 's/\txxx\t/\.\./' >h1n1_0514SeqCds.tab

    hgLoadSqlTab h1n1 h1n1_0514SeqCds  ~/hg/lib/cdsSpec.sql h1n1_0514SeqCds.tab

####################################################################################