# This file describes how the proteins070202 database was
# built.  See also the sp070202 database doc, which needs
# to be built first.

hgsqladmin create proteins070202

cd /cluster/data/proteins
mkdir 070202
cd 070202
mkdir build

# Get HGNC data
~/src/hg/protein/getHgnc.pl >hgnc.tab

hgsql proteins070202 < ~/src/hg/lib/hgnc.sql
hgsql proteins070202 -e 'load data local infile "hgnc.tab" into table hgnc ignore 1 lines'
 
pbHgnc 070202   
hgsql proteins070202 < ~/src/hg/lib/hgncXref.sql
hgsql proteins070202 -e 'load data local infile "hgncXref.tab" into table hgncXref'
#note:  please notify Elspeth Bruford at hgnc that there is a new mapping
# to their names.  hgnc@genenames.org  here is the file that ann gave them
# hgsql -Ne
# "SELECT geneSymbol,hgncId,transcript
# FROM knownCanonical,kgXref,proteome.hgncXref
# WHERE kgXref.kgID=knownCanonical.transcript
# AND kgXref.geneSymbol = proteome.hgncXref.symbol"
# hg18 > hg18.hgncId
#   -- b0b 12-26-2007
#   -- ASZ updated 01-03-2008
# Actually, they have automated this task, so we no longer need to notify
# them. (ASZ: 02-04-2008). 

# make sure the pfamXref program is available:
cd /cluster/data/proteins/070202/build

# JK - GOT STUCK HERE FROM NO hugo table.
spToProteins 070202

# Continue to build proteins070202 DB BUILD (STARTED 3/07/07, DONE 3/9/07, Fan).

# FIRST BUILD spXref2 and spXref3 tables.

mkdir /cluster/data/proteins/070202/build
cd /cluster/data/proteins/070202/build

# spToProteins seems having problem finishing (consistent error of "next row failed")
# I suspect it might due to the size of UniProt (2.7 million rows grew to 3.9 million rows).
# So, use hgsql instead.

# get records with HUGL symbol and descriptions
hgsql sp070202 -N -e 'select d.acc, d.val, taxon, id.bioentryID, 2-isCurated, des.val,hx.symbol, hx.description from displayId d, info i, accToTaxon t, bioentryID id, description des, proteins070202.hgncXref hx where  d.acc=t.acc and d.acc=i.acc and d.acc=id.acc and d.acc = id.acc and d.acc=des.acc and hx.uniProt=d.acc ' >spXref3a.tab

# get records with HUGL symbol and descriptions for variant splice proteins
hgsql sp070202 -N -e 'select v.varAcc, v.varAcc, taxon, id.bioentryID, 2-isCurated, des.val,hx.symbol, hx.description from varAcc v, displayId d, info i, accToTaxon t, bioentryID id, description des, proteins070202.hgncXref hx where v.parAcc=d.acc and d.acc=t.acc and d.acc=i.acc and d.acc=id.acc and d.acc = id.acc and d.acc=des.acc and hx.uniProt=d.acc ' >spXref3av.tab

# get all records and set HUGO symbol and description to "" 

hgsql sp070202 -N -e 'select d.acc, d.val, taxon, id.bioentryID, 2-isCurated, des.val,"", "" from displayId d, info i, accToTaxon t, bioentryID id, description des where  d.acc=t.acc and d.acc=i.acc and d.acc=id.acc and d.acc = id.acc and d.acc=des.acc ' >spXref3b.tab

# get all records and set HUGO symbol and description to "" for variant splice proteins 

hgsql sp070202 -N -e 'select v.varAcc, v.varAcc, taxon, id.bioentryID, 2-isCurated, des.val,"", "" from varAcc v, displayId d, info i, accToTaxon t, bioentryID id, description des where v.parAcc=d.acc and  d.acc=t.acc and d.acc=i.acc and d.acc=id.acc and d.acc = id.acc and d.acc=des.acc ' >spXref3bv.tab

# collect all data from the resulting 4 files.

cat spXref3a.tab  spXref3b.tab  spXref3av.tab spXref3bv.tab >spXref3.tab

# load into the spXref3 table
hgLoadSqlTab proteins070202 spXref3 ~/kent/src/hg/lib/spXref3.sql spXref3.tab

# load the same data into a second table
hgLoadSqlTab proteins070202 spXref3B ~/kent/src/hg/lib/spXref3.sql spXref3.tab

# remove records that have empty HUGO symbol, yet other records 
# of the same accession does have non-empty HUGO symbols.
hgsql proteins070202 -e 'delete spXref3 from spXref3 a, spXref3B b where a.accession=b.accession and a.hugoSymbol="" and b.hugoSymbol!=""'

# Build spOrganism table
hgsql sp070202 -N -e 'select d.val, taxon from displayId d, accToTaxon t where d.acc=t.acc ' >spOrganism.tab

hgLoadSqlTab proteins070202 spOrganism ~/kent/src/hg/lib/spOrganism.sql ./spOrganism.tab

#	Build spSecondaryID table
cd /cluster/data/proteins/070202
hgsql -e "select displayId.val, displayId.acc, otherAcc.val from displayId, \
        otherAcc where otherAcc.acc = displayId.acc;" sp070202 \
	| sed -e "1d" > spSecondaryID.tab

hgLoadSqlTab proteins070202 spSecondaryID ~/kent/src/hg/lib/spSecondaryID.sql ./spSecondaryID.tab

#	Build pfamXref and pfamDesc tables
mkdir /san/sanvol1/scratch/proteins/pfam/070202
cd    /san/sanvol1/scratch/proteins/pfam/070202
wget --timestamping "ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz"
#	300 Mb compressed, over 3GB uncompressed

gunzip Pfam-A.full.gz
pfamXref proteins070202 Pfam-A.full pfamADesc.tab pfamAXref.tab >pfamXref.log
gzip Pfam-A.full &

hgLoadSqlTab proteins070202 pfamDesc ~/kent/src/hg/lib/pfamDesc.sql pfamADesc.tab
hgLoadSqlTab proteins070202 pfamXref ~/kent/src/hg/lib/proteome_pfamXref.sql pfamAXref.tab

#	Build the pdbSP table, new process using extDbRef data from spXXXXXX

cd /san/sanvol1/scratch/proteins/pfam/070202

hgsql sp070202 -N -e 'select extAcc1, d.val from extDbRef x, displayId d, extDb where x.acc=d.acc and extDb.val="PDB" and x.extDb=extDb.id'|sort -u >pdbSP.tab

hgLoadSqlTab proteins070202 pdbSP ~/kent/src/hg/lib/pdbSP.sql pdbSP.tab

#	Build the spDisease table

hgsql -N -e \
'select comment.acc, displayId.val, commentVal.val from comment, commentVal, commentType ct, displayId where comment.commentType=ct.id and ct.val="DISEASE" and commentVal.id=comment.commentVal and displayId.acc=comment.acc;' \
sp070202 > spDisease.tab

hgLoadSqlTab proteins070202 spDisease ~/kent/src/hg/lib/spDisease.sql spDisease.tab

# create swInterPro table

wget --timestamping "ftp://ftp.ebi.ac.uk/pub/databases/interpro/protein2ipr.dat.gz"
gzip -d protein2ipr.dat.gz
# rearange col positioin to match the old format
cut -f 1 protein2ipr.dat >j1
cut -f 2,3 protein2ipr.dat >j23
cut -f 4,5,6 protein2ipr.dat >j456
paste j1 j456 j23 > interProXref.tab
rm j1 j456 j23

hgLoadSqlTab proteins070202 interProXref ~/kent/src/hg/lib/interProXref.sql ./interProXref.tab

hgsql proteins070202 -N -e 'select accession, interProId from interProXref;'|sort -u >swInterPro.tab
hgLoadSqlTab proteins070202 swInterPro ~/kent/src/hg/lib/swInterPro.sql ./swInterPro.tab

# BUILD TABLES FOR pbGlobal (PB V1.1)

cd /cluster/data/proteins/070202
mkdir pbGlobal
cd pbGlobal

# Calculate Pi values for all proteins

hgsql sp070202 -e "select acc from protein" >acc070202.lis
nice pbCalPi acc070202.lis sp070202 pi070202.tab
/cluster/home/fanhsu/bin/x86_64/pbCalPi acc070202.lis sp070202 pi070202.tab 

hgLoadSqlTab proteins070202 pepPi ~/kent/src/hg/lib/pepPi.sql ./pi070202.tab

# Build pepMwAa table

hgsql sp070202 -N -e "select acc, molWeight, aaSize from info" >pepMwAa.tab
hgLoadSqlTab proteins070202 pepMwAa ~/kent/src/hg/lib/pepMwAa.sql ./pepMwAa.tab

# Calculate global protein property distributions

nice pbCalDistGlobal sp070202 proteins070202

# Load the tables

hgLoadSqlTab proteins070202 pepCCntDist ~/kent/src/hg/lib/pepCCntDist.sql pepCCntDist.tab
hgLoadSqlTab proteins070202 pepHydroDist ~/kent/src/hg/lib/pepHydroDist.sql pepHydroDist.tab
hgLoadSqlTab proteins070202 pepIPCntDist ~/kent/src/hg/lib/pepIPCntDist.sql pepIPCntDist.tab
hgLoadSqlTab proteins070202 pepMolWtDist ~/kent/src/hg/lib/pepMolWtDist.sql pepMolWtDist.tab
hgLoadSqlTab proteins070202 pepPiDist ~/kent/src/hg/lib/pepPiDist.sql pepPiDist.tab
hgLoadSqlTab proteins070202 pepResDist ~/kent/src/hg/lib/pepResDist.sql pepResDist.tab

# Calculate global AA residue distributions

pbCalResStdGlobal 070202

# Load distribution tables:

hgLoadSqlTab proteins070202 pbAnomLimit ~/kent/src/hg/lib/pbAnomLimit.sql ./pbAnomLimit.tab
hgLoadSqlTab proteins070202 pbResAvgStd  ~/kent/src/hg/lib/pbResAvgStd.sql ./pbResAvgStd.tab

# Get taxonomy names table from NCBI.

cd /cluster/data/proteins/070202
mkdir taxon
cd taxon
wget --timestamping ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
unzip taxdmp.zip

# Create table taxonNames in proteins070202

# Load from the file names.dmp into taxonNames table.

# hgsql proteins070202 -e 'drop table taxonNames'
hgsql proteins070202 < ~/kent/src/hg/lib/taxonNames.sql    
hgsql proteins070202 -e 'load data local infile "names.dmp" into table taxonNames fields terminated by "|" enclosed by "\t";'

# Load and edit pbStamp table

cd /cluster/data/proteins/070202
hgsql proteins060115 -N -e "select * from pbStamp" > pbStamp.tab

hgLoadSqlTab proteins070202 pbStamp ~/kent/src/hg/lib/pbStamp.sql ./pbStamp.tab

mkdir alias
cd alias

hgsql proteins070202 -N -e 'select accession, accession,  "uAcc", "2006-01-15" from spXref3' >j1.tmp
hgsql proteins070202 -N -e 'select accession, accession2, "u2Ac", "2006-01-15" from spSecondaryID' >j2.tmp
hgsql proteins070202 -N -e 'select accession, displayID,  "uDis", "2006-01-15" from spXref3' >j3.tmp

hgsql proteins040515 -N -e 'select accession, displayID,  "oDis", "2004-05-15" from spXref3' >j4.tmp
# PLEASE NOTE: proteins041115 was later deleted by Fan when hgwdev mySQL space was at 100%.
# So, use protein040515 instead.

cat j1.tmp j2.tmp j3.tmp j4.tmp >uniProtAlias.tab
rm j1.tmp j2.tmp j3.tmp j4.tmp

hgLoadSqlTab proteins070202 uniProtAlias ~/kent/src/hg/lib/uniProtAlias.sql ./uniProtAlias.tab
hgsql proteins070202 -e 'create index alias on uniProtAlias(alias)'

hgsql proteins050415 -N -e 'select * from spOldNew' >spOldNew.tab
hgLoadSqlTab proteins070202 spOldNew ~/kent/src/hg/lib/spOldNew.sql ./spOldNew.tab

hgsql proteins070202 -e 'create index oldDisplayId on spOldNew(oldDisplayId)'
hgsql proteins070202 -e 'create index newDisplayId on spOldNew(newDisplayId)'

# Build spDeleted table

# Download list of deleted accession numbers from UniProt
cd /cluster/bluearc/fan/proteins/070202/build

wget --timestamping http://ca.expasy.org/txt/delac_tr.txt
cp delac_tr.txt spDeleted.tab

# Edit spDeleted.tab to get rid of other lines except accession numbers

vi spDeleted.tab
hgLoadSqlTab proteins070202 spDeleted ~/kent/src/hg/lib/spDeleted.sql ./spDeleted.tab

# CREATE spVariant TABLE TO ENABLE UNIPROT VARIANT SPLICE ISOFORM PROCESSING 

hgsql proteins070202 -N -e 'select accession,accession from spXref3' >j1
cat j1 |grep "-" |sed -e 's/-/\t/'|cut -f 1,3 >j2
cat j1 |grep -v "-" >>j3
cat j2 j3 |sort -u >spVariant.tab
rm j1 j2 j3

hgLoadSqlTab proteins070202 spVariant ~/kent/src/hg/lib/spVariant.sql ./spVariant.tab
hgsql proteins070202 -e 'create index parent on spVariant(parent)'

# EXTEND pfamXref TO COVER VARIANT SPLICE PROTEINS

hgsql proteome -N -e \
'select pfamAC, variant, variant from spVariant, pfamXref where swissAC = parent and variant like "%-%" and swissAC=parent;' \
>pfamXrefVar.tab

hgLoadSqlTab -append proteins070202 pfamXref ~/kent/src/hg/lib/pfamXref.sql ./pfamXrefVar.tab

# SWITCH SYMBOLIC PROTEIN DATABASE LINKS (TBD)

# Ask system admin to switch the following symbolic database links:

       swissProt --> sp070202
       proteins  --> proteins070202

# Run some simple test on hgTracks, hgNear, hgGene, pbTracks, and pbGlobal  
# to make sure things are running OK.

# First check to see if pbGateway and pbGlobal are working.

# Then edit pbStamp.tab to adjust maximum y values for various stamps 
# and load it to pbStamp tables until all their scales look reasonable.
# For this particular release, no adjustment seems necessary.	
    
#########################################################
# CREATE REACTOME TABLES (DONE, Fan 3/9/07)

# Obtain the ucsc_events20.dat.txt and ucsc_entity20.dat from Reactome.
# Contact: Gopinathrao, Gopal [gopinath@cshl.edu]

ssh hgwdev
mkdir /cluster/store10/reactome/reactome20
rm /cluster/data/reactome
ln -s /cluster/store10/reactome/reactome20 /cluster/data/reactome
cd /cluster/data/reactome

# save these two .dat files to this subdirectory.

hgsql proteins070202 -e 'drop table spReactomeEvent'
hgsql proteins070202 < ~/hg/lib/spReactomeEvent.sql
hgsql proteins070202 -e 'load data local infile "ucsc_events20.dat" into table spReactomeEvent ignore 4 lines'

hgsql proteins070202 -e 'drop table spReactomeId'
hgsql proteins070202 < ~/hg/lib/spReactomeId.sql
hgsql proteins070202 -e 'load data local infile "ucsc_entity20.dat" into table spReactomeId ignore 6 lines'

# Generate lists for Reactome to link back to UCSC site.

hgsql hg18 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg18.lis

hgsql hg17 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg17.lis

hgsql mm8 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm8.lis

hgsql mm7 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm7.lis

hgsql rn4 -N -e 'select kgId, kgXref.spId from kgXref,proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >rn4.lis

# !!! NOTE: Email the above 5 list files to Gopinathrao, Gopal [gopinath@cshl.edu]
# AFTER THE NEW PROTEIN DBs ARE RELEASED ON RR.

#########################################################
# UPDATE REACTOME TABLES (DONE, Fan 6/5/07)

# Obtain the ucsc_events21.dat.txt and ucsc_entity21.dat from Reactome.
# Contact: Gopinathrao, Gopal [gopinath@cshl.edu]

ssh hgwdev
mkdir /cluster/store10/reactome/reactome21
rm /cluster/data/reactome
ln -s /cluster/store10/reactome/reactome21 /cluster/data/reactome
cd /cluster/data/reactome

# save these two .dat files to this subdirectory.

cp ucsc_events21.dat ucsc_events21.tab
cp ucsc_entity21.dat ucsc_entity21.tab

# edit two .tab files to get rid of top head lines.

hgsql proteins070202 -e 'drop table spReactomeEvent'
hgsql proteins070202 < ~/hg/lib/spReactomeEvent.sql
hgsql proteins070202 -e 'load data local infile "ucsc_events21.tab" into table spReactomeEvent'

hgsql proteins070202 -e 'drop table spReactomeId'
hgsql proteins070202 < ~/hg/lib/spReactomeId.sql
hgsql proteins070202 -e 'load data local infile "ucsc_entity21.tab" into table spReactomeId'

# Generate lists for Reactome to link back to UCSC site.

hgsql hg18 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg18.lis

hgsql hg17 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg17.lis

hgsql mm8 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm8.lis

hgsql mm7 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm7.lis

hgsql rn4 -N -e 'select kgId, kgXref.spId from kgXref,proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >rn4.lis

# !!! NOTE: Email the above 5 list files to Gopinathrao, Gopal [gopinath@cshl.edu]
# AFTER THE NEW PROTEIN DBs ARE RELEASED ON RR.

#########################################################
# UPDATE REACTOME TABLES (DONE, Fan 4/7/08)

# Obtain the ucsc_events24.dat.txt and ucsc_entity24.dat from Reactome.
# Contact: Gopinathrao, Gopal [gopinath@cshl.edu]

ssh hgwdev
mkdir /cluster/store10/reactome/reactome24
rm /cluster/data/reactome
ln -s /cluster/store10/reactome/reactome24 /cluster/data/reactome
cd /cluster/data/reactome

# save these two .dat files to this subdirectory.

cp ucsc_events24.dat ucsc_events24.tab
cp ucsc_entity24.dat ucsc_entity24.tab

# edit two .tab files to get rid of top head lines.

hgsql proteins070202 -e 'drop table spReactomeEvent'
hgsql proteins070202 < ~/hg/lib/spReactomeEvent.sql
hgsql proteins070202 -e 'load data local infile "ucsc_events24.tab" into table spReactomeEvent'

hgsql proteins070202 -e 'drop table spReactomeId'
hgsql proteins070202 < ~/hg/lib/spReactomeId.sql
hgsql proteins070202 -e 'load data local infile "ucsc_entity24.tab" into table spReactomeId'

# Generate lists for Reactome to link back to UCSC site.

hgsql hg18 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg18.lis

hgsql hg17 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg17.lis

hgsql mm9 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm9.lis
hgsql mm8 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm8.lis

hgsql rn4 -N -e 'select kgId, kgXref.spId from kgXref,proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >rn4.lis

# !!! NOTE: Email the above 5 list files to Gopinathrao, Gopal [gopinath@cshl.edu]
# AFTER THE NEW PROTEIN DBs ARE RELEASED ON RR.

####################################################################################
# UPDATE HGNC DATA (DONE, Fan, 4/10/08)
#
# Go to http://www.genenames.org/cgi-bin/hgnc_downloads.cgi
# and generate the Perl program by selecting the 25 variables we use
# and replace ~/src/hg/protein/getHgnc.pl with this new program and check it
# in.

# Get HGNC data
~/src/hg/protein/getHgnc.pl >hgnc.tab

hgsql proteins070202 -e 'delete from hgnc'
hgsql proteins070202 -e 'load data local infile "hgnc.tab" into table hgnc ignore 1 lines'

pbHgnc 070202
hgsql proteins070202 -e 'delete from hgncXref'
hgsql proteins070202 -e 'load data local infile "hgncXref.tab" into table hgncXref'