#!/bin/tcsh -efx

# uses the following binaries: pbHgnc spToSpXref2

# This file describes how the proteome database was
# built.  See also the uniProt database doc, which needs
# to be built first.

set kent = ~/kent
set date = 120806
set spdate = 120323
set db = proteins$date
set spdb = sp$spdate

hgsqladmin create $db

cd /hive/data/inside/proteins
mkdir -p $date
cd $date
mkdir build

# Get HGNC data

wget -O hgnc.tab "http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_locus_type&col=gd_locus_group&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_date2app_or_res&col=gd_date_mod&col=gd_date_sym_change&col=gd_date_name_change&col=gd_pub_acc_ids&col=gd_enz_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_mgd_id&col=gd_other_ids&col=gd_other_ids_list&col=gd_pubmed_ids&col=gd_pub_refseq_ids&col=gd_gene_fam_name&col=gd_gene_fam_pagename&col=gd_record_type&col=gd_primary_ids&col=gd_secondary_ids&col=gd_ccds_ids&col=gd_vega_ids&col=gd_lsdb_links&col=md_gdb_id&col=md_eg_id&col=md_mim_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&col=md_mgd_id&col=md_rgd_id&status=Approved&status=Entry+Withdrawn&status_opt=1&where=&order_by=gd_app_sym_sort&format=text&limit=&submit=submit&.cgifields=&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag"

hgsql $db < $kent/src/hg/lib/hgnc.sql
hgsql $db -e 'load data local infile "hgnc.tab" into table hgnc ignore 1 lines'
 
pbHgnc $date
hgsql $db < $kent/src/hg/lib/hgncXref.sql
hgsql $db -e 'load data local infile "hgncXref.tab" into table hgncXref'

# BUILD spXref2 and spXref3 tables.
cd /hive/data/inside/proteins/$date/build

# Create a bioentry table to use as an internal replacement for accession.
# Takes a minute
hgsql $spdb -e "create table bioentryID (acc char(16) NOT NULL, bioentryID int not null auto_increment, primary key (bioentryID), unique (acc))"
hgsql $spdb -e "insert into bioentryID (acc) select acc from $spdb.info order by acc"

# get records with HUGL symbol and descriptions
hgsql $spdb -N -e "select d.acc, d.val, taxon, id.bioentryID, 2-isCurated, des.val,hx.symbol, hx.description from displayId d, info i, accToTaxon t, bioentryID id, description des, $db.hgncXref hx where  d.acc=t.acc and d.acc=i.acc and d.acc=id.acc and d.acc = id.acc and d.acc=des.acc and hx.uniProt=d.acc " >spXref3a.tab

# get records with HUGL symbol and descriptions for variant splice proteins
hgsql $spdb -N -e "select v.varAcc, v.varAcc, taxon, id.bioentryID, 2-isCurated, des.val,hx.symbol, hx.description from varAcc v, displayId d, info i, accToTaxon t, bioentryID id, description des, $db.hgncXref hx where v.parAcc=d.acc and d.acc=t.acc and d.acc=i.acc and d.acc=id.acc and d.acc = id.acc and d.acc=des.acc and hx.uniProt=d.acc " >spXref3av.tab

# get all records and set HUGO symbol and description to "" 

hgsql $spdb -N -e 'select d.acc, d.val, taxon, id.bioentryID, 2-isCurated, des.val,"", "" from displayId d, info i, accToTaxon t, bioentryID id, description des where  d.acc=t.acc and d.acc=i.acc and d.acc=id.acc and d.acc = id.acc and d.acc=des.acc ' >spXref3b.tab

# get all records and set HUGO symbol and description to "" for variant splice proteins 

hgsql $spdb -N -e 'select v.varAcc, v.varAcc, taxon, id.bioentryID, 2-isCurated, des.val,"", "" from varAcc v, displayId d, info i, accToTaxon t, bioentryID id, description des where v.parAcc=d.acc and  d.acc=t.acc and d.acc=i.acc and d.acc=id.acc and d.acc = id.acc and d.acc=des.acc ' >spXref3bv.tab

# Clean up temp table.
hgsql $spdb -e "drop table bioentryID"

# collect all data from the resulting 4 files.

cat spXref3a.tab  spXref3b.tab  spXref3av.tab spXref3bv.tab >spXref3.tab

# load into the spXref3 table
hgLoadSqlTab $db spXref3 $kent/src/hg/lib/spXref3.sql spXref3.tab

# load the same data into a second temp table
hgLoadSqlTab $db spXref3B $kent/src/hg/lib/spXref3.sql spXref3.tab

# remove records that have empty HUGO symbol, yet other records 
# of the same accession does have non-empty HUGO symbols.
hgsql $db -e 'delete spXref3 from spXref3, spXref3B where spXref3.accession=spXref3B.accession and spXref3.hugoSymbol="" and spXref3B.hugoSymbol!=""'

# Drop temp table
hgsql $db -e "drop table spXref3B"


#ifdef DIDNTDO
######################################################################
# BUILD spXref2 TABLE 

cd /hive/data/inside/proteins/$date/build
# This step takes several hours and produces a table of ~100 million lines.
spToSpXref2 $spdate >spXref2.tab


# create and load the big spXref2 table
hgsql $db < $kent/src/hg/lib/spXref2.sql
cd /hive/data/inside/proteins/$date/build
hgsql $db -e 'load data local infile "spXref2.tab" into table spXref2'

# create the indices, took about 40 minutes for each index. 

hgsql $db -e 'create index displayID on spXref2(displayID)'
hgsql $db -e 'create index extAC on spXref2(extAC)'
hgsql $db -e 'create index accession on spXref2(accession)'


########################################################################
# Build spOrganism table
hgsql $spdb -N -e 'select d.val, taxon from displayId d, accToTaxon t where d.acc=t.acc ' >spOrganism.tab

hgLoadSqlTab $db spOrganism $kent/src/hg/lib/spOrganism.sql ./spOrganism.tab
#endif DIDNTDO

#	Build spSecondaryID table
cd /hive/data/inside/proteins/$date
hgsql -e "select displayId.val, displayId.acc, otherAcc.val from displayId, \
        otherAcc where otherAcc.acc = displayId.acc;" $spdb \
	| sed -e "1d" > spSecondaryID.tab

hgLoadSqlTab $db spSecondaryID $kent/src/hg/lib/spSecondaryID.sql ./spSecondaryID.tab

#	Build pfamXref and pfamDesc tables
# First get pfam info into /hive/data/outside/pfam/current/Pfam-A.full.gz somehow.
# Did it this time with
#   wget --timestamping ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz
cd    /hive/data/outside/pfam/current/

gunzip -c Pfam-A.full.gz | pfamXref $db stdin pfamADesc.tab pfamAXref.tab >pfamXref.log

hgLoadSqlTab $db pfamDesc $kent/src/hg/lib/pfamDesc.sql pfamADesc.tab


hgLoadSqlTab $db pfamXref $kent/src/hg/lib/pfamXref.sql pfamAXref.tab

#	Build the pdbSP table, new process using extDbRef data from spXXXXXX

cd /hive/data/inside/proteins/$date

hgsql $spdb -N -e 'select extAcc1, d.val from extDbRef x, displayId d, extDb where x.acc=d.acc and extDb.val="PDB" and x.extDb=extDb.id'|sort -u >pdbSP.tab

hgLoadSqlTab $db pdbSP $kent/src/hg/lib/pdbSP.sql pdbSP.tab

#	Build the spDisease table
cd /hive/data/inside/proteins/$date

hgsql -N -e \
'select comment.acc, displayId.val, commentVal.val from comment, commentVal, commentType ct, displayId where comment.commentType=ct.id and ct.val="DISEASE" and commentVal.id=comment.commentVal and displayId.acc=comment.acc;' \
$spdb > spDisease.tab
wc  spDisease.tab
#   4909  289325 2100354 spDisease.tab

hgLoadSqlTab $db spDisease $kent/src/hg/lib/spDisease.sql spDisease.tab

# create swInterPro table

cd /hive/data/inside/proteins/$date/build
# Fetch interpro file, last time like so:
wget --timestamping "ftp://ftp.ebi.ac.uk/pub/databases/interpro/protein2ipr.dat.gz"
gunzip protein2ipr.dat.gz
# rearrange col position to match the old format
awk '{print $1,$4,$5,$6,$2,$3}' protein2ipr.dat > interProXref.tab

hgLoadSqlTab $db  interProXref $kent/src/hg/lib/interProXref.sql ./interProXref.tab

hgsql $db  -N -e 'select accession, interProId from interProXref;'|sort -u >swInterPro.tab
hgLoadSqlTab $db  swInterPro $kent/src/hg/lib/swInterPro.sql ./swInterPro.tab

cd /hive/data/inside/proteins/$date
mkdir alias
cd alias

# this is very funky, but I'm leaving it in for the moment (braney)
hgsql $db -N -e 'select accession, accession,  "uAcc", "2006-01-15" from spXref3' >j1.tmp
hgsql $db -N -e 'select accession, accession2, "u2Ac", "2006-01-15" from spSecondaryID' >j2.tmp
hgsql $db -N -e 'select accession, displayID,  "uDis", "2006-01-15" from spXref3' >j3.tmp
hgsql proteins040515 -N -e 'select accession, displayID,  "oDis", "2004-05-15" from spXref3' >j4.tmp

cat j1.tmp j2.tmp j3.tmp j4.tmp >uniProtAlias.tab
rm j1.tmp j2.tmp j3.tmp j4.tmp

hgLoadSqlTab $db uniProtAlias $kent/src/hg/lib/uniProtAlias.sql ./uniProtAlias.tab
hgsql $db -e 'create index alias on uniProtAlias(alias)'

hgsql proteins111004 -N -e 'select * from spOldNew' >spOldNew.tab
hgLoadSqlTab $db spOldNew $kent/src/hg/lib/spOldNew.sql ./spOldNew.tab

hgsql $db -e 'create index oldDisplayId on spOldNew(oldDisplayId)'
hgsql $db -e 'create index newDisplayId on spOldNew(newDisplayId)'

# CREATE spVariant TABLE TO ENABLE UNIPROT VARIANT SPLICE ISOFORM PROCESSING 
cd /hive/data/inside/proteins/$date

hgsql $db  -N -e 'select accession,accession from spXref3' >j1
cat j1 |grep "-" |sed -e 's/-/\t/'|cut -f 1,3 >j2
cat j1 |grep -v "-" >>j3
cat j2 j3 |sort -u >spVariant.tab
rm j1 j2 j3


hgLoadSqlTab $db  spVariant $kent/src/hg/lib/spVariant.sql ./spVariant.tab
hgsql $db  -e 'create index parent on spVariant(parent)'

#this is build of reactome tables
cd /hive/data/outside/reactome
cd reactome41/
sed '1,4d' ucsc_events41 | hgLoadSqlTab proteome spReactomeEvent ~/kent/src/hg/lib/spReactomeEvent.sql stdin
sed '1,6d' ucsc_entity41 | hgLoadSqlTab proteome spReactomeId ~/kent/src/hg/lib/spReactomeId.sql stdin

#STOPPED HERE

# BUILD TABLES FOR pbGlobal (PB V1.1).  These tables are used by the Proteome
# Browser.  

cd /hive/data/inside/proteins/111004
mkdir pbGlobal
cd pbGlobal

# Calculate Pi values for all proteins
pbCalPi sp111004 pi111004.tab


hgLoadSqlTab proteins111004 pepPi $kent/src/hg/lib/pepPi.sql ./pi111004.tab

# Build pepMwAa table

hgsql sp111004 -N -e "select acc, molWeight, aaSize from info" >pepMwAa.tab
hgLoadSqlTab proteins111004 pepMwAa $kent/src/hg/lib/pepMwAa.sql ./pepMwAa.tab


# Calculate global protein property distributions
cd /hive/data/inside/proteins/111004/pbGlobal
nice pbCalDistGlobal sp111004 proteins111004

# Load the tables


cd /hive/data/inside/proteins/111004/pbGlobal
hgLoadSqlTab proteins111004 pepCCntDist $kent/src/hg/lib/pepCCntDist.sql pepCCntDist.tab
hgLoadSqlTab proteins111004 pepHydroDist $kent/src/hg/lib/pepHydroDist.sql pepHydroDist.tab
hgLoadSqlTab proteins111004 pepIPCntDist $kent/src/hg/lib/pepIPCntDist.sql pepIPCntDist.tab
hgLoadSqlTab proteins111004 pepMolWtDist $kent/src/hg/lib/pepMolWtDist.sql pepMolWtDist.tab
hgLoadSqlTab proteins111004 pepPiDist $kent/src/hg/lib/pepPiDist.sql pepPiDist.tab
hgLoadSqlTab proteins111004 pepResDist $kent/src/hg/lib/pepResDist.sql pepResDist.tab

# Calculate global AA residue distributions

pbCalResStdGlobal 111004

# Load distribution tables:

hgLoadSqlTab proteins111004 pbAnomLimit $kent/src/hg/lib/pbAnomLimit.sql ./pbAnomLimit.tab
hgLoadSqlTab proteins111004 pbResAvgStd  $kent/src/hg/lib/pbResAvgStd.sql ./pbResAvgStd.tab

# Get taxonomy names table from NCBI.

cd /hive/data/inside/proteins/111004
mkdir taxon
cd taxon
wget --timestamping ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
unzip taxdmp.zip

# Create table taxonNames in proteins111004

# Load from the file names.dmp into taxonNames table.

# hgsql proteins111004 -e 'drop table taxonNames'
hgsql proteins111004 < $kent/src/hg/lib/taxonNames.sql    
hgsql proteins111004 -e 'load data local infile "names.dmp" into table taxonNames fields terminated by "|" enclosed by "\t";'

# Load and edit pbStamp table

cd /hive/data/inside/proteins/111004
hgsql proteins060115 -N -e "select * from pbStamp" > pbStamp.tab

hgLoadSqlTab proteins111004 pbStamp $kent/src/hg/lib/pbStamp.sql ./pbStamp.tab


# Build spDeleted table

# Download list of deleted accession numbers from UniProt
cd /hive/data/inside/proteins/111004

wget --timestamping ftp://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/docs/delac_tr.txt
cp delac_tr.txt spDeleted.tab

# Edit spDeleted.tab to get rid of other lines except accession numbers
vi spDeleted.tab
hgLoadSqlTab proteins111004 spDeleted $kent/src/hg/lib/spDeleted.sql ./spDeleted.tab

# move this endif statement past business that has successfully been completed
endif # BRACKET		


# EXTEND pfamXref TO COVER VARIANT SPLICE PROTEINS

hgsql proteins111004 -N -e \
'select pfamAC, variant, variant from spVariant, pfamXref where swissAC = parent and variant like "%-%" and swissAC=parent;' \
>pfamXrefVar.tab

hgLoadSqlTab -append proteins111004 pfamXref $kent/src/hg/lib/pfamXref.sql ./pfamXrefVar.tab

# move this exit statement to the end of the section to be done next
exit $status # BRACKET


# SWITCH SYMBOLIC PROTEIN DATABASE LINKS (TBD)

# Ask system admin to switch the following symbolic database links after
# reaching this point in this script, and running the companion script
# sp111004:

       swissProt --> sp111004
       proteins  --> proteins111004
       uniProt   --> proteins111004
       proteome  --> proteins111004

# Run some simple test on hgTracks, hgNear, hgGene, pbTracks, and pbGlobal  
# to make sure things are running OK.

# First check to see if pbGateway and pbGlobal are working.

# Then edit pbStamp.tab to adjust maximum y values for various stamps 
# and load it to pbStamp tables until all their scales look reasonable.
# For this particular release, no adjustment seems necessary.	

    
#########################################################
# CREATE REACTOME TABLES 

# Obtain the ucsc_events24.dat.txt and ucsc_entity24.dat from Reactome.
# Contact: Gopinathrao, Gopal [gopinath@cshl.edu]
# Note - Gopal is no longer with Reactome, and the format has changed.  For now
# just using previous data.  Should double-check that Reactome links still work
# both on hg18 and with the hg19 build....

cd /hive/data/outside/reactome/reactome24

hgsql proteins111004 < $kent/src/hg/lib/spReactomeEvent.sql
hgsql proteins111004 -e 'load data local infile "ucsc_events24.tab" into table spReactomeEvent'

hgsql proteins111004 < $kent/src/hg/lib/spReactomeId.sql
hgsql proteins111004 -e 'load data local infile "ucsc_entity24.tab" into table spReactomeId'

# move this endif statement past business that has been successfully completed

#########################################################
# UPDATE REACTOME TABLES WITH V32 DATA (DONE, Fan 3/19/10)

# Obtain the ucsc_events32.dat.txt and ucsc_entity32.dat from Reactome.
# Contact: Lisa Matthews [lmatthews.nyumc@gmail.com]

ssh hgwdev
mkdir /hive/data/outside/reactome/reactome32
rm /hive/data/outside/reactome/current
ln -s /hive/data/outside/reactome/reactome32 /hive/data/outside/reactome/current
rm /hive/data/outside/reactome/reactome
ln -s /hive/data/outside/reactome/reactome32 /hive/data/outside/reactome/reactome 
cd /hive/data/outside/reactome/reactome

# save these two .dat files to this subdirectory.

cp ucsc_events32.dat ucsc_events32.tab
cp ucsc_entity32.dat ucsc_entity32.tab

# edit two .tab files to get rid of top head lines.

hgsql proteome -e 'drop table spReactomeEvent'
hgsql proteome < $kent/hg/lib/spReactomeEvent.sql
hgsql proteome -e 'load data local infile "ucsc_events32.tab" into table spReactomeEvent'

hgsql proteome -e 'drop table spReactomeId'
hgsql proteome < $kent/hg/lib/spReactomeId.sql
hgsql proteome -e 'load data local infile "ucsc_entity32.tab" into table spReactomeId'

# Generate lists for Reactome to link back to UCSC site.

hgsql hg17 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg17.lis

hgsql hg18 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg18.lis

hgsql hg19 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg19.lis

hgsql mm9 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm9.lis

hgsql mm8 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm8.lis

hgsql rn4 -N -e 'select kgId, kgXref.spId from kgXref,proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >rn4.lis

# !!! NOTE: Email the above 6 list files to Lisa Matthews [lmatthews.nyumc@gmail.com]
# AFTER THE NEW PROTEIN DBs ARE RELEASED ON RR.

#########################################################
# UPDATE REACTOME TABLES WITH V33 DATA (DONE, Fan 6/22/10)

# Obtain the ucsc_events33.dat.txt and ucsc_entity33.dat from Reactome.
# Contact: Robin Haw [Robin.Haw@oicr.on.ca]

ssh hgwdev
mkdir /hive/data/outside/reactome/reactome33
rm /hive/data/outside/reactome/current
ln -s /hive/data/outside/reactome/reactome33 /hive/data/outside/reactome/current
rm /hive/data/outside/reactome/reactome
ln -s /hive/data/outside/reactome/reactome33 /hive/data/outside/reactome/reactome 
cd /hive/data/outside/reactome/reactome

# save these two .dat files to this subdirectory.

cp ucsc_events33.dat ucsc_events33.tab
cp ucsc_entity33.dat ucsc_entity33.tab

# edit two .tab files to get rid of top head lines.

hgsql proteome -e 'drop table spReactomeEvent'
hgsql proteome < $kent/hg/lib/spReactomeEvent.sql
hgsql proteome -e 'load data local infile "ucsc_events33.tab" into table spReactomeEvent'

hgsql proteome -e 'drop table spReactomeId'
hgsql proteome < $kent/hg/lib/spReactomeId.sql
hgsql proteome -e 'load data local infile "ucsc_entity33.tab" into table spReactomeId'

# Generate lists for Reactome to link back to UCSC site.

hgsql hg17 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg17.lis

hgsql hg18 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg18.lis

hgsql hg19 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg19.lis

hgsql mm9 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm9.lis

hgsql mm8 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm8.lis

hgsql rn4 -N -e 'select kgId, kgXref.spId from kgXref,proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >rn4.lis

# !!! NOTE: Email the above 6 list files to Lisa Matthews [lmatthews.nyumc@gmail.com]
# AFTER THE NEW PROTEIN DBs ARE RELEASED ON RR.

#########################################################
# UPDATE REACTOME TABLES WITH V34 DATA (DONE, Fan 10/25/10)

# Obtain the ucsc_events34.dat.txt and ucsc_entity34.dat from Reactome.
# Contact: Robin Haw [Robin.Haw@oicr.on.ca]

ssh hgwdev
mkdir /hive/data/outside/reactome/reactome34
rm /hive/data/outside/reactome/current
ln -s /hive/data/outside/reactome/reactome34 /hive/data/outside/reactome/current
rm /hive/data/outside/reactome/reactome
ln -s /hive/data/outside/reactome/reactome34 /hive/data/outside/reactome/reactome 
cd /hive/data/outside/reactome/reactome

# save these two .dat files to this subdirectory.

cp ucsc_events34.dat ucsc_events34.tab
cp ucsc_entity34.dat ucsc_entity34.tab

# edit two .tab files to get rid of top head lines.

hgsql proteome -e 'drop table spReactomeEvent'
hgsql proteome < $kent/hg/lib/spReactomeEvent.sql
hgsql proteome -e 'load data local infile "ucsc_events34.tab" into table spReactomeEvent'

hgsql proteome -e 'drop table spReactomeId'
hgsql proteome < $kent/hg/lib/spReactomeId.sql
hgsql proteome -e 'load data local infile "ucsc_entity34.tab" into table spReactomeId'

# Generate lists for Reactome to link back to UCSC site.

hgsql hg17 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg17.lis

hgsql hg18 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg18.lis

hgsql hg19 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg19.lis

hgsql mm9 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >mm9.lis

hgsql mm8 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >mm8.lis

hgsql rn4 -N -e \
'select kgId, kgXref.spId from kgXref,proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >rn4.lis

# !!! NOTE: Email the above 6 list files to Lisa Matthews [lmatthews.nyumc@gmail.com]
# AFTER THE NEW PROTEIN DBs ARE RELEASED ON RR.

#########################################################
# UPDATE REACTOME TABLES WITH V35 DATA (DONE, Fan 01/03/10)

# Obtain the ucsc_events35 and ucsc_entity35 from Reactome.
# Contact: Robin Haw [Robin.Haw@oicr.on.ca]

ssh hgwdev
mkdir /hive/data/outside/reactome/reactome35
rm /hive/data/outside/reactome/current
ln -s /hive/data/outside/reactome/reactome35 /hive/data/outside/reactome/current
rm /hive/data/outside/reactome/reactome
ln -s /hive/data/outside/reactome/reactome35 /hive/data/outside/reactome/reactome 
cd /hive/data/outside/reactome/reactome

# save these two .dat files to this subdirectory.

cp ucsc_events35 ucsc_events35.tab
cp ucsc_entity35 ucsc_entity35.tab

# edit two .tab files to get rid of top head lines.

hgsql proteome -e 'drop table spReactomeEvent'
hgsql proteome < $kent/hg/lib/spReactomeEvent.sql
hgsql proteome -e 'load data local infile "ucsc_events35.tab" into table spReactomeEvent'

hgsql proteome -e 'drop table spReactomeId'
hgsql proteome < $kent/hg/lib/spReactomeId.sql
hgsql proteome -e 'load data local infile "ucsc_entity35.tab" into table spReactomeId'

# Generate lists for Reactome to link back to UCSC site.

hgsql hg17 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg17.lis

hgsql hg18 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg18.lis

hgsql hg19 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg19.lis

hgsql mm9 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >mm9.lis

hgsql mm8 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >mm8.lis

hgsql rn4 -N -e \
'select kgId, kgXref.spId from kgXref,proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >rn4.lis

# !!! NOTE: Email the above 6 list files to Lisa Matthews [lmatthews.nyumc@gmail.com]
# AFTER THE NEW PROTEIN DBs ARE RELEASED ON RR.

#########################################################
# UPDATE REACTOME TABLES WITH V36 DATA (DONE, Fan 03/18/2011)

# Obtain the ucsc_events36 and ucsc_entity36 from Reactome.
# Contact: Robin Haw [Robin.Haw@oicr.on.ca]

ssh hgwdev
mkdir /hive/data/outside/reactome/reactome36
rm /hive/data/outside/reactome/current
ln -s /hive/data/outside/reactome/reactome36 /hive/data/outside/reactome/current
rm /hive/data/outside/reactome/reactome
ln -s /hive/data/outside/reactome/reactome36 /hive/data/outside/reactome/reactome 
cd /hive/data/outside/reactome/reactome

# save these two .dat files to this subdirectory.

cp ucsc_events36.dat ucsc_events36.tab
cp ucsc_entity36.dat ucsc_entity36.tab

# edit two .tab files to get rid of top head lines.

hgsql proteome -e 'drop table spReactomeEvent'
hgsql proteome < $kent/hg/lib/spReactomeEvent.sql
hgsql proteome -e 'load data local infile "ucsc_events36.tab" into table spReactomeEvent'

hgsql proteome -e 'drop table spReactomeId'
hgsql proteome < $kent/hg/lib/spReactomeId.sql
hgsql proteome -e 'load data local infile "ucsc_entity36.tab" into table spReactomeId'

# Generate lists for Reactome to link back to UCSC site.

hgsql hg17 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg17.lis

hgsql hg18 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg18.lis

hgsql hg19 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg19.lis

hgsql mm9 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >mm9.lis

hgsql mm8 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >mm8.lis

hgsql rn4 -N -e \
'select kgId, kgXref.spId from kgXref,proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >rn4.lis

# !!! NOTE: Email the above 6 list files to Lisa Matthews [lmatthews.nyumc@gmail.com]
# AFTER THE NEW PROTEIN DBs ARE RELEASED ON RR.

#########################################################
# UPDATE REACTOME TABLES WITH V38 DATA (DONE, Fan 10/14/2011)

# Obtain the ucsc_events38 and ucsc_entity38 from Reactome.
# Contact: Robin Haw [Robin.Haw@oicr.on.ca]

ssh hgwdev
mkdir /hive/data/outside/reactome/reactome38
rm /hive/data/outside/reactome/current
ln -s /hive/data/outside/reactome/reactome38 /hive/data/outside/reactome/current
rm /hive/data/outside/reactome/reactome
ln -s /hive/data/outside/reactome/reactome38 /hive/data/outside/reactome/reactome 
cd /hive/data/outside/reactome/reactome

# save these two .dat files to this subdirectory.

cp ucsc_events38 ucsc_events38.tab
cp ucsc_entity38 ucsc_entity38.tab

# edit two .tab files to get rid of top head lines.

hgsql proteome -e 'drop table spReactomeEvent'
hgsql proteome < ~/kent/src/hg/lib/spReactomeEvent.sql
hgsql proteome -e 'load data local infile "ucsc_events38.tab" into table spReactomeEvent'

hgsql proteome -e 'drop table spReactomeId'
hgsql proteome < ~/kent/src/hg/lib/spReactomeId.sql
hgsql proteome -e 'load data local infile "ucsc_entity38.tab" into table spReactomeId'

# Generate lists for Reactome to link back to UCSC site.

hgsql hg17 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg17.lis

hgsql hg18 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg18.lis

hgsql hg19 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg19.lis

hgsql mm9 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >mm9.lis

hgsql mm8 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >mm8.lis

hgsql rn4 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >rn4.lis

# !!! NOTE: Email the above 6 list files to Lisa Matthews [lmatthews.nyumc@gmail.com]
# AFTER THE NEW PROTEIN DBs ARE RELEASED ON RR.

####################################################################
# REACTOME UPDATE v39 (DONE, Andy 2012-01-24)

scp UCSC_Reactome_v39.zip aamp@hgwdev.cse.ucsc.edu:/hive/data/outside/reactome/
# (from redmine #5425)
ssh hgwdev
cd /hive/data/outside/reactome
unzip UCSC_Reactome_v39.zip
rm -rf __MACOSX/ UCSC_Reactome_v39.zip
mkdir reactome39
mv ucsc_e* reactome39/
rm current reactome
ln -s /hive/data/outside/reactome/reactome39 current
ln -s /hive/data/outside/reactome/reactome39 reactome
cd reatome39/
tail -n +7 ucsc_entity39 > ucsc_entity39.tab
tail -n +7 ucsc_events39 > ucsc_events39.tab
hgLoadSqlTab proteome spReactomeEvent ~/kent/src/hg/lib/spReactomeEvent.sql ucsc_events39.tab
hgLoadSqlTab proteome spReactomeId ~/kent/src/hg/lib/spReactomeId.sql ucsc_entity39.tab
hgsql hg17 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg17.lis
hgsql hg18 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg18.lis
hgsql hg19 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >hg19.lis
hgsql mm9 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >mm9.lis
hgsql mm8 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >mm8.lis
hgsql rn4 -N -e \
'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' \
|sort -u >rn4.lis

# these lists are e-mailed to someone at Reactome for updating their site, presumably.

endif # BRACKET