# This file describes how the sp050415 and proteins050415 databases were
# made using UniProt database files 
# and a few other external databases.

# STARTED ON 05/04/21, DONE ON 05/26/21.

# FIRST PARSE SWISS-PROT RAW DATA FILES AND BUILD sp050415 DB.

# Make subdirectories under /cluster/store10/swissProt

	mkdir /cluster/store10/swissprot/050415
	mkdir /cluster/store10/swissprot/050415/build
	mkdir /cluster/store10/swissprot/050415/tabFiles
	ln -s /cluster/store10/swissprot/050415 /cluster/data/swissprot/050415
	
# mkSwissProtDB.sh is updated so that it takes date from command argument
# instead of using today's date and removed uniprot_trembl_new.dat, since
# it is no longer available.

# Run mkSwissProtDB.sh to parse Swiss-Prot raw input files.

	ssh kksilo
	cd /cluster/data/swissprot/050415
	~/src/hg/protein/mkSwissProtDB.sh 050415

# Go to hgwdev and run mkSwissProtDB.sh again to create and load sp050415 database

      ssh hgwdev
	cd /cluster/data/swissprot/050415
	~/src/hg/protein/mkSwissProtDB.sh 050415

# NEXT BUILD proteins050415 DB

# Updated mkProteinsDB.sh script then run it to create proteins050415 DB.

   ~/src/hg/protein/mkProteinsDB.sh 050415

# create indice of the spXref2 table, took about 10 minutes each.

    hgsql proteins050415 -e 'CREATE INDEX i2 ON spXref2(displayID(12))'
    hgsql proteins050415 -e 'CREATE INDEX i3 ON spXref2(extAC(24))'
    hgsql proteins050415 -e 'CREATE INDEX ii2 ON spXref3(displayID(12))'
    hgsql proteins050415 -e 'CREATE INDEX ii3 ON spXref3(hugoSymbol(12))'

# Add variant splice protein data into sp050415 tables.

fgrep ">" uniprot_sprot_varsplic.fasta|sed -e 's/>//g' | sed -e 's/ (/\t1\t/g' \
| sed -e 's/) /\t/g' >varProtTemp.tab

fgrep ">" uniprot_trembl_varsplic.fasta|sed -e 's/>//g' | sed -e 's/ (/\t2\t/g' \
| sed -e 's/) /\t/g' >>varProtTemp.tab

cat uniprot_sprot_varsplic.fasta |sed -e 's/ (/\n/g' |grep -v iso >j.tmp
cat uniprot_trembl_varsplic.fasta|sed -e 's/ (/\n/g' |grep -v iso >>j.tmp
faToTab -type=protein j.tmp varProtein.tab
rm j.tmp
hgsql sp050415 -e 'load data local infile "varProtein.tab" into table protein'

hgsql sp050415 -e 'drop table varProtTemp'
hgsql sp050415 < ~/src/hg/lib/varProtTemp.sql

hgsql sp050415 -e 'load data local infile "varProtTemp.tab" into table varProtTemp'

spToProteinsVar 050415
hgsql proteins050415 -e 'load data local infile "spXref3Var.tab" into table spXref3'

cut -f 1,2 spXref3Var.tab >j.tmp
hgsql sp050415 -e 'load data local infile "j.tmp" into table displayId'
rm j.tmp

hgsql sp050415 -e 'drop table varProtTemp'

# BUILD TABLES FOR pbGlobal (PB V1.1)

    cd /cluster/data/proteins/050415
    mkdir pbGlobal
    cd pbGlobal

# Calculate Pi values for all proteins

    hgsql sp050415 -e "select acc from protein" >acc050415.lis
    nice pbCalPi acc050415.lis sp050415 pi050415.tab
    hgsql proteins050415 -e 'load data local infile "pi050415.tab" into table pepPi;'
   
# Build pepMwAa table

    hgsql sp050415 -N -e "select acc, molWeight, aaSize from info" >pepMwAa.tab
    hgsql proteins050415 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'

# Calculate global protein property distributions

       nice pbCalDistGlobal sp050415 proteins050415

# This took about 35 minutes.
# Load the tables

       hgsql proteins050415 -e 'load data local infile "pepCCntDist.tab"  into table pepCCntDist'
       hgsql proteins050415 -e 'load data local infile "pepHydroDist.tab" into table pepHydroDist'
       hgsql proteins050415 -e 'load data local infile "pepIPCntDist.tab" into table pepIPCntDist'
       hgsql proteins050415 -e 'load data local infile "pepMolWtDist.tab" into table pepMolWtDist'
       hgsql proteins050415 -e 'load data local infile "pepPiDist.tab"    into table pepPiDist'
       hgsql proteins050415 -e 'load data local infile "pepResDist.tab"   into table pepResDist'

# Calculate global AA residue distributions

       pbCalResStdGlobal 050415

# Load distribution tables:

        hgsql proteins050415 -e 'load data local infile "pbAnomLimit.tab" into table pbAnomLimit'
        hgsql proteins050415 -e 'load data local infile "pbResAvgStd.tab" into table pbResAvgStd'

# Get taxonomy names table from NCBI.

      cd /cluster/data/proteins/050415
	mkdir taxon
	cd taxon
	wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
	unzip taxdmp.zip

# Create table taxonNames in proteins050415

# Load from the file names.dmp into taxonNames table.
     
   hgsql proteins050415 -e 'load data local infile "names.dmp" into table taxonNames fields terminated by "|" enclosed by "\t";'

# Load and edit pbStamp table

      cd /cluster/data/proteins/050415
hgsql proteins050201 -e "select * from pbStamp" > pbStamp.tab
	hgsql proteins050415 -e 'load data local infile "pbStamp.tab" into table pbStamp ignore 1 lines'

# Found there are a few records in UniProt that has LF char in description line.
hgsql proteins050415 -N -e 'delete from spXref3 where accession like "%\.%"'

mkdir alias
cd alias

accession, accession,  "uAcc", "2005-04-15" from spXref3' >j1.tmp
hgsql proteins050415 -N -e 'select accession, accession2, "u2Ac", "2005-04-15" from spSecondaryID' >j2.tmp
hgsql proteins050415 -N -e 'select accession, displayID,  "uDis", "2005-04-15" from spXref3' >j3.tmp

hgsql proteins041115 -N -e 'select accession, displayID,  "oDis", "2004-11-15" from spXref3' >j4.tmp

cat j1.tmp j2.tmp j3.tmp j4.tmp >uniProtAlias.tab

hgsql proteins050415 -e 'drop table uniProtAlias'

hgsql proteins050415 <~/src/hg/lib/uniProtAlias.sql
hgsql proteins050415 -e 'load data local infile "uniProtAlias.tab" into table uniProtAlias'
hgsql proteins050415 -e 'create index alias on uniProtAlias(alias)'

# First check to see if pbGateway and pbGlobal are working.

# Then edit pbStamp.tab to adjust maximum y values for various stamps 
# and load it to pbStamp tables until all their scales look reasonable.
# For this particular release, no adjustment seems necessary.	

# SWITCH SYMBOLIC PROTEIN DATABASE LINKS

# Ask system admin to switch the following symbolic database links:

       swissProt --> sp050415
       proteins  --> proteins050415

# Run some simple test on hgTracks, hgNear, hgGene, pbTracks, and pbGlobal  
# to make sure things are running OK.
    
# Release to QA for formal testing.

# CREATE spVariant TABLE TO ENABLE UNIPROT VARIANT SPLICE ISOFORM PROCESSING (DONE, Fan, 7/27/05)

hgsql proteome -N -e 'select accession,accession from spXref3' >j1
cat j1 |grep "-" |sed -e 's/-/\t/'|cut -f 1,3 >j2

cat j1 |grep -v "-" >>j3
cat j2 j3 |sort -u >spVariant.tab

hgsql proteome -e 'drop table spVariant'
hgsql proteome < ~/src/hg/lib/spVariant.sql
hgsql proteome -e 'load data local infile "spVariant.tab" into table spVariant'

hgsql proteome -e 'create index parent on spVariant(parent)'

# CREATE TABLES TO ENABLE LINKS TO REACTOME (DONE, Fan 7/27/05)

# Obtain the Reactome_events.txt and ucsc_entity14.dat from Reactome.
# Contact: Gopinathrao, Gopal [gopinath@cshl.edu]

mkdir /cluster/store10/reactome/reactome050613
rm /cluster/data/reacome
ln -s /cluster/store10/reactome/reactome050613 /cluster/data/reactome
cd /cluster/data/reactome

hgsql proteome -e 'drop table spReactomeEvent'
hgsql proteome < ~/hg/lib/spReactomeEvent.sql
hgsql proteome -e 'load data local infile "Reactome_events.txt" into table spReactomeEvent ignore 4 lines'

# First remove extra LF or CR that is causing problem.
rmLf ucsc_entity14.dat >ucsc_entity14.cleaned

hgsql proteome -e 'drop table spReactomeId'
hgsql proteome < ~/hg/lib/spReactomeId.sql
hgsql proteome -e 'load data local infile "ucsc_entity14.cleaned" into table spReactomeId ignore 6 lines'

# Add the following into src/hg/hgGene/hgGeneData/Human/links.ra and do the same for Mouse and Rat:

name reactome
shortLabel Reactome
tables proteome.spReactomeEvent kgXref
idSql select spReactomeId.spID from proteome.spReactomeId,proteome.spVariant,kgXref where kgXref.kgID = '%s' and kgXref.spID=variant and spReactomeId.spID=parent;
url http://www.reactome.org/cgi-bin/link?SOURCE=UniProt&ID=%s&
priority 86

cd ~/src/hg/hgGene
make

cd /cluster/data/reactome

hgsql hg17 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg17.lis

hgsql mm6 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm6.lis

hgsql rn3 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >rn3.lis

# Email the above 3 list files to Gopinathrao, Gopal [gopinath@cshl.edu]


# EXTEND pfamXref TO COVER VARIANT SPLICE PROTEINS

hgsql proteome -N -e \
'select pfamAC, variant, variant from spVariant, pfamXref where swissAC = parent and variant like "%-%" and swissAC=parent;' \
>pfamXrefVar.tab

hgsql proteome -N -e \
'load data local infile "pfamXrefVar.tab" into table pfamXref'