# This file describes how the sp040915 and proteins040915  databases were
# made using September 13, 2004 release of UniProt database files 
# from SWISS-PROT and a few other external databases.

# STARTED ON 9/15/04, DONE ON 9/21/04.

# FIRST PARSE SWISS-PROT RAW DATA FILES AND BUILD sp040915 DB.

o Make subdirectories under /cluster/store8/swissProt

	mkdir /cluster/store8/swissprot/040915
	mkdir /cluster/store8/swissprot/040915/build
	mkdir /cluster/store8/swissprot/040915/tabFiles
	ln -s /cluster/store8/swissprot/040915 /cluster/data/swissprot/040915
	
o Update mkSwissProtDB.sh under src/hg/protein to use kksilo instead
  of eieio, since /cluster/store8 is on kksilo.

o run mkSwissProtDB.sh

	ssh kksilo
	cd /cluster/data/swissprot/040915
	~/src/hg/protein/mkSwissProtDB.sh 

  This supposely will ftp over all Swiss-Prot raw data files,
  parse them using spToDb, create .txt files and load them into
  sp040915 DB.

  In reality, since Swiss-Prot changed their data format in a 
  few places, spToDb no longer finishes successfully.
  Several changes were made to spToDb.c, and several manual
  steps were used to complete what mkSwissProtDB.sh
  was supposed to finish in one shot.

  The recent Swiss-Prot format changes are listed below:
    
    1. Gene name line is changed into:
    
       GN   Name=CRB; OrderedLocusNames=At1g03880; ORFNames=F21M11.19;
    
    2. An new type of DOI (Digital Object Identifier) is added to
       citation cross-reference line, e.g.:
    
       RX   MEDLINE=88039002; PubMed=2444886; DOI=10.1016/0166-6851(87)90007-7;
    
       I added a "doi" field at the end of reference table to store
       DOI info.
    
    Jim, the parseNameVals() function in spToDb.c assumes that there is
    always "xxx=...." separated by ";".  This is violated by some DOI lines
    and "GN  " lines, e.g.:
    
        RX   DOI=10.1002/(SICI)1097-0061(199610)12:13<1321::AID-YEA27>3.0.CO;2-6;
    
        GN   Name=CBP1; Synonyms=CXP;1;
    
    In those cases, there are a few extra characters at the end of the line
    after ";".  I changed the hard exit logic of parseNameVals()
    to stop parsing and thus ignore those extra characters.
    This seems working OK for now.


  For record keeping purpose, the complete script of 
  mkSwissProtDB.sh is included in the following.  
  Hopefully next time, this script will run successfully
  from start to end.  If not, just following the steps
  in this script.  They should work (most of the time).

### begining of mkSwissProtDB.sh
#!/bin/sh
#
#	mkSwissProtDB.sh
#	- currently no arguments but it should be modified to take an
#	- argument of a data stamp instead of generating one below since
#	- you want to have consistent date stamps for this swissprot and
#	- the subsequence proteins database
#
#	This script could be improved to do error checking for each step.
#
#	Thu Nov 20 11:31:51 PST 2003 - Created - Hiram
#
#	"$Id: proteinDBs040915.txt,v 1.1 2006/07/25 20:14:50 markd Exp $"

TOP=/cluster/data/swissprot
export TOP
cd ${TOP}

type spToDb > /dev/null 2> /dev/null

if [ "$?" -ne 0 ]; then
    echo "ERROR: can not find required program: spToDb"
    echo -e "\tYou may need to build it in ~/kent/src/hg/protein/spToDb"
    exit 255
fi

MACHINE=`uname -n`

if [ ${MACHINE} != "kksilo" -a ${MACHINE} != "hgwdev" ]; then
    echo "ERROR: must run this script on kksilo or hgwdev.  This is: ${MACHINE}"
    exit 255
fi

DATE=`date "+%y%m%d"`
SP="${DATE}"
SPDB="sp${DATE}"
export SP SPDB

echo "Creating Db: ${SP}"

if [ ${MACHINE} = "kksilo" ]; then

    if [ -d "${SP}" ]; then
	echo "WARNING: ${SP} already exists."
	echo -e "Do you want to try to use the data here ? (ynq) \c"
	read YN
	if [ "${YN}" = "Y" -o "${YN}" = "y" ]; then
	    echo "working with current data in ${SP}"
	else
	    echo "Will not recreate at this time."
	    exit 255
	fi
    fi


    echo mkdir -p ./${SP}
    mkdir -p ./${SP}
    cd ./${SP}
    mkdir -p ./build
    cd ./build
    for db in uniprot_sprot uniprot_trembl new/uniprot_trembl_new
    do
	if [ ! -f ${db}.dat.gz ]; then
		wget --timestamping \
			ftp://us.expasy.org/databases/uniprot/knowledgebase/${db}.dat.gz
	fi
    done
    
    mv uniprot_sprot.dat.gz sprot.dat.gz
    mv uniprot_trembl.dat.gz trembl.dat.gz
    mv uniprot_trembl_new.dat.gz trembl_new.dat.gz

    zcat *.dat.gz | spToDb stdin ../tabFiles

else
    if [ ! -d ${TOP}/${DATE}/tabFiles ]; then
	echo "ERROR: ${TOP}/tabFiles does not exist."
	echo -e "\tRun this first on kksilo to fetch the data."
	exit 255
    fi

    if [ ! -f ~/kent/src/hg/protein/spToDb/spDb.sql ]; then
	echo "ERROR: can not find ~/kent/src/hg/protein/spToDb/spDb.sql"
	echo "\tto create the database.  Update your source tree."
	exit 255
    fi

    echo "creating the database ${SPDB}"
    EXISTS=`hgsql -e "show tables;" ${SPDB} 2> /dev/null | wc -l`
    if [ "${EXISTS}" -gt 1 ]; then
	echo "ERROR: database ${SPDB} already exists"
	echo -e "\t to drop: hgsql -e 'drop database ${SPDB};' ${SPDB}"
	exit 255
    fi
    hgsql -e "create database ${SPDB}" proteins040515
    hgsql ${SPDB} < ~/kent/src/hg/protein/spToDb/spDb.sql
    cd ${TOP}/${DATE}/tabFiles
    for i in *.txt
    do
	TBL=${i%.txt}
	echo "importing table: ${TBL}"
	echo hgsql -e "load data local infile '${i}' into table ${TBL};" ${SPDB}
	hgsql -e "load data local infile \"${i}\" into table ${TBL};" ${SPDB}
    done

fi

exit 0
  
### end of mkSwissProtDB.sh


# NEXT BUILD proteins040915 DB

o Update mkProteinsDB.sh script to:
    
    - remove lines adding additional index, since those index
      were already present in previous proteinsYYMMDD DBs and
      the new DB uses an earlier verion DB table definitions 
      to create its tables.

    - added an InterPro section.

   mkProteinsDB.sh 040915

# BUILD TABLES FOR pbGlobal (PB V1.1)

  o Calculate Pi values for all proteins

    hgsql sp040915 -e "select acc from protein" >acc040915.lis
    nice pbCalPi acc040915.lis sp040915 pi040915.tab
    hgsql pbGlobal -e 'load data local infile "pi.tab" into table pepPi;'
   
  o Build pepMwAa table

    hgsql sp040915 -e "select acc, molWeight, aaSize from info" >pepMwAa.tab
    hgsql proteins040915 \
    'load data local infile "pepMwAa.tab" into table pepMwAa ignore 1 lines'

  o Calculate global protein property distributions

       pbCalDistGlobal sp040915 proteins040915

    This takes about 20 minutes.

       hgsql proteins040915 'load data local infile "pepCCntDist.tab"  into table pepCCntDist'
       hgsql proteins040915 'load data local infile "pepHydroDist.tab" into table pepHydroDist'
       hgsql proteins040915 'load data local infile "pepIPCntDist.tab" into table pepIPCntDist'
       hgsql proteins040915 'load data local infile "pepMolWtDist.tab" into table pepMolWtDist'
       hgsql proteins040915 'load data local infile "pepPiDist.tab"    into table pepPiDist'
       hgsql proteins040915 'load data local infile "pepResDist.tab"   into table pepResDist'

  o Calculate global AA residue distributions

       pbCalResStdGlobal 040915

    Load all distribution tables:

        hgsql proteins040915 -e 'load data local infile "pbAaDistW.tab" into table pbAaDistW'
        hgsql proteins040915 -e 'load data local infile "pbAaDistC.tab" into table pbAaDistC'
        hgsql proteins040915 -e 'load data local infile "pbAaDistM.tab" into table pbAaDistM'
        hgsql proteins040915 -e 'load data local infile "pbAaDistH.tab" into table pbAaDistH'
        hgsql proteins040915 -e 'load data local infile "pbAaDistY.tab" into table pbAaDistY'
        hgsql proteins040915 -e 'load data local infile "pbAaDistN.tab" into table pbAaDistN'
        hgsql proteins040915 -e 'load data local infile "pbAaDistF.tab" into table pbAaDistF'
        hgsql proteins040915 -e 'load data local infile "pbAaDistI.tab" into table pbAaDistI'
        hgsql proteins040915 -e 'load data local infile "pbAaDistD.tab" into table pbAaDistD'
        hgsql proteins040915 -e 'load data local infile "pbAaDistQ.tab" into table pbAaDistQ'
        hgsql proteins040915 -e 'load data local infile "pbAaDistK.tab" into table pbAaDistK'
        hgsql proteins040915 -e 'load data local infile "pbAaDistR.tab" into table pbAaDistR'
        hgsql proteins040915 -e 'load data local infile "pbAaDistT.tab" into table pbAaDistT'
        hgsql proteins040915 -e 'load data local infile "pbAaDistV.tab" into table pbAaDistV'
        hgsql proteins040915 -e 'load data local infile "pbAaDistP.tab" into table pbAaDistP'
        hgsql proteins040915 -e 'load data local infile "pbAaDistG.tab" into table pbAaDistG'
        hgsql proteins040915 -e 'load data local infile "pbAaDistE.tab" into table pbAaDistE'
        hgsql proteins040915 -e 'load data local infile "pbAaDistA.tab" into table pbAaDistA'
        hgsql proteins040915 -e 'load data local infile "pbAaDistL.tab" into table pbAaDistL'
        hgsql proteins040915 -e 'load data local infile "pbAaDistS.tab" into table pbAaDistS'

        hgsql proteins040915 -e 'load data local infile "pbAnomLimit.tab" into table pbAnomLimit'
        hgsql proteins040915 -e 'load data local infile "pbResAvgStd.tab" into table pbResAvgStd'

   o Get taxnomy names table from NCBI.

        cd /cluster/data/proteins/040915
	mkdir taxon
	cd taxon
	wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
	unzip taxdmp.zip

     Create table taxonNames in proteins040915

     	#A table to keep all NCBI taxon names 
     	CREATE TABLE taxonNames (
     	id int not null,                # Taxon NCBI ID
     	name varchar(255) not null,     # Binomial format name
     	info varchar(255),              # other info
     	nameType varchar(255) not null, # name type
     	#Indices
     	INDEX(id)
     	);
     
     Load from the file names.dmp into taxonNames table.
     
        load data local infile "names.dmp" into table taxonNames fields terminated by '|' enclosed by '\t';

   o Load and edit pbStamp table

        cd /cluster/data/proteins/040915
	cp ~/src/hg/proteins/pbTracks/pbStamp.tab .
	hgsql proteins040915 'load data local infile "pbStamp.tab" into table pbStamp'

   o First check to see if pbGateway and pbGlobal are working.

     Then edit pbStamp.tab to adjust maximum y values for various stamps 
     and load it to pbStamp tables until all their scales look reasonable.
	

# SWITCH SYMBOLIC PROTEIN DATABASE LINKS

  o Ask system admin to switch the following symbolic database links:

       swissProt --> sp040915
       proteins  --> proteins040915

    Perform some tests on hgTracks, hgNear, hgGene, pbTracks, and pbGlobal  
    to make sure things are running OK.
    
    Release to QA for formal testing.