# This file describes how the sp040915 and proteins040915 databases were # made using September 13, 2004 release of UniProt database files # from SWISS-PROT and a few other external databases. # STARTED ON 9/15/04, DONE ON 9/21/04. # FIRST PARSE SWISS-PROT RAW DATA FILES AND BUILD sp040915 DB. o Make subdirectories under /cluster/store8/swissProt mkdir /cluster/store8/swissprot/040915 mkdir /cluster/store8/swissprot/040915/build mkdir /cluster/store8/swissprot/040915/tabFiles ln -s /cluster/store8/swissprot/040915 /cluster/data/swissprot/040915 o Update mkSwissProtDB.sh under src/hg/protein to use kksilo instead of eieio, since /cluster/store8 is on kksilo. o run mkSwissProtDB.sh ssh kksilo cd /cluster/data/swissprot/040915 ~/src/hg/protein/mkSwissProtDB.sh This supposely will ftp over all Swiss-Prot raw data files, parse them using spToDb, create .txt files and load them into sp040915 DB. In reality, since Swiss-Prot changed their data format in a few places, spToDb no longer finishes successfully. Several changes were made to spToDb.c, and several manual steps were used to complete what mkSwissProtDB.sh was supposed to finish in one shot. The recent Swiss-Prot format changes are listed below: 1. Gene name line is changed into: GN Name=CRB; OrderedLocusNames=At1g03880; ORFNames=F21M11.19; 2. An new type of DOI (Digital Object Identifier) is added to citation cross-reference line, e.g.: RX MEDLINE=88039002; PubMed=2444886; DOI=10.1016/0166-6851(87)90007-7; I added a "doi" field at the end of reference table to store DOI info. Jim, the parseNameVals() function in spToDb.c assumes that there is always "xxx=...." separated by ";". This is violated by some DOI lines and "GN " lines, e.g.: RX DOI=10.1002/(SICI)1097-0061(199610)12:13<1321::AID-YEA27>3.0.CO;2-6; GN Name=CBP1; Synonyms=CXP;1; In those cases, there are a few extra characters at the end of the line after ";". I changed the hard exit logic of parseNameVals() to stop parsing and thus ignore those extra characters. This seems working OK for now. For record keeping purpose, the complete script of mkSwissProtDB.sh is included in the following. Hopefully next time, this script will run successfully from start to end. If not, just following the steps in this script. They should work (most of the time). ### begining of mkSwissProtDB.sh #!/bin/sh # # mkSwissProtDB.sh # - currently no arguments but it should be modified to take an # - argument of a data stamp instead of generating one below since # - you want to have consistent date stamps for this swissprot and # - the subsequence proteins database # # This script could be improved to do error checking for each step. # # Thu Nov 20 11:31:51 PST 2003 - Created - Hiram # # "$Id: proteinDBs040915.txt,v 1.1 2006/07/25 20:14:50 markd Exp $" TOP=/cluster/data/swissprot export TOP cd ${TOP} type spToDb > /dev/null 2> /dev/null if [ "$?" -ne 0 ]; then echo "ERROR: can not find required program: spToDb" echo -e "\tYou may need to build it in ~/kent/src/hg/protein/spToDb" exit 255 fi MACHINE=`uname -n` if [ ${MACHINE} != "kksilo" -a ${MACHINE} != "hgwdev" ]; then echo "ERROR: must run this script on kksilo or hgwdev. This is: ${MACHINE}" exit 255 fi DATE=`date "+%y%m%d"` SP="${DATE}" SPDB="sp${DATE}" export SP SPDB echo "Creating Db: ${SP}" if [ ${MACHINE} = "kksilo" ]; then if [ -d "${SP}" ]; then echo "WARNING: ${SP} already exists." echo -e "Do you want to try to use the data here ? (ynq) \c" read YN if [ "${YN}" = "Y" -o "${YN}" = "y" ]; then echo "working with current data in ${SP}" else echo "Will not recreate at this time." exit 255 fi fi echo mkdir -p ./${SP} mkdir -p ./${SP} cd ./${SP} mkdir -p ./build cd ./build for db in uniprot_sprot uniprot_trembl new/uniprot_trembl_new do if [ ! -f ${db}.dat.gz ]; then wget --timestamping \ ftp://us.expasy.org/databases/uniprot/knowledgebase/${db}.dat.gz fi done mv uniprot_sprot.dat.gz sprot.dat.gz mv uniprot_trembl.dat.gz trembl.dat.gz mv uniprot_trembl_new.dat.gz trembl_new.dat.gz zcat *.dat.gz | spToDb stdin ../tabFiles else if [ ! -d ${TOP}/${DATE}/tabFiles ]; then echo "ERROR: ${TOP}/tabFiles does not exist." echo -e "\tRun this first on kksilo to fetch the data." exit 255 fi if [ ! -f ~/kent/src/hg/protein/spToDb/spDb.sql ]; then echo "ERROR: can not find ~/kent/src/hg/protein/spToDb/spDb.sql" echo "\tto create the database. Update your source tree." exit 255 fi echo "creating the database ${SPDB}" EXISTS=`hgsql -e "show tables;" ${SPDB} 2> /dev/null | wc -l` if [ "${EXISTS}" -gt 1 ]; then echo "ERROR: database ${SPDB} already exists" echo -e "\t to drop: hgsql -e 'drop database ${SPDB};' ${SPDB}" exit 255 fi hgsql -e "create database ${SPDB}" proteins040515 hgsql ${SPDB} < ~/kent/src/hg/protein/spToDb/spDb.sql cd ${TOP}/${DATE}/tabFiles for i in *.txt do TBL=${i%.txt} echo "importing table: ${TBL}" echo hgsql -e "load data local infile '${i}' into table ${TBL};" ${SPDB} hgsql -e "load data local infile \"${i}\" into table ${TBL};" ${SPDB} done fi exit 0 ### end of mkSwissProtDB.sh # NEXT BUILD proteins040915 DB o Update mkProteinsDB.sh script to: - remove lines adding additional index, since those index were already present in previous proteinsYYMMDD DBs and the new DB uses an earlier verion DB table definitions to create its tables. - added an InterPro section. mkProteinsDB.sh 040915 # BUILD TABLES FOR pbGlobal (PB V1.1) o Calculate Pi values for all proteins hgsql sp040915 -e "select acc from protein" >acc040915.lis nice pbCalPi acc040915.lis sp040915 pi040915.tab hgsql pbGlobal -e 'load data local infile "pi.tab" into table pepPi;' o Build pepMwAa table hgsql sp040915 -e "select acc, molWeight, aaSize from info" >pepMwAa.tab hgsql proteins040915 \ 'load data local infile "pepMwAa.tab" into table pepMwAa ignore 1 lines' o Calculate global protein property distributions pbCalDistGlobal sp040915 proteins040915 This takes about 20 minutes. hgsql proteins040915 'load data local infile "pepCCntDist.tab" into table pepCCntDist' hgsql proteins040915 'load data local infile "pepHydroDist.tab" into table pepHydroDist' hgsql proteins040915 'load data local infile "pepIPCntDist.tab" into table pepIPCntDist' hgsql proteins040915 'load data local infile "pepMolWtDist.tab" into table pepMolWtDist' hgsql proteins040915 'load data local infile "pepPiDist.tab" into table pepPiDist' hgsql proteins040915 'load data local infile "pepResDist.tab" into table pepResDist' o Calculate global AA residue distributions pbCalResStdGlobal 040915 Load all distribution tables: hgsql proteins040915 -e 'load data local infile "pbAaDistW.tab" into table pbAaDistW' hgsql proteins040915 -e 'load data local infile "pbAaDistC.tab" into table pbAaDistC' hgsql proteins040915 -e 'load data local infile "pbAaDistM.tab" into table pbAaDistM' hgsql proteins040915 -e 'load data local infile "pbAaDistH.tab" into table pbAaDistH' hgsql proteins040915 -e 'load data local infile "pbAaDistY.tab" into table pbAaDistY' hgsql proteins040915 -e 'load data local infile "pbAaDistN.tab" into table pbAaDistN' hgsql proteins040915 -e 'load data local infile "pbAaDistF.tab" into table pbAaDistF' hgsql proteins040915 -e 'load data local infile "pbAaDistI.tab" into table pbAaDistI' hgsql proteins040915 -e 'load data local infile "pbAaDistD.tab" into table pbAaDistD' hgsql proteins040915 -e 'load data local infile "pbAaDistQ.tab" into table pbAaDistQ' hgsql proteins040915 -e 'load data local infile "pbAaDistK.tab" into table pbAaDistK' hgsql proteins040915 -e 'load data local infile "pbAaDistR.tab" into table pbAaDistR' hgsql proteins040915 -e 'load data local infile "pbAaDistT.tab" into table pbAaDistT' hgsql proteins040915 -e 'load data local infile "pbAaDistV.tab" into table pbAaDistV' hgsql proteins040915 -e 'load data local infile "pbAaDistP.tab" into table pbAaDistP' hgsql proteins040915 -e 'load data local infile "pbAaDistG.tab" into table pbAaDistG' hgsql proteins040915 -e 'load data local infile "pbAaDistE.tab" into table pbAaDistE' hgsql proteins040915 -e 'load data local infile "pbAaDistA.tab" into table pbAaDistA' hgsql proteins040915 -e 'load data local infile "pbAaDistL.tab" into table pbAaDistL' hgsql proteins040915 -e 'load data local infile "pbAaDistS.tab" into table pbAaDistS' hgsql proteins040915 -e 'load data local infile "pbAnomLimit.tab" into table pbAnomLimit' hgsql proteins040915 -e 'load data local infile "pbResAvgStd.tab" into table pbResAvgStd' o Get taxnomy names table from NCBI. cd /cluster/data/proteins/040915 mkdir taxon cd taxon wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip unzip taxdmp.zip Create table taxonNames in proteins040915 #A table to keep all NCBI taxon names CREATE TABLE taxonNames ( id int not null, # Taxon NCBI ID name varchar(255) not null, # Binomial format name info varchar(255), # other info nameType varchar(255) not null, # name type #Indices INDEX(id) ); Load from the file names.dmp into taxonNames table. load data local infile "names.dmp" into table taxonNames fields terminated by '|' enclosed by '\t'; o Load and edit pbStamp table cd /cluster/data/proteins/040915 cp ~/src/hg/proteins/pbTracks/pbStamp.tab . hgsql proteins040915 'load data local infile "pbStamp.tab" into table pbStamp' o First check to see if pbGateway and pbGlobal are working. Then edit pbStamp.tab to adjust maximum y values for various stamps and load it to pbStamp tables until all their scales look reasonable. # SWITCH SYMBOLIC PROTEIN DATABASE LINKS o Ask system admin to switch the following symbolic database links: swissProt --> sp040915 proteins --> proteins040915 Perform some tests on hgTracks, hgNear, hgGene, pbTracks, and pbGlobal to make sure things are running OK. Release to QA for formal testing.