# This file describes how the sp041115 and proteins041115 databases were # made using November 9th, 2004 release of UniProt database files # from SWISS-PROT and a few other external databases. # STARTED ON 11/19/04, DONE ON 11/22/04. # FIRST PARSE SWISS-PROT RAW DATA FILES AND BUILD sp041115 DB. # Make subdirectories under /cluster/store8/swissProt mkdir /cluster/store8/swissprot/041115 mkdir /cluster/store8/swissprot/041115/build mkdir /cluster/store8/swissprot/041115/tabFiles ln -s /cluster/store8/swissprot/041115 /cluster/data/swissprot/041115 # mkSwissProtDB.sh is updated so that it takes date from command argument # instead of using today's date and removed uniprot_trembl_new.dat, since # it is no longer available. # Run mkSwissProtDB.sh to parse Swiss-Prot raw input files. ssh kksilo cd /cluster/data/swissprot/041115 ~/src/hg/protein/mkSwissProtDB.sh 041115 # Go to hgwdev and run mkSwissProtDB.sh again to create and load sp041115 database ssh hgwdev ~/src/hg/protein/mkSwissProtDB.sh 041115 # NEXT BUILD proteins041115 DB # Updated mkProteinsDB.sh script then run it to create proteins041115 DB. ~/src/hg/protein/mkProteinsDB.sh 041115 # create indice of the spXref2 table, took about 10 minutes each. hgsql proteins041115 -e 'CREATE INDEX i1 ON spXref2(accession(6))' hgsql proteins041115 -e 'CREATE INDEX i2 ON spXref2(displayID(10))' hgsql proteins041115 -e 'CREATE INDEX i3 ON spXref2(extAC(24))' # BUILD TABLES FOR pbGlobal (PB V1.1) cd /cluster/data/proteins/041115 mkdir pbGlobal cd pbGlobal # Calculate Pi values for all proteins hgsql sp041115 -e "select acc from protein" >acc041115.lis nice pbCalPi acc041115.lis sp041115 pi041115.tab hgsql proteins041115 -e 'load data local infile "pi041115.tab" into table pepPi;' # Build pepMwAa table hgsql sp041115 -e "select acc, molWeight, aaSize from info" >pepMwAa.tab hgsql proteins041115 -e 'load data local infile "pepMwAa.tab" into table pepMwAa ignore 1 lines' # Calculate global protein property distributions pbCalDistGlobal sp041115 proteins041115 # This took about 35 minutes. # Load the tables hgsql proteins041115 -e 'load data local infile "pepCCntDist.tab" into table pepCCntDist' hgsql proteins041115 -e 'load data local infile "pepHydroDist.tab" into table pepHydroDist' hgsql proteins041115 -e 'load data local infile "pepIPCntDist.tab" into table pepIPCntDist' hgsql proteins041115 -e 'load data local infile "pepMolWtDist.tab" into table pepMolWtDist' hgsql proteins041115 -e 'load data local infile "pepPiDist.tab" into table pepPiDist' hgsql proteins041115 -e 'load data local infile "pepResDist.tab" into table pepResDist' # Calculate global AA residue distributions pbCalResStdGlobal 041115 # Load distribution tables: hgsql proteins041115 -e 'load data local infile "pbAnomLimit.tab" into table pbAnomLimit' hgsql proteins041115 -e 'load data local infile "pbResAvgStd.tab" into table pbResAvgStd' # Get taxonomy names table from NCBI. cd /cluster/data/proteins/041115 mkdir taxon cd taxon wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip unzip taxdmp.zip # Create table taxonNames in proteins041115 CREATE TABLE taxonNames ( id int not null, # Taxon NCBI ID name varchar(255) not null, # Binomial format name info varchar(255), # other info nameType varchar(255) not null, # name type #Indices INDEX(id) ); # Load from the file names.dmp into taxonNames table. load data local infile "names.dmp" into table taxonNames fields terminated by '|' enclosed by '\t'; # Load and edit pbStamp table cd /cluster/data/proteins/041115 hgsql proteins040915 -e "select * from pbStamp" > pbStamp.tab hgsql proteins041115 -e 'load data local infile "pbStamp.tab" into table pbStamp ignore 1 lines' # First check to see if pbGateway and pbGlobal are working. # Then edit pbStamp.tab to adjust maximum y values for various stamps # and load it to pbStamp tables until all their scales look reasonable. # For this particular release, no adjustment seems necessary. # SWITCH SYMBOLIC PROTEIN DATABASE LINKS # Ask system admin to switch the following symbolic database links: swissProt --> sp041115 proteins --> proteins041115 # Run some simple test on hgTracks, hgNear, hgGene, pbTracks, and pbGlobal # to make sure things are running OK. # Release to QA for formal testing.