# for emacs: -*- mode: sh; -*- # This file describes how we made the proteins database using # 6/29/03 release of NR database files from SWISS-PROT # First ftp over raw data files mkdir /cluster/store5/swissprot/072003 cd /cluster/store5/swissprot/072003 mkdir build ftp us.expasy.org cd databases cd sp_tr_nrdb binary prompt mget * gzip -d *.dat.gz # After decompression, total of about 2.2 GB. # create biosql072003 and proteins072003 mySQL databases at hgwdev from interactive mysql create database biosql072003; create database proteins072003; cd /cluster/home/fanhsu/bioperl/bioperl-db/scripts createTables 072003 # use bioPerl to parse the data and load them into biosql loadAll3 072003 & # loading will take about 1.5 days select count(*) from biosql072003.bioentry; to check progress. Over 1 million rows will be generated. o make a subdirectory of: /cluster/store5/proteins/072003 and go there. o dump table definitions from proteins0405 mysqldump -d proteins070403 -u hgcat -pBigSecret > proteins.sql o edit proteins.sql to commented out index i1, i2, i3, i4 of the spXref2 table o create tables in proteins mysql -u hgcat -pBigSecret -A proteins072003 spOrganism.tab rm spOrganismSP.dat spOrganismTR.dat spOrganismTRNEW.dat Load spOrganism.tab to mySQL LOAD DATA local INFILE 'spOrganism.tab' into table proteins072003.spOrganism; o Build spSecondaryID table spSecondID is a program based on SEQIO (under /projects/cc/hg/fanhsu/kent/src/hg/seqio/seqio-1.2.2) spSecondID /cluster/store5/swissprot/072003/sprot.dat cp all.lis all.tab cp 2nd.lis 2nd.tab spSecondID /cluster/store5/swissprot/072003/trembl.dat cat all.lis >>all.tab cat 2nd.lis >>2nd.tab spSecondID /cluster/store5/swissprot/072003/trembl_new.dat cat all.lis >>all.tab cat 2nd.lis >>2nd.tab rm all.lis rm 2nd.lis load into table: LOAD DATA local INFILE '2nd.tab' into table proteins072003.spSecondaryID; o Build pepSequence table getallpep 072003 From mysql: LOAD DATA local INFILE 'allPep.tab' into table proteins072003.pepSequence; o Build pfamXref and pfamDesc tables Ftp over Pfam-A.full.gz from: ftp.sanger.ac.uk/pub/databases/Pfam/Pfam-A.full.gz and save it at /cluster/store5/proteins/pfam/072803 Uncompress it by gzip -d Pfam-A.full.gz Run pfamXref to generate pfamAXref.tab and pfamADesc.tab files pfamXref proteins072003 /cluster/store5/proteins/pfam/072803/Pfam-A.full pfamADesc.tab pfamAXref.tab Load them into mySQL load data local infile "pfamADesc.tab" into table proteins072003.pfamDesc; load data local infile "pfamAXref.tab" into table proteins072003.pfamXref; o Build pdbSP table pdbSP proteins072003 Load the output file, pdbSP.tab, into mySQL load data local infile "pdbSP.tab" into table proteins072003.pdbSP o Build spDisease table Create the spDisease table in proteins072003. CREATE TABLE spDisease ( accession varchar(40) NOT NULL default '', displayID varchar(40) NOT NULL default '', diseaseDesc text, KEY accession (accession), KEY displayID (displayID) ) TYPE=MyISAM; Use the following SQL and hgsql to retrieve rows and create spDisease.tab. select comment.acc, displayId.val, commentVal.val from sp092903.comment, sp092903.commentVal, sp092903.displayId where comment.commentType=19 and commentVal.id=comment.commentVal and displayId.acc=comment.acc; Load the table into proteins072003. load data local infile "spDisease.tab" into table proteins072003.spDisease;