# This file describes how the sp060115 and proteins060115 databases were # made using UniProt database files # and a few other external databases. # STARTED ON 1/23/06, DONE ON 2/13/06 (Fan). # FIRST PARSE SWISS-PROT RAW DATA FILES AND BUILD sp060115 DB. # Make subdirectories under /cluster/store11/swissProt mkdir /cluster/store11/swissprot/060115 mkdir /cluster/store11/swissprot/060115/build mkdir /cluster/store11/swissprot/060115/tabFiles ln -s /cluster/store11/swissprot/060115 /cluster/data/swissprot/060115 # Run mkSwissProtDB.sh to parse Swiss-Prot raw input files. ssh kkstore02 cd /cluster/data/swissprot/060115 ~/src/hg/protein/mkSwissProtDB.sh 060115 # Answer y. # Go to hgwdev and run mkSwissProtDB.sh again to create and load sp060115 database ssh hgwdev cd /cluster/data/swissprot/060115 ~/src/hg/protein/mkSwissProtDB.sh 060115 # This part process the variant splice proteins. # First build variant splice protein tables. # Get all variant isoform human protein sequences ssh hgwdev cd /cluster/data/swissprot/060115/build gzip -d *varsplic.fasta.gz # for sp060115, uniprot_trembl_varsplic.fasta does not exist # faToTab -type=protein uniprot_trembl_varsplic.fasta splicTrembl.tab faToTab -type=protein uniprot_sprot_varsplic.fasta splicSprot.tab # cat splicTrembl.tab splicSprot.tab >varProtein.tab cat splicSprot.tab >varProtein.tab hgsql sp060115 < ~/src/hg/lib/varProtein.sql hgsql sp060115 -e 'load data local infile "varProtein.tab" into table varProtein' # load data into the protein table too. hgsql sp060115 -e 'load data local infile "varProtein.tab" into table protein' cat varProtein.tab |cut -f 1>j1 paste j1 j1 >varDisplayId.tab hgsql sp060115 -e 'load data local infile "varDisplayId.tab" into table displayId' cut -f 1 j1|sed -e 's/-/\t/g' >j2 paste j1 j2 >splicProt.tab hgsql sp060115 -N -e 'select acc from varProtein' |sort -u >j1 cat j1 |sed -e 's/-/\t/' >j2 paste j1 j2 >varAcc.tab rm j1 j2 hgsql sp060115 -e 'drop table varAcc' hgsql sp060115 < ~/src/hg/lib/varAcc.sql hgsql sp060115 -e 'load data local infile "varAcc.tab" into table varAcc' # Create tableDescriptions table. makeTableDescriptions sp060115 ~/kent/src/hg/protein/spToDb/spDbTables.as # NEXT BUILD proteins060115 DB cd /cluster/data/proteins mkdir 060115 cd 060115 # Get HGNC data ~/src/hg/protein/getHgnc.pl >hgnc.tab hgsql proteins060115 -e "drop table hgnc" hgsql proteins060115 < ~/src/hg/lib/hgnc.sql hgsql proteins060115 -e 'load data local infile "hgnc.tab" into table hgnc ignore 1 lines' pbHgnc 060115 hgsql proteins060115 -e "drop table hgncXref" hgsql proteins060115 < ~/src/hg/lib/hgncXref.sql hgsql proteins060115 -e 'load data local infile "hgncXref.tab" into table hgncXref' # make sure the following programs are available: pfamXref cd /cluster/data/proteins/060115/build spToProteins 060115 # from mySQL prompt do the following: use proteins060115 drop index displayId on spXref3; drop index accession on spXref3; drop index hugoSymbol on spXref3; load data local infile "good/spXref3.tab" into table spXref3; create index accession on spXref3(accession); create index displayId on spXref3(displayId); create index hugoSymbol on spXref3(hugoSymbol); # It took 30 seconds to load and less than 2 minutes to create each index. # load spOrganism table drop index displayID on spOrganism; drop index organism on spOrganism; load data local infile 'spOrganism.tab' into table spOrganism; create index displayID on spOrganism(displayID); create index organism on spOrganism(organism); # Build spXref2 table. spToProteins2 060115 # Build spSecondaryID table cd /cluster/data/proteins/060115 hgsql -e "select displayId.val, displayId.acc, otherAcc.val from displayId, \ otherAcc where otherAcc.acc = displayId.acc;" sp060115 \ | sed -e "1d" > spSecondaryID.tab hgsql -e \ 'LOAD DATA local INFILE "spSecondaryID.tab" into table spSecondaryID;' \ proteins060115 # Build pfamXref and pfamDesc tables mkdir /san/sanvol1/scratch/proteins/pfam/060115 cd /san/sanvol1/scratch/proteins/pfam/060115 wget --timestamping "ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz" # 100 Mb compressed, over 700 Mb uncompressed rm -f Pfam-A.full gunzip Pfam-A.full.gz pfamXref proteins060115 Pfam-A.full pfamADesc.tab pfamAXref.tab >pfamXref.log gzip Pfam-A.full & hgsql -e 'LOAD DATA local INFILE "pfamADesc.tab" into table pfamDesc;' proteins060115 hgsql -e 'LOAD DATA local INFILE "pfamAXref.tab" into table pfamXref;' proteins060115 # Build the pdbSP table, new process using extDbRef data from spXXXXXX cd /san/sanvol1/scratch/proteins/pfam/060115 hgsql sp060115 -N -e 'select extAcc1, d.val from extDbRef x, displayId d, extDb where x.acc=d.acc and extDb.val="PDB" and x.extDb=extDb.id'|sort -u >pdbSP.tab hgsql -e 'LOAD DATA local INFILE "pdbSP.tab" into table pdbSP;' proteins060115 # Build the spDisease table hgsql -N -e \ 'select comment.acc, displayId.val, commentVal.val from comment, commentVal, commentType ct, displayId where comment.commentType=ct.id and ct.val="DISEASE" and commentVal.id=comment.commentVal and displayId.acc=comment.acc;' \ sp060115 > spDisease.tab hgsql -e 'LOAD DATA local INFILE "spDisease.tab" into table spDisease;' proteins060115 # create swInterPro table wget --timestamping "ftp://ftp.ebi.ac.uk/pub/databases/interpro/protein2ipr.dat.gz" gzip -d protein2ipr.dat.gz # rearange col positioin to match the old format cut -f 1 protein2ipr.dat >j1 cut -f 2,3 protein2ipr.dat >j23 cut -f 4,5,6 protein2ipr.dat >j456 paste j1 j456 j23 > protein2interpro.dat rm j1 j456 j23 hgsql proteins060115 -e 'load data local infile "protein2interpro.dat" into table interProXref;' #hgsql proteins060115 -e "drop table proteins060115.swInterPro" hgsql --skip-column-names proteins060115 -e 'select accession, interProId from interProXref;'|sort -u >swInterPro.tab hgsql proteins060115 -e 'load data local infile "swInterPro.tab" into table swInterPro;' # BUILD TABLES FOR pbGlobal (PB V1.1) cd /cluster/data/proteins/060115 mkdir pbGlobal cd pbGlobal # Calculate Pi values for all proteins hgsql sp060115 -e "select acc from protein" >acc060115.lis nice pbCalPi acc060115.lis sp060115 pi060115.tab hgsql proteins060115 -e 'load data local infile "pi060115.tab" into table pepPi;' # Build pepMwAa table hgsql sp060115 -N -e "select acc, molWeight, aaSize from info" >pepMwAa.tab hgsql proteins060115 -e 'load data local infile "pepMwAa.tab" into table pepMwAa' # Calculate global protein property distributions nice pbCalDistGlobal sp060115 proteins060115 # This took about 35 minutes. # Load the tables hgsql proteins060115 -e 'delete from pepCCntDist' hgsql proteins060115 -e 'delete from pepHydroDist' hgsql proteins060115 -e 'delete from pepIPCntDist' hgsql proteins060115 -e 'delete from pepMolWtDist' hgsql proteins060115 -e 'delete from pepPiDist' hgsql proteins060115 -e 'delete from pepResDist' hgsql proteins060115 -e 'load data local infile "pepCCntDist.tab" into table pepCCntDist' hgsql proteins060115 -e 'load data local infile "pepHydroDist.tab" into table pepHydroDist' hgsql proteins060115 -e 'load data local infile "pepIPCntDist.tab" into table pepIPCntDist' hgsql proteins060115 -e 'load data local infile "pepMolWtDist.tab" into table pepMolWtDist' hgsql proteins060115 -e 'load data local infile "pepPiDist.tab" into table pepPiDist' hgsql proteins060115 -e 'load data local infile "pepResDist.tab" into table pepResDist' # Calculate global AA residue distributions pbCalResStdGlobal 060115 # Load distribution tables: hgsql proteins060115 -e 'delete from pbAnomLimit' hgsql proteins060115 -e 'delete from pbResAvgStd' hgsql proteins060115 -e 'load data local infile "pbAnomLimit.tab" into table pbAnomLimit' hgsql proteins060115 -e 'load data local infile "pbResAvgStd.tab" into table pbResAvgStd' # Get taxonomy names table from NCBI. cd /cluster/data/proteins/060115 mkdir taxon cd taxon wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip unzip taxdmp.zip # Create table taxonNames in proteins060115 # Load from the file names.dmp into taxonNames table. hgsql proteins060115 -e 'load data local infile "names.dmp" into table taxonNames fields terminated by "|" enclosed by "\t";' # Load and edit pbStamp table cd /cluster/data/proteins/060115 hgsql proteins051015 -e "select * from pbStamp" > pbStamp.tab hgsql proteins060115 -e 'delete from pbStamp' hgsql proteins060115 -e 'load data local infile "pbStamp.tab" into table pbStamp ignore 1 lines' # Found there are a few records in UniProt that has LF char in description line. # This was left over by last build. No such problem this time. # hgsql proteins060115 -N -e 'delete from spXref3 where accession like "%\.%"' mkdir alias cd alias hgsql proteins060115 -N -e 'select accession, accession, "uAcc", "2006-01-15" from spXref3' >j1.tmp hgsql proteins060115 -N -e 'select accession, accession2, "u2Ac", "2006-01-15" from spSecondaryID' >j2.tmp hgsql proteins060115 -N -e 'select accession, displayID, "uDis", "2006-01-15" from spXref3' >j3.tmp hgsql proteins041115 -N -e 'select accession, displayID, "oDis", "2004-11-15" from spXref3' >j4.tmp # PLEASE NOTE: proteins041115 was later deleted by Fan when hgwdev mySQL space was at 100%. cat j1.tmp j2.tmp j3.tmp j4.tmp >uniProtAlias.tab hgsql proteins060115 -e 'drop table uniProtAlias' hgsql proteins060115 <~/src/hg/lib/uniProtAlias.sql hgsql proteins060115 -e 'load data local infile "uniProtAlias.tab" into table uniProtAlias' hgsql proteins060115 -e 'create index alias on uniProtAlias(alias)' hgsql proteins060115 -e 'drop table spOldNew' hgsql proteins060115 <~/src/hg/lib/spOldNew.sql hgsql proteins050415 -N -e 'select * from spOldNew' >spOldNew.tab hgsql proteins060115 -e 'load data local infile "spOldNew.tab" into table spOldNew' hgsql proteins060115 -e 'create index oldDisplayId on spOldNew(oldDisplayId)' hgsql proteins060115 -e 'create index newDisplayId on spOldNew(newDisplayId)' # Build spDeleted table # Download list of deleted accession numbers from UniProt wget --timestamping http://ca.expasy.org/txt/delac_tr.txt cp delac_tr.txt spDeleted.tab # Edit spDeleted.tab to get rid of other lines except accession numbers vi spDeleted.tab hgsql proteins060115 -e 'load data local infile "spDeleted.tab" into table spDeleted' # Copy the ensemblXref3, sfAss, and sfDesc from proteines051015 ssh hgwdev cd /cluster/data/proteins/060115 hgsql proteins051015 -N -e 'select * from ensemblXref3' >ensemblXref3.tab hgsql proteins060115 -e 'delete from ensemblXref3' hgsql proteins060115 -e 'load data local infile "ensemblXref3.tab.old" into table ensemblXref3' # This table should be updated after Ensembl puts out their Human Build 36 release. hgsql proteins051015 -N -e 'select * from sfAssign' > sfAssign.tab hgsql proteins060115 -e 'delete from sfAssign' hgsql proteins060115 -e 'load data local infile "sfAssign.tab" into table sfAssign' hgsql proteins051015 -N -e 'select * from sfDes' >sfDes.tab hgsql proteins060115 -e 'delete from sfDes' hgsql proteins060115 -e 'load data local infile "sfDes.tab" into table sfDes' # The sfAssign and sfDes should be updated after we update our Superfamily track(s). # CREATE spVariant TABLE TO ENABLE UNIPROT VARIANT SPLICE ISOFORM PROCESSING hgsql proteome -N -e 'select accession,accession from spXref3' >j1 cat j1 |grep "-" |sed -e 's/-/\t/'|cut -f 1,3 >j2 cat j1 |grep -v "-" >>j3 cat j2 j3 |sort -u >spVariant.tab hgsql proteome -e 'drop table spVariant' hgsql proteome < ~/src/hg/lib/spVariant.sql hgsql proteome -e 'load data local infile "spVariant.tab" into table spVariant' hgsql proteome -e 'create index parent on spVariant(parent)' # EXTEND pfamXref TO COVER VARIANT SPLICE PROTEINS hgsql proteome -N -e \ 'select pfamAC, variant, variant from spVariant, pfamXref where swissAC = parent and variant like "%-%" and swissAC=parent;' \ >pfamXrefVar.tab hgsql proteome -N -e \ 'load data local infile "pfamXrefVar.tab" into table pfamXref' # First check to see if pbGateway and pbGlobal are working. # Then edit pbStamp.tab to adjust maximum y values for various stamps # and load it to pbStamp tables until all their scales look reasonable. # For this particular release, no adjustment seems necessary. # SWITCH SYMBOLIC PROTEIN DATABASE LINKS # Ask system admin to switch the following symbolic database links: swissProt --> sp060115 proteins --> proteins060115 # Run some simple test on hgTracks, hgNear, hgGene, pbTracks, and pbGlobal # to make sure things are running OK. # Release to QA for formal testing. # Update hgncXref table. # A deficiency in pbHgnc was found and fixed. Reran it again and # load the new data into proteins051015.hgncXref too. pbHgnc 060115 hgsql proteins060115 delete from hgncXref; load data local infile "hgncXref.tab" into table hgncXref; quit; # CREATE REACTOME TABLES (DONE, Fan 2/23/06) # Obtain the ucsc_events14.dat.txt and ucsc_entity14.dat from Reactome. # Contact: Gopinathrao, Gopal [gopinath@cshl.edu] # ...14.dat is actually a typo. They should be 16 (for ver.16). mkdir /cluster/store10/reactome/reactome060223 rm /cluster/data/reactome ln -s /cluster/store10/reactome/reactome060223 /cluster/data/reactome cd /cluster/data/reactome # save these two .dat files to this subdirectory. hgsql proteome -e 'drop table spReactomeEvent' hgsql proteome < ~/hg/lib/spReactomeEvent.sql hgsql proteome -e 'load data local infile "ucsc_events14.dat" into table spReactomeEvent ignore 4 lines' hgsql proteome -e 'drop table spReactomeId' hgsql proteome < ~/hg/lib/spReactomeId.sql hgsql proteome -e 'load data local infile "ucsc_entity14.dat" into table spReactomeId ignore 6 lines' # Generate lists for Reactome to link back to UCSC site. hgsql hg18 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg18.lis hgsql mm7 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm7.lis hgsql hg17 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg17.lis # Email the above 3 list files to Gopinathrao, Gopal [gopinath@cshl.edu] ######################################################### # UPDATE REACTOME TABLES (DONE, Fan 8/18/06) # Obtain the ucsc_events18.dat.txt and ucsc_entity18.dat from Reactome. # Contact: Gopinathrao, Gopal [gopinath@cshl.edu] ssh hgwdev mkdir /cluster/store10/reactome/reactome060818 rm /cluster/data/reactome ln -s /cluster/store10/reactome/reactome060818 /cluster/data/reactome cd /cluster/data/reactome # save these two .dat files to this subdirectory. # First run rmLf on these two files to remove extra CRs. hgsql proteome -e 'drop table spReactomeEvent' hgsql proteome < ~/hg/lib/spReactomeEvent.sql hgsql proteome -e 'load data local infile "ucsc_events18.dat" into table spReactomeEvent ignore 4 lines' hgsql proteome -e 'drop table spReactomeId' hgsql proteome < ~/hg/lib/spReactomeId.sql hgsql proteome -e 'load data local infile "ucsc_entity18.dat" into table spReactomeId ignore 6 lines' # Generate lists for Reactome to link back to UCSC site. hgsql hg18 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg18.lis hgsql hg17 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg17.lis hgsql mm8 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm8.lis hgsql mm7 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm7.lis hgsql rn4 -N -e 'select kgId, kgXref.spId from kgXref,proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >rn4.lis # Email the above 5 list files to Gopinathrao, Gopal [gopinath@cshl.edu]