# This file describes how the sp050415 and proteins050415 databases were # made using UniProt database files # and a few other external databases. # STARTED ON 05/04/21, DONE ON 05/26/21. # FIRST PARSE SWISS-PROT RAW DATA FILES AND BUILD sp050415 DB. # Make subdirectories under /cluster/store10/swissProt mkdir /cluster/store10/swissprot/050415 mkdir /cluster/store10/swissprot/050415/build mkdir /cluster/store10/swissprot/050415/tabFiles ln -s /cluster/store10/swissprot/050415 /cluster/data/swissprot/050415 # mkSwissProtDB.sh is updated so that it takes date from command argument # instead of using today's date and removed uniprot_trembl_new.dat, since # it is no longer available. # Run mkSwissProtDB.sh to parse Swiss-Prot raw input files. ssh kksilo cd /cluster/data/swissprot/050415 ~/src/hg/protein/mkSwissProtDB.sh 050415 # Go to hgwdev and run mkSwissProtDB.sh again to create and load sp050415 database ssh hgwdev cd /cluster/data/swissprot/050415 ~/src/hg/protein/mkSwissProtDB.sh 050415 # NEXT BUILD proteins050415 DB # Updated mkProteinsDB.sh script then run it to create proteins050415 DB. ~/src/hg/protein/mkProteinsDB.sh 050415 # create indice of the spXref2 table, took about 10 minutes each. hgsql proteins050415 -e 'CREATE INDEX i2 ON spXref2(displayID(12))' hgsql proteins050415 -e 'CREATE INDEX i3 ON spXref2(extAC(24))' hgsql proteins050415 -e 'CREATE INDEX ii2 ON spXref3(displayID(12))' hgsql proteins050415 -e 'CREATE INDEX ii3 ON spXref3(hugoSymbol(12))' # Add variant splice protein data into sp050415 tables. fgrep ">" uniprot_sprot_varsplic.fasta|sed -e 's/>//g' | sed -e 's/ (/\t1\t/g' \ | sed -e 's/) /\t/g' >varProtTemp.tab fgrep ">" uniprot_trembl_varsplic.fasta|sed -e 's/>//g' | sed -e 's/ (/\t2\t/g' \ | sed -e 's/) /\t/g' >>varProtTemp.tab cat uniprot_sprot_varsplic.fasta |sed -e 's/ (/\n/g' |grep -v iso >j.tmp cat uniprot_trembl_varsplic.fasta|sed -e 's/ (/\n/g' |grep -v iso >>j.tmp faToTab -type=protein j.tmp varProtein.tab rm j.tmp hgsql sp050415 -e 'load data local infile "varProtein.tab" into table protein' hgsql sp050415 -e 'drop table varProtTemp' hgsql sp050415 < ~/src/hg/lib/varProtTemp.sql hgsql sp050415 -e 'load data local infile "varProtTemp.tab" into table varProtTemp' spToProteinsVar 050415 hgsql proteins050415 -e 'load data local infile "spXref3Var.tab" into table spXref3' cut -f 1,2 spXref3Var.tab >j.tmp hgsql sp050415 -e 'load data local infile "j.tmp" into table displayId' rm j.tmp hgsql sp050415 -e 'drop table varProtTemp' # BUILD TABLES FOR pbGlobal (PB V1.1) cd /cluster/data/proteins/050415 mkdir pbGlobal cd pbGlobal # Calculate Pi values for all proteins hgsql sp050415 -e "select acc from protein" >acc050415.lis nice pbCalPi acc050415.lis sp050415 pi050415.tab hgsql proteins050415 -e 'load data local infile "pi050415.tab" into table pepPi;' # Build pepMwAa table hgsql sp050415 -N -e "select acc, molWeight, aaSize from info" >pepMwAa.tab hgsql proteins050415 -e 'load data local infile "pepMwAa.tab" into table pepMwAa' # Calculate global protein property distributions nice pbCalDistGlobal sp050415 proteins050415 # This took about 35 minutes. # Load the tables hgsql proteins050415 -e 'load data local infile "pepCCntDist.tab" into table pepCCntDist' hgsql proteins050415 -e 'load data local infile "pepHydroDist.tab" into table pepHydroDist' hgsql proteins050415 -e 'load data local infile "pepIPCntDist.tab" into table pepIPCntDist' hgsql proteins050415 -e 'load data local infile "pepMolWtDist.tab" into table pepMolWtDist' hgsql proteins050415 -e 'load data local infile "pepPiDist.tab" into table pepPiDist' hgsql proteins050415 -e 'load data local infile "pepResDist.tab" into table pepResDist' # Calculate global AA residue distributions pbCalResStdGlobal 050415 # Load distribution tables: hgsql proteins050415 -e 'load data local infile "pbAnomLimit.tab" into table pbAnomLimit' hgsql proteins050415 -e 'load data local infile "pbResAvgStd.tab" into table pbResAvgStd' # Get taxonomy names table from NCBI. cd /cluster/data/proteins/050415 mkdir taxon cd taxon wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip unzip taxdmp.zip # Create table taxonNames in proteins050415 # Load from the file names.dmp into taxonNames table. hgsql proteins050415 -e 'load data local infile "names.dmp" into table taxonNames fields terminated by "|" enclosed by "\t";' # Load and edit pbStamp table cd /cluster/data/proteins/050415 hgsql proteins050201 -e "select * from pbStamp" > pbStamp.tab hgsql proteins050415 -e 'load data local infile "pbStamp.tab" into table pbStamp ignore 1 lines' # Found there are a few records in UniProt that has LF char in description line. hgsql proteins050415 -N -e 'delete from spXref3 where accession like "%\.%"' mkdir alias cd alias accession, accession, "uAcc", "2005-04-15" from spXref3' >j1.tmp hgsql proteins050415 -N -e 'select accession, accession2, "u2Ac", "2005-04-15" from spSecondaryID' >j2.tmp hgsql proteins050415 -N -e 'select accession, displayID, "uDis", "2005-04-15" from spXref3' >j3.tmp hgsql proteins041115 -N -e 'select accession, displayID, "oDis", "2004-11-15" from spXref3' >j4.tmp cat j1.tmp j2.tmp j3.tmp j4.tmp >uniProtAlias.tab hgsql proteins050415 -e 'drop table uniProtAlias' hgsql proteins050415 <~/src/hg/lib/uniProtAlias.sql hgsql proteins050415 -e 'load data local infile "uniProtAlias.tab" into table uniProtAlias' hgsql proteins050415 -e 'create index alias on uniProtAlias(alias)' # First check to see if pbGateway and pbGlobal are working. # Then edit pbStamp.tab to adjust maximum y values for various stamps # and load it to pbStamp tables until all their scales look reasonable. # For this particular release, no adjustment seems necessary. # SWITCH SYMBOLIC PROTEIN DATABASE LINKS # Ask system admin to switch the following symbolic database links: swissProt --> sp050415 proteins --> proteins050415 # Run some simple test on hgTracks, hgNear, hgGene, pbTracks, and pbGlobal # to make sure things are running OK. # Release to QA for formal testing. # CREATE spVariant TABLE TO ENABLE UNIPROT VARIANT SPLICE ISOFORM PROCESSING (DONE, Fan, 7/27/05) hgsql proteome -N -e 'select accession,accession from spXref3' >j1 cat j1 |grep "-" |sed -e 's/-/\t/'|cut -f 1,3 >j2 cat j1 |grep -v "-" >>j3 cat j2 j3 |sort -u >spVariant.tab hgsql proteome -e 'drop table spVariant' hgsql proteome < ~/src/hg/lib/spVariant.sql hgsql proteome -e 'load data local infile "spVariant.tab" into table spVariant' hgsql proteome -e 'create index parent on spVariant(parent)' # CREATE TABLES TO ENABLE LINKS TO REACTOME (DONE, Fan 7/27/05) # Obtain the Reactome_events.txt and ucsc_entity14.dat from Reactome. # Contact: Gopinathrao, Gopal [gopinath@cshl.edu] mkdir /cluster/store10/reactome/reactome050613 rm /cluster/data/reacome ln -s /cluster/store10/reactome/reactome050613 /cluster/data/reactome cd /cluster/data/reactome hgsql proteome -e 'drop table spReactomeEvent' hgsql proteome < ~/hg/lib/spReactomeEvent.sql hgsql proteome -e 'load data local infile "Reactome_events.txt" into table spReactomeEvent ignore 4 lines' # First remove extra LF or CR that is causing problem. rmLf ucsc_entity14.dat >ucsc_entity14.cleaned hgsql proteome -e 'drop table spReactomeId' hgsql proteome < ~/hg/lib/spReactomeId.sql hgsql proteome -e 'load data local infile "ucsc_entity14.cleaned" into table spReactomeId ignore 6 lines' # Add the following into src/hg/hgGene/hgGeneData/Human/links.ra and do the same for Mouse and Rat: name reactome shortLabel Reactome tables proteome.spReactomeEvent kgXref idSql select spReactomeId.spID from proteome.spReactomeId,proteome.spVariant,kgXref where kgXref.kgID = '%s' and kgXref.spID=variant and spReactomeId.spID=parent; url http://www.reactome.org/cgi-bin/link?SOURCE=UniProt&ID=%s& priority 86 cd ~/src/hg/hgGene make cd /cluster/data/reactome hgsql hg17 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >hg17.lis hgsql mm6 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >mm6.lis hgsql rn3 -N -e 'select kgId, kgXref.spId from kgXref, proteome.spReactomeId,proteome.spVariant where kgXref.spID=proteome.spVariant.variant and spReactomeId.spId=parent' |sort -u >rn3.lis # Email the above 3 list files to Gopinathrao, Gopal [gopinath@cshl.edu] # EXTEND pfamXref TO COVER VARIANT SPLICE PROTEINS hgsql proteome -N -e \ 'select pfamAC, variant, variant from spVariant, pfamXref where swissAC = parent and variant like "%-%" and swissAC=parent;' \ >pfamXrefVar.tab hgsql proteome -N -e \ 'load data local infile "pfamXrefVar.tab" into table pfamXref'