# This file describes how the sp051015 and proteins051015 databases were # made using UniProt database files # and a few other external databases. # STARTED ON 10/12/05, sp051015 and proteins051015 DONE ON 10/28/05. # FIRST PARSE SWISS-PROT RAW DATA FILES AND BUILD sp051015 DB. # Make subdirectories under /cluster/store11/swissProt mkdir /cluster/store11/swissprot/051015 mkdir /cluster/store11/swissprot/051015/build mkdir /cluster/store11/swissprot/051015/tabFiles ln -s /cluster/store11/swissprot/051015 /cluster/data/swissprot/051015 # mkSwissProtDB.sh is updated so that it takes date from command argument # instead of using today's date and removed uniprot_trembl_new.dat, since # it is no longer available. # Run mkSwissProtDB.sh to parse Swiss-Prot raw input files. ssh kkstore02 cd /cluster/data/swissprot/051015 ~/src/hg/protein/mkSwissProtDB.sh 051015 # Go to hgwdev and run mkSwissProtDB.sh again to create and load sp051015 database ssh hgwdev cd /cluster/data/swissprot/051015 ~/src/hg/protein/mkSwissProtDB.sh 051015 # This part process the variant splice proteins. # First build variant splice protein tables. # Get all variant isoform human protein sequences ssh hgwdev cd /cluster/data/swissprot/051015/build gzip -d *varsplic.fasta.gz # for sp051015, uniprot_trembl_varsplic.fasta does not exist # faToTab -type=protein uniprot_trembl_varsplic.fasta splicTrembl.tab faToTab -type=protein uniprot_sprot_varsplic.fasta splicSprot.tab # cat splicTrembl.tab splicSprot.tab >varProtein.tab cat splicSprot.tab >varProtein.tab hgsql sp051015 < ~/src/hg/lib/varProtein.sql hgsql sp051015 -e 'load data local infile "varProtein.tab" into table varProtein' # load data into the protein table too. hgsql sp051015 -e 'load data local infile "varProtein.tab" into table protein' cat varProtein.tab |cut -f 1>j1 paste j1 j1 >varDisplayId.tab hgsql sp051015 -e 'load data local infile "varDisplayId.tab" into table displayId' cut -f 1 j1|sed -e 's/-/\t/g' >j2 paste j1 j2 >splicProt.tab # build varAcc table hgsql sp051015 -N -e 'select acc from varProtein' |sort -u >j1 cat j1 |sed -e 's/-/\t/' >j2 paste j1 j2 >varAcc.tab rm j1 j2 hgsql sp051015 -e 'drop table varAcc' hgsql sp051015 < ~/src/hg/lib/varAcc.sql hgsql sp051015 -e 'load data local infile "varAcc.tab" into table varAcc' # hgsql kgHg17FTemp -e 'drop table splicProt' # hgsql kgHg17FTemp <~/src/hg/lib/splicProt.sql # hgsql kgHg17FTemp -e 'load data local infile "splicProt.tab" into table splicProt' # hgsql kgHg17FTemp -N -e \ # 'select varAcc, varProtein.val from sp051015.varProtein,splicProt,proteins051015.spXref3 where accession=parAcc and varProtein.acc=splicProt.varAcc and division="9606"'| \ # awk '{print ">" $1;print $2}' >humanVarProt.fa # Create tableDescriptions table. makeTableDescriptions sp051015 ~/kent/src/hg/protein/spToDb/spDbTables.as # NEXT BUILD proteins051015 DB # make sure the following programs are available: pfamXref # Updated mkProteinsDB.sh script then run it to create proteins051015 DB. # Please note that due to external DB changes, this script could not be successfully # run from start to end. Several manual steps were used. mkProteinsDB.sh is updated # to reflect the actual manual steps used. ~/src/hg/protein/mkProteinsDB.sh 051015 # create indice of the spXref2 table, took about 10 minutes each. # check to see if any index already existed hgsql proteins051015 -e 'CREATE INDEX accession ON spXref2(accession(24))' hgsql proteins051015 -e 'CREATE INDEX displayID ON spXref2(displayID(24))' hgsql proteins051015 -e 'CREATE INDEX extAC ON spXref2(extAC(24))' # hgsql proteins051015 -e 'CREATE INDEX ii2 ON spXref3(displayID(12))' # hgsql proteins051015 -e 'CREATE INDEX ii3 ON spXref3(hugoSymbol(12))' # Add variant splice protein data into sp051015 tables. fgrep ">" uniprot_sprot_varsplic.fasta|sed -e 's/>//g' | sed -e 's/ (/\t1\t/g' \ | sed -e 's/) /\t/g' >varProtTemp.tab # fgrep ">" uniprot_trembl_varsplic.fasta|sed -e 's/>//g' | sed -e 's/ (/\t2\t/g' \ | sed -e 's/) /\t/g' >>varProtTemp.tab # cat uniprot_sprot_varsplic.fasta |sed -e 's/ (/\n/g' |grep -v iso >j.tmp # cat uniprot_trembl_varsplic.fasta|sed -e 's/ (/\n/g' |grep -v iso >>j.tmp # faToTab -type=protein j.tmp varProtein.tab # rm j.tmp # hgsql sp051015 -e 'load data local infile "varProtein.tab" into table protein' hgsql sp051015 -e 'drop table varProtTemp' hgsql sp051015 < ~/src/hg/lib/varProtTemp.sql hgsql sp051015 -e 'load data local infile "varProtTemp.tab" into table varProtTemp' spToProteinsVar 051015 hgsql proteins051015 -e 'load data local infile "spXref3Var.tab" into table spXref3' # not needed, done previously # cut -f 1,2 spXref3Var.tab >j.tmp # hgsql sp051015 -e 'load data local infile "j.tmp" into table displayId' # rm j.tmp hgsql sp051015 -e 'drop table varProtTemp' # Update hgncXref table. # A deficiency in pbHgnc was found and fixed. Reran it again and # load the new data into proteins051015.hgncXref too. pbHgnc 060115 hgsql proteins051015 delete from hgncXref; load data local infile "hgncXref.tab" into table hgncXref; quit;