The database of biosql072003 and proteins072003 need to be built from SWISS-PROT, TrEMBL, and TrEMBL-NEW first, using bioperl bioperl-db, and other programs. (see /cluster/store4/fan/pb/buildProteins072003.doc for details). o Create a working subdirectory hg16, make symbolic link, and go there mkdir /cluster/store4/fan/pb/hg16 cd /cluster/store4/fan/pb/hg16 ln -s /cluster/store4/fan/pb/hg16 ~/hg16 o Build hg16Temp database by: create database hg16Temp; Get hg16Temp.sql for table definitions dumpdbdef rn3Temp >hg16Temp.sql Create tables in hg16Temp: mysql -u hgcat -pBigSecret -A hg16Temp " mrna.fa > mrna.lis o Process LocusLink data to generate mrnaRefseq table - create a subdirectory 071803 under ~fan/data/ll and cd to there - get the latest LocusLink data from wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2acc wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc - and copy over to ~hg16 cp -p *loc* ~/hg16 - load LocusLink data to this 2 tables using mysql LOAD DATA local INFILE 'loc2acc' into table hg16Temp.locus2Acc0; LOAD DATA local INFILE 'loc2ref' into table hg16Temp.locus2Ref0; - run hgMrnaRefseq to generate mrnaRefseq.tab hgMrnaRefseq hg16 - create table hg16.mrnaRefseq: CREATE TABLE mrnaRefseq ( mrna varchar(40) NOT NULL default '', refseq varchar(40) NOT NULL default '', KEY mrna (mrna), KEY refseq (refseq) ) TYPE=MyISAM; - load data into all appropriate genome databases LOAD DATA local INFILE 'mrnaRefseq.tab' into table hg16.mrnaRefseq; o generate FASTA format protein seuqnce file kgGetPep 072003 > mrnaPep.fa o run pslReps to get tighter mRNAs pslReps -minCover=0.40 -sizeMatters -minAli=0.97 -nearTop=0.002 all_mrna.psl tight_mrna.psl /dev/null o Run hgKgMrna to build "refGene" tables in hg16Temp database hgKgMrna hg16Temp mrna.fa mrna.ra tight_mrna.psl loc2ref mrnaPep.fa mim2loc proteins072003 >hgKgMrna.out 2>hgKgMrna.err o create the mrnaGene table in hg16Temp DB, by running mrnaGene.sql at mySql prompt Load mrnaGene data into the table LOAD DATA local INFILE 'refGene.tab' into table hg16Temp.mrnaGene; hg16.mrnaGene is needed by spm6 create KG related tables in hg16 mysql -u hgcat -pBigSecret -A hg16 < kgRelated.sql LOAD DATA local INFILE 'refMrna.tab' into table hg16Temp.refMrna; Load pep and mrna data into the knownGenePep and knownGeneMrna tables LOAD DATA local INFILE 'refPep.tab' into table hg16.knownGenePep; LOAD DATA local INFILE 'refMrna.tab' into table hg16.knownGeneMrna; o run spm3 to generate the proteinMrna.tab and protein.lis file spm3 072003 hg16 create table spMrna in hg16Temp and load proteinMrna.tab into hg16Temp.spMrna. load data local infile "proteinMrna.tab" into table hg16Temp.spMrna; o run kgBestMrna create a subdirectory kgBestMrna cd kgBestMrna cp -p ../protein.lis . kgBestMrna 072003 hg16 2>kgBestMrna.err >kgBestMrna.out2 The log file of best picks will be generated by kgBestMrna and stored at kgBestMrna.out. This may take a day and half to finish! The output file is best.lis. cp -p best.lis .. This step could be broken into 3 pieces and run in parallel to leverage hgwdev's 4 CPUs. o Create spMrna table in hg16, by copy and past spMrna.sql at mysql prompt. Load the data by: LOAD DATA local INFILE 'best.lis' into table hg16.spMrna; o Run spm6 to generate sorted.lis and knownGene0.tab for further duplicates processing spm6 072003 hg16 create table knownGene0 in hg16Temp load the knownGene0.tab into the knownGene0 table in hg16Temp LOAD DATA local INFILE 'knownGene0.tab' into table hg16Temp.knownGene0; o Run spm7 to perform duplicates processing spm7 072003 hg16 > spm7.out o create knownGene and dupSpMrna tables in hg16 by using knownGene.sql and dupSpMrna.sql LOAD DATA local INFILE 'knownGene.tab' into table hg16.knownGene; LOAD DATA local INFILE 'duplicate.tab' into table hg16.dupSpMrna; o collect DNA based RefSeq data to create dnaGene.tab and dnaLink.tab dnaGene hg16 proteins072003 o create table knownGeneLink in hg16 LOAD DATA local INFILE 'dnaLink.tab' into table hg16.knownGeneLink; o load the data into tables: LOAD DATA local INFILE 'dnaGene.tab' into table hg16.knownGene; o Remove invalid KG entries in knownGenePep and knownGeneMrna tables: rmKGPepMrna hg16 072003 First, use mysql to delete old knownGenePep and knownGeneMrna table entries: use hg16 delete from hg16.knownGenePep; delete from hg16.knownGeneMrna; Then load in new filtered data: LOAD DATA local INFILE 'knownGenePep.tab' into table hg16.knownGenePep; LOAD DATA local INFILE 'knownGeneMrna.tab' into table hg16.knownGeneMrna; o Use the Genome Browser to check if the "Known Gene" track is functioning correctly. o Now create alias tables to facilitate hgFind. First create tables of kgXref, kgAlias and kgProtAlias in hg16, using kgXref.sql kgAlias.sql kgProtAlias.sql o Build kgXref table Generate xref .tab file for KG kgXref hg16 proteins072003 Load it into mySQL load data local infile "kgXref.tab" into table hg16.kgXref; o Build gene aliases Generate aliases from hugo, etc kgAliasM hg16 proteins072003 Generate aliases from kgXref kgAliasKgXref hg16 Generate gene aliases from SWISS-PROT data kgAliasP hg16 /cluster/store5/swissprot/072003/sprot.dat sp.lis kgAliasP hg16 /cluster/store5/swissprot/072003/trembl.dat tr.lis kgAliasP hg16 /cluster/store5/swissprot/072003/trembl_new.dat new.lis cat sp.lis tr.lis new.lis |sort|uniq >kgAliasP.tab rm sp.lis tr.lis new.lis Generate gene aliases from RefSeq data kgAliasRefseq hg16 Concatenate all 3 files cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab|sort|uniq > kgAlias.tab Load it into mySQL table load data local infile "kgAlias.tab" into table hg16.kgAlias; o Build protein aliases Generate protein aliases kgProtAlias hg16 proteins072003 Generate protein aliases from NCBI data kgProtAliasNCBI hg16 Concatenate both files cat kgProtAliasNCBI.tab kgProtAlias.tab|sort|uniq > kgProtAliasBoth.tab Load it into mySQL tables load data local infile "kgProtAliasBoth.tab" into table hg16.kgProtAlias; o Build tables for BioCyc Create bioCycMapDesc table. CREATE TABLE bioCycMapDesc ( mapID varchar(40) NOT NULL default '', description varchar(255) NOT NULL default '', KEY mapID (mapID) ) TYPE=MyISAM; Crate bioCycPathway table. CREATE TABLE bioCycPathway ( kgID varchar(40) NOT NULL default '', geneID varchar(40) NOT NULL default '', mapID varchar(40) NOT NULL default '', KEY kgID (kgID), KEY geneID (geneID), KEY mapID (mapID) ) TYPE=MyISAM; Save the BioCyc Pathway name and description table as names.txt, make sure there is no extra ^M at end of the line. LOAD DATA local INFILE 'names.txt' into table hg16.bioCycMapDesc;