The database of biosql072003 and proteins072003 need to 
 be built from SWISS-PROT, TrEMBL, and TrEMBL-NEW first, using bioperl 
 bioperl-db, and other programs.
 (see /cluster/store4/fan/pb/buildProteins072003.doc for details).

o Create a working subdirectory hg16, make symbolic link,  and go there

	mkdir /cluster/store4/fan/pb/hg16
	cd /cluster/store4/fan/pb/hg16
	ln -s /cluster/store4/fan/pb/hg16 ~/hg16

o Build hg16Temp database by:

  	create database hg16Temp;

  Get hg16Temp.sql for table definitions

	dumpdbdef rn3Temp >hg16Temp.sql

  Create tables in hg16Temp:

	mysql -u hgcat -pBigSecret -A hg16Temp <hg16Temp.sql

  From mysql prompt:

	drop table hg16Temp.history;

  NOTE: the above step is needed because later hgKgMrna would have
        a seqment fault error if the history table is not dropped.

o Get mrna input data files

  - Get mrna.fa file:

	/cluster/data/genbank/bin/i386/gbGetSeqs -native -db=hg -gbRoot=/cluster/data/genbank genbank mrna mrna.fa 

    There will be some error messages like:

	warning: AH003062.1 does not appear to be a valid mRNA sequence, skipped: ...

    This is normal.

  - Get mrna.ra file:

    /cluster/data/genbank/bin/i386/gbGetSeqs -get=ra -native -db=hg -gbRoot=/cluster/data/genbank genbank mrna mrna.ra 

  - Get all_mrna.psl file:

    /cluster/data/genbank/bin/i386/gbGetSeqs -get=psl -native -db=hg16 -gbRoot=/cluster/data/genbank genbank mrna all_mrna.psl 
	
o generate a list of all mrna accession numbers

	fgrep ">" mrna.fa > mrna.lis

o Process LocusLink data to generate mrnaRefseq table

        - create a subdirectory 071803 under ~fan/data/ll and cd to there

        - get the latest LocusLink data from

                wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref
                wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2acc
                wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc

	- and copy over to ~hg16
	
		cp -p *loc* ~/hg16 

        - load LocusLink data to this 2 tables using mysql

                LOAD DATA local INFILE 'loc2acc' into table hg16Temp.locus2Acc0;
                LOAD DATA local INFILE 'loc2ref' into table hg16Temp.locus2Ref0;

        - run hgMrnaRefseq to generate mrnaRefseq.tab

                hgMrnaRefseq hg16

	- create table hg16.mrnaRefseq:

		CREATE TABLE mrnaRefseq (
		mrna varchar(40) NOT NULL default '',
  		refseq varchar(40) NOT NULL default '',
  		KEY mrna (mrna),
  		KEY refseq (refseq)
		) TYPE=MyISAM;

        - load data into all appropriate genome databases

                LOAD DATA local INFILE 'mrnaRefseq.tab' into table hg16.mrnaRefseq; 

o generate FASTA format protein seuqnce file

	 kgGetPep 072003 > mrnaPep.fa

o run pslReps to get tighter mRNAs

pslReps -minCover=0.40 -sizeMatters -minAli=0.97 -nearTop=0.002 all_mrna.psl tight_mrna.psl /dev/null

o Run hgKgMrna to build "refGene" tables in hg16Temp database

  hgKgMrna hg16Temp mrna.fa mrna.ra tight_mrna.psl loc2ref mrnaPep.fa mim2loc proteins072003 >hgKgMrna.out 2>hgKgMrna.err

o create the mrnaGene table in hg16Temp DB, by running mrnaGene.sql 
  at mySql prompt

  Load mrnaGene data into the table

	LOAD DATA local INFILE 'refGene.tab' into table hg16Temp.mrnaGene;

  hg16.mrnaGene is needed by spm6

  create KG related tables in hg16

	mysql -u hgcat -pBigSecret -A hg16 < kgRelated.sql

        LOAD DATA local INFILE 'refMrna.tab' into table hg16Temp.refMrna;
  
  Load pep and mrna data into the knownGenePep and knownGeneMrna tables

        LOAD DATA local INFILE 'refPep.tab'  into table hg16.knownGenePep;
        LOAD DATA local INFILE 'refMrna.tab' into table hg16.knownGeneMrna;


o run spm3 to generate the proteinMrna.tab and protein.lis file

	spm3 072003 hg16

  create table spMrna in hg16Temp and load proteinMrna.tab into hg16Temp.spMrna.

	load data local infile "proteinMrna.tab" into table hg16Temp.spMrna;

o run kgBestMrna

  create a subdirectory kgBestMrna
  cd kgBestMrna
  cp -p ../protein.lis .

	kgBestMrna 072003 hg16 2>kgBestMrna.err >kgBestMrna.out2

  The log file of best picks will be generated by kgBestMrna and stored at kgBestMrna.out.

  This may take a day and half to finish!
  The output file is best.lis.  

  	cp -p best.lis ..

  This step could be broken into 3 pieces and run in parallel to leverage hgwdev's 4 CPUs.

o Create spMrna table in hg16, by copy and past spMrna.sql
  at mysql prompt.

    Load the data by:

      LOAD DATA local INFILE 'best.lis' into table hg16.spMrna;
      
o Run spm6 to generate sorted.lis and knownGene0.tab 
  for further duplicates processing

  	spm6 072003 hg16

  create table knownGene0 in hg16Temp

  load the knownGene0.tab into the knownGene0 table in hg16Temp

	LOAD DATA local INFILE 'knownGene0.tab' into table hg16Temp.knownGene0;

o Run spm7 to perform duplicates processing

  	spm7 072003 hg16 > spm7.out

o create knownGene and dupSpMrna tables in hg16 by using knownGene.sql and dupSpMrna.sql

        LOAD DATA local INFILE 'knownGene.tab' into table hg16.knownGene;
        LOAD DATA local INFILE 'duplicate.tab' into table hg16.dupSpMrna;

o collect DNA based RefSeq data to create dnaGene.tab and dnaLink.tab

        dnaGene hg16 proteins072003

o create table knownGeneLink in hg16

        LOAD DATA local INFILE 'dnaLink.tab' into table hg16.knownGeneLink;

o load the data into tables:

        LOAD DATA local INFILE 'dnaGene.tab' into table hg16.knownGene;

o Remove invalid KG entries in knownGenePep and knownGeneMrna tables:

	rmKGPepMrna hg16 072003

  First, use mysql to delete old knownGenePep and knownGeneMrna table entries:

	use hg16
	delete from hg16.knownGenePep;
	delete from hg16.knownGeneMrna;

  Then load in new filtered data:

    LOAD DATA local INFILE 'knownGenePep.tab'  into table hg16.knownGenePep;
    LOAD DATA local INFILE 'knownGeneMrna.tab' into table hg16.knownGeneMrna;

o Use the Genome Browser to check if the "Known Gene" track is functioning
  correctly.


o Now create alias tables to facilitate hgFind.

  First create tables of kgXref, kgAlias and kgProtAlias in hg16, using

	kgXref.sql
	kgAlias.sql  
	kgProtAlias.sql  

o Build kgXref table
 
  Generate xref .tab file for KG

	kgXref hg16 proteins072003

  Load it into mySQL

	load data local infile "kgXref.tab" into table hg16.kgXref;
  
o Build gene aliases

  Generate aliases from hugo, etc

	kgAliasM hg16 proteins072003

  Generate aliases from kgXref

	kgAliasKgXref hg16

  Generate gene aliases from SWISS-PROT data 

    kgAliasP hg16 /cluster/store5/swissprot/072003/sprot.dat      sp.lis
    kgAliasP hg16 /cluster/store5/swissprot/072003/trembl.dat     tr.lis
    kgAliasP hg16 /cluster/store5/swissprot/072003/trembl_new.dat new.lis
    cat sp.lis tr.lis new.lis |sort|uniq >kgAliasP.tab
    rm  sp.lis tr.lis new.lis 

  Generate gene aliases from RefSeq data

    	kgAliasRefseq hg16 

  Concatenate all 3 files

	cat kgAliasM.tab kgAliasRefseq.tab  kgAliasKgXref.tab kgAliasP.tab|sort|uniq > kgAlias.tab

  Load it into mySQL table

	load data local infile "kgAlias.tab" into table hg16.kgAlias;

o Build protein aliases

  Generate protein aliases

	kgProtAlias hg16 proteins072003

  Generate protein aliases from NCBI data

	kgProtAliasNCBI hg16

  Concatenate both files

	cat kgProtAliasNCBI.tab kgProtAlias.tab|sort|uniq > kgProtAliasBoth.tab

  Load it into mySQL tables

	load data local infile "kgProtAliasBoth.tab" into table hg16.kgProtAlias;

o Build tables for BioCyc

  Create bioCycMapDesc table.

	CREATE TABLE bioCycMapDesc (
  	mapID varchar(40) NOT NULL default '',
  	description varchar(255) NOT NULL default '',
  	KEY mapID (mapID)
	) TYPE=MyISAM;

  Crate bioCycPathway table.

	CREATE TABLE bioCycPathway (
  	kgID varchar(40) NOT NULL default '',
  	geneID varchar(40) NOT NULL default '',
  	mapID varchar(40) NOT NULL default '',
  	KEY kgID (kgID),
 	KEY geneID (geneID),
  	KEY mapID (mapID)
	) TYPE=MyISAM;

  Save the BioCyc Pathway name and description table as names.txt, 
  make sure there is no extra ^M at end of the line.

	LOAD DATA local INFILE 'names.txt' into table hg16.bioCycMapDesc;