/** \page orthoPickIntron.c Pick an intron for exploratory PCR using mouse transcript data.
Overview:
Mouse mRNAs and ESTs were summarized in a more
compact format and were mapped to orthologous locations in the human
genome using blastz nets and chains created by Jim Kent. Individual
introns and flanking exons were chosen as candidates for exploratory
PCR by the number of mouse transcripts containing the intron, The
objective is to take advantage of the mouse ESTs and mRNAs to
supplement the human ones in looking for novel genes.
Details:
Objective:
Construct a summary of all mRNA and EST data in mouse and map them to
human genome via whole genome alignments. Select mouse mapped
transcripts that are not overlapped by native human transcripts for
exploratory PCR.
Algorithm:
- Mouse mRNAs and cDNAs are clustered on the mouse genome and
a summary of the cluster (altGraphX) is generated.
- Mouse clusters are mapped via Jim Kent's
net/chain mapping of the mouse and human genomes to the human
genome. By using the net/chain mapping we can take into account more
sequence than just the individual mRNAs and also take into account
synteny
- Each intron and flanking exons are evaluated for the number of mouse
mRNAs and ESTs that mapped over, as well as overlap with exons predicted
by gene finding programs.
- Individual introns are selected and the loci are remembered so
that only one intron is chosen from each loci. Also, no intron is
chosen if it is overlapped by any transcript evidence.
*/
#include "common.h"
#include "hdb.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "psl.h"
#include "bed.h"
#include "borf.h"
#include "dnaseq.h"
#include "fa.h"
#include "portable.h"
#include "dystring.h"
#include "altGraphX.h"
#include "bits.h"
#include "dnautil.h"
#include "orthoEval.h"
#include "rbTree.h"
struct intronEv
/** Data about one intron. */
{
struct intronEv *next; /**< Next in list. */
struct orthoEval *ev; /**< Parent orthoEval structure. */
char *chrom; /**< Chromosome. */
unsigned int /** Exon starts and stops on chromsome. */
e1S, e1E,
e2S, e2E;
char *agxName; /**< Name of best fitting native agx record. Memory not owned here. */
int support; /**< Number of mRNAs supporting this list. */
int orientation; /**< Orientation of intron. */
boolean inCodInt; /**< TRUE if in coding region, FALSE otherwise. */
float borfScore; /**< Score for ev from borf program. */
};
static boolean doHappyDots; /* output activity dots? */
static struct optionSpec optionSpecs[] =
/** Our acceptable options to be called with. */
{
{"help", OPTION_BOOLEAN},
{"db", OPTION_STRING},
{"orthoEvalFile", OPTION_STRING},
{"bedOutFile", OPTION_STRING},
{"orthoBedOut", OPTION_STRING},
{"orthoEvalOut", OPTION_STRING},
{"htmlFile", OPTION_STRING},
{"htmlFrameFile", OPTION_STRING},
{"scoreFile", OPTION_STRING},
{"numPicks", OPTION_INT},
{NULL, 0}
};
static char *optionDescripts[] =
/** Description of our options for usage summary. */
{
"Display this message.",
"Database (i.e. hg15) that bedFile corresponds to.",
"File to input orthoEval records from.",
"File to output final bed picks for track.",
"File to output mapped orthoBeds that are used.",
"File to output evaluation records.",
"File for html output.",
"[optional default: frame.html] File for html frames.",
"[optional default: not created] Print scores for each intron. orient,support,borf,coding",
"[optional default: 100] Number of introns to pick."
};
void usage()
/** Print usage and quit. */
{
int i=0;
warn("orthoPickIntron - Pick best intron from orthoEval.\n");
for(i=0; ichrom, b->chrom); */
/* if(diff == 0) */
/* { */
/* if (diff == 0) */
/* diff = a->chromStart - b->chromStart; */
/* if(diff == 0) */
/* diff = a->chromEnd - b->chromEnd; */
/* } */
/* return diff; */
/* } */
struct intronEv *intronIvForEv(struct orthoEval *ev, int intron)
/** Return an intronEv record for a particular intron. */
{
struct intronEv *iv = NULL;
struct bed *bed = NULL;
assert(ev);
assert(ev->orthoBed);
assert(intron < ev->orthoBed->blockCount - 1);
bed = ev->orthoBed;
AllocVar(iv);
iv->ev = ev;
iv->chrom = bed->chrom;
iv->e1S = bed->chromStart + bed->chromStarts[intron];
iv->e1E = iv->e1S + bed->blockSizes[intron];
iv->e2S = bed->chromStart + bed->chromStarts[intron+1];
iv->e2E = iv->e2S + bed->blockSizes[intron+1];
iv->agxName = ev->agxNames[intron];
iv->support = ev->basesOverlap;
iv->orientation = ev->orientation[intron];
iv->inCodInt = ev->inCodInt[intron];
if(sameString(ev->borf->name, "dummy"))
iv->borfScore = (5*ev->orthoBed->score) + (ev->borf->score * 100);
else
iv->borfScore = ev->borf->score;
return iv;
}
int scoreForIntronEv(const struct intronEv *iv)
/** Score function based on native support, orientation, and coding. */
{
static FILE *out = NULL;
int score = 0;
char *scoreFile = optionVal("scoreFile", NULL);
if(out == NULL && scoreFile != NULL)
out = mustOpen(scoreFile, "w");
if(out != NULL)
fprintf(out, "%d\t%d\t%f\t%d\n", iv->orientation, iv->support, iv->borfScore, iv->inCodInt);
if(iv->orientation != 0)
score += 200;
// score += iv->support *2;
score += (float) iv->borfScore;
if(iv->inCodInt)
score = score +2;
return score;
}
boolean checkMgcPicks(char *db, struct intronEv *iv)
/** Try to look up an mgc pick in this area. */
{
struct sqlConnection *conn = hAllocConn(db);
struct sqlResult *sr = NULL;
char **row;
int rowOffset = 0;
char *mgcTable = optionVal("mgcTable", "chuckMgcPicked");
char *mgcGenes = optionVal("mgcGenes", "chuckMgcGenes");
char *mgcAList = optionVal("mgcAList", "chuckMgcAList");
boolean foundSome = FALSE;
sr = hRangeQuery(conn, mgcTable, iv->chrom, iv->e1S, iv->e2E, NULL, &rowOffset);
if(sqlNextRow(sr) != NULL)
foundSome = TRUE;
sqlFreeResult(&sr);
if(foundSome == FALSE)
{
sr = hRangeQuery(conn, mgcGenes, iv->chrom, iv->e1S, iv->e2E, NULL, &rowOffset);
if(sqlNextRow(sr) != NULL)
foundSome = TRUE;
sqlFreeResult(&sr);
}
if(foundSome == FALSE)
{
sr = hRangeQuery(conn, mgcAList, iv->chrom, iv->e1S, iv->e2E, NULL, &rowOffset);
if(sqlNextRow(sr) != NULL)
foundSome = TRUE;
sqlFreeResult(&sr);
}
hFreeConn(&conn);
return foundSome;
}
boolean coordOverlappedByTable(char *db, char *chrom, int start, int end, char *table)
/** Return TRUE if there is a record overlapping. */
{
struct sqlConnection *conn = hAllocConn(db);
struct sqlResult *sr = NULL;
char **row;
int rowOffset = 0;
boolean foundSome = FALSE;
sr = hRangeQuery(conn, table, chrom, start, end, NULL, &rowOffset);
if(sqlNextRow(sr) != NULL)
foundSome = TRUE;
sqlFreeResult(&sr);
hFreeConn(&conn);
return foundSome;
}
boolean isOverlappedByTable(char *db, struct intronEv *iv, char *table)
/** Return TRUE if there is a record overlapping. */
{
return coordOverlappedByTable(db, iv->chrom, iv->e1S, iv->e2E, table);
}
boolean isOverlappedByRefSeq(char *db, struct intronEv *iv)
{
return isOverlappedByTable(db, iv, "refGene");
}
boolean isOverlappedByMgcBad(char *db, struct intronEv *iv)
{
return isOverlappedByTable(db, iv, "chuckMgcBad");
}
boolean isOverlappedByEst(char *db, struct intronEv *iv)
{
return isOverlappedByTable(db, iv, "est");
}
boolean isOverlappedByMRna(char *db, struct intronEv *iv)
{
return isOverlappedByTable(db, iv, "mrna");
}
int intronEvalCmp(const void *va, const void *vb)
/** Compare to sort based score function. */
{
const struct intronEv *a = *((struct intronEv **)va);
const struct intronEv *b = *((struct intronEv **)vb);
float aScore = scoreForIntronEv(a);
float bScore = scoreForIntronEv(b);
return bScore - aScore; /* reverse to get largest values first. */
}
boolean bedUniqueInTree(struct rbTree *bedTree, struct intronEv *iv)
{
static struct bed searchLo, searchHi;
struct slRef *refList = NULL, *ref = NULL;
searchLo.chrom = iv->chrom;
searchLo.chromStart =iv->e1S;
searchLo.chromEnd = iv->e2E;
/* searchHi.chrom = iv->chrom; */
/* searchHi.chromStart = iv->e2E; */
/* searchHi.chromEnd = iv->e2E; */
refList = rbTreeItemsInRange(bedTree, &searchLo, &searchLo);
if(refList == NULL)
return TRUE;
else
slFreeList(&refList);
return FALSE;
}
int bedRangeCmp(void *va, void *vb)
{
struct bed *a = va;
struct bed *b = vb;
int diff = strcmp(a->chrom, b->chrom);
if(diff == 0)
{
if(a->chromEnd <= b->chromStart)
diff = -1;
else if(a->chromStart >= b->chromEnd)
diff = 1;
else
diff = 0;
}
return diff;
}
boolean isUniqueCoordAndAgx(char *db, struct intronEv *iv, struct hash *posHash, struct hash *agxHash)
/** Return TRUE if iv isn't in posHash and agxHash.
Return FALSE otherwise. */
{
static char key[1024];
static struct rbTree *bedTree = NULL;
boolean unique = TRUE;
struct bed *bed = NULL;
if(bedTree == NULL)
bedTree = rbTreeNew(bedRangeCmp);
/* Unique location (don't pick same intron twice. */
if(bedUniqueInTree(bedTree, iv))
{
AllocVar(bed);
bed->chrom = cloneString(iv->chrom);
bed->chromStart = iv->e1S;
bed->chromEnd = iv->e2E;
rbTreeAdd(bedTree, bed);
}
else
unique = FALSE;
/* Unique loci, don't pick from same overall loci if possible. */
safef(key, sizeof(key), "%s", iv->agxName);
if(hashFindVal(agxHash, key) == NULL)
hashAdd(agxHash, key, iv);
else
unique = FALSE;
/* Definitely don't pick from same mRNA. */
chopSuffix(iv->ev->orthoBedName);
safef(key, sizeof(key), "%s", iv->ev->orthoBedName);
if(hashFindVal(agxHash, key) == NULL)
hashAdd(agxHash, key, iv);
else
unique = FALSE;
if(unique)
unique = !checkMgcPicks(db, iv);
return unique;
}
struct bed *bedForIv(struct intronEv *iv)
/** Make a bed to represent a intronEv structure. */
{
struct bed *bed = NULL;
AllocVar(bed);
bed->chrom = cloneString(iv->chrom);
bed->chromStart = bed->thickStart = iv->e1S;
bed->chromEnd = bed->thickEnd = iv->e2E;
bed->name = cloneString(iv->ev->orthoBed->name);
if(iv->orientation == 1)
safef(bed->strand, sizeof(bed->strand), "+");
else if(iv->orientation == -1)
safef(bed->strand, sizeof(bed->strand), "-");
else
safef(bed->strand, sizeof(bed->strand), "%s", iv->ev->orthoBed->strand);
bed->blockCount = 2;
AllocArray(bed->chromStarts, 2);
bed->chromStarts[0] = iv->e1S - bed->chromStart;
bed->chromStarts[1] = iv->e2S - bed->chromStart;
AllocArray(bed->blockSizes, 2);
bed->blockSizes[0] = iv->e1E - iv->e1S;
bed->blockSizes[1] = iv->e2E - iv->e2S;
bed->score = scoreForIntronEv(iv);
return bed;
}
void writeOutFrames(FILE *htmlOut, char *fileName, char *db, char *bedFile, char *pos)
/** Write out frame for htmls. */
{
fprintf(htmlOut, "Introns and flanking exons for exploratory PCR\n"
"\n"
"\n", fileName, db, pos, bedFile);
}
void pickIntrons()
/** Top level routine, actually picks the introns. */
{
char *htmlFileName=NULL, *htmlFrameFileName=NULL;
char *bedFileName=NULL, *orthoBedFileName=NULL;
FILE *htmlOut=NULL, *htmlFrameOut=NULL;
FILE *bedOut=NULL, *orthoBedOut=NULL;
char *orthoEvalFile = NULL;
char *db = NULL;
struct orthoEval *ev=NULL, *evList = NULL;
struct intronEv *iv=NULL, *ivList = NULL;
int maxPicks = optionInt("numPicks", 100);
int i=0;
boolean isRefSeq=FALSE, isMgcBad=FALSE;
struct hash *posHash = newHash(12), *agxHash = newHash(12);
struct bed *bed = NULL;
char buff[256];
htmlFileName = optionVal("htmlFile", NULL);
htmlFrameFileName = optionVal("htmlFrameFile", "frame.html");
orthoEvalFile = optionVal("orthoEvalFile", NULL);
db = optionVal("db", NULL);
bedFileName = optionVal("bedOutFile", NULL);
orthoBedFileName = optionVal("orthoBedOut", NULL);
if(htmlFileName == NULL || orthoEvalFile == NULL || db == NULL ||
bedFileName == NULL || orthoBedFileName == NULL )
errAbort("Missing parameters. Use -help for usage.");
warn("Loading orthoEvals.");
evList = orthoEvalLoadAll(orthoEvalFile);
warn("Creating intron records");
for(ev = evList; ev != NULL; ev = ev->next)
{
for(i=0; inumIntrons; i++)
{
occassionalDot();
iv = intronIvForEv(ev, i);
slAddHead(&ivList, iv);
}
}
warn("\nDone");
warn("Sorting");
slSort(&ivList, intronEvalCmp);
warn("Done.");
htmlOut = mustOpen(htmlFileName, "w");
bedOut = mustOpen(bedFileName, "w");
htmlFrameOut = mustOpen(htmlFrameFileName, "w");
orthoBedOut = mustOpen(orthoBedFileName, "w");
i=0;
fprintf(htmlOut, "Num | Mouse Acc. | Score | TS Pick |
\n");
warn("Filtering");
safef(buff, sizeof(buff), "tmp");
for(iv = ivList; iv != NULL && maxPicks > 0; iv = iv->next)
{
if(isUniqueCoordAndAgx(db, iv, posHash, agxHash) && iv->support == 0 && !isOverlappedByRefSeq(db, iv) &&
! isOverlappedByEst(db, iv) && ! isOverlappedByMRna(db, iv))
{
boolean twinScan = (coordOverlappedByTable(db, iv->chrom, iv->e1S, iv->e1E, "mgcTSExpPcr") &&
coordOverlappedByTable(db, iv->chrom, iv->e2S, iv->e2E, "mgcTSExpPcr"));
bed = bedForIv(iv);
if(sameString(buff, "tmp"))
safef(buff, sizeof(buff), "%s:%d-%d", bed->chrom, bed->chromStart-50, bed->chromEnd+50);
// isMgcBad = isOverlappedByMgcBad(iv);
fprintf(htmlOut, "%d | "
"%s | %d | %s |
\n",
++i,bed->chrom, bed->chromStart-50, bed->chromEnd+50, bed->name, bed->score,
twinScan ? "yes" : "no");
bedTabOutN(bed, 12, bedOut);
bedTabOutN(iv->ev->orthoBed, 12, orthoBedOut);
bedFree(&bed);
maxPicks--;
}
}
writeOutFrames(htmlFrameOut, htmlFileName, db, bedFileName, buff);
fprintf(htmlOut, "
\n");
carefulClose(&bedOut);
carefulClose(&htmlOut);
carefulClose(&htmlFrameOut);
carefulClose(&orthoBedOut);
warn("Done.");
hashFree(&posHash);
hashFree(&agxHash);
}
int main(int argc, char *argv[])
/** Main, everybodies favorite function. */
{
doHappyDots = isatty(1);
if(argc == 1)
usage();
optionInit(&argc, argv, optionSpecs);
pickIntrons();
return 0;
}