/* scanIntrons.c */
#include "common.h"
#include "dnautil.h"
#include "dnaseq.h"
#include "wormdna.h"
#include "cheapcgi.h"
#include "htmshell.h"
#define hisSize 15
/* Histogram size. */
static int preHis[hisSize][4];
static int startHis[hisSize][4];
static int endHis[hisSize][4];
static int postHis[hisSize][4];
void addToHis(DNA *dna, int his[hisSize][4])
{
int i;
int val;
for (i=0; i ");
printLineOfHis(his, 'a');
printLineOfHis(his, 'c');
printLineOfHis(his, 'g');
printLineOfHis(his, 't');
printf("
\n");
}
void printHis(int his[hisSize][4])
{
printf("
");
htmlParagraph("Frequency Counts of 5' Ends of Exons:");
printProbs(preHis);
htmlParagraph("Frequency Counts of Intron Starts:");
printProbs(startHis);
htmlParagraph("Frequency Counts of Intron Ends:");
printProbs(endHis);
htmlParagraph("Frequency Counts of 3' Ends of Exons:");
printProbs(postHis);
}
boolean patMatch(DNA *queryString, DNA *targetString, int querySize)
/* Does as match with bs except for the N's in as? */
{
DNA q, t, lastQ;
int i;
lastQ = 0;
for (i=0; iname, b->name);
}
int cmpCounts(const void *va, const void *vb)
/* Compare two slNames. */
{
const struct nameOff *a = *((struct nameOff **)va);
const struct nameOff *b = *((struct nameOff **)vb);
int dif = b->cdnaCount - a->cdnaCount;
if (dif != 0)
return dif;
return strcmp(a->name, b->name);
}
struct nameOff *scanIntronFile(char *preIntronQ, char *startIntronQ,
char *endIntronQ, char *postIntronQ, boolean invert)
{
char intronFileName[600];
FILE *f;
char lineBuf[4*1024];
char *words[4*128];
int wordCount;
int lineCount = 0;
int preLenQ = strlen(preIntronQ);
int startLenQ = strlen(startIntronQ);
int endLenQ = strlen(endIntronQ);
int postLenQ = strlen(postIntronQ);
char *preIntronF, *startIntronF, *endIntronF, *postIntronF;
int preLenF, startLenF, endLenF, postLenF;
int preIx = 6, startIx = 7, endIx =8, postIx = 9;
struct nameOff *list = NULL, *el;
boolean addIt;
int i;
if (preLenQ > 25 || postLenQ > 25 || startLenQ > 40 || endLenQ > 40)
{
errAbort("Can only handle queries up to 25 bases on either side of the intron "
"and 40 bases inside the intron.");
}
sprintf(intronFileName, "%s%s", wormCdnaDir(), "introns.txt");
f = mustOpen(intronFileName, "r");
while (fgets(lineBuf, sizeof(lineBuf), f) != NULL)
{
++lineCount;
wordCount = chopByWhite(lineBuf, words, ArraySize(words));
if (wordCount == ArraySize(words))
{
warn("May have truncated end of line %d of %s",
lineCount, intronFileName);
}
if (wordCount == 0)
continue;
if (wordCount < 11)
errAbort("Unexpected short line %d of %s", lineCount, intronFileName);
preIntronF = words[preIx];
startIntronF = words[startIx];
endIntronF = words[endIx];
postIntronF = words[postIx];
preLenF = strlen(preIntronF);
startLenF = strlen(startIntronF);
endLenF = strlen(endIntronF);
postLenF = strlen(postIntronF);
addIt = FALSE;
if ( ( preLenQ == 0 || patMatch(preIntronQ, preIntronF+preLenF-preLenQ+countSpecial(preIntronQ), preLenQ))
&& (startLenQ == 0 || patMatch(startIntronQ, startIntronF, startLenQ))
&& ( endLenQ == 0 || patMatch(endIntronQ, endIntronF+endLenF-endLenQ+countSpecial(endIntronQ), endLenQ))
&& ( postLenQ == 0 || patMatch(postIntronQ, postIntronF, postLenQ)) )
{
addIt = TRUE;
}
if (invert)
addIt = !addIt;
if (addIt)
{
addIntronToHistogram(preIntronF+preLenF, startIntronF, endIntronF+endLenF, postIntronF);
AllocVar(el);
el->chrom = cloneString(words[1]);
el->name = cloneString(words[5]);
el->start = atoi(words[2]);
el->end = atoi(words[3]);
el->cdnaCount = atoi(words[0]);
memcpy(el->startI, startIntronF, 2);
memcpy(el->endI, endIntronF + endLenF - 2, 2);
assert(wordCount == el->cdnaCount + 10);
for (i=10; icdnaNames, name);
}
slReverse(&el->cdnaNames);
assert(slCount(el->cdnaNames) == el->cdnaCount);
slAddHead(&list, el);
}
}
fclose(f);
slSort(&list, cmpCounts);
return list;
}
void showMatches(struct nameOff *matches)
{
int nameWidth = 13;
printf("");
printf(" ORF cDNA intron supporting\n");
printf(" name count ends cDNA\n");
printf("--------------------------------------\n");
for (;matches != NULL; matches = matches->next)
{
struct slName *name;
printf("%s ",
matches->chrom, matches->start-1000, matches->end+1000, matches->name, matches->start, matches->end, matches->name);
printf("%s", spaces(nameWidth - strlen(matches->name) ) );
printf("%2d %s...%s", matches->cdnaCount, matches->startI, matches->endI);
for (name = matches->cdnaNames; name != NULL; name = name->next)
{
printf(" %s", name->name);
}
printf("\n");
}
printf("\n");
}
void doMiddle()
{
char *preIntron, *startIntron, *endIntron, *postIntron;
int maxCount;
struct nameOff *matchList;
boolean invert = cgiVarExists("Invert");
preIntron = cgiString("preIntron");
startIntron = cgiString("startIntron");
endIntron = cgiString("endIntron");
postIntron = cgiString("postIntron");
/* Just for debugging cut search short if matches special magic */
maxCount = atoi(preIntron);
if (maxCount <= 0)
maxCount = 0x7ffffff;
eraseWhiteSpace(preIntron);
eraseWhiteSpace(startIntron);
eraseWhiteSpace(endIntron);
eraseWhiteSpace(postIntron);
tolowers(preIntron);
tolowers(startIntron);
tolowers(endIntron);
tolowers(postIntron);
matchList = scanIntronFile(preIntron, startIntron, endIntron, postIntron, invert);
if (matchList == NULL)
errAbort("Sorry, no matches to %s%s %s %s %s", (invert ? "inverted " : ""), preIntron, startIntron, endIntron, postIntron);
printf("%d introns matching %s%s(%s %s)%s. ", slCount(matchList),
(invert ? "inverted " : ""), preIntron, startIntron, endIntron, postIntron);
printf("Shortcut to frequency counts.
");
htmlHorizontalLine();
showMatches(matchList);
htmlHorizontalLine();
printf("");
printf("");
printAllHistograms();
printf(" ");
}
int main(int argc, char *argv[])
{
dnaUtilOpen();
/* If run from the command line with the right number of arguments,
* set up QUERY_STRING so we can pretend we're being run by web server
* instead. */
if (argc == 5)
{
char buf[512];
sprintf(buf, "QUERY_STRING=preIntron=%s&startIntron=%s&endIntron=%s&postIntron=%s", argv[1], argv[2], argv[3], argv[4]);
putenv(buf);
}
htmShell("ScanIntron Output", doMiddle, "QUERY");
return 0;
}