/* newIntron.c - look for introns that are present in C. elegans but not * C. briggsae and vice versa. */ #include "common.h" #include "hash.h" #include "dnautil.h" #include "portable.h" #include "gdf.h" #include "wormdna.h" #include "xa.h" FILE *out; /* Output file. */ void report(FILE *f, char *format, ...) /* Write printf formatted report to file and standard out. */ { va_list args; va_start(args, format); vfprintf(f, format, args); vfprintf(stdout, format, args); va_end(args); } #define uglyr report FILE *openXa(char *organism) /* Return file handle for xeno-alignments for given organism. */ { char path[512]; sprintf(path, "%s%s/all.st", wormXenoDir(), organism); return xaOpenVerify(path); } enum gapType /* What type of gap this is - put in typeByte in features list. */ { cIntronGap, cCodingGap, cIgGap, cIeMixedGap, cIgMixedGap, }; char *gapTypeStrings[] = { "intronGap", "codingGap", "igGap", "ieMixdGap", "igMixdGap", }; boolean addInEx(struct gdfGene *gdfList, int start, int end, int *pInCount, int *pExCount, struct gdfGene **retGene) /* Find out how much of segment between start and end intersects introns * and exons in gdfList, and add these to *pInCount and *pExCount. * Returns FALSE if no intersection. Returns gene intersected in *retGene */ { int inCount, exCount, bothCount; struct gdfGene *gdf; for (gdf = gdfList; gdf != NULL; gdf = gdf->next) { struct gdfDataPoint *dp = gdf->dataPoints; int dpCount = gdf->dataCount; int i; inCount = exCount = 0; for (i=0; i s) s = start; e = dp[i+1].start; if (end < e) e = end; if ((size = e - s) > 0) exCount += size; } for (i=1; i s) s = start; e = dp[i+1].start; if (end < e) e = end; if ((size = e - s) > 0) inCount += size; } bothCount = inCount + exCount; if (bothCount > 0) { *retGene = gdf; *pInCount += inCount; *pExCount += exCount; return TRUE; } } return FALSE; } void classifyGapT(struct gdfGene *gdfList, char *chrom, int start, int end, enum gapType *retType, struct gdfGene **retGene) /* Classify based on where gap in target occurred. * Assumes that gdfList is sorted. */ { struct gdfGene *gdf; int inCount = 0, exCount = 0, bothCount; assert(start == end); if (addInEx(gdfList, start-6, end+6, &inCount, &exCount, &gdf) ) { bothCount = inCount + exCount; bothCount = inCount + exCount; if (bothCount >= 8) { *retGene = gdf; if (inCount >= 8) *retType = cIntronGap; else if (exCount >= 8) *retType = cCodingGap; else if (bothCount >= 12) *retType = cIeMixedGap; else *retType = cIgMixedGap; return; } } *retGene = NULL; *retType = cIgGap; return; } void classifyGapQ(struct gdfGene *gdfList, char *chrom, int start, int end, enum gapType *retType, struct gdfGene **retGene) /* Classify based on where gap in query occurred. * Assumes that gdfList is sorted. */ { struct gdfGene *gdf; int inCount = 0, exCount = 0, bothCount; if (addInEx(gdfList, start-6, start, &inCount, &exCount, &gdf) && addInEx(gdfList, end, end+6, &inCount, &exCount, &gdf) ) { bothCount = inCount + exCount; if (bothCount >= 8) { *retGene = gdf; if (inCount >= 8) *retType = cIntronGap; else if (exCount >= 8) *retType = cCodingGap; else if (bothCount >= 12) *retType = cIeMixedGap; else *retType = cIgMixedGap; return; } } *retGene = NULL; *retType = cIgGap; return; } void classifyGap(struct gdfGene *gdfList, char *chrom, int start, int end, char hSym, enum gapType *retType, struct gdfGene **retGene) /* Classify based on where gap occurred. Assumes that gdfList is sorted. */ { if (hSym == 'T') classifyGapT(gdfList, chrom, start, end, retType, retGene); else classifyGapQ(gdfList, chrom, start, end, retType, retGene); } boolean isConserved(char h) /* Return TRUE is hidden symbol indicates conservation */ { return (h == '1' || h == '2' || h == '3' || h == 'H'); } boolean hasIntronEnds(char *intSym, int start, int end, char iStart[2], char iEnd[2]) /* Return TRUE if ends of intron at this position match consensus. */ { boolean hasEnds = (intSym[start] == iStart[0] && intSym[start+1] == iStart[1] && intSym[end-2] == iEnd[0] && intSym[end-1] == iEnd[1]); return hasEnds; } boolean isSlidableToIntron(char *intSym, char *gapSym, int start, int end, int symCount, char iStart[2], char iEnd[2], int *retSlide) /* Return TRUE if could slide gap into an intron. */ { int i; if (hasIntronEnds(intSym, start, end, iStart, iEnd) ) { *retSlide = 0; return TRUE; } for (i=1; i<=5; ++i) { int s = start+i; int e = end+i; char sym; if (e > symCount) break; /* See if can move match around gap. */ sym = intSym[e-1]; if (sym == gapSym[e-1]) if (sym != intSym[s-1]) break; if (hasIntronEnds(intSym, s, e, iStart, iEnd) ) { *retSlide = i; return TRUE; } } for (i=1; i <= 5; ++i) { int s = start-i; int e = end-i; char sym; if (s < 0) break; /* See if can move match around gap. */ sym = intSym[s]; if (sym == gapSym[s]) if (sym != intSym[e]) break; if (hasIntronEnds(intSym, s, e, iStart, iEnd) ) { *retSlide = -i; return TRUE; } } return FALSE; } boolean isIntron(struct xaAli *xa, int start, int end, char hSym, char strand, int *retSlide, boolean *retIsRc) /* Return TRUE if gap looks like an intron. */ { /* Intron boundaries are GT ... AG for forward strand, * CT ... AC for reverse strand. */ char *intSym, *gapSym; int symCount = xa->symCount; if (hSym == 'T') { gapSym = xa->tSym; intSym = xa->qSym; } else { gapSym = xa->qSym; intSym = xa->tSym; } if (strand == '+') { *retIsRc = FALSE; return isSlidableToIntron(intSym, gapSym, start, end, symCount, "gt", "ag", retSlide); } else if (strand == '-') { *retIsRc = TRUE; return isSlidableToIntron(intSym, gapSym, start, end, symCount, "ct", "ac", retSlide); } else { if (isSlidableToIntron(intSym, gapSym, start, end, symCount, "gt", "ag", retSlide)) { *retIsRc = FALSE; return TRUE; } if (isSlidableToIntron(intSym, gapSym, start, end, symCount, "ct", "ac", retSlide)) { *retIsRc = TRUE; return TRUE; } } return FALSE; } struct gapInfo /* Information on one gap. */ { struct gapInfo *next; char *query; int qStart, qEnd; char *target; int tStart, tEnd; char *name; int size; char hSym; enum gapType type; boolean hasIntronEnds; boolean hasStrongHomology; boolean isRc; /* Need to reverse complement to get GT..AG ends? */ int slideCount; /* Amount to slide to get GT..AG ends. */ }; #define exSize 15 #define inSize exSize #define halfSize exSize #define totSize (exSize+inSize) int hist5[totSize][5]; int hist3[totSize][5]; int histCount; int sum4(int *a) { return a[0]+a[1]+a[2]+a[3]; } void printLineOfHis(FILE *f, int his[totSize][5], DNA base, int start, int size) /* Print a single line of histogram. */ { int baseVal = ntVal[base]; int i; long millis; fprintf(f, "%c ", base); for (i=start; i= 10) fprintf(f, "%3d%% ", (millis+5)/10); else fprintf(f, "%d.%d%% ", millis/10, millis%10); } fprintf(f, "\n"); } void printHis(FILE *f, int his[totSize][5], int start, int size) /* Print one frequency histogram. */ { printLineOfHis(f, his, 'a', start, size); printLineOfHis(f, his, 'c', start, size); printLineOfHis(f, his, 'g', start, size); printLineOfHis(f, his, 't', start, size); fprintf(f, "\n"); } void printAllHistograms(FILE *f) /* Print all frequency histograms. */ { fprintf(f, "Frequency Counts of 5' Ends:\n"); printHis(f, hist5, exSize-7, 14); fprintf(f, "Frequency Counts of 3' Ends:\n"); printHis(f, hist3, inSize-7, 14); #ifdef USUAL fprintf(f, "Frequency Counts of 5' Ends of Exons:\n"); printHis(f, hist5, 0, exSize); fprintf(f, "Frequency Counts of Intron Starts:\n"); printHis(f, hist5, exSize, inSize); fprintf(f, "Frequency Counts of Intron Ends:\n"); printHis(f, hist3, 0, inSize); fprintf(f, "Frequency Counts of 3' Ends of Exons:\n"); printHis(f, hist3, inSize, exSize); #endif } int ceOnlyCount; int ceOnlyHomoCount[2*halfSize]; int cbOnlyCount; int cbOnlyHomoCount[2*halfSize]; int bothCount; int bothHomoCount[2*halfSize]; int ceCount; int ceHomoCount[2*halfSize]; void printHomoEnds(FILE *f, char *title, int count, int homoCount[2*halfSize]) /* Print totals of one type of homology. */ { int i; fprintf(f, "%s (%d total)\n", title, count); if (count > 0) { for (i=0; i<2*halfSize; ++i) { if (i == halfSize) fprintf(f, "^ "); fprintf(f, "%2d ", round(100.0 * homoCount[i] / count)); } } fprintf(f, "\n"); } void printHomologousEndStats(FILE *f) /* Print statistics about how 3' and 5' splice sites are to each other. */ { fprintf(f, "Percent of match between 3' and 5' splice sites.\n"); printHomoEnds(f, "C. elegans only", ceOnlyCount, ceOnlyHomoCount); printHomoEnds(f, "C. briggsae only", cbOnlyCount, cbOnlyHomoCount); printHomoEnds(f, "Either unique", bothCount, bothHomoCount); printHomoEnds(f, "C. elegans total", ceCount, ceHomoCount); } void calcCeHomoCount() /* Calculate total homologies between 3' and 5' ends for * all C. elegans introns. */ { char *intronFileName = "C:/biodata/celegans/cDNA/introns.txt"; FILE *f = mustOpen(intronFileName, "r"); int lineCount = 0; static char line[2048]; int wordCount; char *words[16]; char *exonBefore, *exonAfter, *intronStart, *intronEnd; boolean firstTime = TRUE; int ebSize, ieSize; int ebOff, ieOff; int i; while (fgets(line, sizeof(line), f)) { ++lineCount; wordCount = chopLine(line, words); if (wordCount == 0) continue; if (wordCount < 10) errAbort("Short line %d of %s\n", lineCount, intronFileName); exonBefore = words[6]; intronStart = words[7]; intronEnd = words[8]; exonAfter = words[9]; if (firstTime) { ebSize = strlen(exonBefore); ebOff = ebSize - halfSize; ieSize = strlen(intronEnd); ieOff = ieSize - halfSize; firstTime = FALSE; } for (i=0; inext) { totCount += il->count; listCount += 1; if (il->count > maxCount) maxCount = il->count; if (il->count > 1) { ++dupeCount; dupeTotal += il->count; if (il->isQgap) ++dupeQ; else ++dupeT; if (il->onBoth) ++bothCount; } } fprintf(out, "Stats on how many introns unique to species share share %d bases on either end\n", intronEndsSize); fprintf(out, "%d total unique introns\n", totCount); fprintf(out, "%d distinct unique introns\n", listCount); fprintf(out, "%d maximum occurrences of a unique intron\n", maxCount); fprintf(out, "Average unique intron appears %4.2f times\n", (double)totCount/listCount); fprintf(out, "%d dupes total, %d in briggs, %d in elegans\n", dupeCount, dupeT, dupeQ); fprintf(out, "%d introns in dupe sets\n", dupeTotal); fprintf(out, "%d dupes on both\n\n", bothCount); } void writeGap(struct gapInfo *gap, struct xaAli *xa, int symStart, int symEnd, char geneStrand, FILE *f) /* Write out info on one gap to file. */ { char qStart[totSize+1], qEnd[totSize+1]; char tStart[totSize+1], tEnd[totSize+1]; char hStart[totSize+1], hEnd[totSize+1]; int s, e, size; int midSize; int i; char *threePrime, *fivePrime; boolean isQgap; fprintf(f, "%s %s %s hom %s:%d-%d %c %s:%d-%d %c slide %d\n", gapTypeStrings[gap->type], (gap->hasIntronEnds ? " intron" : "!intron"), (gap->hasStrongHomology ? "heavy" : "light"), gap->query, gap->qStart, gap->qEnd, xa->qStrand, gap->target, gap->tStart, gap->tEnd, geneStrand, gap->slideCount); s = symStart-exSize; e = symStart + inSize; if (s < 0) s = 0; size = e-s; uglyf("s %d size %d e %d totSize %d\n", s, size, e, totSize); strncpy(qStart, xa->qSym+s, size); strncpy(tStart, xa->tSym+s, size); strncpy(hStart, xa->hSym+s, size); qStart[size] = tStart[size] = hStart[size] = 0; // uglyf - crashes by here s = symEnd-inSize; midSize = s - e; e = symEnd+exSize; if (e > xa->symCount) e = xa->symCount; size = e-s; strncpy(qEnd, xa->qSym+s, size); strncpy(tEnd, xa->tSym+s, size); strncpy(hEnd, xa->hSym+s, size); qEnd[size] = tEnd[size] = hEnd[size] = 0; if (gap->isRc) { swapBytes(qStart, qEnd, totSize); swapBytes(tStart, tEnd, totSize); swapBytes(hStart, hEnd, totSize); reverseComplement(qStart, totSize); reverseComplement(qEnd, totSize); reverseComplement(tStart, totSize); reverseComplement(tEnd, totSize); reverseBytes(hStart, totSize); reverseBytes(hEnd, totSize); } /* Write out ends of gap to file. */ fprintf(f, "%s ...%d... %s\n", qStart, midSize, qEnd); fprintf(f, "%s ...%d... %s\n", tStart, midSize, tEnd); fprintf(f, "%s ...%d... %s\n\n", hStart, midSize, hEnd); /* Add intron ends to consensus sequence histogram. */ if (gap->hasIntronEnds && gap->type == cCodingGap) { isQgap = (qStart[exSize] == '-'); if (isQgap) { fivePrime = tStart; threePrime = tEnd; } else { fivePrime = qStart; threePrime = qEnd; } if (noInserts(threePrime, totSize) && noInserts(fivePrime, totSize) ) { int *homoCount; for (i=0; ival; il->count += 1; fprintf(f, ">>>%d of set<<<\n", il->count); if (il->isQgap != isQgap) { il->onBoth = TRUE; } } else { AllocVar(il); strcpy(il->ends, idBuf); il->count = 1; il->isQgap = isQgap; slAddHead(&intronList, il); hashAdd(intronHash, idBuf, il); } } } else { static insertCount = 0; warn("Skipping intron with flanking inserts %d", ++insertCount); } } } void slideGap(struct gapInfo *gap, struct xaAli *xa, char gapType, int symStart, int symEnd) /* Rearrange gap position. Make things looking like: cagaggctctcttccaggtaagcttattcaa ...18... atcgatatttttacagccatccttcttgggt Q cagaggctatgttcca--------------- ...18... ---------------gccatccttcttgggt T look like cagaggctctcttccaggtaagcttattcaa ...18... atcgatatttttacagccatccttcttgggt cagaggctatgttccag-------------- ...18... ----------------ccatccttcttgggt instead. */ { int slideCount = gap->slideCount; if (slideCount != 0) { char oc; char *sym = (gapType == 'Q' ? xa->qSym : xa->tSym); char *hSym = xa->hSym; int i; if (slideCount > 0) { for (i=0; i< slideCount; ++i) { int s = symStart + i; int e = symEnd + i; oc = hSym[s]; hSym[s] = hSym[e]; hSym[e] = oc; oc = sym[s]; sym[s] = sym[e]; sym[e] = oc; } } else { for (i=0; i > slideCount; --i) { int s = symStart + i - 1; int e = symEnd + i - 1; oc = hSym[s]; hSym[s] = hSym[e]; hSym[e] = oc; oc = sym[s]; sym[s] = sym[e]; sym[e] = oc; } } } } boolean uniqueGap(struct gapInfo *oldList, struct gapInfo *gap) /* Return TRUE if gap looks to be unique in where it occurs * in query. */ { struct gapInfo *old; int tolerance = 8; for (old = oldList; old != NULL; old = old->next) { if (!sameString(old->query, gap->query)) break; if (intAbs(old->qStart-gap->qStart) <= tolerance && intAbs(old->qEnd-gap->qEnd) <= tolerance) return FALSE; if (sameString(old->target, gap->target) && intAbs(old->tStart-gap->tStart) <= tolerance && intAbs(old->tEnd-gap->tEnd) <= tolerance) return FALSE; } return TRUE; } struct gapInfo *findLargeGaps(struct xaAli *xa, struct gapInfo *oldList) /* Find large gaps in alignment and classify them. */ { struct gdfGene *gdfList; struct gapInfo *gapList = NULL, *gap; int ceIx=0, cbIx=0, symIx=0; int ceStart=0, cbStart=0, symStart=0; int runSize = 0; char sym, lastSym = 0; int symCount = xa->symCount; /* Fetch C. elegans region. */ gdfList = wormGdfGenesInRange(xa->target, xa->tStart, xa->tEnd, &wormSangerGdfCache); /* Run a little state machine that does something at the end of each solid run * of a symbol. */ for (symIx = 0; symIx <= symCount; ++symIx) { sym = xa->hSym[symIx]; if (sym != lastSym) { if (runSize > 32) /* Introns need to be at least this long. */ { /* We're at end of a solid run. */ if (lastSym == 'Q' || lastSym == 'T') { int ceGapStart = xa->tStart + ceStart; int ceGapEnd = xa->tStart + ceIx; struct gdfGene *gdf; char hBefore = xa->hSym[symStart-1]; char hAfter = sym; char strand = '.'; AllocVar(gap); gap->query = cloneString(xa->query); gap->qStart = xa->qStart + cbStart; gap->qEnd = xa->qStart + cbIx; gap->target = cloneString(xa->target); gap->tStart = ceGapStart; gap->tEnd = ceGapEnd; gap->name = cloneString(xa->name); gap->size = runSize; gap->hSym = lastSym; if (uniqueGap(oldList, gap)) { slAddHead(&gapList, gap); classifyGap(gdfList, xa->target, ceGapStart, ceGapEnd, lastSym, &gap->type, &gdf); if (gdf != NULL) strand = gdf->strand; gap->hasIntronEnds = isIntron(xa, symStart, symIx, lastSym, strand, &gap->slideCount, &gap->isRc); if (gap->hasIntronEnds) slideGap(gap, xa, lastSym, symStart, symIx); if (isConserved(hBefore) && isConserved(hAfter)) gap->hasStrongHomology = TRUE; if (gap->hasStrongHomology) { if (lastSym == 'T') writeGap(gap, xa, symStart+gap->slideCount, symIx+gap->slideCount, strand, out); } } } } runSize = 0; ceStart = ceIx; cbStart = cbIx; symStart = symIx; lastSym = sym; } ++runSize; if (xa->qSym[symIx] != '-') ++cbIx; if (xa->tSym[symIx] != '-') ++ceIx; } gdfFreeGeneList(&gdfList); slReverse(&gapList); return gapList; } int countGapType(struct gapInfo *gapList, int type) /* Return number of gaps of given type. */ { struct gapInfo *gap; int count = 0; for (gap = gapList; gap != NULL; gap = gap->next) if (type == gap->type) ++count; return count; } void countGapHomology(struct gapInfo *gapList, int *retWeak, int *retStrong) /* Count up gap homologies. */ { struct gapInfo *gap; int weakCount=0, strongCount=0; for (gap = gapList; gap != NULL; gap = gap->next) { if (gap->hasStrongHomology) ++strongCount; else ++weakCount; } *retWeak = weakCount; *retStrong = strongCount; } int countIntronEnds(struct gapInfo *gapList) /* Count up number of gaps with intron ends. */ { struct gapInfo *gap; int count = 0; for (gap = gapList; gap != NULL; gap = gap->next) if (gap->hasIntronEnds && gap->hasStrongHomology) ++count; return count; } int countHsym(struct gapInfo *gapList, char hSym) /* Count up number of gaps that have given hSym. */ { struct gapInfo *gap; int count = 0; for (gap = gapList; gap != NULL; gap = gap->next) if (hSym == gap->hSym) ++count; return count; } int countStrongCoding(struct gapInfo *gapList) /* Count up number of gaps in coding regions with strong homology */ { struct gapInfo *gap; int count = 0; for (gap = gapList; gap != NULL; gap = gap->next) if (gap->type == cCodingGap && gap->hasStrongHomology) ++count; return count; } int countSpecific(struct gapInfo *gapList, boolean hasStrongHomology, boolean hasIntronEnds, char hSym, int type) /* Count up number of gaps that match homology, intron ends, etc. */ { struct gapInfo *gap; int count = 0; for (gap = gapList; gap != NULL; gap = gap->next) { if (gap->hasStrongHomology == hasStrongHomology && gap->hasIntronEnds == hasIntronEnds && gap->hSym == hSym && gap->type == type) ++count; } return count; } void reportGaps(struct gapInfo *gapList, FILE *f) /* Write out info on gaps to file, and summary info to * console as well. */ { int gapCount; int qGaps, tGaps; int weakGaps, strongGaps; int intronGaps; int typeGaps[5]; int i; gapCount = slCount(gapList); qGaps = countHsym(gapList, 'Q'); tGaps = gapCount - qGaps; countGapHomology(gapList, &weakGaps, &strongGaps); intronGaps = countIntronEnds(gapList); for (i=0; iquery; xa->query = xa->target; xa->target = swaps; swapi = xa->qStart; xa->qStart = xa->tStart; xa->tStart = swapi; swapi = xa->qEnd; xa->qEnd = xa->tEnd; xa->tEnd = swapi; swapc = xa->qStrand; xa->qStrand = xa->tStrand; xa->tStrand = swapc; swaps = xa->qSym; xa->qSym = xa->tSym; xa->tSym = swaps; swapSym(xa->hSym, xa->symCount); } uglyf("%d query %s target %s\n", count, xa->query, xa->target); s = chromFromPath(xa->target); freeMem(xa->target); xa->target = s; if (++count % 500 == 0) printf("Processing %d\n", count); gaps = findLargeGaps(xa, gapList); gapList = slCat(gaps, gapList); xaAliFree(xa); } slReverse(&gapList); report(out, "Processing took %f seconds\n", (clock1000()-startTime)*0.001); reportGaps(gapList, out); printAllHistograms(out); calcCeHomoCount(); printHomologousEndStats(out); printSameIntronStats(out); return 0; }