/* alt3data - extracts alt3 splicing data from intron database. */ #include "common.h" #include "wormdna.h" struct feature { struct feature *next; char *chrom; int start, end; char strand; char *orfName; DNA *dna; int alt3size; int cdnaCount; int shortCdnaCount; }; int cmpFeatures(const void *va, const void *vb) /* Compare two introns. */ { struct feature **pA = (struct feature **)va; struct feature **pB = (struct feature **)vb; struct feature *a = *pA, *b = *pB; int diff; if ((diff = strcmp(a->chrom, b->chrom)) != 0) return diff; if ((diff = a->start - b->start) != 0) return diff; return a->end - b->end; } int cmpAlts(const void *va, const void *vb) /* Compare two introns. */ { struct feature **pA = (struct feature **)va; struct feature **pB = (struct feature **)vb; struct feature *a = *pA, *b = *pB; return a->alt3size - b->alt3size; } boolean sameFeatures(struct feature *a, struct feature *b) /* Returns true if two features cover the same area. */ { if (a == b) return TRUE; if (a == NULL || b == NULL) return FALSE; return cmpFeatures(&a, &b) == 0; } struct feature *skipIrrelevant(struct feature *listSeg, struct feature *feat, int extraSpace) /* Skip parts of listSeg that couldn't matter to feat. Assumes * listSeg is sorted on start. Returns first possibly relevant * item of listSeg. */ { struct feature *seg = listSeg; char *chrom = feat->chrom; int start = feat->start - extraSpace; int skipCount = 0; /* skip past wrong chromosome. */ while (seg != NULL && strcmp(seg->chrom, chrom) < 0) { seg = seg->next; ++skipCount; } /* Skip past stuff that we've passed on this chromosome. */ while (seg != NULL && seg->chrom == chrom && seg->end < start) { seg = seg->next; ++skipCount; } return seg; } struct feature *sortedSearchOverlap(struct feature *listSeg, struct feature *feature, int extra) /* Assuming that listSeg is sorted on start, search through list * until find something that overlaps with feature, or have gone * past where feature could be. Returns overlapping feature or NULL. */ { struct feature *seg = listSeg; char *chrom = feature->chrom; int start = feature->start; int end = feature->end; while (seg != NULL && seg->chrom == chrom && seg->start <= end + extra) { if (!(seg->start - extra >= end || seg->end + extra <= start)) return seg; seg = seg->next; } return NULL; } struct feature *readGff(char *gffName) /* Read a GFF file with ORF-names on features into memory. */ { char line[1024]; int lineCount; char *words[128]; int wordCount; struct feature *list = NULL, *el; FILE *f = mustOpen(gffName, "r"); while (fgets(line, sizeof(line), f)) { ++lineCount; if (line[0] == '#') continue; wordCount = chopLine(line, words); if (wordCount == 0) break; if (wordCount < 9) errAbort("Need at least 9 fields in line %d of %s", lineCount, gffName); AllocVar(el); el->chrom = wormOfficialChromName(words[0]); el->start = atoi(words[3]); el->end = atoi(words[4]); el->cdnaCount = atoi(words[5]); el->strand = words[6][0]; el->orfName = cloneString(words[8]); slAddHead(&list, el); } fclose(f); slReverse(&list); return list; } void fetchDna(struct feature *el) /* Fill in the DNA from list - reverse complementing it if on minus strand. */ { int size; if (el->dna == NULL) { size = el->end - el->start; el->dna = wormChromPart(el->chrom, el->start, size); if (el->strand == '-') reverseComplement(el->dna, size); } } boolean verifyAg(struct feature *feat) /* Return true if feature's DNA ends in AG. */ { int size = feat->end - feat->start; DNA *dna; fetchDna(feat); dna = feat->dna; return (dna[size-2] == 'a' && dna[size-1] == 'g'); } struct feature *scanForAlt3() /* Scan intron database for introns that are identical on 5' side but * differ on 3' side. */ { char intronGffName[512]; struct feature *intronList, *intron; struct feature *seg, *segList; struct feature *altList = NULL; char intronStrand; sprintf(intronGffName, "%s%s", wormCdnaDir(), "introns.gff"); intronList = readGff(intronGffName); segList = intronList; for (intron = intronList; intron != NULL; intron = intron->next) { intronStrand = intron->strand; seg = segList = skipIrrelevant(segList, intron, 0); while ((seg = sortedSearchOverlap(seg, intron, 0)) != NULL) { if (intronStrand != '.' && seg->strand == intronStrand) { struct feature *bigger, *smaller; boolean gotAlt = FALSE; if (intron->strand == '+') { if (intron->start == seg->start && intron->end != seg->end) { gotAlt = TRUE; if (intron->end > seg->end) { bigger = intron; smaller = seg; } else { bigger = seg; smaller = intron; } } } else { if (intron->end == seg->end && intron->start != seg->start) { gotAlt = TRUE; if (intron->start < seg->start) { bigger = intron; smaller = seg; } else { bigger = seg; smaller = intron; } } } if (gotAlt) { struct feature *alt; int alt3size = (bigger->end - bigger->start) - (smaller->end - smaller->start); if (alt3size <= 18) { if (!sameFeatures(bigger, altList)) { if (verifyAg(bigger) && verifyAg(smaller)) { AllocVar(alt); *alt = *bigger; alt->alt3size = alt3size; alt->shortCdnaCount = smaller->cdnaCount; slAddHead(&altList, alt); } } } } } seg = seg->next; } } slReverse(&altList); return altList; } void writeDna(struct feature *list, char *fileName) /* Write out an FA file from list. */ { int shortestSize = 0x7fffffff; int dnaSize; int dnaOff; struct feature *el; FILE *f; int goodCount = 0; DNA *dna; for (el = list; el != NULL; el = el->next) { dnaSize = el->end - el->start; if (shortestSize > dnaSize) shortestSize = dnaSize; } uglyf("Shortest size is %d\n", shortestSize); //shortestSize -= 4; /* Avoid including intron start. */ f = mustOpen(fileName, "w"); for (el = list; el != NULL; el = el->next) { dnaSize = el->end - el->start; dna = el->dna; ++goodCount; dnaOff = dnaSize - shortestSize; fprintf(f, ">%s %s:%d-%d %c alt 3' by %d\n%s\n", el->orfName, el->chrom, el->start, el->end, el->strand, el->alt3size, el->dna + dnaOff); } uglyf("Wrote %d good looking alt-spliced sites\n", goodCount); fclose(f); } int main(int argc, char *argv[]) /* Check command line arguments, make up list of 3' alt spliced things, * use it to make an FA file. */ { char *outName; struct feature *list = NULL; if (argc != 2) { errAbort("alt3data - writes an fa file full of sequence data from alt-3 spliced genes\n" "usage:\n" " alt3data outfile.fa"); } outName = argv[1]; list = scanForAlt3(); slSort(&list, cmpAlts); uglyf("Found %d alt 3' splice sites\n", slCount(list)); writeDna(list, outName); return 0; }