/* ck3and5 - Checks xxxx.3 and xxxx.5 EST pairs to make sure they align near the same * piece of DNA. */ #include "common.h" #include "hash.h" #include "snof.h" #include "cda.h" #include "wormdna.h" struct ck3and5 { struct ck3and5 *next; char *three; char *five; }; FILE *aliFile; struct snof *aliSnof; void openAli() /* Open up alignment file and index for it. */ { char aliName[512]; sprintf(aliName, "%sgood", wormCdnaDir()); aliSnof = snofOpen(aliName); aliFile = wormOpenGoodAli(); } struct cdaAli *getAli(char *name) /* Get alignment of named cDNA. */ { long offset; struct cdaAli *ali; if (!snofFindOffset(aliSnof, name, &offset)) return NULL; fseek(aliFile, offset, SEEK_SET); ali = cdaLoadOne(aliFile); if (ali == NULL) errAbort("Couldn't load %s in ali file", name); return ali; } int oneAlignCount = 0; int farAlignCount = 0; int sameStrandCount = 0; int unrel3and5Count = 0; boolean pairMatch(char *three, char *five, FILE *out) /* Return FALSE if pair doesn't align in plausible place with respect to each other. */ { struct cdaAli *ali3, *ali5; ali3 = getAli(three); ali5 = getAli(five); if (ali3 == NULL && ali5 == NULL) return TRUE; /* Neither one aligned, that's ok. */ if (ali3 == NULL) { fprintf(out, "%s aligned, but not %s\n", five, three); ++oneAlignCount; return FALSE; } if (ali5 == NULL) { fprintf(out, "%s aligned but not %s\n", three, five); ++oneAlignCount; return FALSE; } if (ali3->chromIx != ali5->chromIx) { ++farAlignCount; fprintf(out, "%s is on chromosome %s but %s is on chromosome %s\n", three, wormChromForIx(ali3->chromIx), five, wormChromForIx(ali5->chromIx)); return FALSE; } if (ali3->chromEnd < ali5->chromStart - 100000 || ali3->chromStart > ali5->chromEnd + 100000) { ++farAlignCount; fprintf(out, "%s aligns to %s:%d-%d %s aligns to %s:%d-%d\n", three, wormChromForIx(ali3->chromIx), ali3->chromStart, ali3->chromEnd, five, wormChromForIx(ali5->chromIx), ali5->chromStart, ali5->chromEnd); return FALSE; } if (ali3->strand == ali5->strand) { ++sameStrandCount; fprintf(out, "%s and %s align on same strand\n", three, five); return FALSE; } if (ali5->strand == '+') { if (ali3->chromEnd < ali5->chromStart) { unrel3and5Count++; return FALSE; } } else if (ali5->strand == '-') { if (ali5->chromEnd < ali3->chromStart) { unrel3and5Count++; return FALSE; } } else errAbort("Strand should be + or -, not %c", ali5->strand); return TRUE; } struct hash *buildMultiAlignHash() /* Return a hash table filled with names of cDNAs that align more * than once. */ { struct cdaAli *cda; struct hash *dupeHash; char lastName[128]; char *name; int dupeCount = 0; lastName[0] = 0; dupeHash = newHash(12); while (cda = cdaLoadOne(aliFile)) { name = cda->name; if (sameString(name, lastName)) { if (!hashLookup(dupeHash, name)) { hashAdd(dupeHash, name, NULL); ++dupeCount; } } strcpy(lastName, name); cdaFreeAli(cda); } printf("Found %d multiple alignments\n", dupeCount); return dupeHash; } int main(int argc, char *argv[]) { char *outName; FILE *out; char faName[512]; FILE *fa; char line[1024]; int lineCount=0; char *strand; char *cdnaName; char *lastDot; struct hash *dupeHash; struct hash *hash = newHash(17); struct hashEl *hel; char nameBuf[128]; struct ck3and5 *ckList = NULL, *ck; char nameEnd; int estCount = 0; int matchingPairCount = 0; int badPairCount = 0; if (argc != 2) { errAbort("ck3and5 - checks that 3' and 5' ESTs from the same clone align near each other.\n" "usage:\n" " ck3and5 output.txt"); } outName = argv[1]; out = mustOpen(outName, "w"); /* Scan through cdna file building up list of matching 3'/5'EST's */ sprintf(faName, "%s%s", wormCdnaDir(), "allcdna.fa"); fa = mustOpen(faName, "r"); while (fgets(line, sizeof(line), fa)) { ++lineCount; if (lineCount % 100000 == 0) printf("Line %d of %s\n", lineCount, faName); if (line[0] == '>') { strand = strchr(line, ' '); if (strand == NULL) errAbort("Expecting lots of info, didn't get much line %d of %s:\n%s", lineCount, faName, line); cdnaName = line+1; *strand++ = 0; strncpy(nameBuf, cdnaName, sizeof(nameBuf)); lastDot = strrchr(nameBuf, '.'); if (lastDot != NULL) { ++estCount; *lastDot = 0; nameEnd = lastDot[1]; if ((hel = hashLookup(hash, nameBuf)) != NULL) ck = hel->val; else { AllocVar(ck); slAddHead(&ckList, ck); hashAdd(hash, nameBuf, ck); } if (nameEnd == '3') ck->three = cloneString(cdnaName); else if (nameEnd == '5') ck->five = cloneString(cdnaName); else errAbort("Expecting 3 or 5 line %d of %s, got %c", lineCount, faName, nameEnd); } } } fclose(fa); /* Get duplicate alignment hash. */ openAli(); dupeHash = buildMultiAlignHash(); /* Scan through list looking for ones that have both 3' and 5' versions. * Look these up and make sure they go to more or less the same place. */ for (ck = ckList; ck != NULL; ck = ck->next) { if (ck->three && ck->five && !hashLookup(dupeHash, ck->three) && !hashLookup(dupeHash, ck->five)) { ++matchingPairCount; if (matchingPairCount % 1000 == 0) printf("Matching pair %d\n", matchingPairCount); badPairCount += !pairMatch(ck->three, ck->five, out); } } fprintf(out, "%d bad pairs out of %d matching pairs in %d ESTs\n", badPairCount, matchingPairCount, estCount); printf("%d bad pairs out of %d matching pairs in %d ESTs\n", badPairCount, matchingPairCount, estCount); printf("See %s for details on bad pairs\n", outName); printf("Summary of bad pairs:\n" " %d where only one aligned\n" " %d where aligned far from each other\n" " %d where aligned close but on same strand\n" " %d where 3' and 5' reversed\n", oneAlignCount, farAlignCount, sameStrandCount, unrel3and5Count ); fprintf(out, "Summary of bad pairs:\n" " %d where only one aligned\n" " %d where aligned far from each other\n" " %d where aligned close but on same strand\n" " %d where 3' and 5' reversed\n", oneAlignCount, farAlignCount, sameStrandCount, unrel3and5Count ); return 0; }