/* strainLumps - strain out the most popular lumps from a file created * by lumpRep, to make a set of repeats to screen against. */ #include "common.h" #include "dnautil.h" struct lump /* Holds a repeating lump. */ { struct lump *next; char *seq; int count; }; int cmpCount(const void *va, const void *vb) /* Compare based on count. */ { const struct lump *a = *((struct lump **)va); const struct lump *b = *((struct lump **)vb); return b->count - a->count; } struct lump *readLumps(char *fileName) /* Read in lumps from file. */ { struct lump *lumpList = NULL, *lump = NULL; char line[1024]; int lineCount; char *words[3]; int wordCount; boolean isIndented; FILE *f = mustOpen(fileName, "r"); while (fgets(line, sizeof(line), f)) { ++lineCount; isIndented = isspace(line[0]); wordCount = chopLine(line, words); if (wordCount == 0) continue; /* Allow blank lines. */ if (isIndented) { if (wordCount != 2 || !isdigit(words[0][0])) errAbort("Bad line %d of %s\n", lineCount, fileName); lump->count += atoi(words[0]); } else { AllocVar(lump); lump->seq = cloneString(words[0]); slAddHead(&lumpList, lump); } } fclose(f); slReverse(&lumpList); return lumpList; } void saveBigLumps(struct lump *lumpList, int minSize, char *fileName) /* Save lumps of minSize or greater to fileName in fa format. */ { struct lump *lump; FILE *f = mustOpen(fileName, "w"); int count = 0; for (lump = lumpList; lump != NULL; lump = lump->next) { if (lump->count >= minSize) { ++count; fprintf(f, ">ce_repeat_%d x%d\n", count, lump->count); fprintf(f, "%s\n", lump->seq); } } fclose(f); } int main(int argc, char *argv[]) { char *inName = "../lumpRep/repeats.out"; char *outName = "repeats.fa"; struct lump *lumpList; dnaUtilOpen(); lumpList = readLumps(inName); slSort(&lumpList, cmpCount); saveBigLumps(lumpList, 6, outName); return 0; }