/* geniegff - make up a genigene.gdf file from genie/N.gff */ #include "common.h" #include "hash.h" #include "sig.h" static char *inNames[] = { "I.gff", "II.gff", "III.gff", "IV.gff", "V.gff", "X.gff", }; static char *chromNames[] = { "i", "ii", "iii", "iv", "v", "x", }; struct exon { struct exon *next; int start, end; }; struct gene { struct gene *next; char *name; struct exon *exons; int start, end; UBYTE chromIx; char strand; }; int cmpGenes(const void *va, const void *vb) { const struct gene *a = *((struct gene **)va); const struct gene *b = *((struct gene **)vb); int dif = a->start - b->start; if (a == 0) dif = a->end - b->end; return dif; } void writeShortString(FILE *f, char *s) { UBYTE count = strlen(s); writeOne(f, count); mustWrite(f, s, count); } void writeGene(struct gene *gene, FILE *c2g, FILE *gl) { short pointCount; struct exon *exon; fprintf(c2g, "%s:%d-%d %c %s\n", chromNames[gene->chromIx], gene->start-1, gene->end, gene->strand, gene->name); writeShortString(gl, gene->name); writeOne(gl, gene->chromIx); writeOne(gl, gene->strand); pointCount = slCount(gene->exons) * 2; writeOne(gl, pointCount); for (exon = gene->exons; exon != NULL; exon = exon->next) { int start = exon->start - 1; writeOne(gl, start); writeOne(gl, exon->end); } } void procOne(char *inName, UBYTE chromIx, FILE *c2g, FILE *gl) { FILE *in = mustOpen(inName, "r"); struct gene *geneList = NULL, *g = NULL; struct exon *exon; char line[1024]; int lineCount = 0; char *words[256]; int wordCount; char *type; char *geneName; char *lastName = ""; struct hash *hash = newHash(12); printf("Processing %s\n", inName); while (fgets(line, sizeof(line), in)) { ++lineCount; wordCount = chopLine(line, words); if (wordCount > 0) { if (wordCount < 9) errAbort("Short line %d of %s\n", lineCount, inName); type = words[2]; if (differentString(type, "CDS")) { errAbort("Expecting CDS got %s in type field line %d of %s\n", type, lineCount, inName); } geneName = words[8]; if (differentString(lastName, geneName) ) { if (hashLookup(hash, geneName)) errAbort("Repeating %s\n", geneName); hashAdd(hash, geneName, NULL); AllocVar(g); g->name = lastName = cloneString(geneName); g->strand = words[6][0]; g->chromIx = chromIx; g->exons = NULL; slAddHead(&geneList, g); } AllocVar(exon); exon->start = atoi(words[3]); exon->end = atoi(words[4]); slAddTail(&g->exons, exon); } } slReverse(&geneList); slSort(&geneList, cmpGenes); for (g=geneList; g != NULL; g=g->next) { int min = 0x7fffffff; int max = -min; for (exon = g->exons; exon != NULL; exon = exon->next) { if (min > exon->start) min = exon->start; if (max < exon->end) max = exon->end; } g->start = min; g->end = max; } for (g = geneList; g != NULL; g=g->next) writeGene(g, c2g, gl); fclose(in); } int main(int argc, char *argv[]) { char *gdfName; char *c2gName; FILE *gdfFile; FILE *c2gFile; int i; bits32 sig = glSig; if (argc != 3) { errAbort("geniegff - makes up a gdf file from Genie gene predictions\n" "usage:\n" " geniegff genigene.gdf c2gFile\n" "This must be run in the same directory as I.gff, II.gff, etc.\n" "generated by Genie\n"); } gdfName = argv[1]; gdfFile = mustOpen(gdfName, "wb"); c2gName = argv[2]; c2gFile = mustOpen(c2gName, "w"); writeOne(gdfFile, sig); for (i=0; i