/* oldGff - module for reading GFFs. This is largely if not * entirely superceded by the gff module. * * * This file is copyright 2002 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "dnautil.h" #include "oldGff.h" #include "dnaseq.h" #include "htmshell.h" #include "portable.h" #include "localmem.h" #define errfile stdout static char _gffIdent[] = "##gff-version"; struct gffSegLine { char seqname[64]; /* Name of DNA sequence this refers to. */ char source[64]; /* Who put this segment here... */ char feature[64]; /* CDS, E, I, exon, intron, ??? */ long start, end; /* Offsets into DNA array, end inclusive */ char score[62]; /* A number between 0 and 1 */ char strand[4]; /* + or - */ char frame[4]; /* 0, 1, 2, or . */ char group[128]; /* Name of gene cosmid.number. */ }; static int gffSegLineScan(struct gff* gff, struct gffSegLine *seg) { int scanned = sscanf(gff->buf, "%s %s %s %ld %ld %s %1s %s %s", seg->seqname, seg->source, seg->feature, &seg->start, &seg->end, seg->score, seg->strand, seg->frame, seg->group); return scanned; } static boolean _gffGetLine(struct gff *gff) /* Get the next line into a gff file. (private) * return FALSE at EOF or if problem. */ { char *s; s = fgets(gff->buf, gff->bufSize, gff->file); if (s == NULL) { return FALSE; } gff->bytesInBuf = strlen(gff->buf); gff->readIx = 0; gff->lineNumber += 1; return TRUE; } static boolean _gffSeekDoubleSharpLine(struct gff *gff) /* Go find next line that begins with ## */ { for (;;) { if (!_gffGetLine(gff)) return FALSE; if (gff->bytesInBuf >= 2) if (gff->buf[0] == '#' && gff->buf[1] == '#') return TRUE; } } boolean gffOpen(struct gff *gff, char *fileName) /* Initialize gff structure and open file for it. */ { dnaUtilOpen(); /* Initialize structure and open file. */ zeroBytes(gff, sizeof(*gff)); gff->memPool = lmInit(16*1024); gff->fileSize = fileSize(fileName); if (gff->fileSize < 0 || (gff->file = fopen(fileName, "rb")) == NULL) { warn("Couldn't find the file named %s\n", fileName); return FALSE; } strcpy(gff->fileName, fileName); gff->bufSize = ArraySize(gff->buf); /* Make sure it's a gff file. */ _gffSeekDoubleSharpLine(gff); if (strncmp(gff->buf, _gffIdent, strlen(_gffIdent)) != 0) { warn("%s doesn't appear to be a .gff file\n", fileName); return FALSE; } return TRUE; } void gffClose(struct gff *gff) /* Close down gff structure. */ { if (gff->file != NULL) fclose(gff->file); freeMem(gff->dna); lmCleanup(&gff->memPool); zeroBytes(gff, sizeof(*gff)); } #if 0 /* unused */ static boolean _gffAtEof(struct gff *gff) /* Returns TRUE if at the end of gff file. */ { return gff->file == NULL; } #endif #if 0 /* unused */ static char _getGffChar(struct gff *gff) /* Return next byte (not next base) in gff file. Return zero * if at end of file. */ { if (gff->readIx >= gff->bytesInBuf) { if (!_gffGetLine(gff)) return 0; } return gff->buf[gff->readIx++]; } #endif static boolean _gffSeekDna(struct gff *gff) /* Skip through file until you get the DNA */ { static char dnaIdent[] = "##DNA"; rewind(gff->file); for (;;) { if (!_gffGetLine(gff)) return FALSE; if (strncmp(gff->buf, dnaIdent, strlen(dnaIdent)) == 0) { sscanf(gff->buf, "##DNA %s", gff->dnaName); gff->bytesInBuf = 0; /* We're done with gff line. */ return TRUE; } } } static boolean gffNextDnaLine(struct gff *gff) /* Fetches next line of DNA. */ { static char endIdent[] = "##end-DNA"; if (!_gffSeekDoubleSharpLine(gff)) return FALSE; /* Check to see if have reached end of DNA sequence */ if (strncmp(gff->buf, endIdent, strlen(endIdent))==0) { gff->bytesInBuf = 0; /* We're done with gff line. */ return FALSE; } return TRUE; } boolean gffReadDna(struct gff *gff) /* Read all the DNA in a file. */ { long dnaSize = 0; DNA *dna; DNA *line; int lineCount; DNA b; if (gff->dna != NULL) return TRUE; /* We already read it. */ if (!_gffSeekDna(gff)) return FALSE; if ((gff->dna = wantMem(gff->fileSize)) == NULL) { warn("Couldn't allocate %ld bytes for DNA\n", gff->fileSize); return FALSE; } dna = gff->dna; for (;;) { if (!gffNextDnaLine(gff)) break; line = gff->buf + gff->readIx; lineCount = gff->bytesInBuf-gff->readIx; while (--lineCount >= 0) { b = *line++; if ((b = ntChars[(int)b]) != 0) { *dna++ = b; dnaSize += 1; } } } gff->dnaSize = dnaSize; return TRUE; } struct gffGene *gffFindGene(struct gff *gff, char *geneName) /* Find gene with given name. Case sensitive. */ { struct gffGene *g; for (g=gff->genes; g!=NULL; g=g->next) { if (strcmp(geneName, g->name) == 0) return g; } return NULL; } struct gffGene *gffFindGeneIgnoreCase(struct gff *gff, char *geneName) /* Find gene with given name. Not case sensitive. */ { struct gffGene *g; for (g=gff->genes; g!=NULL; g=g->next) { if (differentWord(geneName, g->name) == 0) return g; } return NULL; } /* Allocate memory and clear it to zero. Report error * and kill program if can't allocate it. */ static void *gffNeedMem(struct gff *gff, int size) { return lmAlloc(gff->memPool, size); } static void gffSegmentInsertSort(struct gffSegment **plist, struct gffSegment *seg) /* Insert segment on list, keeping list ordered by start field parameters: gffSegment **plist; Pointer to list. gffSegment *seg; Segment to insert */ { struct gffSegment *next; long segStart = seg->start; for (;;) { next = *plist; if (next == NULL) break; if (next->start > segStart) break; plist = &(next->next); } seg->next = next; *plist = seg; } static void offsetsFromExons(struct gffGene *gene) /* Figure out start and end offsets of gene from it's exons. */ { GffExon *exon; long end = 0; long start = 0x7fffffff; /* I should use a .h file constant here... */ for (exon = gene->exons; exon != NULL; exon = exon->next) { if (exon->start < start) start = exon->start; if (exon->end > end) end = exon->end; } gene->start = start; gene->end = end; } void gffPrintInfo(struct gff *gff, FILE *out) /* Print summary info about file. */ { struct gffGene *gene; fprintf(out, "\n%s\n", gff->fileName); fprintf(out, "DNA %s (%ld bases)\n", gff->dnaName, gff->dnaSize); fprintf(out, "%d genes\n", slCount(gff->genes)); for (gene = gff->genes; gene != NULL; gene = gene->next) { fprintf(out, "gene %s has %ld bases, %d exons, %d introns\n", gene->name, gene->end - gene->start + 1, slCount(gene->exons), slCount(gene->introns)); } } static boolean checkWordCount(struct gff *gff, int wordCount) { if (wordCount >= 9) return TRUE; else { warn("???%s???\n", gff->buf); warn("Can't handle line %d of %s.\n", gff->lineNumber, gff->fileName); return FALSE; } } boolean gffReadGenes(struct gff *gff) /* Read all the gene (as opposed to base) info in file. */ { int wordCount; struct gffSegLine seg; char curGroup[128]; struct gffGene *gene = NULL; GffIntron *intron = NULL; GffExon *exon = NULL; boolean warnedUnknown = FALSE; boolean isNewGene; curGroup[0] = 0; /* Start off with no group */ /* Line scanning loop. */ for (;;) { /* Get next line and parse it into segLine data structure. */ if (!_gffGetLine(gff)) break; /* End of file. */ if (gff->buf[0] == '#') continue; /* Ignore sharp containing lines. */ wordCount = gffSegLineScan(gff, &seg); if (wordCount < 9) continue; /* Ignore blank lines and short ones. */ /* Make sure that start is less than or equal end. */ if (seg.start > seg.end) { warn("start greater than end line %d of %s.\n", gff->lineNumber, gff->fileName); return FALSE; } /* Get the gene we're working on. First see if * it's the same as last time around. */ isNewGene = FALSE; if (strcmp(seg.group, curGroup) != 0) { strcpy(curGroup, seg.group); if ((gene = gffFindGene(gff, seg.group)) == NULL) { /* It's a new gene! */ if (!checkWordCount(gff, wordCount)) return FALSE; isNewGene = TRUE; gene = gffNeedMem(gff, sizeof(*gene)); strcpy(gene->name, seg.group); slAddTail(&gff->genes, gene); gene->strand = seg.strand[0]; gene->frame = atoi(seg.frame); if (differentWord(seg.feature, "CDS") == 0) { gene->start = seg.start-1; gene->end = seg.end-1; } } } /* Look at what sort of feature it is, and decide what to do. */ if (differentWord(seg.feature, "CDS")==0) { /* CDS (coding segments) have been processed already * for the most part. Here just make sure they aren't * duplicated. */ if (!checkWordCount(gff, wordCount)) return FALSE; if (!isNewGene) { if (gene->start != 0 || gene->end != 0) { warn("Warning duplicate CDS for %s\n", seg.group); warn("Line %d of %s\n", gff->lineNumber, gff->fileName); } } } else if (differentWord(seg.feature, "SE") == 0 || differentWord(seg.feature, "IE") == 0 || differentWord(seg.feature, "FE") == 0 || differentWord(seg.feature, "E") == 0 || differentWord(seg.feature, "exon") == 0) { /* It's some sort of exon. We'll deal with the complications * of it being possibly on the minus strand later, so can * tread initial, final, single, and regular exons the same * here. */ if (!checkWordCount(gff, wordCount)) return FALSE; exon = gffNeedMem(gff, sizeof(*exon)); exon->start = seg.start-1; exon->end = seg.end-1; exon->frame = atoi(seg.frame); gffSegmentInsertSort(&gene->exons, exon); } else if (differentWord(seg.feature, "I") == 0 || differentWord(seg.feature, "intron") == 0) { /* It's an intron. */ if (!checkWordCount(gff, wordCount)) return FALSE; intron = gffNeedMem(gff, sizeof(*intron)); intron->start = seg.start-1; intron->end = seg.end-1; intron->frame = atoi(seg.frame); gffSegmentInsertSort(&gene->introns, intron); } else if (strcmp(seg.feature, "IG") == 0) { /* I don't know what it is, but we can ignore it. */ } else { if (!warnedUnknown) { warn("Unknown feature %s line %d of %s, ignoring\n", seg.feature, gff->lineNumber, gff->fileName); warnedUnknown = TRUE; } } } /* Fix up gene length from exons if needed. */ for (gene = gff->genes; gene != NULL; gene = gene->next) { if (gene->start >= gene->end) { offsetsFromExons(gene); } } return TRUE; } static boolean geneDna(struct gff *gff, struct gffGene *gene, int leftExtra, int rightExtra, char **retDna, long *retDnaSize, int *retStartOffset) /* Allocate an array and fill it with dna from a gene. */ { char *dna; char *pt; long geneSize; long i; long seqStart, seqEnd, seqSize; /* Filter out unreasonable looking genes - input to this * program isn't totally clean. */ geneSize = gene->end - gene->start + 1; if (geneSize <= 0 || geneSize >= 1000000) return FALSE; /* Figure out extents of DNA we're going to return. * Return extra they ask for if possible, but clip * it to what is actually in GFF file. */ seqStart = gene->start - leftExtra; seqEnd = gene->end + rightExtra + 1; if (seqStart < 0) seqStart = 0; if (seqEnd > gff->dnaSize) seqEnd = gff->dnaSize; seqSize = seqEnd - seqStart; /* Allocate memory and fetch the dna. */ dna = needMem(seqSize+1); pt = dna; for (i=0; idna[seqStart+i]; *pt = 0; /* Report results back to caller. */ *retDna = dna; *retDnaSize = seqSize; *retStartOffset = (gene->start - seqStart); return TRUE; } static void fixDirectionAndOffsets(struct gffGene *gene, char *dna, long dnaSize, int newStart) /* Reverse complement DNA if gene is on negative strand. * Update offsets of exons and introns in gene to * make them point into dna, rather than into gff->dna. */ { long oldStart; long offset; GffIntron *intron; GffExon *exon; long temp; oldStart = gene->start; offset = oldStart - newStart; gene->start -= offset; gene->end -= offset; for (intron = gene->introns; intron != NULL; intron = intron->next) { intron->start -= offset; intron->end -= offset; } for (exon = gene->exons; exon != NULL; exon = exon->next) { exon->start -= offset; exon->end -= offset; } if (gene->strand == '-') { reverseComplement(dna, dnaSize); temp = reverseOffset(gene->start, dnaSize); gene->start = reverseOffset(gene->end, dnaSize); gene->end = temp; for (intron = gene->introns; intron != NULL; intron = intron->next) { temp = reverseOffset(intron->start, dnaSize); intron->start = reverseOffset(intron->end, dnaSize); intron->end = temp; } for (exon = gene->exons; exon != NULL; exon = exon->next) { temp = reverseOffset(exon->start, dnaSize); exon->start = reverseOffset(exon->end, dnaSize); exon->end = temp; } slReverse(&gene->introns); slReverse(&gene->exons); gene->strand = '+'; } } static struct gffSegment *dupeSegmentList(struct gffSegment *oldList, struct gffSegment *newList) /* Duplicate a segment list into array of fresh memory. */ { struct gffSegment *oldEl, *newEl; if (oldList == NULL) return NULL; for (oldEl = oldList, newEl = newList; oldEl != NULL; oldEl=oldEl->next, newEl += 1) { memcpy(newEl, oldEl, sizeof(*newEl)); newEl->next = ((oldEl->next == NULL) ? NULL : newEl+1); } return newList; } struct gffGene *gffDupeGeneAndSurrounds(struct gff *gff, struct gffGene *oldGene, int leftExtra, int rightExtra) /* Make a duplicate of gene with extra DNA around coding region. * gffFreeGene it when done. */ /* In a perhaps hair brained scheme to save some cycles, * the memory allocation of the intron and exon lists * is shared with that of the gffGene itself. */ { struct gffGene *g; int intronCount = slCount(oldGene->introns); int exonCount = slCount(oldGene->exons); int memSize = sizeof(*g) + (intronCount + exonCount) * sizeof(struct gffSegment); char *memPt; int firstExonOffset; memPt = needMem(memSize); g = (struct gffGene *)memPt; memPt += sizeof(*g); g->exons = (struct gffSegment *)memPt; memPt += exonCount*sizeof(struct gffSegment); g->introns = (struct gffSegment *)memPt; g->next = NULL; g->start = oldGene->start; g->end = oldGene->end; g->strand = oldGene->strand; memcpy(g->name, oldGene->name, sizeof(g->name)); g->exons = dupeSegmentList(oldGene->exons, g->exons); g->introns = dupeSegmentList(oldGene->introns, g->introns); if (!geneDna(gff, oldGene, leftExtra, rightExtra, &g->dna, &g->dnaSize, &firstExonOffset)) { gffFreeGene(&g); return NULL; } fixDirectionAndOffsets(g, g->dna, g->dnaSize, firstExonOffset); return g; } struct gffGene *gffDupeGene(struct gff *gff, struct gffGene *oldGene) /* Make a duplicate of gene (with it's own DNA) */ { return gffDupeGeneAndSurrounds(gff, oldGene, 0, 0); } struct gffGene *gffGeneWithOwnDna(struct gff *gff, char *geneName) /* Find gene with given name. Case sensitive. */ { struct gffGene *oldGene; oldGene = gffFindGeneIgnoreCase(gff, geneName); if (oldGene == NULL) return NULL; return gffDupeGene(gff, oldGene); } void gffFreeGene(struct gffGene **pGene) /* Free a gene returned with dupeGene or geneWithOwnDna. * (You don't want to free the ones returned by findGene, * they are still owned by the gff.) */ { struct gffGene *g = *pGene; if (g == NULL) return; freeMem(g->dna); freeMem(g); *pGene = NULL; } struct dnaSeq *gffReadDnaSeq(char *fileName) /* Open gff file and read DNA sequence from it. */ { struct gff gff; struct dnaSeq *seq = NULL; if (!gffOpen(&gff, fileName)) return NULL; if (gffReadDna(&gff)) { seq = newDnaSeq(gff.dna, gff.dnaSize, gff.dnaName); gff.dna = NULL; } gffClose(&gff); return seq; } boolean gffOpenAndRead(struct gff *gff, char *fileName) /* Open up gff file and read everything in it. */ { if (gffOpen(gff, fileName)) if (gffReadDna(gff)) if (gffReadGenes(gff)) return TRUE; gffClose(gff); return FALSE; }