/* gbtofa - convert from genebank to fa format. Does * filtering to make sure it's really DNA/RNA. */ #include "common.h" #include "dnautil.h" #include "hash.h" void filter(FILE *in, FILE *out) { char locusLine[512]; char *locusWords[8]; int locusWordCount; char *dnaIdWord; int locusCount = 0; char accessionLine[256]; char *accessionWords[3]; int accessionWordCount; char definitionLine[256]; char *definitionWords[16]; int definitionWordCount; char koharaName[64]; char *ourName = NULL; char lineBuf[512]; char *linePt; int basePairs; int twiceBasePairs; DNA *dnaBuf = NULL; int dnaAllocated = 0; int dnaCount; DNA base; char c; boolean gotKohara; boolean isEmbryo; int embryoCount = 0; int uniqCount = 0; int i; int lineSize; struct hash *hash = newHash(12); struct hashEl *el; /* Main loop - once per cDNA. */ for (;;) { LOOP_START: /* Loop to get locus. */ isEmbryo = FALSE; for (;;) { if (fgets(locusLine, sizeof(locusLine), in) == NULL) { goto LOOP_END; } if (strncmp(locusLine, "LOCUS", 5) == 0) { if ((locusWordCount = chopString(locusLine, whiteSpaceChopper, locusWords, ArraySize(locusWords))) < 4) errAbort("Bad LOCUS line, only %d words", locusWordCount); dnaIdWord = locusWords[3]; if (differentWord(dnaIdWord, "bp")) { warn("Locus %s doesn't appear to be nucleic acid (expecting %s got %s), skipping", locusWords[1], "bp", dnaIdWord); goto LOOP_START; } if (!differentWord(locusWords[4], "DNA")) { warn("Locus %s is DNA not RNA, skipping", locusWords[1]); goto LOOP_START; } basePairs = atol(locusWords[2]); if (basePairs <= 0) { errAbort("Expecting positive number of base pairs, got %s", locusWords[2]); } /* Make sure we have room for DNA. */ twiceBasePairs = 2*basePairs; if (twiceBasePairs > dnaAllocated) { dnaAllocated = twiceBasePairs; freeMem(dnaBuf); dnaBuf = needMem(dnaAllocated); } dnaCount = 0; ++locusCount; break; } } /* Loop to get definition. */ gotKohara = FALSE; for (;;) { if (fgets(definitionLine, sizeof(definitionLine), in) == NULL) { warn("Got LOCUS without DEFINITION"); goto LOOP_END; } if (strncmp(definitionLine, "DEFINITION", 10) == 0) { definitionWordCount = chopLine(definitionLine, definitionWords); if (definitionWordCount > 4 && sameString(definitionWords[2], "Yuji") && sameString(definitionWords[3], "Kohara")) { gotKohara = TRUE; } break; } } if (gotKohara) { mustGetLine(in, definitionLine, sizeof(definitionLine)); definitionWordCount = chopLine(definitionLine, definitionWords); if (definitionWordCount >= 3 && sameString(definitionWords[0], "clone") && definitionWords[1][0] == 'y' && definitionWords[1][1] == 'k' && (definitionWords[2][0] == '3' || definitionWords[2][0] == '5') ) { sprintf(koharaName, "%s.%c", definitionWords[1], definitionWords[2][0]); ourName = koharaName; } else if (definitionWordCount >= 7 && sameString(definitionWords[0], "embryo") && sameString(definitionWords[4], "clone") && definitionWords[5][0] == 'y' && definitionWords[5][1] == 'k' && (definitionWords[6][0] == '3' || definitionWords[6][0] == '5') ) { sprintf(koharaName, "%s.%c", definitionWords[5], definitionWords[6][0]); ourName = koharaName; isEmbryo = TRUE; } else { errAbort("Unusual Kohara!"); gotKohara = FALSE; } } /* Loop to get accession. */ for (;;) { if (fgets(accessionLine, sizeof(accessionLine), in) == NULL) { warn("Got LOCUS without ACCESSION"); goto LOOP_END; } if (strncmp(accessionLine, "ACCESSION", 9) == 0) { if ((accessionWordCount = chopString(accessionLine, whiteSpaceChopper, accessionWords, ArraySize(accessionWords))) < 2) errAbort("Bad ACCESSION line, only %d words", accessionWordCount); if (!gotKohara) ourName = accessionWords[1]; break; } } if ((el = hashLookup(hash, ourName)) != NULL) { warn("Got duplicate ourName %s, skipping all but first", ourName); goto LOOP_START; } hashAdd(hash, ourName, NULL); ++uniqCount; if (isEmbryo) ++embryoCount; /* Loop to get origin */ for (;;) { if (fgets(lineBuf, sizeof(lineBuf), in) == NULL) { warn("Got LOCUS without ORIGIN"); goto LOOP_END; } if (strncmp(lineBuf, "ORIGIN", 6) == 0) { break; } } /* Loop to read DNA */ for (;;) { if (fgets(lineBuf, sizeof(lineBuf), in) == NULL) { warn("No // at end of sequence"); break; } if (strncmp(lineBuf, "//", 2) == 0) { break; } /* Add in this line's worth of DNA. */ linePt = lineBuf; while ((c = *linePt++) != 0) { if ((base = ntChars[(int)c]) != 0) { if (dnaCount >= dnaAllocated) { errAbort("Got more than %d bases, header said there " "should only be %d in %s!", dnaCount, basePairs, ourName); } dnaBuf[dnaCount++] = base; } } } /* Now write out in .fa format. */ printf("%d %s\n", uniqCount, ourName); fprintf(out, ">%s", ourName); if (isEmbryo) fprintf(out, " embryo"); fputc('\n', out); for (i=0; i 50) lineSize = 50; mustWrite(out, dnaBuf+i, lineSize); putc('\n', out); } } LOOP_END: printf("Saved %d sequences (%d embryonic) to fa file\n", uniqCount, embryoCount); freeHash(&hash); } int main(int argc, char *argv[]) { char *inName, *outName; FILE *in, *out; if (argc != 3) { printf("gbtofa converts from GeneBank to fa format.\n"); printf("usage: gbtofa in.gb out.fa\n"); exit(-1); } inName = argv[1]; outName = argv[2]; in = mustOpen(inName, "r"); out = mustOpen(outName, "w"); dnaUtilOpen(); filter(in,out); carefulClose(&in); carefulClose(&out); return 0; }