/* Routines for reading and writing fasta format sequence files. * * This file is copyright 2002 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "errabort.h" #include "hash.h" #include "portable.h" #include "dnautil.h" #include "dnaseq.h" #include "fa.h" #include "linefile.h" boolean faReadNext(FILE *f, char *defaultName, boolean mustStartWithComment, char **retCommentLine, struct dnaSeq **retSeq) /* Read next sequence from .fa file. Return sequence in retSeq. * If retCommentLine is non-null * return the '>' line in retCommentLine. * The whole thing returns FALSE at end of file. * DNA chars are mapped to lower case.*/ { return faReadMixedNext(f, 0, defaultName, mustStartWithComment, retCommentLine, retSeq); } boolean faReadMixedNext(FILE *f, boolean preserveCase, char *defaultName, boolean mustStartWithComment, char **retCommentLine, struct dnaSeq **retSeq) /* Read next sequence from .fa file. Return sequence in retSeq. * If retCommentLine is non-null return the '>' line in retCommentLine. * The whole thing returns FALSE at end of file. * Contains parameter to preserve mixed case. */ { char lineBuf[1024]; int lineSize; char *words[1]; int c; off_t offset = ftello(f); size_t dnaSize = 0; DNA *dna, *sequence; char *name = defaultName; if (name == NULL) name = ""; dnaUtilOpen(); if (retCommentLine != NULL) *retCommentLine = NULL; *retSeq = NULL; /* Skip first lines until it starts with '>' */ for (;;) { if(fgets(lineBuf, sizeof(lineBuf), f) == NULL) { if (ferror(f)) errnoAbort("read of fasta file failed"); return FALSE; } lineSize = strlen(lineBuf); if (lineBuf[0] == '>') { if (retCommentLine != NULL) *retCommentLine = cloneString(lineBuf); offset = ftello(f); chopByWhite(lineBuf, words, ArraySize(words)); name = words[0]+1; break; } else if (!mustStartWithComment) { if (fseeko(f, offset, SEEK_SET) < 0) errnoAbort("fseek on fasta file failed"); break; } else offset += lineSize; } /* Count up DNA. */ for (;;) { c = fgetc(f); if (c == EOF || c == '>') break; if (isalpha(c)) { ++dnaSize; } } if (dnaSize == 0) { warn("Invalid fasta format: sequence size == 0 for element %s",name); } /* Allocate DNA and fill it up from file. */ dna = sequence = needHugeMem(dnaSize+1); if (fseeko(f, offset, SEEK_SET) < 0) errnoAbort("fseek on fasta file failed"); for (;;) { c = fgetc(f); if (c == EOF || c == '>') break; if (isalpha(c)) { /* check for non-DNA char */ if (ntChars[c] == 0) { *dna++ = preserveCase ? 'N' : 'n'; } else { *dna++ = preserveCase ? c : ntChars[c]; } } } if (c == '>') ungetc(c, f); *dna = 0; *retSeq = newDnaSeq(sequence, dnaSize, name); if (ferror(f)) errnoAbort("read of fasta file failed"); return TRUE; } struct dnaSeq *faReadOneDnaSeq(FILE *f, char *defaultName, boolean mustStartWithComment) /* Read sequence from FA file. Assumes positioned at or before * the '>' at start of sequence. */ { struct dnaSeq *seq; if (!faReadNext(f, defaultName, mustStartWithComment, NULL, &seq)) return NULL; else return seq; } static bioSeq *nextSeqFromMem(char **pText, boolean isDna, boolean doFilter) /* Convert fa in memory to bioSeq. Update *pText to point to next * record. Returns NULL when no more sequences left. */ { char *name = ""; char *s, *d; struct dnaSeq *seq; int size = 0; char c; char *filter = (isDna ? ntChars : aaChars); char *text = *pText; char *p = skipLeadingSpaces(text); if (p == NULL) return NULL; dnaUtilOpen(); if (*p == '>') { char *end; s = strchr(p, '\n'); if (s != NULL) ++s; name = skipLeadingSpaces(p+1); end = skipToSpaces(name); if (end >= s || name >= s) errAbort("No name in line starting with '>'"); if (end != NULL) *end = 0; } else { s = p; if (s == NULL || s[0] == 0) return NULL; } name = cloneString(name); d = text; if (s != NULL) { for (;;) { c = *s; if (c == 0 || c == '>') break; ++s; if (!isalpha(c)) continue; if (doFilter) { if ((c = filter[(int)c]) == 0) { if (isDna) c = 'n'; else c = 'X'; } } d[size++] = c; } } d[size] = 0; /* Put sequence into our little sequence structure. */ AllocVar(seq); seq->name = name; seq->dna = text; seq->size = size; *pText = s; return seq; } bioSeq *faNextSeqFromMemText(char **pText, boolean isDna) /* Convert fa in memory to bioSeq. Update *pText to point to next * record. Returns NULL when no more sequences left. */ { return nextSeqFromMem(pText, isDna, TRUE); } bioSeq *faNextSeqFromMemTextRaw(char **pText) /* Same as faNextSeqFromMemText, but will leave in * letters even if they aren't in DNA or protein alphabed. */ { return nextSeqFromMem(pText, TRUE, FALSE); } bioSeq *faSeqListFromMemText(char *text, boolean isDna) /* Convert fa's in memory into list of dnaSeqs. */ { bioSeq *seqList = NULL, *seq; while ((seq = faNextSeqFromMemText(&text, isDna)) != NULL) { slAddHead(&seqList, seq); } slReverse(&seqList); return seqList; } bioSeq *faSeqListFromMemTextRaw(char *text) /* Convert fa's in memory into list of dnaSeqs without * converting chars to N's. */ { bioSeq *seqList = NULL, *seq; while ((seq = faNextSeqFromMemTextRaw(&text)) != NULL) { slAddHead(&seqList, seq); } slReverse(&seqList); return seqList; } bioSeq *faSeqFromMemText(char *text, boolean isDna) /* Convert fa in memory to bioSeq. */ { return faNextSeqFromMemText(&text, isDna); } struct dnaSeq *faFromMemText(char *text) /* Return a sequence from a .fa file that's been read into * a string in memory. This cannabalizes text, which should * be allocated with needMem. This buffer becomes part of * the returned dnaSeq, which may be freed normally with * freeDnaSeq. */ { return faNextSeqFromMemText(&text, TRUE); } struct dnaSeq *faReadSeq(char *fileName, boolean isDna) /* Open fa file and read a single sequence from it. */ { int maxSize = fileSize(fileName); int fd; DNA *s; if (maxSize < 0) errAbort("can't open %s", fileName); s = needHugeMem(maxSize+1); fd = open(fileName, O_RDONLY); if (read(fd, s, maxSize) < 0) errAbort("faReadSeq: read failed: %s", strerror(errno)); close(fd); s[maxSize] = 0; return faSeqFromMemText(s, isDna); } struct dnaSeq *faReadDna(char *fileName) /* Open fa file and read a single nucleotide sequence from it. */ { return faReadSeq(fileName, TRUE); } struct dnaSeq *faReadAa(char *fileName) /* Open fa file and read a peptide single sequence from it. */ { return faReadSeq(fileName, FALSE); } static unsigned faFastBufSize = 0; static DNA *faFastBuf; static void expandFaFastBuf(int bufPos, int minExp) /* Make faFastBuf bigger. */ { if (faFastBufSize == 0) { faFastBufSize = 64 * 1024; while (minExp > faFastBufSize) faFastBufSize <<= 1; faFastBuf = needHugeMem(faFastBufSize); } else { DNA *newBuf; unsigned newBufSize = faFastBufSize + faFastBufSize; while (newBufSize < minExp) { newBufSize <<= 1; if (newBufSize <= 0) errAbort("expandFaFastBuf: integer overflow when trying to " "increase buffer size from %u to a min of %u.", faFastBufSize, minExp); } newBuf = needHugeMem(newBufSize); memcpy(newBuf, faFastBuf, bufPos); freeMem(faFastBuf); faFastBuf = newBuf; faFastBufSize = newBufSize; } } void faFreeFastBuf() /* Free up buffers used in fa fast and speedreading. */ { freez(&faFastBuf); faFastBufSize = 0; } boolean faFastReadNext(FILE *f, DNA **retDna, int *retSize, char **retName) /* Read in next FA entry as fast as we can. Return FALSE at EOF. * The returned DNA and name will be overwritten by the next call * to this function. */ { int c; int bufIx = 0; static char name[256]; int nameIx = 0; boolean gotSpace = FALSE; /* Seek to next '\n' and save first word as name. */ dnaUtilOpen(); name[0] = 0; for (;;) { if ((c = fgetc(f)) == EOF) { *retDna = NULL; *retSize = 0; *retName = NULL; return FALSE; } if (!gotSpace && nameIx < ArraySize(name)-1) { if (isspace(c)) gotSpace = TRUE; else if (c != '>') { name[nameIx++] = c; } } if (c == '\n') break; } name[nameIx] = 0; /* Read until next '>' */ for (;;) { c = fgetc(f); if (c == EOF || c == '>') c = 0; else if (!isalpha(c)) continue; else { c = ntChars[c]; if (c == 0) c = 'n'; } if (bufIx >= faFastBufSize) expandFaFastBuf(bufIx, 0); faFastBuf[bufIx++] = c; if (c == 0) { *retDna = faFastBuf; *retSize = bufIx-1; *retName = name; return TRUE; } } } void faWriteNext(FILE *f, char *startLine, DNA *dna, int dnaSize) /* Write next sequence to fa file. */ { if (dnaSize == 0) return; if (startLine != NULL) fprintf(f, ">%s\n", startLine); writeSeqWithBreaks(f, dna, dnaSize, 50); } void faWrite(char *fileName, char *startLine, DNA *dna, int dnaSize) /* Write out FA file or die trying. */ { FILE *f = mustOpen(fileName, "w"); faWriteNext(f, startLine, dna, dnaSize); if (fclose(f) != 0) errnoAbort("fclose failed"); } void faWriteAll(char *fileName, bioSeq *seqList) /* Write out all sequences in list to file. */ { FILE *f = mustOpen(fileName, "w"); bioSeq *seq; for (seq=seqList; seq != NULL; seq = seq->next) faWriteNext(f, seq->name, seq->dna, seq->size); if (fclose(f) != 0) errnoAbort("fclose failed"); } boolean faMixedSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName) /* Read in DNA or Peptide FA record in mixed case. Allow any upper or lower case * letter, or the dash character in. */ { char c; int bufIx = 0; static char name[512]; int lineSize, i; char *line; dnaUtilOpen(); /* Read first line, make sure it starts with '>', and read first word * as name of sequence. */ name[0] = 0; if (!lineFileNext(lf, &line, &lineSize)) { *retDna = NULL; *retSize = 0; return FALSE; } if (line[0] == '>') { line = firstWordInLine(skipLeadingSpaces(line+1)); if (line == NULL) errAbort("Expecting sequence name after '>' line %d of %s", lf->lineIx, lf->fileName); strncpy(name, line, sizeof(name)); name[sizeof(name)-1] = '\0'; /* Just to make sure name is NULL terminated. */ } else { errAbort("Expecting '>' line %d of %s", lf->lineIx, lf->fileName); } /* Read until next '>' */ for (;;) { if (!lineFileNext(lf, &line, &lineSize)) break; if (line[0] == '>') { lineFileReuse(lf); break; } if (bufIx + lineSize >= faFastBufSize) expandFaFastBuf(bufIx, lineSize); for (i=0; i= faFastBufSize) expandFaFastBuf(bufIx, 0); faFastBuf[bufIx] = 0; *retDna = faFastBuf; *retSize = bufIx; *retName = name; if (bufIx == 0) { warn("Invalid fasta format: sequence size == 0 for element %s",name); } return TRUE; } void faToProtein(char *poly, int size) /* Convert possibly mixed-case protein to upper case. Also * convert any strange characters to 'X'. Does not change size. * of sequence. */ { int i; char c; dnaUtilOpen(); for (i=0; iname = cloneString(name); seq->size = size; seq->dna = cloneMem(dna, size+1); slAddHead(&seqList, seq); } slReverse(&seqList); faFreeFastBuf(); return seqList; } static struct dnaSeq *faReadAllSeqMixable(char *fileName, boolean isDna, boolean mixed) /* Return list of all sequences in FA file. * Mixed case parameter overrides isDna. If mixed is false then * will return DNA in lower case and non-DNA in upper case. */ { struct lineFile *lf = lineFileOpen(fileName, FALSE); struct dnaSeq *seqList = faReadAllMixableInLf(lf, isDna, mixed); lineFileClose(&lf); return seqList; } struct hash *faReadAllIntoHash(char *fileName, enum dnaCase dnaCase) /* Return hash of all sequences in FA file. */ { boolean isDna = (dnaCase == dnaLower); boolean isMixed = (dnaCase == dnaMixed); struct dnaSeq *seqList = faReadAllSeqMixable(fileName, isDna, isMixed); struct hash *hash = hashNew(18); struct dnaSeq *seq; for (seq = seqList; seq != NULL; seq = seq->next) { if (hashLookup(hash, seq->name)) errAbort("%s duplicated in %s", seq->name, fileName); hashAdd(hash, seq->name, seq); } return hash; } struct dnaSeq *faReadAllSeq(char *fileName, boolean isDna) /* Return list of all sequences in FA file. */ { return faReadAllSeqMixable(fileName, isDna, FALSE); } struct dnaSeq *faReadAllDna(char *fileName) /* Return list of all DNA sequences in FA file. */ { return faReadAllSeq(fileName, TRUE); } struct dnaSeq *faReadAllPep(char *fileName) /* Return list of all peptide sequences in FA file. */ { return faReadAllSeq(fileName, FALSE); } struct dnaSeq *faReadAllMixed(char *fileName) /* Read in mixed case fasta file, preserving case. */ { return faReadAllSeqMixable(fileName, FALSE, TRUE); } struct dnaSeq *faReadAllMixedInLf(struct lineFile *lf) /* Read in mixed case sequence from open fasta file. */ { return faReadAllMixableInLf(lf, FALSE, TRUE); }