/* faends - get just the ends of some fa sequences. */ #include "common.h" #include "dnautil.h" #include "dnaseq.h" #include "fa.h" struct dnaSeq *readAllFa(char *fileName) { struct dnaSeq *list = NULL, *el; FILE *f; int seqCount = 0; uglyf("Loading %s\n", fileName); f = mustOpen(fileName, "r"); while ((el = faReadOneDnaSeq(f, NULL, TRUE)) != NULL) { if (++seqCount % 5000 == 0) uglyf("loaded sequence #%d\n", seqCount); slAddHead(&list, el); } fclose(f); slReverse(&list); return list; } int main(int argc, char *argv[]) { char *inName, *outName; int chopSize; int seqSize; struct dnaSeq *seqList = NULL, *seq; FILE *f; DNA *dna; int seqCount = 0; boolean tail = TRUE; char *headOrTail; if (argc != 5 || !isdigit(argv[3][0])) { errAbort("faends - write just the end part of each sequence to a new fa file\n" "usage:\n" " faends in.fa tail/head size out.fa"); } inName = argv[1]; headOrTail = argv[2]; chopSize = atoi(argv[3]); outName = argv[4]; seqList = readAllFa(inName); if (sameWord(headOrTail, "head")) tail = FALSE; else if (sameWord(headOrTail, "tail")) tail = TRUE; else errAbort("Expecting head or tail as second parameter, got %s", headOrTail); printf("Read %d sequences from %s\n", slCount(seqList), inName); f = mustOpen(outName, "w"); for (seq = seqList; seq != NULL; seq = seq->next) { seqSize = seq->size; dna = seq->dna; if (seqSize >= chopSize-3) { int chop = chopSize; if (chop > seqSize-3) chop = seqSize-3; if (tail) dna += seqSize - chop; else dna[chop] = 0; } assert(seq->name != NULL); assert(dna != NULL); fprintf(f, ">%s\n%s\n", seq->name, dna); } fclose(f); printf("Done writing %s %d bases of each sequence to %s\n", (tail ? "last" : "first"), chopSize, outName); return 0; }