/* splatCheck2 - Check splat output against a MAQ-generated test set.. */ /* This file is copyright 2008 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "sqlNum.h" int offset = 0; boolean bed = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "splatCheck2 - Check splat output against a MAQ-generated test set.\n" "usage:\n" " splatCheck2 in.fa in.splat out.miss out.bad\n" "options:\n" " -offset=N - add offset.\n" " -bed - treat input as bed rather than splat file\n" ); } static struct optionSpec options[] = { {"offset", OPTION_INT}, {"bed", OPTION_BOOLEAN}, {NULL, 0}, }; struct hash *hashFaNames(char *fileName) /* Return a hash full of the names (but not sequence) of all * records in fa file. */ { struct hash *hash = hashNew(0); struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { line = skipLeadingSpaces(line); if (line[0] == '>') { line += 1; char *name = firstWordInLine(line); if (name == NULL || name[0] == 0) errAbort("Empty name in fasta file line %d of %s", lf->lineIx, lf->fileName); if (hashLookup(hash, name)) errAbort("Duplicate name %s in fasta file line %d of %s", name, lf->lineIx, lf->fileName); hashAdd(hash, name, NULL); } } lineFileClose(&lf); return hash; } void parseNameNeighborhood(char *name, char **retChrom, int *retStart, int *retEnd) /* Parse chromosome and start/end out of name field. This is expected to look something * like: chr22_382900_383094_3c/1 or chr22_20M_382900_383094_3c/1 */ { char *words[6]; int wordCount = chopByChar(name, '_', words, ArraySize(words)); if (wordCount < 4 || wordCount > 5) errAbort("Don't understand name field"); *retChrom = words[0]; *retStart = sqlUnsigned(words[wordCount-3]); *retEnd = sqlUnsigned(words[wordCount-2]); } int splatCheck2(char *inFa, char *inSplat, char *outMiss, char *outWrong) /* splatCheck2 - Check splat output against a MAQ-generated test set.. */ { int nameCol = (bed ? 3 : 6); struct lineFile *lf = lineFileOpen(inSplat, TRUE); FILE *missF = mustOpen(outMiss, "w"); FILE *badF = mustOpen(outWrong, "w"); char *row[7]; struct hash *allHash = hashFaNames(inFa); struct hash *mappedHash = hashNew(0); /* Keep track of reads we've seen here. */ struct hash *goodHash = hashNew(0); /* Keep track of good reads here. */ while (lineFileNextRow(lf, row, nameCol+1)) { /* Read in line and parse it, track it. */ char *chrom = row[0]; int chromStart = sqlUnsigned(row[1]) - offset; int chromEnd = sqlUnsigned(row[2]) - offset; char *name = row[nameCol]; char *origName = hashStore(mappedHash, name)->name; char *sourceChrom; int sourceStart, sourceEnd; /* Parse out name field to figure out where we expect it to map. */ parseNameNeighborhood(name, &sourceChrom, &sourceStart, &sourceEnd); if (sameString(sourceChrom, chrom) && rangeIntersection(chromStart, chromEnd, sourceStart, sourceEnd) > 0) { hashStore(goodHash, origName); } } struct hashEl *hel, *helList = hashElListHash(allHash); int allCount = allHash->elCount; int missCount = 0, badCount = 0; for (hel = helList; hel != NULL; hel = hel->next) { char *name = hel->name; if (!hashLookup(mappedHash, name)) { fprintf(missF, "%s\n", name); ++missCount; } else { if (!hashLookup(goodHash, hel->name)) { fprintf(badF, "%s\n", hel->name); ++badCount; } } } carefulClose(&badF); carefulClose(&missF); verbose(1, "Total reads %d\n", allCount); verbose(1, "Unmapped %d (%5.2f%%)\n", missCount, 100.0*missCount/allCount); verbose(1, "Mapped wrong %d (%5.2f%%)\n", badCount, 100.0*badCount/allCount); return -(missCount + badCount); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); offset = optionInt("offset", offset); bed = optionExists("bed"); if (argc != 5) usage(); return splatCheck2(argv[1], argv[2], argv[3], argv[4]); return 0; }