/* splatCheck1 - Check that all the test set really is being covered.. */ /* This file is copyright 2008 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "sqlNum.h" #include "fa.h" void usage() /* Explain usage and exit. */ { errAbort( "splatCheck1 - Check that all the test set really is being covered.\n" "Works on test sets generated by splatTestSet.\n" "usage:\n" " splatCheck1 int.fa in.splat out.miss out.bad\n" "where out.miss contains the names of unmapped reads, and out.bad\n" "contains the names of reads that don't map to the right place\n" "options:\n" " -xxx=XXX\n" ); } static struct optionSpec options[] = { {NULL, 0}, }; struct hash *hashFaNames(char *fileName) /* Return a hash full of the names (but not sequence) of all * records in fa file. */ { struct hash *hash = hashNew(0); struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { line = skipLeadingSpaces(line); if (line[0] == '>') { line += 1; char *name = firstWordInLine(line); if (name == NULL || name[0] == 0) errAbort("Empty name in fasta file line %d of %s", lf->lineIx, lf->fileName); if (hashLookup(hash, name)) errAbort("Duplicate name %s in fasta file line %d of %s", name, lf->lineIx, lf->fileName); hashAdd(hash, name, NULL); } } lineFileClose(&lf); return hash; } int splatCheck1(char *inFa, char *inSplat, char *outMiss, char *outWrong) /* splatCheck1 - Check that all the test set really is being covered.. */ { struct lineFile *lf = lineFileOpen(inSplat, TRUE); FILE *missF = mustOpen(outMiss, "w"); FILE *badF = mustOpen(outWrong, "w"); char *row[7]; struct hash *allHash = hashFaNames(inFa); struct hash *mappedHash = hashNew(0); /* Keep track of reads we've seen here. */ struct hash *goodHash = hashNew(0); /* Keep track of good reads here. */ while (lineFileRow(lf, row)) { /* Read in line and parse it, track it. */ int chromStart = sqlUnsigned(row[1]); char *strand = row[5]; char *name = row[6]; hashStore(mappedHash, name); /* Parse out name field to figure out where we expect it to map. */ char *pt = name; char *expectStrand = "+"; if (startsWith("RC_", pt)) { pt += 3; expectStrand = "-"; } char *nameCopy = cloneString(pt); char *parts[4]; int partCount = chopByChar(nameCopy, '_', parts, ArraySize(parts)); if (partCount != 3) errAbort("Can't parse name field line %d of %s", lf->lineIx, lf->fileName); int expectStart = sqlUnsigned(parts[1]); if (sameString(strand, expectStrand)) { int diff = intAbs(chromStart - expectStart); if (diff <= 2) { hashStore(goodHash, name); } } freeMem(nameCopy); } struct hashEl *hel, *helList = hashElListHash(allHash); int allCount = allHash->elCount; int missCount = 0, badCount = 0; for (hel = helList; hel != NULL; hel = hel->next) { char *name = hel->name; if (!hashLookup(mappedHash, name)) { fprintf(missF, "%s\n", name); ++missCount; } else { if (!hashLookup(goodHash, hel->name)) { fprintf(badF, "%s\n", hel->name); ++badCount; } } } carefulClose(&badF); carefulClose(&missF); verbose(1, "Total reads %d\n", allCount); verbose(1, "Unmapped %d (%5.2f%%)\n", missCount, 100.0*missCount/allCount); verbose(1, "Mapped wrong %d (%5.2f%%)\n", badCount, 100.0*badCount/allCount); return -(missCount + badCount); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); return splatCheck1(argv[1], argv[2], argv[3], argv[4]); }