#!/usr/bin/awk -f # # summerize gbSanity output # function reset() { totalErr = 0; refLinkName = 0; invalidPsl = 0; pslQSizeMismatch = 0; exonOverlap = 0; faNameMatch = 0; gbStatCdnaVer = 0; gbStatCdnaModDate = 0; qNameNotInIndex = 0; noGbIndexProc = 0; typeNotMatchExpected = 0; cdsEndNotInExon = 0; noGbIndexAligned = 0; numAlignsNotMatchGbStatus = 0; noPepRefSeq = 0; noPepRefLink = 0; emptyPepRefLink = 0; gbSeqNoExtFile = 0; pepNotInExtFile = 0; pepOffNotFaRec = 0; nonNmRefLink = 0; nonNmGbSeq = 0; nonNmNoRefLink = 0; nonNmNoPeptide = 0; } function prErrInfo(desc, cnt) { if (cnt > 0) { print db, desc, cnt; } } BEGIN { reset(); } #->gbSanity: begin: ce2: tod=2006-03-11T05:47:35 /^->gbSanity: begin: / { db = gensub(":", "", "g", $3); } /Error: .* no psl table/ { next; # not a real error } /^Error: / { totalErr++; } /^Error: .*(refFlat|xenoRefFlat).* geneName.*does not match refLink name/ { refLinkName++; next; } /^Error: invalid PSL:/ { invalidPsl++; next; } /^Error: .*psl.*qSize .* != .*size/ { pslQSizeMismatch++; next; } /^Error: .*overlaps previous exon/ { exonOverlap++; next; } /^Error: .* name in fasta header/ { faNameMatch++; next; } /^Error: .* gbStatus.version.* not same gbCdnaInfo.version/ { gbStatCdnaVer++; next; } /^Error: .* gbStatus.modDate .* not same gbCdnaInfo.moddate/ { gbStatCdnaModDate++; next; } /^Error: .* qName not in gbIndex as type/ { qNameNotInIndex++; next; } /^Error: .* no gbIndex processed entry for version/ { noGbIndexProc++; next; } /^Error: .*type mrna,refseq doesn.t match expected mrna,native,refseq/ { typeNotMatchExpected++; # bogus?? next; } /^Error: .* cdsEnd .* not in an exon/ { cdsEndNotInExon++; next; } /^Error: .* no genbank gbIndex aligned object for gbStatus/ { noGbIndexAligned++; next; } /^Error: .*number of alignments found .*does not match expected.* from gbStatus/ { numAlignsNotMatchGbStatus++; next; } /^Error: .*non-NM_ mrnaAcc in refLink/ { nonNmRefLink++; next; } /^Error: .* (YP_|NC_|NR_).* peptide in gbSeq not found in refLink/ { nonNmGbSeq++; next; } /^Error: .* (YP_|NC_|NR_).* not in refLink / { nonNmNoRefLink++; next; } /^Error: .* (YP_|NC_|NR_).* no peptide for RefSeq/ { nonNmNoPeptide++; next; } /^Error: .* NM_.* no peptide for RefSeq/ { noPepRefLink++; next; } /^Error: .* NP_.* peptide in gbSeq not found in refLink/ { noPepRefSeq++; next; } /^Error: .* NM_.* empty protein acc in refLink/ { emptyPepRefLink++; next; } /^Error: .*gbSeq.gbExtFile .* not in gbExtFile table/ { gbSeqNoExtFile++; next; } /^Error: .*NM_.*RefSeq peptide .* not in gbExtFile table/ { pepNotInExtFile++; next; } /^Error: .* gbExtFile offset .* start a fasta record/ { pepOffNotFaRec++; next; } /^Error: / { print "Unknown: ",$0; next; } /^<-gbSanity: completed:/ { print db, "totalErr", totalErr; prErrInfo("refLinkName", refLinkName); prErrInfo("invalidPsl", invalidPsl); prErrInfo("pslQSizeMismatch", pslQSizeMismatch); prErrInfo("exonOverlap", exonOverlap); prErrInfo("faNameMatch", faNameMatch); prErrInfo("gbStatCdnaVer", gbStatCdnaVer); prErrInfo("gbStatCdnaModDate", gbStatCdnaModDate); prErrInfo("qNameNotInIndex",qNameNotInIndex); prErrInfo("noGbIndexProc",noGbIndexProc); prErrInfo("typeNotMatchExpected",typeNotMatchExpected); prErrInfo("cdsEndNotInExon",cdsEndNotInExon); prErrInfo("noGbIndexAligned",noGbIndexAligned); prErrInfo("numAlignsNotMatchGbStatus", numAlignsNotMatchGbStatus); prErrInfo("noPepRefSeq" noPepRefSeq); prErrInfo("noPepRefLink", noPepRefLink); prErrInfo("emptyPepRefLink", emptyPepRefLink); prErrInfo("gbSeqNoExtFile", gbSeqNoExtFile); prErrInfo("pepNotInExtFile", pepNotInExtFile); prErrInfo("nonNmRefLink" nonNmRefLink); prErrInfo("nonNmGbSeq", nonNmGbSeq); prErrInfo("nonNmNoRefLink", nonNmNoRefLink); prErrInfo("nonNmNoPeptide", nonNmNoPeptide); prErrInfo("pepOffNotFaRec", pepOffNotFaRec); print ""; reset(); }