#include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "chromInfo.h" #include "jksql.h" #include "twoBit.h" #include "dnaseq.h" #include "hgBam.h" #include "bbiFile.h" #include "bigWig.h" #include "errCatch.h" #include "obscure.h" #include "hmmstats.h" #include "basicBed.h" #include "asParse.h" #include "bigBed.h" char *version = "4.6"; #define PEAK_WORDS 16 #define TAG_WORDS 9 #define QUICK_DEFAULT 1000 enum bedType { BED_GRAPH = 0, BROAD_PEAK, NARROW_PEAK, GAPPED_PEAK }; boolean colorSpace; boolean mmPerPair; boolean nMatch; boolean isSorted; boolean privateData; boolean allowOther; boolean allowBadLength; boolean complementMinus; struct hash *chrHash = NULL; char dnaChars[256]; char qualChars[256]; char csQualChars[256]; char seqName[256]; char digits[256]; char alpha[256]; char csSeqName[256]; char bedTypeCols[10]; struct twoBitFile *genome = NULL; int mismatchTotalQuality; int mismatches; int matchFirst=0; int mmCheckOneInN; boolean doReport; double bamPercent = 0.0; char *as = NULL; boolean tab = FALSE; boolean bigBed = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "validateFiles - Validates the format of different genomic files.\n" " Exits with a zero status for no errors detected and non-zero for errors.\n" " Uses filename 'stdin' to read from stdin.\n" " Automatically decompresses Files in .gz, .bz2, .zip, .Z format.\n" " Accepts multiple input files of the same type.\n" " Writes Error messages to stderr\n" "usage:\n" " validateFiles -chromInfo=FILE -options -type=FILE_TYPE file1 [file2 [...]]\n" "\n" " -type=\n" " fasta : Fasta files (only one line of sequence, and no quality scores)\n" " fastq : Fasta with quality scores (see http://maq.sourceforge.net/fastq.shtml)\n" " csfasta : Colorspace fasta (implies -colorSpace)\n" " csqual : Colorspace quality (see link below)\n" " See http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm\n" " bam : Binary Alignment/Map\n" " See http://samtools.sourceforge.net/SAM1.pdf\n" " bigWig : Big Wig\n" " See http://genome.ucsc.edu/goldenPath/help/bigWig.html\n" " bedN[+P] : BED N or BED N+ or BED N+P\n" " where N is a number between 3 and 15 of standard BED columns,\n" " optional + indicates the presence of additional columns\n" " and P is the number of addtional columns\n" " Examples: -type=bed6 or -type=bed6+ or -type=bed6+3 \n" " See http://genome.ucsc.edu/FAQ/FAQformat.html#format1\n" " bigBedN[+P] : bigBED N or bigBED N+ or bigBED N+P, similar to BED\n" " See http://genome.ucsc.edu/goldenPath/help/bigBed.html\n" " tagAlign : Alignment files, replaced with BAM\n" " pairedTagAlign \n" " broadPeak : ENCODE Peak formats\n" " narrowPeak These are specialized bedN+P formats.\n" " gappedPeak See http://genomewiki.cse.ucsc.edu/EncodeDCC/index.php/File_Formats\n" " bedGraph : BED Graph\n" "\n" " -as=fields.as If you have extra \"bedPlus\" fields, it's great to put a definition\n" " of each field in a row in AutoSql format here. Applies to bed-related types.\n" " -tab If set, expect fields to be tab separated, normally\n" " expects white space separator. Applies to bed-related types.\n" " -chromDb=db Specify DB containing chromInfo table to validate chrom names\n" " and sizes\n" " -chromInfo=file.txt Specify chromInfo file to validate chrom names and sizes\n" " -colorSpace Sequences include colorspace values [0-3] (can be used \n" " with formats such as tagAlign and pairedTagAlign)\n" " -isSorted Input is sorted by chrom, only affects types tagAlign and pairedTagAlign\n" " -doReport Output report in filename.report\n" " -version Print version\n" "\n" "For Alignment validations\n" " -genome=path/to/hg18.2bit REQUIRED to validate sequence mappings match the genome specified\n" " in the .2bit file. (BAM, tagAlign, pairedTagAlign)\n" " -nMatch N's do not count as a mismatch\n" " -matchFirst=n Only check the first N bases of the sequence\n" " -mismatches=n Maximum number of mismatches in sequence (or read pair) \n" " -mismatchTotalQuality=n Maximum total quality score at mismatching positions\n" " -mmPerPair Check either pair dont exceed mismatch count if validating\n" " pairedTagAlign files (default is the total for the pair)\n" " -mmCheckOneInN=n Check mismatches in only one in 'n' lines (default=1, all)\n" " -allowOther Allow chromosomes that aren't native in BAM's\n" " -allowBadLength Allow chromosomes that have the wrong length in BAM\n" " -complementMinus Complement the query sequence on the minus strand (for testing BAM)\n" " -bamPercent=N.N Percentage of BAM alignments that must be compliant\n" " -privateData Private data so empty sequence is tolerated\n" "\n" ); } static struct optionSpec options[] = { {"type", OPTION_STRING}, {"chromDb", OPTION_STRING}, {"chromInfo", OPTION_STRING}, {"colorSpace", OPTION_BOOLEAN}, {"mismatchTotalQuality", OPTION_INT}, {"genome", OPTION_STRING}, {"mismatches", OPTION_INT}, {"matchFirst", OPTION_INT}, {"mmPerPair", OPTION_BOOLEAN}, {"mmCheckOneInN", OPTION_INT}, {"nMatch", OPTION_BOOLEAN}, {"privateData", OPTION_BOOLEAN}, {"isSorted", OPTION_BOOLEAN}, {"version", OPTION_BOOLEAN}, {"allowOther", OPTION_BOOLEAN}, {"allowBadLength", OPTION_BOOLEAN}, {"complementMinus", OPTION_BOOLEAN}, {"bamPercent", OPTION_FLOAT}, {"doReport", OPTION_BOOLEAN}, {"as", OPTION_STRING}, {"tab", OPTION_BOOLEAN}, {NULL, 0}, }; FILE *reportF; void vaErrAbort(char *format, va_list args); void reportErrAbort(char *format, ...) /* Abort function, with optional (printf formatted) error message. */ #ifdef __GNUC__ __attribute__((format(printf, 1, 2))) #endif ; void reportWarn(char *format, ...) /* Issue a warning message. */ #ifdef __GNUC__ __attribute__((format(printf, 1, 2))) #endif ; void report(char *format, ...) /* Issue a warning message. */ #ifdef __GNUC__ __attribute__((format(printf, 1, 2))) #endif ; void reportErrAbort(char *format, ...) /* Abort function, with optional (printf formatted) error message. */ { va_list args; if (reportF != NULL) { va_start(args, format); vfprintf(reportF, format, args); va_end(args); } va_start(args, format); vaErrAbort(format, args); va_end(args); } void vaWarn(char *format, va_list args); void reportWarn(char *format, ...) /* Issue a warning message. */ { va_list args; va_start(args, format); vaWarn(format, args); va_end(args); va_start(args, format); if (reportF != NULL) vfprintf(reportF, format, args); va_end(args); } void report(char *format, ...) /* Issue a warning message. */ { va_list args; va_start(args, format); if (reportF != NULL) vfprintf(reportF, format, args); va_end(args); } void reportLongWithCommas(long long l) /* Print out a long number with commas at thousands, millions, etc. */ { char ascii[32]; sprintLongWithCommas(ascii, l); report("%s", ascii); } void reportLabelAndLongNumber(char *label, long long l) /* Print label: 1,234,567 format number */ { report("%s: ", label); reportLongWithCommas( l); report("\n"); } boolean checkMismatch(int ch1, int ch2) // checkMismatch -- if the sequence has an N, we call this a mismatch // by default unless nMatch is set, in which case we don't call // it a mismatch { if ((ch1 != 'n') && (ch2 != 'n')) return ch1 != ch2; return !nMatch; } void initArrays() // Set up array of chars // dnaChars: DNA chars ACGTNacgtn, and optionally include colorspace 0-3 // qualChars: fastq quality scores as ascii [!-~] (ord(!)=33, ord(~)=126) // csQualChars: csfasta quality scores are decimals separated by spaces // seqName: fastq sequence name chars [A-Za-z0-9_.:/-] { int i; // number of columns to expect in bedType files bedTypeCols[BED_GRAPH] = 4; bedTypeCols[BROAD_PEAK] = 9; bedTypeCols[NARROW_PEAK] = 10; bedTypeCols[GAPPED_PEAK] = 15; for (i=0 ; i < 256 ; ++i) dnaChars[i] = qualChars[i] = csQualChars[i] = seqName[i] = csSeqName[i] = digits[i] = alpha[i] = 0; dnaChars['a'] = dnaChars['c'] = dnaChars['g'] = dnaChars['t'] = dnaChars['n'] = 1; dnaChars['A'] = dnaChars['C'] = dnaChars['G'] = dnaChars['T'] = dnaChars['N'] = 1; if (colorSpace) { dnaChars['0'] = dnaChars['1'] = dnaChars['2'] = dnaChars['3'] = 1; } for (i= (int)'A' ; i <= (int)'Z' ; ++i) seqName[i] = seqName[i+(int)('a'-'A')] = alpha[i] = alpha[i+(int)('a'-'A')] = 1; for (i= (int)'0' ; i <= (int)'9' ; ++i) seqName[i] = digits[i] = csSeqName[i] = csQualChars[i] = 1; seqName['_'] = seqName['.'] = seqName[':'] = seqName['/'] = seqName['-'] = seqName['#'] = 1; csSeqName[','] = csSeqName['.'] = csSeqName['-'] = csSeqName['#'] = 1; csQualChars[' '] = csQualChars['-'] = 1; for (i= (int)'!' ; i <= (int)'~' ; ++i) qualChars[i] = 1; } boolean checkUnsigned(struct lineFile *lf, char *row, char *s, unsigned *val, char *name) /* Convert series of digits to unsigned integer about * twice as fast as atoi (by not having to skip white * space or stop except at the null byte.) * Returns true if conversion possible, and value is returned in 'val' * Otherwise prints warning and returns false */ { unsigned res = 0; char *p = s; char c; while (((c = *(p++)) >= '0') && (c <= '9')) { res *= 10; res += c - '0'; } if (c != '\0') { reportWarn("Error [file=%s, line=%d]: %s field invalid unsigned number (%s) [%s]" , lf->fileName, lf->lineIx,name, s, row); return FALSE; } *val = res; return TRUE; } boolean checkSigned(struct lineFile *lf, char *row, char *s, int *val, char *name) /* Convert string to signed integer. Unlike atol assumes * all of string is number. * Returns true if conversion possible, and value is returned in 'val' * Otherwise prints warning and returns false */ { int res = 0; char *p, *p0 = s; if (*p0 == '-') p0++; p = p0; while ((*p >= '0') && (*p <= '9')) { res *= 10; res += *p - '0'; p++; } /* test for invalid character, empty, or just a minus */ if ((*p != '\0') || (p == p0)) { reportWarn("Error [file=%s, line=%d]: %s field invalid signed number (%s) [%s]" , lf->fileName, lf->lineIx,name, s, row); return FALSE; } if (*s == '-') *val = -res; else *val = res; return TRUE; } boolean checkString(struct lineFile *lf, char *row, char *s, char *name) // Return TRUE if string has non-zero length // Othewise print warning that name column is empty and return FALSE { verbose(3,"[%s %3d] %s(%s)\n", __func__, __LINE__, name, s); if (strlen(s) > 0) return TRUE; reportWarn("Error [file=%s, line=%d]: %s column empty [%s]", lf->fileName, lf->lineIx,name, row); return FALSE; } boolean checkChrom(struct lineFile *lf, char *row, char *s, unsigned *chromSize) // Return TRUE if string has non-zero length // Othewise print warning that name column is empty and return FALSE { unsigned *size; *chromSize = 0; if (strlen(s) > 0) { if (chrHash == NULL) reportErrAbort("chrom validation requires the -chromInfo or -chromDb option\n"); if (chrHash) { if ( (size = hashFindVal(chrHash, s)) != NULL) { *chromSize = *size; verbose(2,"[%s %3d] hashFindVal(%s -> %p = %u)\n", __func__, __LINE__, s, size, *size); return TRUE; // found chrom } else { reportWarn("Error [file=%s, line=%d]: chrom %s not found [%s]" , lf->fileName, lf->lineIx, s, row); return FALSE; // chrom not found } } else { verbose(2,"[%s %3d] chrom(%s) \n", __func__, __LINE__, s); return TRUE; // chrom name not blank, and not validating against chromInfo } } reportWarn("Error [file=%s, line=%d]: chrom column empty [%s]", lf->fileName, lf->lineIx, row); return FALSE; } int checkSeq(struct lineFile *lf, char *row, char *s, char *name) // Return TRUE if string has non-zero length and contains only chars [ACGTNacgtn0-3] // Othewise print warning that name column is empty and return FALSE { verbose(3,"[%s %3d] inputLine=%d %s seq(%s) [%s]\n", __func__, __LINE__, lf->lineIx, name, s, row); int i; int len = 0; for ( i = 0; s[i] ; ++i) { if (!dnaChars[(int)s[i]]) { if (s==row) reportWarn("Error [file=%s, line=%d]: invalid DNA chars in %s(%s)" , lf->fileName, lf->lineIx,name, s); else reportWarn("Error [file=%s, line=%d]: invalid DNA chars in %s(%s) [%s]" , lf->fileName, lf->lineIx,name, s, row); return 0; } len++; } if (i == 0) { if (privateData) // PrivateData means sequence should be empty return 1; if (s==row) reportWarn("Error [file=%s, line=%d]: %s empty", lf->fileName, lf->lineIx,name); else reportWarn("Error [file=%s, line=%d]: %s empty in line [%s]", lf->fileName, lf->lineIx, name, row); return 0; } else if (privateData) // PrivateData means sequence should be empty { if (s==row) reportWarn("Error [file=%s, line=%d]: %s is not empty but this should be private data", lf->fileName, lf->lineIx,name); else reportWarn("Error [file=%s, line=%d]: %s is not empty but this should be private data in line [%s]" , lf->fileName, lf->lineIx,name, row); return 0; } return len; } boolean checkSeqName(struct lineFile *lf, char *s, char firstChar, char *name) // Return TRUE if string has non-zero length and contains only seqName[] chars // Othewise print warning that seqName is empty and return FALSE { int i; if (s[0] == 0) { reportWarn("Error [file=%s, line=%d]: %s empty [%s]", lf->fileName, lf->lineIx,name, s); return FALSE; } else if (s[0] != firstChar) { reportWarn("Error [file=%s, line=%d]: %s first char invalid (got '%c', wanted '%c') [%s]", lf->fileName, lf->lineIx, name, s[0], firstChar, s); return FALSE; } for ( i = 1; s[i] ; ++i) { if (s[i] == ' ') break; if (!seqName[(int)s[i]]) { reportWarn("Error [file=%s, line=%d]: invalid %s chars in [%s]", lf->fileName, lf->lineIx,name, s); return FALSE; } } return TRUE; } char *getDigits(char *s) // Consume 1 or more digits from s, return pointer to next non-digit // Return NULL if no digits consumed { char *s0 = s; while (digits[(int) *s]) ++s; if (s > s0) return s; else return NULL; } boolean checkTrailingCsSeqName(char *s) // Return true if all chars in s (if any) are csSeqName chars // Return false otherwise { while (csSeqName[(int) *s]) ++s; if (*s == 0) return TRUE; else return FALSE; } // >461_19_209_F3 // T022213002230311203200200322000 // >920_22_656_F3,1.-152654094.1.35.35.0###,19.43558664.1.35.35.0### // T01301010111200210102321210100112312 boolean checkCsSeqName(struct lineFile *lf, char *s) // Return TRUE if string has non-zero length, matches CS name pattern contains only csSeqName[] // chars. Othewise print warning that seqName is empty and return FALSE { char *s0; if (s[0] == 0) { reportWarn("Error [file=%s, line=%d]: sequence name empty [%s]", lf->fileName, lf->lineIx,s); return FALSE; } else if (s[0] != '>') { reportWarn("Error [file=%s, line=%d]: sequence name first char invalid (got '%c', wanted '>') [%s]", lf->fileName, lf->lineIx, s[0], s); return FALSE; } if ( (s0 = getDigits(s+1)) && (*(s0++) == '_') && (s0 = getDigits(s0)) && (*(s0++) == '_') && (s0 = getDigits(s0)) && (*(s0++) == '_') && alpha[(int) *(s0++)] && digits[(int) *(s0++)] && checkTrailingCsSeqName(s0) ) { verbose(2,"[%s %3d] OK [%s] file(%s) line=%d\n", __func__, __LINE__, s, lf->fileName, lf->lineIx); return TRUE; } else { reportWarn("Error [file=%s, line=%d]: invalid sequence name [%s]", lf->fileName, lf->lineIx,s); return FALSE; } } boolean checkQual(struct lineFile *lf, char *s, int len) // Return TRUE if string has non-zero length and contains only qualChars[] chars // Othewise print warning that quality is empty and return FALSE { int i; for ( i = 0; s[i] ; ++i) { if (!qualChars[(int)s[i]]) { reportWarn("Error [file=%s, line=%d]: invalid quality chars in [%s]", lf->fileName, lf->lineIx,s); return FALSE; } } if (i == 0) { reportWarn("Error [file=%s, line=%d]: quality empty [%s]", lf->fileName, lf->lineIx,s); return FALSE; } if (i != len) { reportWarn("Error [file=%s, line=%d]: quality not as long as sequence (%d bases) [%s]" , lf->fileName, lf->lineIx,len, s); return FALSE; } return TRUE; } boolean checkCsQual(struct lineFile *lf, char *s) // Return TRUE if string has non-zero length and contains quality scores // Othewise print warning that quality is empty and return FALSE { int i; for ( i = 0; s[i] ; ++i) { if (!csQualChars[(int)s[i]]) { reportWarn("Error [file=%s, line=%d]: invalid colorspace quality chars in [%s]" , lf->fileName, lf->lineIx,s); return FALSE; } } if (i == 0) { reportWarn("Error [file=%s, line=%d]: colorspace quality empty [%s]", lf->fileName, lf->lineIx,s); return FALSE; } return TRUE; } boolean checkStartEnd(struct lineFile *lf, char *row, char *start, char *end, char *chrom, unsigned chromSize, unsigned *sVal, unsigned *eVal) // Return TRUE if start and end are both >= 0, // and if start <= end // Also check end <= chromSize (special case circular chrM start <= chromSize) // start and end values are returned in sVal and eVal // Othewise print warning and return FALSE { verbose(3,"[%s %3d] inputLine=%d [%s..%s] (chrom=%s,size=%u) [%s]\n" , __func__, __LINE__, lf->lineIx, start, end, chrom, chromSize, row); unsigned s, e; if ( !checkUnsigned(lf, row, start, &s, "chromStart") || !checkUnsigned(lf, row, end, &e, "chromEnd")) return FALSE; *sVal = s; *eVal = e; if (chromSize > 0) { if (e > chromSize && (differentString(chrom, "chrM"))) { reportWarn("Error [file=%s, line=%d]: end(%u) > chromSize(%s=%u) [%s]" , lf->fileName, lf->lineIx,e, chrom, chromSize, row); return FALSE; } else verbose(2,"[%s %3u] end <= chromSize (%u <= %u)\n", __func__, __LINE__, e, chromSize); } if (e >= s) { verbose(2,"[%s %3d] start <= end (%u <= %u)\n", __func__, __LINE__, s, e); return TRUE; } else reportWarn("Error [file=%s, line=%d]: start(%u) > end(%u) [%s]", lf->fileName, lf->lineIx,s, e, row); return FALSE; } boolean checkPeak(struct lineFile *lf, char *row, char *peak, char *start, char *end) // Return TRUE if peak is >= 0 and <= (end-start) // Othewise print warning and return FALSE { verbose(3,"[%s %3d] inputLine=%d peak(%s) (%s,%s) [%s]\n" , __func__, __LINE__, lf->lineIx, peak, start, end, row); unsigned p, s, e; int i; if (!checkSigned(lf, row, peak, &i, "peak")) return FALSE; if (i == -1) return TRUE; if ( !checkUnsigned(lf, row, peak, &p, "peak") || !checkUnsigned(lf, row, start, &s, "chromStart") || !checkUnsigned(lf, row, end, &e, "chromEnd")) return FALSE; if (p > e - s) { reportWarn("Error [file=%s, line=%d]: peak(%u) past block length (%u) [%s]" , lf->fileName, lf->lineIx,p, e - s, row); return FALSE; } return TRUE; } boolean checkIntBetween(struct lineFile *lf, char *row, char *val, char *name, int min, int max) // Return TRUE if val is integer between min and max // Othewise print warning and return FALSE { int i; if (!checkSigned(lf, row, val, &i, name)) return FALSE; verbose(2,"[%s %3d] inputLine=%d [%s] -> [%d] [%s,%d..%d]\n" , __func__, __LINE__, lf->lineIx, val, i, name, min, max); if (i >= min && i <= max) { verbose(2,"[%s %3d] min <= value <= max (%d <= %d <= %d)\n", __func__, __LINE__, min, i, max); return TRUE; } reportWarn("Error [file=%s, line=%d]: %s %d outside bounds (%d, %d) [%s]" , lf->fileName, lf->lineIx,name, i, min, max, row); return FALSE; } boolean checkFloat(struct lineFile *lf, char *row, char *val, char *name) // Return TRUE if val is floating point number // Othewise print warning and return FALSE // taken from sqlNum.c { char* end; double discardMe = strtod(val, &end); if ((end == val) || (*end != '\0')) { reportWarn("Error [file=%s, line=%d]: invalid %s '%s' [%s]", lf->fileName, lf->lineIx,name, val, row); discardMe = 0.0; return FALSE; } return TRUE; } boolean checkStrand(struct lineFile *lf, char *row, char *strand) // Return TRUE if strand == '+' or '-' or '.', // Othewise print warning and return FALSE { if (strlen(strand) == 1 && (*strand == '+' || *strand == '-' || *strand == '.')) { verbose(2,"[%s %3d] strand(%s)\n", __func__, __LINE__, strand); return TRUE; } reportWarn("Error [file=%s, line=%d]: invalid strand '%s' (want '+','-','.') [%s]" , lf->fileName, lf->lineIx,strand, row); return FALSE; } boolean wantNewLine(struct lineFile *lf, char **row, char *msg) { boolean res = lineFileNext(lf, row, NULL); if (!res) reportWarn("Error [file=%s, line=%d]: %s not found", lf->fileName, lf->lineIx, msg); return res; } boolean checkColumns(struct lineFile *lf, char *row, char *buf, char *words[], int wordSize, int expected) // Split buf into wordSize columns in words[] array // Return TRUE if number of columns == expected, otherwise FALSE { int n = 0; if (tab) n = chopByChar(buf, '\t', words, wordSize); else n = chopByWhite(buf, words, wordSize); if (n != expected) { reportWarn("Error [file=%s, line=%d]: found %d columns, expected %d [%s]" , lf->fileName, lf->lineIx, n, expected, row); return FALSE; } return TRUE; } boolean checkMismatchesSeq(struct lineFile *lf, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq) { int i, mm = 0; struct dnaSeq *g; static struct dnaSeq *cacheSeq = NULL; static char cacheChrom[1024]; static char bigArr[100 * 1024]; // 100K limit on tagAlign seqLen struct dnaSeq ourSeq; boolean chrMSizeAjustment=FALSE; if (privateData) // No way to check private data return TRUE; if (!genome) return TRUE; // only check if 2bit file specified if (lf->lineIx % mmCheckOneInN != 0) return TRUE; // dont check if this is not one in N if (!isSorted) { //unsigned end = chromEnd; if (sameString(chrom,"chrM")) { unsigned size = twoBitSeqSize(genome, chrom); if ( chromEnd>size) { chrMSizeAjustment=TRUE; chromEnd=size; } } g = twoBitReadSeqFragLower(genome, chrom, chromStart, chromEnd); } else { // read the whole chrom if ((cacheChrom == NULL) || !sameString(chrom, cacheChrom)) { freeDnaSeq(&cacheSeq); int size = twoBitSeqSize(genome, chrom); cacheSeq = twoBitReadSeqFragLower(genome, chrom, 0, size); strcpy(cacheChrom, chrom); verbose(2, "read in chrom %s size %d\n",cacheChrom, size); } int len = chromEnd - chromStart; if (len > sizeof(bigArr)) reportErrAbort("static array not big enough for sequence len %d on line %d\n", len, lf->lineIx); g = &ourSeq; g->dna = bigArr; g->size = len; memcpy(g->dna, &cacheSeq->dna[chromStart], len); } if (strand == '-') reverseComplement(g->dna, g->size); if ((g->size != strlen(seq) || g->size != chromEnd-chromStart) && !chrMSizeAjustment) { reportWarn("Error [file=%s, line=%d]: " "sequence (%s) length (%d) does not match genomic coords (%d / %d - %s %d %d %c)", lf->fileName, lf->lineIx, seq, (int)strlen(seq), chromEnd-chromStart, g->size, chrom, chromStart, chromEnd, strand); return FALSE; } int length = g->size; if (matchFirst && (matchFirst < length)) length = matchFirst; for (i=0 ; i < length; ++i) { char c = tolower(seq[i]); if (checkMismatch(c, g->dna[i])) ++mm; } if (mm > mismatches) { reportWarn("Error [file=%s, line=%d]: " "too many mismatches (found %d/%d, maximum is %d) (%s %d %d %c)\nseq=[%s]\ngen=[%s]\n", lf->fileName, lf->lineIx, mm, g->size, mismatches, chrom, chromStart, chromEnd, strand, seq, g->dna); return FALSE; } if (!isSorted) freeDnaSeq(&g); return TRUE; } boolean checkMismatchesSeq1Seq2(struct lineFile *lf, char *chrom, unsigned chromStart, unsigned chromEnd, char strand, char *seq1, char *seq2) { int i, mm1, mm2, len1, len2; struct dnaSeq *g1, *g2; if (privateData) // No way to check private data return TRUE; if (!genome) return TRUE; // dont check unless 2bit file specified if (lf->lineIx % mmCheckOneInN != 0) return TRUE; // dont check if this is not one in N len1 = strlen(seq1); len2 = strlen(seq2); if (strand == '-') { g1 = twoBitReadSeqFragLower(genome, chrom, chromEnd-len1, chromEnd); g2 = twoBitReadSeqFragLower(genome, chrom, chromStart, chromStart+len2); reverseComplement(g1->dna, g1->size); reverseComplement(g2->dna, g2->size); } else { g1 = twoBitReadSeqFragLower(genome, chrom, chromStart, chromStart+len1); g2 = twoBitReadSeqFragLower(genome, chrom, chromEnd-len2, chromEnd); } if (g1->size != len1 || g2->size != len2) { reportWarn("Error [file=%s, line=%d]: sequence lengths (%d, %d) do not match genomic ones (%d, %d)", lf->fileName, lf->lineIx, len1, len2, g1->size, g2->size); return FALSE; } mm1 = 0; for (i=0 ; i < g1->size ; ++i) { char c = tolower(seq1[i]); if (checkMismatch(c, g1->dna[i])) ++mm1; } mm2 = 0; for (i=0 ; i < g2->size ; ++i) { char c = tolower(seq2[i]); if (checkMismatch(c, g2->dna[i])) ++mm2; } if (mmPerPair) { if (mm1 > mismatches || mm2 > mismatches) { reportWarn("Error [file=%s, line=%d]: " "too many mismatches in one or both (seq1=%d/%d, seq2=%d/%d, maximum is %d) (%s %d %d %c)\nseq1=[%s] seq2=[%s]\ngen1=[%s] gen2=[%s]\n", lf->fileName, lf->lineIx, mm1, len1, mm2, len2, mismatches, chrom, chromStart, chromEnd, strand, seq1, seq2, g1->dna, g2->dna); return FALSE; } } else { if (mm1+mm2 > mismatches) { reportWarn("Error [file=%s, line=%d]: " "too many mismatches in pair (seq1=%d/%d, seq2=%d/%d, maximum is %d) (%s %d %d %c)\nseq1=[%s] seq2=[%s]\ngen1=[%s] gen2=[%s]\n", lf->fileName, lf->lineIx, mm1, len1, mm2, len2, mismatches, chrom, chromStart, chromEnd, strand, seq1, seq2, g1->dna, g2->dna); return FALSE; } } freeDnaSeq(&g1); freeDnaSeq(&g2); return TRUE; } void validateTagOrPairedTagAlign(struct lineFile *lf, boolean paired) { char *row; char buf[1024]; char *words[TAG_WORDS]; unsigned chromSize; int size; verbose(2,"[%s %3d] paired=%d file(%s)\n", __func__, __LINE__, paired, lf->fileName); while (lineFileNext(lf, &row, &size)) { unsigned start, end; safecpy(buf, sizeof(buf), row); if (!( checkColumns(lf, row, buf, words, TAG_WORDS, (paired ? 8 : 6)) && checkChrom(lf, row, words[0], &chromSize) && checkStartEnd(lf, row, words[1], words[2], words[0], chromSize, &start, &end) && checkIntBetween(lf, row, words[4], "score", 0, 1000) && checkStrand(lf, row, words[5]) && (paired ? (checkString(lf, row, words[3], "name") && checkSeq(lf, row, words[6], "seq1") && checkSeq(lf, row, words[7], "seq2") && checkMismatchesSeq1Seq2(lf, words[0], start, end, *words[5], words[6], words[7])) : (checkSeq(lf, row, words[3], "sequence") && checkMismatchesSeq(lf, words[0], start, end, *words[5], words[3])) ) ) ) { verbose(2, "%s\n", row); reportErrAbort("Aborting .. found 1 error\n"); } } } // tagAlign // chr1 6082 6117 TCTACTGGCTCTGTGTGTACCAGTCTGTCACTGAG 1000 - // chr1 7334 7369 AGCCAGGGGGTGACGTTGTTAGATTAGATTTCTTA 1000 + void validateTagAlign(struct lineFile *lf) { validateTagOrPairedTagAlign(lf, FALSE); } // pairedTagAlign // chr10 96316360 96310862 9 1000 + TCTCACCCGATAACGACCCCCTCCC TGATCCTTGACTCACTTGCTAATTT // chr8 126727657 126721865 10 1000 + AATTCTTCACCTCTCCTGTTCAAAG TGTGTGAGATCCAAGAATCCTCTCT void validatePairedTagAlign(struct lineFile *lf) { validateTagOrPairedTagAlign(lf, TRUE); } void validateBedVariant(struct lineFile *lf, enum bedType type) { char *row; char buf[1024]; char *words[PEAK_WORDS]; unsigned chromSize; int gappedOffset = (type == GAPPED_PEAK ? 6 : 0); verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, lf->fileName); while (lineFileNextReal(lf, &row)) { unsigned start, end; safecpy(buf, sizeof(buf), row); if (! ( checkColumns(lf, row, buf, words, PEAK_WORDS, bedTypeCols[type]) && checkChrom(lf, row, words[0], &chromSize) && checkStartEnd(lf, row, words[1], words[2], words[0], chromSize, &start, &end) && ( type == BED_GRAPH ? (checkFloat(lf, row, words[3], "value")) // canonical bedGraph has float in 4th column : // otherwise BROAD_, NARROW_, or GAPPED_PEAK (checkString(lf, row, words[3], "name") && checkIntBetween(lf, row, words[4], "score", 0, 1000) && checkStrand(lf, row, words[5]) // && ((type != GAPPED_PEAK) || ()) // for now dont check all the BED 12 gapped fields && checkFloat(lf, row, words[6 + gappedOffset], "signalValue") && checkFloat(lf, row, words[7 + gappedOffset], "pValue") && checkFloat(lf, row, words[8 + gappedOffset], "qValue") && ((type != NARROW_PEAK) || (checkPeak(lf, row, words[9], words[1], words[2]))) ) ) ) ) { verbose(2, "%s\n", row); reportErrAbort("Aborting .. found 1 error\n"); } } } void validateBroadPeak(struct lineFile *lf) { validateBedVariant(lf, BROAD_PEAK); } void validateNarrowPeak(struct lineFile *lf) { validateBedVariant(lf, NARROW_PEAK); } void validateGappedPeak(struct lineFile *lf) { validateBedVariant(lf, GAPPED_PEAK); } void validateBedGraph(struct lineFile *lf) { validateBedVariant(lf, BED_GRAPH); } void validateBed(struct lineFile *lf, int bedN, int bedP, struct asObject *as) // Validate regular bed [3-15] . and + { char *row; int bufSize = 1024; // bufSize is max row length char *buf = needMem(bufSize); char *words[1024]; unsigned chromSize; int fieldCount = 0; boolean allocedAs = FALSE; struct bed *bed; AllocVar(bed); verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, lf->fileName); while (lineFileNextReal(lf, &row)) { /* First time through figure out the field count, and if not set, the defined standard field count. */ if (fieldCount == 0) { if (as == NULL) { if (tab) fieldCount = chopByChar(row, '\t', NULL, 0); else fieldCount = chopByWhite(row, NULL, 0); char *asText = bedAsDef(bedN, fieldCount); as = asParseText(asText); allocedAs = TRUE; freeMem(asText); } else { fieldCount = slCount(as->columnList); asCompareObjAgainstStandardBed(as, bedN, TRUE); // abort if bedN columns are not standard } if (bedP == -1) // user did not specify how many plus columns there are. { bedP = fieldCount - bedN; if (bedP < 1) reportErrAbort("Aborting: [file=%s, line=%d] fieldCount input (%d) did not match the specification (%s)\n" , lf->fileName, lf->lineIx, fieldCount, optionVal("type", "")); } if (fieldCount != bedN + bedP) reportErrAbort("Aborting: [file=%s, line=%d] fieldCount input (%d) did not match the specification (%s)\n" , lf->fileName, lf->lineIx, fieldCount, optionVal("type", "")); } int rowLen = strlen(row); if (bufSize < rowLen) { bufSize = 2 * rowLen; // allocate twice as much as we need now freeMem(buf); buf = needMem(bufSize); } safecpy(buf, bufSize, row); if (checkColumns(lf, row, buf, words, sizeof words, bedN+bedP) && checkChrom(lf, row, words[0], &chromSize)) { loadAndValidateBed(words, bedN, fieldCount, lf, bed, as, FALSE); if (bed->chromEnd > chromSize) { reportErrAbort("Error [file=%s, line=%d]: bed->chromEnd[%u] > chromSize[%u] [%s]" , lf->fileName, lf->lineIx, bed->chromEnd, chromSize, row); } } else { verbose(2, "%s\n", row); reportErrAbort("Aborting .. found error.\n"); } } if (allocedAs) asObjectFreeList(&as); freez(&bed); } void validateBedN(struct lineFile *lf) { int bedN = 0; int bedP = 0; // parse type char *type = cloneString(optionVal("type", "")); char *plus = strchr(type, '+'); if (plus) { *plus++ = 0; if (isdigit(*plus)) bedP = sqlUnsigned(plus); else bedP = -1; } if (bigBed) { if (!startsWith("bigBed", type)) errAbort("type must begin with \"bigBed\""); type +=6; } else { if (!startsWith("bed", type)) errAbort("type must begin with \"bed\""); type +=3; } bedN = sqlUnsigned(type); if (bedN < 3) reportErrAbort("Aborting .. bed must be 3 or higher, found %d\n", bedN); if (bedN > 15) // maybe someday 15 for microarray data? reportErrAbort("Aborting .. bed must be 15 or lower, found %d\n", bedN); /* Load up as-object if defined in file. */ struct asObject *asObj = NULL; if (as) { /* Parse it and do sanity check. */ asObj = asParseFile(as); if (asObj->next != NULL) errAbort("Can only handle .as files containing a single object."); } validateBed(lf, bedN, bedP, asObj); } // fasta: // >VHE-245683051005-13-1-2-1704 // GTGTTAATTTTCTTGATCTTTCGTTC // >VHE-245683051005-13-1-2-1704 // CTTGCTTTCTAGTTCTTTTAATTGTG void validateFasta(struct lineFile *lf) { char *seqName, *seq; boolean startOfFile = TRUE; verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, lf->fileName); while ( lineFileNext(lf, &seqName, NULL)) { if (startOfFile) { if (*seqName == '#') continue; else startOfFile = FALSE; } if (! (checkSeqName(lf, seqName, '>', "sequence name") && (wantNewLine(lf, &seq, "fastq sequence line")) && checkSeq(lf, seq, seq, "sequence") ) ) { verbose(2, "%s\n%s\n", seqName, seq); reportErrAbort("Aborting .. found 1 error\n"); } } } // fastq: // @NINA_1_FC30G3VAAXX:5:1:110:908 // ATCGTCAGGTGGGATAATCCTTACCTTTTCCTCCTC // +NINA_1_FC30G3VAAXX:5:1:110:908 // aa`]`a`XQ^VQQ^`aaaaaaa^[[ZG[aXUX[[[X void validateFastq(struct lineFile *lf) { char *seqName, *seq, *qName, *qual; int line = 0; boolean startOfFile = TRUE; verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, lf->fileName); while ( lineFileNext(lf, &seqName, NULL)) { ++line; if (startOfFile) { if (*seqName == '#') continue; else startOfFile = FALSE; } int len = 0; if (!(checkSeqName(lf, seqName, '@', "sequence name") && (wantNewLine(lf, &seq, "fastq sequence line")) && (len = checkSeq(lf, seq, seq, "sequence")) && (wantNewLine(lf, &qName, "fastq sequence name (quality line)")) && checkSeqName(lf, qName, '+', "quality name") && (wantNewLine(lf, &qual, "quality line")) && checkQual(lf, qual, len) )) { verbose(2,"%s\n%s\n%s\n%s\n", seqName, seq, qName, qual); reportErrAbort("Aborting .. found 1 error\n"); } } reportLabelAndLongNumber("number of sequences", line / 4); } /* Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm CS Fasta: >461_19_209_F3 T022213002230311203200200322000 >920_22_656_F3,1.-152654094.1.35.35.0###,19.43558664.1.35.35.0### T01301010111200210102321210100112312 */ void validateCsfasta(struct lineFile *lf) // Validate Colorspace fasta files { char *seqName = NULL; char *seq = NULL; boolean startOfFile = TRUE; verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, lf->fileName); while (lineFileNext(lf, &seqName, NULL)) { if (startOfFile) { if (*seqName == '#') continue; else startOfFile = FALSE; } if (! (checkCsSeqName(lf, seqName) && (wantNewLine(lf, &seq, "colorspace sequence name")) && checkSeq(lf, seq, seq, "colorspace sequence") ) ) { verbose(2,"%s\n%s\n", seqName, seq); reportErrAbort("Aborting .. found 1 error\n"); } } } /* Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm Sample:- # Cwd: /home/pipeline # Title: S0033_20080723_2_I22_EA_ >461_19_90_F3 20 10 8 13 8 10 20 7 7 24 15 22 21 14 14 8 11 15 5 20 6 5 8 22 6 24 3 16 7 11 >461_19_209_F3 16 8 5 12 20 24 19 8 13 17 11 23 8 24 8 7 17 4 20 8 29 7 3 16 3 4 8 20 17 9 */ void validateCsqual(struct lineFile *lf) // Validate Colorspace quality files { char *seqName = NULL; char *qual = NULL; boolean startOfFile = TRUE; verbose(2,"[%s %3d] file(%s)\n", __func__, __LINE__, lf->fileName); while (lineFileNext(lf, &seqName, NULL)) { if (startOfFile) { if (*seqName == '#') continue; else startOfFile = FALSE; } if (! (checkCsSeqName(lf, seqName) && (wantNewLine(lf, &qual, "colorspace quality line")) && checkCsQual(lf, qual) ) ) { verbose(2,"%s\n%s\n", seqName, qual); reportErrAbort("Aborting .. found 1 error\n"); } } } void validateBigWig(struct lineFile *lf) { if (chrHash == NULL) reportErrAbort("bigWig validation requires the -chromInfo or -chromDb option\n"); struct bbiFile *bbiFile; if (!bigWigFileCheckSigs(lf->fileName)) reportErrAbort("bad signatures in file %s\n", lf->fileName); bbiFile = bigWigFileOpen(lf->fileName); if (bbiFile == NULL) reportErrAbort("Aborting... Cannot open bigWig file: %s\n", lf->fileName); struct bbiChromInfo *bbiChroms = bbiChromList(bbiFile); if (bbiChroms == NULL) reportErrAbort("Aborting... cannot get bigWig chromosome list in file: %s\n", lf->fileName); struct bbiChromInfo *chroms = bbiChroms; for(; chroms; chroms = chroms->next) { unsigned *size; if ( (size = hashFindVal(chrHash, chroms->name)) == NULL) { reportErrAbort("bigWig contains invalid chromosome name: %s\n", chroms->name); } else { if (*size != chroms->size) { reportErrAbort("bigWig contains chromosome with wrong length: %s should be %d bases, " "not %d bases\n", chroms->name, *size, chroms->size); } } } report("version: %d\n", bbiFile->version); report("isCompressed: %s\n", (bbiFile->uncompressBufSize > 0 ? "yes" : "no")); report("isSwapped: %d\n", bbiFile->isSwapped); reportLabelAndLongNumber("primaryDataSize", bbiFile->unzoomedIndexOffset - bbiFile->unzoomedDataOffset); if (bbiFile->levelList != NULL) { long long indexEnd = bbiFile->levelList->dataOffset; reportLabelAndLongNumber("primaryIndexSize", indexEnd - bbiFile->unzoomedIndexOffset); } report("zoomLevels: %d\n", bbiFile->zoomLevels); struct bbiZoomLevel *zoom; for (zoom = bbiFile->levelList; zoom != NULL; zoom = zoom->next) report("\t%d\t%d\n", zoom->reductionLevel, (int)(zoom->indexOffset - zoom->dataOffset)); struct bbiChromInfo *chrom, *chromList = bbiChroms; report("chromCount: %d\n", slCount(chromList)); for (chrom=chromList; chrom != NULL; chrom = chrom->next) report("\t%s %d %d\n", chrom->name, chrom->id, chrom->size); struct bbiSummaryElement sum = bbiTotalSummary(bbiFile); reportLabelAndLongNumber("basesCovered", sum.validCount); report("mean: %f\n", sum.sumData/sum.validCount); report("min: %f\n", sum.minVal); report("max: %f\n", sum.maxVal); report("std: %f\n", calcStdFromSums(sum.sumData, sum.sumSquares, sum.validCount)); } #ifdef USE_BAM struct bamCallbackData { char *file; char *chrom; int *errs; int numAligns; int numNeg; int numPos; struct hash *tagHash; }; boolean checkCigarMismatches(char *file, int line, char *chrom, unsigned chromStart, char strand, char *seq, UBYTE* quals, unsigned int *cigarPacked, int nCigar) { static char cacheChrom[1024]; static struct dnaSeq *cacheSeq = NULL; if (!genome) reportErrAbort("Need to specify genome if checking BAM files"); if ((line % mmCheckOneInN) != 0) return TRUE; // dont check if this is not one in N // read the whole chrom if ((cacheChrom == NULL) || !sameString(chrom, cacheChrom)) { freeDnaSeq(&cacheSeq); int size = twoBitSeqSize(genome, chrom); cacheSeq = twoBitReadSeqFragLower(genome, chrom, 0, size); strcpy(cacheChrom, chrom); verbose(2, "read in chrom %s size %d: aligns to this point %d\n", cacheChrom, size, line); } int curPosGenome = chromStart; int i, j; int mmTotalQual = 0; int mm = 0; char dna[10000], *dnaPtr = dna; for (i = 0; (i < nCigar); i++) { char op; int size = bamUnpackCigarElement(cigarPacked[i], &op); switch (op) { case 'M': // match or mismatch (gapless aligned block) //printf("grabbing %d from %d\n",size,curPosGenome); for (j=0 ; j < size; ++j) { *dnaPtr++ = cacheSeq->dna[curPosGenome]; curPosGenome++; } break; case 'I': // inserted in query case 'S': // skipped query bases at beginning or end ("soft clipping") for(j=0; j < size; j++) *dnaPtr++ = '-'; break; case 'D': // deleted from query case 'N': // long deletion from query (intron as opposed to small del) curPosGenome += size; break; case 'H': // skipped query bases not stored in record's query sequence ("hard clipping") case 'P': // P="silent deletion from padded reference sequence" -- ignore these. break; default: reportErrAbort("unrecognized CIGAR op %c -- update me", op); } } *dnaPtr = 0; if (complementMinus && (strand == '-')) reverseComplement(seq, strlen(seq)); int length = strlen(seq); if (matchFirst && (matchFirst < length)) length = matchFirst; int checkLength = length; int start, incr; if (strand == '-') { start = strlen(seq) - 1; incr = -1; } else { start = 0; incr = 1; } for (i = start; (strand == '-') ? i >= 0 : i < strlen(seq); i += incr) { char c = tolower(seq[i]); if ((dna[i] != '-') && (checkMismatch(c, dna[i]))) { ++mm; if (quals) mmTotalQual += min( round( quals[i] / 10 ) * 10, 30 ); } if (--length == 0) break; } if (mm > mismatches || ((quals != NULL) && (mmTotalQual > mismatchTotalQuality))) { assert(checkLength < 10000); char match[10000]; for(i = 0; i < strlen(seq); i++) { if ((dna[i] == '-') || (tolower(seq[i]) == dna[i])) match[i] = ' '; else match[i] = 'x'; } match[i] = 0; if (mm > mismatches) { reportWarn("Error [file=%s, line=%d]: " "too many mismatches (found %d/%d, maximum is %d) (%s: %d\nquery %s\nmatch %s\ndna %s )\n", file, line, mm, checkLength, mismatches, chrom, chromStart, seq, match, dna); } if ((quals != NULL) && (mmTotalQual > mismatchTotalQuality)) { char squal[10000]; for (i = 0; i < checkLength; i++) squal[i] = '0' + min( round( quals[i] / 10 ), 3 ); reportWarn("Error [file=%s, line=%d]: " "total quality at mismatches too high (found %d, maximum is %d) " "(%s: %d\nquery %s\nmatch %s\ndna %s\nqual %s )\n", file, line, mmTotalQual, mismatchTotalQuality, chrom, chromStart, seq, match, dna, squal); } return FALSE; } return TRUE; } unsigned long long numQuals[256]; unsigned long quals[256]; unsigned int flagOr = 0; int parseBamRecord(const bam1_t *bam, void *data) /* bam_fetch() calls this on each bam alignment retrieved. */ { struct bamCallbackData *bd = data; char *chrom = bd->chrom; char *file = bd->file; int *errs = bd->errs; const bam1_core_t *core = &bam->core; unsigned int *cigarPacked = bam1_cigar(bam); char *query = bamGetQuerySequence(bam, FALSE); char strand = bamIsRc(bam) ? '-' : '+'; UBYTE *queryQuals = NULL; quals[core->qual]++; bd->numAligns++; unsigned char *s = bam1_aux(bam); char str[3]; str[2] = 0; while(s < bam->data + bam->data_len) { str[0] = s[0]; str[1] = s[1]; hashStore(bd->tagHash, str); uint8_t type; s += 2; type = *s; ++s; if (type == 'A') { ++s; } else if (type == 'C') { ++s; } else if (type == 'c') { ++s; } else if (type == 'S') { s += 2; } else if (type == 's') { s += 2; } else if (type == 'I') { s += 4; } else if (type == 'i') { s += 4; } else if (type == 'f') { s += 4; } else if (type == 'd') { s += 8; } else if (type == 'Z' || type == 'H') { while (*s) s++; ++s; } } if (core->flag & BAM_FUNMAP) // read is unmapped... ignore return 0; flagOr |= core->flag; if ( mismatchTotalQuality) { queryQuals = bamGetQueryQuals(bam, FALSE); int ii; int len = strlen(query); for(ii=0; ii < len; ii++) numQuals[queryQuals[ii]]++; } if (bam->core.l_qseq == 0) { reportWarn("zero length sequence on line %d\n", bd->numAligns); ++(*errs); } else if (! checkCigarMismatches(file, bd->numAligns, chrom, bam->core.pos, strand, query, queryQuals, cigarPacked, core->n_cigar)) { char *cigar = bamGetCigar(bam); reportWarn("align: ciglen %d cigar %s qlen %d pos %d length %d strand %c\n" , bam->core.n_cigar, cigar, bam->core.l_qname, bam->core.pos, bam->core.l_qseq, bamIsRc(bam) ? '-' : '+'); ++(*errs); } if ((bamPercent == 0.0) && (*errs) >= 1) reportErrAbort("Aborting .. found %d errors\n", *errs); if (strand == '+') bd->numPos++; else bd->numNeg++; return 0; } void validateBAM(struct lineFile *lf) { if (chrHash == NULL) reportErrAbort("BAM validation requires the -chromInfo or -chromDb option\n"); int errs = 0; samfile_t *fh = samopen(lf->fileName, "rb", NULL); if (fh == NULL) reportErrAbort("Aborting... Cannot open BAM file: %s\n", lf->fileName); bam_header_t *head = fh->header; if (head == NULL) reportErrAbort("Aborting... Bad BAM header in file: %s\n", lf->fileName); int ii; for(ii=0; ii < head->n_targets; ii++) { unsigned *size; verbose(2,"has chrom %s\n", head->target_name[ii]); if ( (size = hashFindVal(chrHash, head->target_name[ii])) == NULL) { reportWarn("BAM contains invalid chromosome name: %s\n", head->target_name[ii]); if (!allowOther) errs++; } else { if (*size != head->target_len[ii]) { reportWarn("BAM contains chromosome with wrong length: %s should be %d bases, not %d bases\n", head->target_name[ii], *size, head->target_len[ii]); if (!allowBadLength) errs++; } } } if (errs > 0) reportErrAbort("Aborting... %d errors found in BAM file\n", errs); if (!genome) return; // only check sequence if 2bit file specified bam_index_t *idx = bam_index_load(lf->fileName); if (idx == NULL) reportErrAbort("couldn't find index file for %s\n", lf->fileName); struct bamCallbackData *bd; AllocVar(bd); bd->file = lf->fileName; if (bamPercent != 0.0) errs = 0; // if we're testing on percent compliant, don't count header errs bd->errs = &errs; bd->numPos = bd->numNeg = 0; struct hash *tagHash = newHash(5); bd->tagHash = tagHash; for(ii=0; ii < head->n_targets; ii++) { char position[256]; safef(position, sizeof position, "%s:%d-%d",head->target_name[ii], 0, head->target_len[ii]); int chromId, start, end; int ret = bam_parse_region(fh->header, position, &chromId, &start, &end); bd->chrom = head->target_name[ii]; verbose(2,"asking for alignments on %s\n",bd->chrom); ret = bam_fetch(fh->x.bam, idx, chromId, start, end, bd, parseBamRecord); } report("number of BAM alignments %d pos %d neg %d errors %d\n", bd->numAligns, bd->numPos, bd->numNeg, errs ); if (doReport) { struct hashCookie cook = hashFirst(tagHash); struct hashEl *hel; report("tags: "); while((hel = hashNext(&cook)) != NULL) { report("%s ",hel->name); } report("\n"); report( "flagOr: %x\n", flagOr); } double percentCorrect = 100.0 * (bd->numAligns - errs)/(double)bd->numAligns; report( "BAM alignment compliancy rate: %g\n", percentCorrect); if (bamPercent != 0.0) { if (percentCorrect < bamPercent) reportErrAbort("Aborting: too many non-compliant BAM alignments (%%%g compliancy rate)", percentCorrect); errs = 0; } report("alignQuals "); for(ii=0; ii < 256; ii++) { report("%ld ",quals[ii]); } report("\n"); report("seqQuals "); for(ii=0; ii < 256; ii++) { report("%lld ",numQuals[ii]); } report("\n"); } #endif void validateFiles(void (*validate)(struct lineFile *lf), int numFiles, char *files[]) /* validateFile - validate format of different track input files. */ { int i; verbose(2,"[%s %3d] numFiles=%d \n", __func__, __LINE__, numFiles); for (i = 0; i < numFiles ; ++i) { struct lineFile *lf = NULL; if (bigBed) lf = lineFileOnBigBed(files[i]); else lf = lineFileOpen(files[i], TRUE); char buffer[10 * 1024]; if (doReport) { safef(buffer, sizeof buffer, "%s.report", files[i]); verbose(2, "opening report %s\n", buffer); reportF = mustOpen(buffer, "w"); } else reportF = NULL; report("validateFiles %s %s\n", version, files[i]); validate(lf); /* set up error catch to allow SIGPIPE errors from gzip */ struct errCatch *errCatch = errCatchNew(); if (errCatchStart(errCatch)) lineFileClose(&lf); errCatchEnd(errCatch); if (errCatch->gotError) warn("Ignoring: %s", errCatch->message->string); errCatchFree(&errCatch); report("endReport\n"); if (reportF != NULL) { fclose(reportF); reportF = NULL; } } verbose(2,"[%s %3d] done loop\n", __func__, __LINE__); printf( "Error count 0\n"); verbose(2,"[%s %3d] done\n", __func__, __LINE__); } int testFunc(char *f) { char *row; int size; struct lineFile *lf = lineFileOpen(f, TRUE); while (lineFileNext(lf, &row, &size)) printf("size=%d [%s]\n", size, row); printf("done.\n"); return 0; } int main(int argc, char *argv[]) /* Process command line. */ { char *type; void *func; struct hash *funcs = newHash(0); char *chromDb, *chromInfo; optionInit(&argc, argv, options); ++argv; --argc; if (optionExists("version")) reportErrAbort("%s", version); if (argc==0) usage(); type = optionVal("type", ""); if (strlen(type) == 0) reportErrAbort("please specify type"); genome = optionExists("genome") ? twoBitOpen(optionVal("genome",NULL)) : NULL; mismatches = optionInt("mismatches",0); matchFirst = optionInt("matchFirst",0); mmPerPair = optionExists("mmPerPair"); mismatchTotalQuality = optionInt("mismatchTotalQuality",0); nMatch = optionExists("nMatch"); privateData = optionExists("privateData"); isSorted = optionExists("isSorted"); mmCheckOneInN = optionInt("mmCheckOneInN", 1); colorSpace = optionExists("colorSpace") || sameString(type, "csfasta"); allowOther = optionExists("allowOther"); allowBadLength = optionExists("allowBadLength"); complementMinus = optionExists("complementMinus"); bamPercent = optionFloat("bamPercent", bamPercent); doReport = optionExists("doReport"); as = optionVal("as", as); tab = optionExists("tab"); if ((bamPercent != 0.0) && (mmCheckOneInN != 1 )) reportErrAbort("can't specify bamPercent and mmCheckOneInN"); initArrays(); dnaChars[(int)'.'] = 1; // Get chromInfo from DB or file if ( (chromDb = optionVal("chromDb", NULL)) != NULL) { chrHash = chromHashFromDatabase(chromDb); } else if ( (chromInfo=optionVal("chromInfo", NULL)) != NULL) { chrHash = chromHashFromFile(chromInfo); } verbose(2,"[%s %3d] type=%s\n", __func__, __LINE__, type); if (sameString("BAM", type)) reportErrAbort("Type error. Please specify BAM spelled in lower-case as -type=bam."); if (startsWith("bed", type) && isdigit(type[3])) type = "bedN"; if (startsWith("bigBed", type) && isdigit(type[6])) { type = "bedN"; bigBed = TRUE; } // Setup the function hash keyed by type hashAdd(funcs, "tagAlign", &validateTagAlign); // superseded by BAM hashAdd(funcs, "pairedTagAlign", &validatePairedTagAlign); // superseded by BAM hashAdd(funcs, "fasta", &validateFasta); hashAdd(funcs, "fastq", &validateFastq); hashAdd(funcs, "csfasta", &validateCsfasta); hashAdd(funcs, "csqual", &validateCsqual); hashAdd(funcs, "broadPeak", &validateBroadPeak); hashAdd(funcs, "narrowPeak", &validateNarrowPeak); hashAdd(funcs, "gappedPeak", &validateGappedPeak); hashAdd(funcs, "bedGraph", &validateBedGraph); #ifdef USE_BAM hashAdd(funcs, "bam", &validateBAM); #endif hashAdd(funcs, "bigWig", &validateBigWig); hashAdd(funcs, "bedN", &validateBedN); //hashAdd(funcs, "test", &testFunc); if (!(func = hashFindVal(funcs, type))) reportErrAbort("Cannot validate %s type files\n", type); validateFiles(func, argc, argv); return 0; }