""" annotate calls that are present in a secondary VCF file todo: filter for genotype quality """ import vcf import filez import sys import getopt ################################################################################################### def wrap(line, length=80): """ Free function to wrap a long line. If the line is longer than 'length', then the remainder is wrapped into another line. I think this can be replaced by the textwrap.wrap function. """ if len(line) <= length: return line if len(line) == 0: return "" while length>=0 and line[length] != " ": length -= 1 start = length while length>=0 and line[length] == " ": length -= 1 if length>=0: return line[:length+1] + "\n" + wrap(line[start+1:]) while length maxdistance: return 4, (r1, r2) # try all pairs of alternate alleles mincmp, alleles = 4, None for i1, a1 in enumerate(call1["alt"]): for i2, a2 in enumerate(call2["alt"]): code = distance / (maxdistance+1.0) + _cmpcall(r1,a1,r2,a2,posdiff) if code < mincmp: mincmp = code alleles = i1+1, i2+1 return mincmp, alleles ################################################################################################### # bring original and replacement sequence into normal form, apart from rotation # due to position mismatch. For insertions or deletions one of r, a will be empty, # but for replacement calls both will be nonempty def _truncate( r, a, pos ): while len(r)>0 and len(a)>0 and r[-1] == a[-1]: r = r[:-1] a = a[:-1] while len(r)>0 and len(a)>0 and r[0] == a[0]: r = r[1:] a = a[1:] pos = pos+1 return r, a, pos ################################################################################################### # rotate variant call assuming a repetitive reference. Diff = relative position of current call def _rotate( r, a, diff ): if len(r)>0 and len(a)>0: if diff == 0: return r, a return "X"*len(r), "Y"*len(a) # do not rotate replacement calls; ensure mismatch if len(r)>0: a, r = _rotate(a, r, diff) return r, a # now len(r) == 0 return r, a[-(diff % len(a)):] + a[:-(diff % len(a))] ################################################################################################### def _cmpcall( r1, a1, r2, a2, diff ): r1, a1, p1 = _truncate(r1,a1,0) r2, a2, p2 = _truncate(r2,a2,diff) r2, a2 = _rotate( r2,a2,p2-p1) if r1 == r2 and a1 == a2: return 0 # perfect match if len(r1) == len(r2) and len(a1) == len(a2): return 1 # sequence mismatch if cmp(len(r1),len(a1)) == cmp(len(r2),len(a2)): return 2 # type match (ins/del/snp) return 3 ################################################################################################### # forms pairs of best-matching records out of a merged stream def VCFpair( stream, vcforder, maxdistance=0, ignoretype=False, ignorelength=False, ignoresequence=False ): window = [] # encode criteria into match code if ignoretype: matchcode = 3.9999 elif ignorelength: matchcode = 2.9999 elif ignoresequence: matchcode = 1.9999 else: matchcode = 0.9999 streamgood = True while streamgood: # invariant: window contains calls over no more than 2*maxdistance nucleotides # (and if the stream has run out, it is empty) try: record = stream.next() window.append(record) except StopIteration: streamgood = False while (len(window)>0 and ((len(window)>1 and vcforder.distance( window[0][1], window[-1][1] ) > 2*maxdistance) or (not streamgood))): # Make invariant true again by removing calls at head of list. # Any call that matches by current criteria with another in the window is paired # However, find reciprocal best match to avoid a spurious matching taking out a good pair curpair = [0] oldpair = [] while set(curpair) != set(oldpair): oldpair = curpair curpair = [curpair[-1]] bestmatch = None bestcode = 4 for i in range(len(window)): j = curpair[0] if i == j: continue # check that we're comparing between VCF files if window[j][0] != window[i][0]: code, alleles = cmpcall( window[j][1], window[i][1], maxdistance, vcforder ) if code < min(matchcode, bestcode): bestmatch = i bestcode = code bestalleles = alleles if bestmatch != None: curpair.append( bestmatch ) result = [None, None] if len(curpair)==2: # found a best pair. Store VCF line, and matching allele result[ window[oldpair[0]][0] ] = (window[oldpair[0]][1], bestalleles[0]) result[ window[oldpair[1]][0] ] = (window[oldpair[1]][1], bestalleles[1]) yield result del window[ max(oldpair) ] del window[ min(oldpair) ] else: # call at head does not match another result[ window[oldpair[0]][0] ] = (window[oldpair[0]][1], -1) yield result del window[oldpair[0]] # invariant is true again ################################################################################################### # 0=r/r, 1=r/a, 2=a/a, 3=other combination or no call def allelecode( g, allele ): if 'GT' not in g: raise ValueError("Allele call does not have GT field") g = g['GT'] if len(g[0]) == 1: if g[0][0] == 0: return 0,1 if g[0][0] == allele: return 1,1 return 3,1 if not (g[0][0] in [0,allele] and g[0][2] in [0,allele]): return 3,2 return (g[0][0] == allele) + (g[0][2] == allele), 2 ################################################################################################### # number of occurrences of an allele among a set of shared samples def countalleles( call, sharedsamples, allele, bins, homopolbin, microsatbin ): count, popsize = 0, 0 for s in sharedsamples: if s not in call: raise ValueError("Expected sample %s in call" % s) code, ploidy = allelecode( call[s], allele ) if code in [0,1,2]: count += code popsize += ploidy # calculate histogram bin if popsize>0: bin = int( bins*count / (popsize+1.0) ) else: bin = 0 hr = call["info"].get("HR",[0])[0] tr = call["info"].get("TR",[0])[0] if hr>=tr: idx = len([c for c in homopolbin if c < hr]) if idx>0: return count, (bin, "r"+str(idx-1)) return count, (bin, "0") idx = len([c for c in microsatbin if c < tr]) if idx>0: return count, (bin, "u"+str(idx-1)) return count, (bin, "0") ################################################################################################### def genotypeconcordance( call1, call2, allele1, allele2, sharedsamples ): gtconcordance = [ [0,0,0,0] for i in range(4) ] for s in sharedsamples: if s not in call1 or s not in call2: raise ValueError("Expected sample %s in call" % s) code1, ploidy = allelecode( call1[s], allele1 ) code2, ploidy = allelecode( call2[s], allele2 ) gtconcordance[ code1 ][ code2 ] += 1 return gtconcordance ################################################################################################### def updatepairconcordance( call1, call2, allele1, allele2, samples1, samples2, pairconcordance ): gt1 = [ allelecode( call1[s], allele1 )[0] for s in samples1 ] gt2 = [ allelecode( call2[s], allele2 )[0] for s in samples2 ] for i1 in range(len(gt1)): for i2 in range(len(gt2)): if gt1[i1]<3 and gt2[i2]<3: pairconcordance[i1][i2][gt1[i1]][gt2[i2]] += 1 ################################################################################################### def concordances( s1, s2, pairconcordance ): pc = pairconcordance[s1][s2] homcc = pc[0][0] / (0.01+pc[0][0]+pc[0][1]+pc[0][2]) hetcc = pc[1][1] / (0.01+pc[1][0]+pc[1][1]+pc[1][2]) altcc = pc[2][2] / (0.01+pc[2][0]+pc[2][1]+pc[2][2]) conc = (pc[0][0] + pc[1][1] + pc[2][2]) / (0.01+sum(pc[0][:3])+sum(pc[1][:3])+sum(pc[2][:3])) hh1 = (pc[1][0] + pc[1][1] + pc[1][2]) / (0.01+pc[0][0]+pc[0][1]+pc[0][2]+pc[2][0]+pc[2][1]+pc[2][2]) hh2 = (pc[0][1] + pc[1][1] + pc[2][1]) / (0.01+pc[0][0]+pc[1][0]+pc[2][0]+pc[0][2]+pc[1][2]+pc[2][2]) return conc, homcc, hetcc, altcc, hh1, hh2 ################################################################################################### def pair_conc_results_1( samples1, samples2, pairconcordance ): s2dict = dict( ((s,n) for n,s in enumerate(samples2)) ) for n1, s1 in enumerate(samples1): result = [] n2 = s2dict.get(s1,-1) if n2>-1: conc, homcc, hetcc, altcc, hh, dummy = concordances(n1,n2,pairconcordance) concs2 = s1 result += map(lambda x: "%0.3f"%x, [conc,homcc,hetcc,altcc]) else: conc, homcc, hetcc, altcc, hh = [0.0]*5 concs2 = "*" result += ["*"]*4 for n2, s2 in enumerate(samples2): conc0, homcc0, hetcc0, altcc0, hh0, dummy = concordances(n1,n2,pairconcordance) if conc0 > conc or (conc0 == conc and s2 == s1): concs2, conc, homcc, hetcc, altcc, hh = s2, conc0, homcc0, hetcc0, altcc0, hh0 if concs2 == s1: result += ["*"]*5 else: result += [concs2] + map(lambda x: "%0.3f"%x, [conc, homcc, hetcc, altcc]) yield [s1,"%0.3f"%hh] + result ################################################################################################### def pair_conc_results_2( samples1, samples2, pairconcordance ): s1dict = dict( ((s,n) for n,s in enumerate(samples1)) ) for n2, s2 in enumerate(samples2): result = [] n1 = s1dict.get(s2,-1) if n1>-1: conc, homcc, hetcc, altcc, dummy, hh = concordances(n1,n2,pairconcordance) concs1 = s2 result += map(lambda x: "%0.3f"%x, [conc,homcc,hetcc,altcc]) else: conc, homcc, hetcc, altcc, hh = [0.0]*5 concs1 = "*" result += ["*"]*4 for n1, s1 in enumerate(samples1): conc0, homcc0, hetcc0, altcc0, dummy, hh0 = concordances(n1,n2,pairconcordance) if conc0 > conc or (conc0 == conc and s1 == s2): concs1, conc, homcc, hetcc, altcc, hh = s1, conc0, homcc0, hetcc0, altcc0, hh0 if concs1 == s2: result += ["*"]*5 else: result += [concs1] + map(lambda x: "%0.3f"%x, [conc, homcc, hetcc, altcc]) yield [s2,"%0.3f"%hh] + result ################################################################################################### def cross_concordances( samples1, samples2, pairconcordance ): print "VCF1, cross-concordance with VCF2:" print "\t".join( ["Sample","H/H","var","hom","het","alt","Match","var","hom","het","alt"] ) for line in pair_conc_results_1( samples1, samples2, pairconcordance ): print "\t".join(line) print print "VCF2, cross-concordance with VCF1:" print "\t".join( ["Sample","H/H","var","hom","het","alt","Match","var","hom","het","alt"] ) for line in pair_conc_results_2( samples1, samples2, pairconcordance ): print "\t".join(line) print print "Columns: H/H=het/hom ratio (primary VCF)" print " var=variant concordance; hom/hom + het/het + var/var / total" print " * if no matching sample in secondary VCF" print " hom=homozygous concordance; hom/hom / (hom/ref + hom/het + hom/alt)" print " het=heterozygous concordance" print " alt=homozygous alternative concordance" print " Match=best matching sample; * if matches same sample" print " var,hom,het,alt: concordances with best-matching sample" print ################################################################################################### def pretty( b, homopolbin, microsatbin ): j, contextbin = b if contextbin == "0": return "f"+str(j) if contextbin.startswith("r"): return "f%s;HR>=%s" % (j, homopolbin[int(contextbin[1:])]) return "f%s;TR>=%s" % (j,microsatbin[int(contextbin[1:])]) def help(): print "Compare two VCF files" print "Usage: vcf-compare.py [options]" print print " -h, --help This help" print " -oF, --output F Send output (calls from set 1 annotated by match status) to file F" print " -iF, --input F Read input 1 from file F (default stdin)" print " -cF, --input2 F Read input 2 from file F (required)" print " -n, --nosamples Ignore sample information, just compare calls" print " -sR, --select R Select region or regions to compare (chr:start-end,chr:start-end,...)" print " -mK, --match K Match stringency (0 [default] exact; 1: ignore sequence; 2: ignore length; 3: ignore type)" print " -pN, --position N Allow position to mismatch N positions (default 0)" print " -fF1,F2,F3 Allow FILTER values F1, F2, F3 for VCF1 (default None)" print " -gF1,F2,F3 Allow FILTER values F1, F2, F3 for VCF2 (default None)" print " -bN, --bins N Bin by N frequency bins, using input 1 allele counts (default 1)" print " -rN[,N...] Bin by homopolymer run length using thresholds N,N,..." print " -uN[,N...] Bin by microsatellite length using thresholds N,N,..." print " -t, --samplestats Compute per-sample statistics" print " -a, --callstats Compute per-call statistics" print " -3 Set default input version to 3.3" print " -F Fast and sloppy GT parsing" print " -xE, --ignore E Ignore error/warning E" print print " Note: When positions do not match exactly, indel sequences are rotated assuming that the reference is repetitive." # todo: select single chromosome # todo: implement filtering print print "Possible errors are:" v = vcf.VCF() print wrap( ", ".join([v.split(':')[0] for v in v._errors.values()]) ) sys.exit(1) ################################################################################################### def main(): infile, fname = sys.stdin, "" secondaryin = None outfile = None inversion = 40 nosamples = False stringency = 0 maxdistance = 0 regions = None fastGT = False samplestats = False callstats = False homopolbin = [] microsatbin = [] allowfilter1, allowfilter2 = [], [] bins = 1 v = vcf.sortedVCF() try: if len(sys.argv)==1: sys.argv.append("-h") opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:c:l:d:3nm:p:f:g:s:tr:u:b:aF", ["help","output","input","input2","ignore","nosamples","match","position","select","samplestats","bins=","callstats"]) except getopt.GetoptError, err: print str(err) help() for o, a in opts: if o in ["-h","--help"]: help() sys.exit() elif o in ["-n","--nosamples"]: nosamples = True elif o in ["-m","--match"]: stringency = int(a) elif o in ["-p","--position"]: maxdistance = int(a) elif o in ["-x","--ignore"]: v.ignoreerror(a) elif o in ["-o","--output"]: outfile = open(a,'w') elif o in ["-b","--bins"]: bins = int(a) elif o in ["-s","--select"]: regions = vcf.parse_regions(a) elif o in ["-t","--samplestats"]: samplestats = True elif o in ["-r"]: homopolbin = map(int,a.split(',')) elif o in ["-u"]: microsatbin = map(int,a.split(',')) elif o in ["-a","--callstats"]: callstats = True elif o in ["-F"]: fastGT = True elif o in ["-i","--input"]: infile = filez.open(a,'r') fname = a elif o in ["-c","--input2"]: secondaryin = filez.open(a,'r') secondaryfname = a elif o == "-3": inversion = 33 elif o == "-f": allowfilter1 = a.split(',') elif o == "-g": allowfilter2 = a.split(',') if not secondaryin: raise ValueError("Need two input VCF files") v.setversion(inversion) v._fastGT = fastGT # pre-parse first 1000 lines of each file, to catch formatting problems early v0 = vcf.VCF( v, lines=1000, _fastGT=fastGT ) for dummy in v0.parse( infile ): pass infile = filez.open(fname,'r') v0 = vcf.VCF( v, lines=1000, _fastGT=fastGT ) for dummy in v0.parse( secondaryin ): pass secondaryin = filez.open(secondaryfname, 'r') # process data v.setregions(regions) secondary = vcf.sortedVCF(v) secondary._fastGT = fastGT vcfstream1 = v.parse( infile ) vcfstream2 = secondary.parse( secondaryin ) samples1 = v.getsamples() samples2 = secondary.getsamples() sharedsamples = list( set.intersection( set(samples1), set(samples2) ) ) if len(sharedsamples) == 0 and not nosamples: raise ValueError("Files do not share samples (consider option -n)") for f in allowfilter1: if f not in v.getfilter(): print "#WARNING: filter value %s not defined in header" % f for f in allowfilter2: if f not in secondary.getfilter(): print "#WARNING: filter value %s not defined in header" % f if samplestats: pairsamples1 = samples1 pairsamples2 = samples2 pairconcordance = [ [ [ [0,0,0] for i in range(3) ] for s2 in samples2 ] for s1 in samples1 ] if not nosamples: samples1 = sharedsamples samples2 = sharedsamples if outfile: vcfout = vcf.VCF(v) vcfout.setversion(40) for i in ["1","2"]: vcfout.getinfo()['CI'+i] = \ vcf.FORMAT('CI'+i,vcfout.NT_NUMBER,1,"String", "Concordance info VCF%s: NC/NA/AP = Not Called/No Allele in shared samples/Allele Present" % i, "") vcfout.getinfo()['GCC'] = \ vcf.FORMAT('GCC',vcfout.NT_NUMBER,16,"Integer", "Genotype concordance counts: HomRef(1)/HomRef(2), Het(1)/HomRef(2), HomAlt(1)/HomRef(2), Other(1)/HomRef(2), ...", [0]*16) vcfout.getheader().append( ("comment","Secondary input file (for CI1/CI2/GCC): %s" % secondaryfname) ) vcfout.getheader().append( ("source"," ".join(sys.argv)) ) vcfout.writeheader( outfile ) else: vcfout = None chromosomes = set(v.chr_order()) chromosomes.update(secondary.chr_order()) chromosomes = v.chr_order( list(chromosomes) ) vcforder = VCForder(chromosomes) mergedstream = VCFmerge(vcfstream1, vcfstream2, vcforder, filter1=allowfilter1, filter2=allowfilter2 ) pairstream = VCFpair( mergedstream, vcforder, maxdistance, ignoretype=(stringency>=3), ignorelength=(stringency>=2), ignoresequence=(stringency>=1) ) # setup results counters # callconcordance[i][j] i,j are 0,1 or 2 encoding: # 0 = site not called; # 1 = site called but allele not called in intersection # 2 = site and allele called # # gtconcordance[i][j] callconcordance[2][2], and i,j encode # 0 = ref/ref # 1 = alt/ref # 2 = alt/alt # 3 = other ( # Here "alt" = the matching non-reference allele, "ref" is the # reference, "other" a non-call or non-matching non-ref allele contextbins = ["0"] + ["r"+str(i) for i in range(len(homopolbin))] + ["u"+str(i) for i in range(len(microsatbin))] allbins = [] for cb in contextbins: for j in range(bins): allbins.append( (j,cb) ) callconcordance = dict([(j,[ [0,0,0] for i in range(3) ]) for j in allbins ]) gtconcordance = dict([(j,[ [0,0,0,0] for i in range(4) ]) for j in allbins ]) for pair in pairstream: if pair[1] == None: # VCF1 call not matched in VCF2 call = pair[0][0] call["info"]["CI2"] = ["NC"] numalleles, bin = countalleles( call, samples1, 1, bins, homopolbin, microsatbin ) if numalleles == 0: # VCF1 call with no alleles in intersection callconcordance[bin][1][0] += 1 call["info"]["CI1"] = ["NA"] else: callconcordance[bin][2][0] += 1 call["info"]["CI1"] = ["AP"] if outfile: vcfout.write_data( outfile, call ) elif pair[0] == None: # VCF2 call not matched in VCF1 call = pair[1][0] numalleles, dummy = countalleles( call, samples2, 1, bins, homopolbin, microsatbin ) if numalleles == 0: # VCF2 call with no alleles in intersection callconcordance[(0,"0")][0][1] += 1 else: callconcordance[(0,"0")][0][2] += 1 else: # Paired call call1, call2 = pair[0][0], pair[1][0] allele1, allele2 = pair[0][1], pair[1][1] numalleles1, bin = countalleles( call1, samples1, allele1, bins, homopolbin, microsatbin ) numalleles2, dummy = countalleles( call2, samples2, allele2, bins, homopolbin, microsatbin ) callconcordance[bin][ 1+(numalleles1>0) ][ 1+(numalleles2>0) ] += 1 call1["info"]["CI1"] = [["NA","AP"][numalleles1>0]] call1["info"]["CI2"] = [["NA","AP"][numalleles2>0]] gc = genotypeconcordance( call1, call2, allele1, allele2, sharedsamples ) call1["info"]["GCC"] = gc[0]+gc[1]+gc[2]+gc[3] for i1, gc1 in enumerate(gc): for i2, n in enumerate(gc1): gtconcordance[bin][i1][i2] += n if samplestats: updatepairconcordance( call1, call2, allele1, allele2, pairsamples1, pairsamples2, pairconcordance ) if outfile: vcfout.write_data( outfile, call1 ) print "\nConcordance summary" print "-------------------\n" print "\tVCF1: ",fname print "\tVCF2: ",secondaryfname print "\n\nCall concordance:" print "-----------------\n" for b in allbins: print "\t%s\t%s\t%s\t%s\t%s" % ("(bin %s)"%pretty(b,homopolbin,microsatbin),"VCF2:","NC","NA","AP") for i in range(3): print "\t%s\t%s\t%s\t%s\t%s" % (("VCF1:",["NC","NA","AP"][i])+tuple(callconcordance[b][i])) print print "\tNC=no call, NA=no alleles in sample intersection, AP=allele present in intersection" print "\tWith -n, NA and AP report allele presence or absence within all samples rather than the intersection" print "\n\nGenotype concordance (AP/AP calls):" print "-----------------------------------\n" for b in allbins: print "\t%s\t%s\t%s\t%s\t%s\t%s" % ("(bin %s)"%pretty(b,homopolbin,microsatbin),"VCF2:","HomRef","Het","HomAlt","Other") for i in range(4): print "\t%s\t%s\t%s\t%s\t%s\t%s" % (("VCF1:",["HomRef","Het","HomAlt","Other"][i])+tuple(gtconcordance[b][i])) print print "\t(HomRef: homozygous reference; Het: ref/alt where alt=matching non-ref allele," print "\t HomAlt: alt/alt, Other: no call, or unmatched non-ref allele)" print print "Genotype quality (AP/AP calls)" print "------------------------------\n" print "Freqbin\tHomRef\t\t\tHet\t\t\tHomAlt" print "\tTrue\tTotal\tFrac\tTrue\tTotal\tFrac\tTrue\tTotal\tFrac" for b in allbins: print ("\t".join(["%s"]*10)) % (pretty(b,homopolbin,microsatbin), gtconcordance[b][0][0],sum(gtconcordance[b][0][0:3]), "%1.3f" % (gtconcordance[b][0][0]/(0.01+sum(gtconcordance[b][0][0:3]))), gtconcordance[b][1][1],sum(gtconcordance[b][1][0:3]), "%1.3f" % (gtconcordance[b][1][1]/(0.01+sum(gtconcordance[b][1][0:3]))), gtconcordance[b][2][2],sum(gtconcordance[b][2][0:3]), "%1.3f" % (gtconcordance[b][2][2]/(0.01+sum(gtconcordance[b][2][0:3])))) print print "(Assumes VCF1 is true. Non-calls and unmatched non-ref alleles are not considered.)\n" if samplestats: print cross_concordances( pairsamples1, pairsamples2, pairconcordance ) ################################################################################################### if __name__ == "__main__": main()