#! /usr/bin/env python from sys import stdin, stderr, stdout, argv, exit from snp import SNP from getopt import getopt opts, args = getopt(argv[1:], "o:f:ul:c:vhH?") outnames = [] verbose = False freq = 0 removeuninformative = False minimumlength = 0 minimumcount = 0 for o in opts: if o[0][1] == 'o': outnames.append(o[1]) elif o[0][1] == 'f': freq = float(o[1]) if freq < 0 or freq >= 1: raise ValueError, "Frequency of mismatches should be in [0; 1[" elif o[0][1] == 'u': removeuninformative = True elif o[0][1] == 'l': minimumlength = int(o[1]) elif o[0][1] == 'c': minimumcount = int(o[1]) if minimumcount < 0: raise ValueError, "Cannot specify negative minimum SNP count" elif o[0][1] == 'v': verbose = True elif o[0][1] in 'hH?': print "Usage:", argv[0], "[options] [datafile]" print "Read SNP data from datafile and determine extended shared haplotypes" print "\nLegal options are:" print " -fX Allow fraction of X of the sites to differ" print " -lN Only report regions of length N, measured by SNP positions" print " -cN Only report regions of length N, measured by SNP count; if both -l and -c" print " options are invoked, both filters are applied" print " -u Remove singletons" print " -oN Write output to file N" print " -v Output some progress information to standard error along the way" print " -h, -H, -? Print this information and stop" exit(0) # Output string s to all output files def output(s): for f in outfiles: print >> f, s f.flush() # Analyse data from file pointer f def analyse_data(f): results = [] snp = SNP(f) if removeuninformative: snp.removeuninformative(verbose) for i in xrange(snp.n): results.append([]) for j in xrange(i): # Compare accessions i and j if verbose: print >> stderr, "Comparing accessions", snp.getname(j), "and", snp.getname(i) # Start by computing array of match calls match = snp.length * [True] m = snp.length u = 0 for k in xrange(snp.length): if snp.data[i].sequence[k] == '*' or snp.data[j].sequence[k] == '*': match[k] = None m -= 1 u += 1 print >>stderr, snp.data[i].sequence[k], snp.data[j].sequence[k], k elif snp.data[i].sequence[k] != snp.data[j].sequence[k]: match[k] = False m -= 1 if verbose: print >> stderr, m, "of", snp.length, "SNPs match" if u > 0: print >> stderr, u, "of", snp.length, "positions have one or both sequences undefined" # Then find long stretches of shared types shared = [] # Shared regions found start = 0 # Start of current potential region score = -1 # Score for current potential region maxscore = 0 # Maximum scored obtained in current potential region maxpoint = -1 # Position where the maximum score was attained for k in xrange(snp.length): if match[k] is True: # Matching types if score < 0: # Score at previous position negative, so start new region start = k score = freq / (1 - freq) maxscore = score maxpoint = k else: # Score at previous position non-negative, so continue region score += freq / (1 - freq) if score >= maxscore: # Found a new maximum score (when region length is used # to break ties) maxscore = score maxpoint = k elif match[k] is False: # Types are not matching score -= 1 if score < 0: # Current region has negative score score = -1 if maxpoint - start >= minimumcount and int(snp.getposition(maxpoint)) - int(snp.getposition(start)) >= minimumlength: # It just went into arrears, register best region in # current region if it is of sufficient length shared.append((start, maxpoint)) if verbose: stderr.write('#') # Set start value such that current region will not register start = k + 2 else: # Simply ignore sites with undefined characters pass if verbose and shared != []: stderr.write('\n') # Output shared haplotype regions shared.sort(lambda x, y: int(snp.getposition(y[1])) - int(snp.getposition(y[0])) - int(snp.getposition(x[1])) + int(snp.getposition(x[0]))) output("Comparing accessions " + snp.getname(j) + " and " + snp.getname(i)) for interval in shared: c = 0 for k in xrange(interval[0], interval[1] + 1): if match[k] is True: c += 1 output(" " + snp.getposition(interval[0]) + " to " + snp.getposition(interval[1]) + " (" + str((int(snp.getposition(interval[1])) - int(snp.getposition(interval[0]))) / 1000) + "kb) has " + str(c) + " matching positions in region with " + str(interval[1] - interval[0] + 1) + (" SNPs (%.4f%%)" % (100.0 * c / (interval[1] - interval[0] + 1)))) # Set up output files if outnames == []: outfiles = [stdout] else: outfiles = [open(i, 'w') for i in outnames] # Run through specified data files and analyse data if args == []: output("Reading data from standard input") analyse_data(stdin) else: for i in args: output("Reading data from file " + i) f = open(i) analyse_data(f) f.close() if outnames != []: for f in outfiles: f.close()