#! /usr/bin/env python
from sys import stdin, stderr, stdout, argv, exit
from snp import SNP
from getopt import getopt

opts, args = getopt(argv[1:], "o:f:ul:c:vhH?")
outnames = []
verbose = False
freq = 0
removeuninformative = False
minimumlength = 0
minimumcount = 0
for o in opts:
  if o[0][1] == 'o':
    outnames.append(o[1])
  elif o[0][1] == 'f':
    freq = float(o[1])
    if freq < 0 or freq >= 1:
      raise ValueError, "Frequency of mismatches should be in [0; 1["
  elif o[0][1] == 'u':
    removeuninformative = True
  elif o[0][1] == 'l':
    minimumlength = int(o[1])
  elif o[0][1] == 'c':
    minimumcount = int(o[1])
    if minimumcount < 0:
      raise ValueError, "Cannot specify negative minimum SNP count"
  elif o[0][1] == 'v':
    verbose = True
  elif o[0][1] in 'hH?':
    print "Usage:", argv[0], "[options] [datafile]"
    print "Read SNP data from datafile and determine extended shared haplotypes"
    print "\nLegal options are:"
    print " -fX Allow fraction of X of the sites to differ"
    print " -lN Only report regions of length N, measured by SNP positions"
    print " -cN Only report regions of length N, measured by SNP count; if both -l and -c"
    print "     options are invoked, both filters are applied"
    print " -u  Remove singletons"
    print " -oN Write output to file N"
    print " -v  Output some progress information to standard error along the way"
    print " -h, -H, -? Print this information and stop"
    exit(0)

# Output string s to all output files
def output(s):
  for f in outfiles:
    print >> f, s
    f.flush()

# Analyse data from file pointer f
def analyse_data(f):
  results = []
  snp = SNP(f)
  if removeuninformative:
    snp.removeuninformative(verbose)
  for i in xrange(snp.n):
    results.append([])
    for j in xrange(i):
      # Compare accessions i and j
      if verbose:
        print >> stderr, "Comparing accessions", snp.getname(j), "and", snp.getname(i)
      # Start by computing array of match calls
      match = snp.length * [True]
      m = snp.length
      u = 0
      for k in xrange(snp.length):
        if snp.data[i].sequence[k] == '*' or snp.data[j].sequence[k] == '*':
          match[k] = None
          m -= 1
          u += 1
          print >>stderr, snp.data[i].sequence[k], snp.data[j].sequence[k], k
        elif snp.data[i].sequence[k] != snp.data[j].sequence[k]:
          match[k] = False
          m -= 1
      if verbose:
        print >> stderr, m, "of", snp.length, "SNPs match"
        if u > 0:
          print >> stderr, u, "of", snp.length, "positions have one or both sequences undefined" 
      # Then find long stretches of shared types
      shared = [] # Shared regions found
      start = 0 # Start of current potential region
      score = -1 # Score for current potential region
      maxscore = 0 # Maximum scored obtained in current potential region
      maxpoint = -1 # Position where the maximum score was attained
      for k in xrange(snp.length):
        if match[k] is True:
          # Matching types
          if score < 0:
            # Score at previous position negative, so start new region
            start = k
            score = freq / (1 - freq)
            maxscore = score
            maxpoint = k
          else:
            # Score at previous position non-negative, so continue region
            score += freq / (1 - freq)
            if score >= maxscore:
              # Found a new maximum score (when region length is used
              # to break ties)
              maxscore = score
              maxpoint = k
        elif match[k] is False:
          # Types are not matching
          score -= 1
          if score < 0:
            # Current region has negative score
            score = -1
            if maxpoint - start >= minimumcount and int(snp.getposition(maxpoint)) - int(snp.getposition(start)) >= minimumlength:
              # It just went into arrears, register best region in
              # current region if it is of sufficient length
              shared.append((start, maxpoint))
              if verbose:
                stderr.write('#')
            # Set start value such that current region will not register
            start = k + 2
        else:
          # Simply ignore sites with undefined characters
          pass
      if verbose and shared != []:
        stderr.write('\n')
      # Output shared haplotype regions
      shared.sort(lambda x, y: int(snp.getposition(y[1])) - int(snp.getposition(y[0])) - int(snp.getposition(x[1])) + int(snp.getposition(x[0])))
      output("Comparing accessions " + snp.getname(j) + " and " + snp.getname(i))
      for interval in shared:
        c = 0
        for k in xrange(interval[0], interval[1] + 1):
          if match[k] is True:
            c += 1
        output(" " + snp.getposition(interval[0]) + " to " + snp.getposition(interval[1]) + " (" + str((int(snp.getposition(interval[1])) - int(snp.getposition(interval[0]))) / 1000) + "kb) has " + str(c) + " matching positions in region with " + str(interval[1] - interval[0] + 1) + (" SNPs (%.4f%%)" % (100.0 * c / (interval[1] - interval[0] + 1))))


# Set up output files
if outnames == []:
  outfiles = [stdout]
else:
  outfiles = [open(i, 'w') for i in outnames]

# Run through specified data files and analyse data
if args == []:
  output("Reading data from standard input")
  analyse_data(stdin)
else:
  for i in args:
    output("Reading data from file " + i)
    f = open(i)
    analyse_data(f)
    f.close()
if outnames != []:
  for f in outfiles:
    f.close()