#! /usr/bin/env python from sys import stdin, stdout, stderr, exit, argv from getopt import getopt #from hcluster import single, to_tree from disjointset import DisjointSet from defaultdict import Dict from math import log, sin, cos, pi, exp, floor from copy import deepcopy def log10(x): return log(x, 10) opts, args = getopt(argv[1:], "?Hab:c:d:e:hi:l:mo:r:suvw:") outnames = [] # Names of output files verbose = False # Output progress information csort = False # Sort accessions so accessions with more shared haplotypes are close asort = False # Sort accessions alphabetically start = None # Start of region to visualise end = None # End of region to visualise ostart = None # Option specification for start oend = None # Option specification for end uniqueprefix = False # Truncate accession names to the minimum amount required for uniqueness minscore = .5 # Minimum score that a shared haplotype region for an accession can have - determines how faded it can be minobs = None # Minimum score maxobs = None # Maximum score infofiles = None # Files sequencing and SNP information is read from avrange = 50000 # SNP density for a point is averaged over region extending avrange either side density = None # Density of data points in SNP density plots classcolours = [] # Colours used for connected components of shared haplotypes scaledist = .07 # Distance, as fraction of plot length from plots to informative SNP density scale # Function for outputting brief usage description def print_usage(output = stdout, exitcode = None, postscript = None): print >> output, "Usage:", argv[0], "[Options] [input files]" print >> output, "Read information generated by shared haplotype investigation and create an" print >> output, "Octave format plot of the data. Information should all pertain to a single" print >> output, "chromosome, as numerical values of chromosomes are used for sorting" print >> output, "\nLegal options are:" print >> output, " -oN Output plot code to file N. Default is to output to standard output." print >> output, " Multiple output files can be specified by repeated application of this" print >> output, " option" print >> output, " -cC Add colour C (which should be comma separated RGB values) to the colours" print >> output, " used to represent connected components of shared haplotypes. Default is" print >> output, " set of all RGB values with one or two values of 1 and the rest 0. Fading" print >> output, " is defined by interpolation by score between class colour and white." print >> output, " Colours are used in the order they are added. If an insufficient number" print >> output, " of colours is available, the program will crash" print >> output, " -a Sort accessions alphabetically" print >> output, " -s Sort accessions by first clustering by maximum shared length between" print >> output, " haplotypes and then ordering subtrees so sum of shared lengths between" print >> output, " neighbours is maximised. If both the -a and -s options are specified," print >> output, " the last occurrence takes presedence. CURRENTLY DEFUNCT" print >> output, " -bX Set start point of region to X, otherwise 0 is used" print >> output, " -eX Set end point of region to X, otherwise maximum value encountered is used" print >> output, " -u Only include as much of the prefix of an accession name required for" print >> output, " uniqueness in the graphs showing shared haplotypes in 2D plot" print >> output, " -lX Set lowest score of accesion to X; score is used as colour intensity" print >> output, " -iN Use SNP information from file N to affect the background representation" print >> output, " of the accessions (default is a thin, grey line). Multiple files can" print >> output, " be specified" print >> output, " -wX Width of region to base statistics from information files on. Sites with" print >> output, " an absolute difference to current position of at most X will be used" print >> output, " -dX Density of points used for plotting information file statistics for. A" print >> output, " data point every X position will be used to generate the graphs. If no" print >> output, " density is specified, the total set of positions from the information" print >> output, " file be used to define the data points" print >> output, " -rX Distance from plots to informative SNP density scale, measured relative" print >> output, " to the width of the plot." print >> output, " -v Write some information to standard error" print >> output, " -h, -H, -? Print this information and stop" if postscript != None: output.write('\n') print >> output, postscript if exitcode != None: exit(exitcode) # Analyse command line options for o in opts: if o[0][1] == 'o': outnames.append(o[1]) elif o[0][1] == 'i': if infofiles == None: infofiles = [o[1]] else: infofiles.append(o[1]) elif o[0][1] == 'w': avrange = int(o[1]) elif o[0][1] == 'd': density = int(o[1]) elif o[0][1] == 'c': try: c = map(lambda x: float(x), o[1].split(',')) if len(c) != 3 or max(c) > 1 or min(c) < 0: raise ValueError except ValueError: print_usage(output = stderr, exitcode = 1, postscript = "Colour not a valid comma separated RGB value") classcolours.append(c) elif o[0][1] == 's': # csort = True CURRENTLY DEFUNCT asort = False elif o[0][1] == 'a': asort = True csort = False elif o[0][1] == 'l': minscore = float(o[1]) if minscore < 0 or minscore > 1: print_usage(output = stderr, exitcode = 1, postscript = "Minimum score has to be number between 0 and 1") elif o[0][1] == 'r': scaledist = float(o[1]) elif o[0][1] == 'u': uniqueprefix = True elif o[0][1] == 'v': verbose = True elif o[0][1] == 'b': ostart = int(o[1]) elif o[0][1] == 'e': oend = int(o[1]) elif o[0][1] in 'hH?': print_usage(output = stdout, exitcode = 0) # Set up files for output if outnames == []: outfiles = [stdout] else: outfiles = [] for n in outnames: outfiles.append(open(n, 'w')) # Set up files for input if args == []: args = [stdin] # Define default colours if none were specified if classcolours == []: classcolours = [(0, 0, 1), (1, 0, 0), (0, 1, 0), (1, 0, 1), (0, 1, 1), (1, 1, 0), (1, .5, .5), (.5, 1, .5), (.5, .5, 1)] # Binary search for item in arr having valuefn return largest value # not larger than x; arr is assumed to be sorted according to valuefn def bsearchd(x, arr, valuefn): if len(arr) == 0: return None low = 0 high = len(arr) while low < high - 1: mid = (low + high) / 2 if valuefn(arr[mid]) <= x: low = mid else: high = mid return low # Binary search for item in arr having valuefn return smallest value # not smaller than x; arr is assumed to be sorted according to valuefn def bsearchu(x, arr, valuefn): if len(arr) == 0: return None low = -1 high = len(arr) - 1 while low < high - 1: mid = (low + high) / 2 if valuefn(arr[mid]) < x: low = mid else: high = mid return high # Read background information if infofiles != None: # Structure for holding information about overall informative sites informative = set([]) alle = set([]) refprivate = set([]) refseq = None # Class for holding information on a specific site class SiteInf: def __init__(self, heterozygous, coverage): self.heterozygous = (int(heterozygous) == 1) self.coverage = int(coverage) # Class for holding information on a specific accession class AccInfo: # Class initialisation def __init__(self): self.__info__ = [] # List of per site information self.__sorted__ = True # Is list sorted? Iterate over all # positions in points (default is to iterate over all positions # from infofiles) and return average values def iterate(self, points = None): class __ReturnValue: def __init__(self, position, heterozygocity, private, informative, length, coverage): self.pos = position self.het = heterozygocity self.snp = private self.inf = informative self.len = length self.cov = coverage if not self.__sorted__: self.__info__.sort() if points == None: points = alle low = 0 high = 0 hetacc = 0 snpacc = 0 inflist = list(informative) inflist.sort() ilow = 0 ihigh = 0 def privatesnp(i): return int(self.__info__[i][0] not in informative and (self != refseq or self.__info__[i][0] in refprivate)) for i in points: if i < start or i > end: # Ignore points outside start,end interval continue # Update with sites no longer in region averaged over while low < len(self.__info__) and self.__info__[low][0] < i - avrange: hetacc -= int(self.__info__[low][1].heterozygous) snpacc -= privatesnp(low) low += 1 # Update with new sites included in region averaged over while high < len(self.__info__) and self.__info__[high][0] <= i + avrange: hetacc += int(self.__info__[high][1].heterozygous) snpacc += privatesnp(high) high += 1 while ilow < len(inflist) and inflist[ilow] < i - avrange: ilow += 1 while ihigh < len(inflist) and inflist[ihigh] <= i + avrange: ihigh += 1 yield __ReturnValue(i, float(hetacc) / max(1, high - low), snpacc, ihigh - ilow, min(end, i + avrange) - max(start, i - avrange), max(0, self.find(i)[1].coverage)) # Iterator for object def __iter__(self): for i in self.iterate(): yield i # Add new position def add(self, position, heterozygous, coverage): self.__info__.append((position, SiteInf(heterozygous, coverage))) self.__sorted__ = False # Extract positions within global variable avrange distance from position def extract(self, position): if not self.__sorted__: self.__info__.sort() i = bsearchu(position - avrange, self.__info__, lambda x: x[0]) j = bsearchd(position + avrange, self.__info__, lambda x: x[0]) return self.__info__[i:j + 1] # Return element closest to position def find(self, position): if not self.__sorted__: self.__info__.sort() i = bsearchd(position, self.__info__, lambda x: x[0]) if i < len(self.__info__) - 1 and self.__info__[i + 1][0] - position < position - self.__info__[i][0]: i += 1 return self.__info__[i] info = Dict(initfn = AccInfo) count = Dict(0) for fname in infofiles: if verbose: print >> stderr, "Reading information from", fname f = open(fname) s = f.readline() while s != "": s = s.split() info[s[0]].add(int(s[2]), int(s[3]), int(s[4])) count[int(s[2])] += 1 alle.add(int(s[2])) s = f.readline() f.close() # Private SNPs are only registered for accession and reference, # unless it is a reference private SNP where it is registered for # all for i in count.items(): if i[1] > 2 and i[1] < len(info): informative.add(i[0]) elif i[1] == len(info): # Private SNP for reference sequence refprivate.add(i[0]) alle = list(alle) alle.sort() i = -1 for a in info.values(): if len(a.__info__) > i: i = len(a.__info__) refseq = a # Output s to all output files def output(s): for f in outfiles: print >> f, s f.flush() # Determine whether prefix of s matches p def prefix(s, p): return s[:len(p)] == p # Read data from file pointer f def readdata(f): global maxobs, minobs # Class for shared haplotypes class SharedHap: def __init__(self, start, end, score): self.start = start self.end = end self.score = score # Read data data = {} # Shared haplotype regions acc = [] # Accessions encountered while True: s = f.readline() if s == "": break if prefix(s, "Comparing"): # Start of new pair of accessions s = s[len("Comparing accessions "):] i = s.find(" and ") a1 = s[:i].strip() if a1 not in acc: acc.append(a1) a2 = s[i + len(" and "):].strip() if a2 not in acc: acc.append(a2) data[(a1, a2)] = data[(a2, a1)] = [] else: # Specification of shared haplotype s = s.split() if s[0].isdigit(): data[(a1, a2)].append(SharedHap(int(s[0]), int(s[2]), float(s[-1][1:-2]))) # Remember highest and lowest scores observed if maxobs == None or data[(a1, a2)] < maxobs: maxobs = data[(a1, a2)] if minobs == None or data[(a1, a2)] > minobs: minobs = data[(a1, a2)] return data, acc # Clean up data by removing parts not in the region specified by start # and end. If no start/end has been specified, find it. def cleanup(data): global start, end start = ostart end = oend if start == None: for i in data.values(): for j in i: if start == None: start = j.start else: start = min(start, j.start) if end == None: end = 0 for i in data.values(): for j in i: end = max(end, j.end) # Check regions against start and end for i in data.values(): # Run through shared haplotypes for pair yielding i j = 0 while j < len(i): if i[j].end < start or i[j].start > end: # This region is entirely outside start to end region del i[j] else: if i[j].start < start: # This region begins before start i[j].start = start if i [j].end > end: # This region ends after end i[j].end = end j += 1 # Reorder subtrees in t such that sum of shared haplotype lengths # between neighbours, given by d, is maximised. def reorder(t, d): M = {} N = {} L = {} B = {} # Generate list of all leaves under node t def leaves(t): return t.pre_order(lambda x: x.get_id()) # Update M and N for node t and return leaves under t def recurse(t, d, M, N, L, B): if t.is_leaf(): # Reached bottom of recursion M[(t.get_id(), t.get_id())] = 0 return ([t.get_id()], ), None else: left = recurse(t.get_left(), d, M, N, L, B)[0] cleft = sum(left, []) L[t.get_left()] = cleft right = recurse(t.get_right(), d, M, N, L, B)[0] cright = sum(right, []) L[t.get_right()] = cright best = (-1,) b = {} for l in xrange(len(left)): # For subtrees of left subtree for i in left[l]: # For leaves in left subtree for j in cright: # Initialise best score with flanks i, j for all j in right subtree M[(i, j)] = -1 for r in xrange(len(right)): # For subtrees of right subtree for j in right[r]: # For leaves in right subtree # Compute best score with left flank i and left flank of # right subtree j N[(i, j)] = -1 for k in left[len(left) - 1 - l]: if M[(i, k)] + d[(k, j)] > N[(i, j)]: N[(i, j)] = M[(i, k)] + d[(k, j)] b[(i, j)] = k # Compute best score for all i, k flanks where k is not # in same subtree of right subtree as j, unless right # subtree is a leaf for k in right[len(right) - 1 - r]: if N[(i, j)] + M[(j, k)] > M[(i, k)]: M[(i, k)] = N[(i, j)] + M[(j, k)] B[(i, k)] = ((i, b[(i, j)]), (j, k)) # Make score symmetric for j in cright: M[(j, i)] = M[(i, j)] B[(j, i)] = B[(i, j)] if M[(i, j)] > best[0]: best = (M[(i, j)], (i, j)) return (cleft, cright), best[1] # Compute best score obtainable for any pair of flanking accessions # (the pair of accessions uniquely define the node in the tree). best = recurse(t, d, M, N, L, B)[1] # Backtrack score for flanking pair i, j, where i is in left subtree # and j is in right subtree. def backtrack(t, i, j, flip): if not t.is_leaf(): # Still need to recurse deeper flip = flip ^ (i != B[(i, j)][0][0]) backtrack(t.get_left(), B[(i, j)][0][0], B[(i, j)][0][1], flip) backtrack(t.get_right(), B[(i, j)][1][0], B[(i, j)][1][1], flip) if flip: # Swap subtrees t.right, t.left = t.get_left(), t.get_right() backtrack(t, best[0], best[1], False) return t.pre_order(lambda x: x.get_id()) # Sort data by first doing a hierarchical agglomerative clustering # based on maximum similarity, then reorder subtrees such that sum of # similarities between neighbours is maximised. def sortdata(acc, data): # Start by creating distance matrix d = [] D = {} for i in xrange(len(acc)): for j in xrange(i + 1, len(acc)): d.append(0) for k in data[(acc[i], acc[j])]: d[-1] += k.end - k.start D[(i, j)] = D[(j, i)] = d[-1] m = max(d) for i in xrange(len(d)): d[i] = m - d[i] # Perform clustering t = to_tree(single(d)) # Reorder tree to maximise sum of neighbour similarities return map(lambda x: acc[x], reorder(t, D)) # Class representing a shared haplotype class Sharing: def __init__(self, start, end, score, pair): self.start = start self.end = end self.pair = pair self.score = score # Find regions of shared haplotypes and the overlaps in each region def find_regions(data): # Create list of all shared haplotypes in sorted order shared = sum(map(lambda x: map(lambda y: Sharing(y.start, y.end, y.score, x[0]), x[1]), data.items()), []) shared.sort(lambda x, y: x.start - y.start) # Find maximal sets where no shared hapotypes are disjoint end = shared[0].end regions = [] region = [(shared[0].pair, shared[0].score)] overhang = [] newoverhang = [] for i in xrange(1, len(shared)): if shared[i].start > end: # This shared haplotype does not overlap all shared haplotypes # in region; start new region # Eliminate shared haplotypes from previous regions that no longer # extends into current region j = 0 while j < len(overhang): if overhang[j].end >= shared[i - 1].start: # This shared haplotype extends into current region region.append((overhang[j].pair, overhang[j].score)) if overhang[j].end < shared[i].start: # This shared haplotype does not extend into new region del overhang[j] else: # Proceed to next shared haplotype j += 1 # Store start and end of core region, and all shared haplotypes in region regions.append((shared[i - 1].start, end, region)) # Start new region region = [(shared[i].pair, shared[i].score)] end = shared[i].end # Move shared haplotypes part of current region to haplotypes # that can extend into subsequent regions for j in newoverhang: if j.end >= shared[i].start: overhang.append(j) newoverhang = [shared[i]] else: # This shared haplotype belongs to current region end = min(end, shared[i].end) region.append((shared[i].pair, shared[i].score)) newoverhang.append(shared[i]) # Check for addition of shared haplotypes to last region for i in overhang: if i.end >= shared[-1].start: region.append((i.pair, i.score)) regions.append((shared[-1].start, end, region)) return regions # Return string with name of accession i with any extensions removed def prune_acc(s, unique = False, acc = None): if unique: i = 2 for j in acc: if not prefix(s, j): while i <= len(s) and prefix(s[:i], j): i += 1 s = s[:i - 1] return s # Create visualisation of data in file f def __count(): i = 0 while True: i += 1 yield i count = __count().next def visualise(f): if f == stdin: output("# Visualisation of data from standard input") if verbose: print >> stderr, "Reading data from standard input" else: output("# Visualisation of data from " + f) if verbose: print >> stderr, "Reading data from file", f f = open(f) # Read data data, acc = readdata(f) # Clean up data cleanup(data) if csort: # Sort data to group similar accessions together acc = sortdata(acc, data) elif asort: # Sort data alphabetically acc.sort() # Output Octave code for visualising data # Iterator function for points that should be used as data points for # plotting SNP density information def points(): if density != None: for i in xrange(start, end, density): yield i yield end else: i = bsearchu(start, alle, lambda x: x) while i < len(alle) and alle[i] <= end: yield alle[i] i += 1 # Create summary visualisation for the accessions as single entities output("figure(%d)" % count()) output('box("off")') output('axis("off")') output('line([%d %d], [-.5 -.5], "linewidth", 1.5, "color", "k", "linestyle", "-")' % (start, end)) ticstep = 10 ** int(log10((end - start) / 4.0)) if 5 * ticstep <= (end - start) / 4.0: ticstep = 5 * ticstep elif 2 * ticstep <= (end - start) / 4.0: i = 2 * ticstep ticstart = ticstep * (int((start - 1) / ticstep) + 1) for i in xrange(ticstart, end + 1, ticstep): output('line([%d %d], [-.5 -.8], "linewidth", 1.5, "color", "k", "linestyle", "-")' % (i, i)) output('text(%d, -1.2, "%d", "horizontalalignment", "center", "verticalalignment", "top")' % (i, i)) for i in xrange(len(acc)): output('line([%d %d], [%d %d], "linewidth", 1, "color", [.7, .7, .7], "linestyle", ":")' % (start, end, i, i)) output('text(%d, %d, "%s", "horizontalalignment", "right")' % (start - (end - start) * .05, i, prune_acc(acc[i], unique = uniqueprefix, acc = acc))) if infofiles != None: # Function for determining tic value def findtic(x): print x e = int(floor(log(x, 10))) if x / 10 ** e > 2: if x / 10 ** e > 5: m = 5 else: m = 2 else: m = 1 return (m * 10 ** e, m, e) # Start by plotting informative SNP density as background x = [] y = [] for i in points(): x.append(str(i)) y.append(float(bsearchd(min(end, i + avrange), alle, lambda x: x) - bsearchu(max(start, i - avrange), alle, lambda x: x)) / float(min(end, i + avrange) - max(start, i - avrange))) m = max(y) tic = findtic(m) scale = (len(acc) - 1) / m y = map(lambda x: "%.2f" % (x * scale), y) output('line([%d %d %d], [0 0 %d], "color", [.6 .6 .6], "linewidth", 1)' % (end + (end - start) * (scaledist + .01), end + (end - start) * scaledist, end + (end - start) * scaledist, len(acc))) output('line([%d %d], [%.2f, %.2f], "color", [.6, .6, .6], "linewidth", 1)' % (end + (end - start) * scaledist, end + (end - start) * (scaledist + .01), tic[0] * scale, tic[0] * scale)) output('text(%d, 0, "0", "color", [.6, .6, .6])' % (end + (end - start) * (scaledist + .02))) output('text(%d, %.2f, "%s10^{%d}", "color", [.6, .6, .6])' % (end + (end - start) * (scaledist + .02), tic[0] * scale, int(tic[1] != 1) * ("%d * " % tic[1]), tic[2])) output('line([' + " ".join(x) + '], [' + " ".join(y) + '], "color", [.6, .6, .6], "linewidth", .7, "linestyle", "-")') # Create sorted list of all shared haplotype events, and compute # geometric mean of all lengths and scores she = [] meanlength = 0 meanscore = 0 for i in data: if i[0] < i[1]: # We only want one copy of each shared haplotype for j in data[i]: she.extend([(j.start, Sharing(j.start, j.end, j.score, i)), (j.end, Sharing(j.start, j.end, j.score, i))]) meanlength += log(j.end - j.start) meanscore += log(max(.025, 100 - j.score)) she.sort() meanlength = exp(meanlength * 2.0 / len(she)) meanscore = exp(meanscore * 2.0 / len(she)) # Class describing representation class Repr: def __init__(self, group, score, start): self.group = group self.score = score self.start = start # Scan through region, keeping track of which pairs of accessions # share haplotypes conn = {} # Connections current = Dict() # Current annotation for accessions for e in xrange(len(she)): if she[e][0] == she[e][1].start: # New shared haplotype conn[she[e][1].pair] = she[e][1] else: # End of shared haplotype del conn[she[e][1].pair] if e < len(she) - 1 and she[e][0] == she[e + 1][0]: # Deal with events happening at same position in one go continue # Class for holding relevant information about edges class EdgeInf: def __init__(self, score, length): self.score = score self.length = length # Determine representation of current state and plot lines changing status # First compute connected components and connectivities comp = {} # Connected components edges = Dict(lambda: [], True) # Connections for each accession for i in conn: for j in xrange(2): if i[j] not in comp: # First time we have seen this accession comp[i[j]] = DisjointSet(i[j]) # Update with connection between these two accessions comp[i[0]].union(comp[i[1]]) for j in xrange(2): edges[i[j]].append(EdgeInf(conn[i].score, conn[i].end - conn[i].start)) # Function for computing geometric mean def geometric_mean(l): return exp(reduce(lambda x, y: log(y) + x, l, 0) * 1.0 / len(l)) # Function assigning score to haplotype def accscore(edges, compsize): # Function computing score of accession with edge scores given by # edges in component of size compsize # Score dependencies: # Length - linear contribution, normalised by overall mean length = geometric_mean(map(lambda x: x.length, edges)) / meanlength # Connectivity - fraction of full clique this sequence is connected to number = len(edges) / float(compsize - 1) # Score - reciprocal of deviation from 100%, normalised score = meanscore / geometric_mean(map(lambda x: max(0.025, 100 - x.score), edges)) # Clique size - reciprocal of expected tree length compared to expected # separation of pair, mapped to the interval from 1/2 to 1 size = 1 - 1.0 / (log(compsize) * compsize * (compsize - 1)) # Final score finalscore = min(1, length * number * score * size) # Map final score to interval from minscore to 1 return minscore + (1 - minscore) * finalscore # Determine RGB values for specified class and score def acccolour(score, group): return tuple(map(lambda x: 1 - score * (1 - x), classcolours[group])) # Alternative determination of RGB values def altcolour(frac, group): frac = minscore + (1 - minscore) * frac return tuple(map(lambda x: 1 - frac * (1 - x), classcolours[group])) def width(score): # Width ranges from 6 to 20, with 14 for overall mean return 2 * min(10, max(3, 10 - (score - 0.025) * 3 / (meanscore - 0.025))) # Generate output for accessions starting, ending, or changing score here updated = set([]) for a in acc: if a in current: if a in edges: # Accession has shared haplotype extending over event position score = (geometric_mean(map(lambda x: max(0.025, 100 - x.score), edges[a])), len(edges[a]) / float(comp[a].size() - 1)) if score != current[a].score: # But its score changes output('line([%d %d], [%d %d], "linewidth", %.1f, "color", [%.2f, %.2f, %.2f], "linestyle", "-")' % ((current[a].start, she[e][0], acc.index(a), acc.index(a), width(current[a].score[0])) + altcolour(current[a].score[1], current[a].group))) current[a].start = she[e][0] updated.add(a) else: # Accession ends having a shared haplotype at this position output('line([%d %d], [%d %d], "linewidth", %.1f, "color", [%.2f, %.2f, %.2f], "linestyle", "-")' % ((current[a].start, she[e][0], acc.index(a), acc.index(a), width(current[a].score[0])) + altcolour(current[a].score[1], current[a].group))) del current[a] elif a in edges: # Start of shared haplotype for this accession current[a] = Repr(None, (geometric_mean(map(lambda x: max(0.025, 100 - x.score), edges[a])), len(edges[a]) / float(comp[a].size() - 1)), she[e][0]) # Determine classes of connected components and output accessions # changing class here group = Dict(lambda: Dict(0), True) for i in comp: if i in current and current[i].group != None: # Update score for component of i inheriting class of i group[comp[i].find()][current[i].group] += current[i].score[0] else: group[comp[i].find()][None] += current[i].score[0] # Sort class assignments accoring to score vote1 = [] vote2 = [] for i in group: s = 0 for j in group[i]: if j != None: vote1.append((-group[i][j], i, j)) s -= group[i][j] vote2.append((s, i)) # Assign existing classes based on score vote1.sort() group = {} used = len(classcolours) * [False] for i in vote1: if i[1] not in group and not used[i[2]]: # Class with representation i[1] has not been assigned a class # yet, and i[2] is unused - use this assignment. group[i[1]] = i[2] used[i[2]] = True # Assign lowest ranking available class to components not yet # assigned classes vote2.sort() j = 0 for i in vote2: if i[1] not in group: while j < len(used) and used[j]: j += 1 if j >= len(used): raise ValueError, "Ran out of classes at position " + str(she[e][0]) group[i[1]] = j # For each accession in a currently shared haplotype, compute its # representation and perform necessary updates if it has changed for a in current: if current[a].group == None: # Accession only just started having a shared haplotype here current[a].group = group[comp[a].find()] elif current[a].group != group[comp[a].find()]: # Accession changes class at this position if a not in updated: # And we haven't yet output it output('line([%d %d], [%d %d], "linewidth", %.1f, "color", [%.2f, %.2f, %.2f], "linestyle", "-")' % ((current[a].start, she[e][0], acc.index(a), acc.index(a), width(current[a].score[0])) + altcolour(current[a].score[1], current[a].group))) current[a].group = group[comp[a].find()] # Update scores if a in updated: # Update score for accessions where we identified a change in score current[a].score = (geometric_mean(map(lambda x: max(0.025, 100 - x.score), edges[a])), len(edges[a]) / float(comp[a].size() - 1)) # As ends of shared haplotypes are also events, everything should be # properly concluded. for a in acc: if a in current: print >> stderr, "Still accessions with current track" exit(1) # Generate lines representing SNP densities if infofiles != None: x = Dict(initfn = lambda: []) yd = Dict(initfn = lambda: []) maxyd = 0 for a in acc: yf = [] for p in info[a].iterate(points()): x[a].append(str(p.pos)) yd[a].append(float(p.snp) / p.len) if p.snp == 0 and p.inf == 0: yf.append("%.2f" % (acc.index(a) + .5)) else: yf.append('%.2f' % (acc.index(a) + p.snp / float(p.snp + p.inf))) if len(x[a]) > 0: output('line([' + " ".join(x[a]) + '], [' + " ".join(yf) + '], "linewidth", .5, "linestyle", "-", "color", [.3, .3, .3])') maxyd = max(maxyd, max(yd[a])) # Determine scaling for private SNP density graphs tic = findtic(maxyd) scale = .9 / maxyd # Output graphs for private SNP density values for a in acc: if len(x[a]) > 0: output('line([' + " ".join(x[a]) + '], [' + " ".join(map(lambda x: ("%.2f" % (acc.index(a) + x * scale)), yd[a])) + '], "linewidth", .5, "linestyle", "-", "color", [0, 0, 0])') output('line([%(x2).2f %(x1).2f %(x1).2f], [%(y)d %(y)d %(y)d.9], "linewidth", 1, "linestyle", "-", "color", [0, 0, 0])' % {'x1': end + (end - start) * .01, 'x2': end + (end - start) * .02, 'y': acc.index(a)}) output('line([%(x1).2f %(x2).2f], [%(y).2f, %(y).2f], "linewidth", 1, "linestyle", "-", "color", [0, 0, 0])' % {'x1': end + (end - start) * .01, 'x2': end + (end - start) * .02, 'y': acc.index(a) + scale * tic[0]}) output('text(%.2f, %d, "0", "color", [0, 0, 0])' % (end + (end - start) * .03, acc.index(a))) output('text(%.2f, %.2f, "%s10^{%d}", "color", [0, 0, 0])' % (end + (end - start) * .03, acc.index(a) + scale * tic[0], int(tic[1] != 1) * ("%d * " % tic[1]), tic[2])) if f != stdin: f.close() # Run through each input file and generate visualisation for f in args: visualise(f) if outnames != []: for f in outfiles: f.close()