#!/usr/bin/python
import IO
import cousins

#Goal1: Find which pairs of clusters are shared between lines
#Keep only clusters that are shared between cousin lines, the others are probably rearrangements - Keep new clusters and rearrangements in a new file
#Goal2: For every cousin pair, count how many pairs share clusters and how many clusters they share - should be straightforward from the results of Goal 1.
#For the remaining clusters, see if the cousin pair lacking the cluster, has a similar genetic background in the area (considering a 2?5? Mb window around the cluster)

def overlap(row1, row2, chr_col, start_col, end_col):
    if (row1[chr_col] == row2[chr_col]) and ((row1[start_col] >= row2[start_col] and row1[start_col] <= row2[end_col]) or (row1[end_col] >= row2[start_col] and row1[end_col] <= row2[end_col]) or (row2[start_col] >= row1[start_col] and row2[start_col] <= row1[end_col]) or (row2[end_col] >= row1[start_col] and row2[end_col] <= row2[start_col])):
        return True
    return False

def cluster_sharing(pair, clusters_d, mosaic, sharing_threshold = 0.8):
    #print(pair)
    line1, line2 = pair
    shared_clusters = []
    line1_cl, line2_cl ={cl:clusters_d[cl] for cl in clusters_d.keys() if clusters_d[cl][0] == line1}, {cl:clusters_d[cl] for cl in clusters_d.keys() if clusters_d[cl][0] == line2}
    if line1_cl and line2_cl:
        cl_pairs = [(cl1, cl2) for cl1 in line1_cl.keys() for cl2 in line2_cl.keys() if overlap(line1_cl[cl1], line2_cl[cl2], 1, 2, 3)]
        if cl_pairs:
            for cl_pair in cl_pairs:
                chrom = line1_cl[cl_pair[0]][1]
                start, end = min(line1_cl[cl_pair[0]][2], line2_cl[cl_pair[1]][2]), min(line1_cl[cl_pair[0]][3], line2_cl[cl_pair[1]][3])
                cl_sim =  round(cousins.sample_pair_similarity(pair, mosaic, nwindows = 10, sample_every = 1000, genome_length = end-start, onlychromregion = True, region = [chrom, start, end]), 1)
                if line1 == "MAGIC.101" and line2 == "MAGIC.40":
                    print(line1_cl[cl_pair[0]])
                    print(line2_cl[cl_pair[1]])
                    print(cl_sim)
                if cl_sim >= sharing_threshold:
                    shared_clusters.append(cl_pair)
    #print(shared_clusters)
    return(shared_clusters)
                
                

if __name__ == "__main__":
    print("Cluster sharing")
    clustersfile = "/home/martha/DPhil/Recombination/Results2/Clusters/clusters10.txt"
    cousinsfile = "/home/martha/DPhil/Recombination/Results2/CousinLines/cousins.txt"
    mosaicfile = "/home/martha/DPhil/Recombination/Results2/Clusters/mosaic10.clusters.txt"
    writedir = "/home/martha/DPhil/Recombination/Results2/CousinLines/"
    statsfile = writedir + "stats.txt"
    stats = []
    clusters = IO.readtable(clustersfile, ignoreheader = True, sep = '\t')
    clusters = [[line, chrom, int(start), int(end), int(nr_breaks)] for [line, chrom, start, end, nr_breaks] in clusters]
    the_cousins = IO.readtable(cousinsfile, ignoreheader = True,  sep = '\t')
    mosaic = IO.readtable(mosaicfile, ignoreheader = True,  sep = '\t')
    clusters_d = {index:item for index, item in enumerate(clusters)}
    lines = list(set([row[0] for row in mosaic]))
    lines.sort()
    pairs_of_lines = [(l1, l2) for l1 in lines for l2 in lines[lines.index(l1)+1:]]
    pairs_of_cousins = [(row[0], row[ind]) for row in the_cousins for ind in range(1,7,2) if row[0] < row[ind] and float(row[ind+1]) < 0.9]
    #pairs_of_cousins = list(set(pairs_of_cousins))
    #pairs_of_cousins = [tuple(pair) for pair in pairs_of_cousins]
    print(pairs_of_cousins)
    cluster_share = {pair:cluster_sharing(pair, clusters_d, mosaic) for pair in pairs_of_lines}
    #Q_1: How many pairs share each cluster? - Any cluster shared by over 5 pairs should be eliminated as rearrangement
    clusters_pairs = {cl:[] for cl in clusters_d.keys()}
    for pair in cluster_share.keys():
        if cluster_share[pair]:
            for cl_pair in cluster_share[pair]:
                clusters_pairs[cl_pair[0]].append(pair)
                clusters_pairs[cl_pair[1]].append(pair)
    #clusters_d = {cl:clusters_d[cl].append(len(clusters_pairs[cl])) for cl in clusters_d.keys()}
    #print(clusters_d)
    print(clusters_pairs[1])
    rearr = {}
    for cl in clusters_d.keys():
        clusters_d[cl].append(len(clusters_pairs[cl]))
        print(cl)
        print(clusters_d[cl])
        print(clusters_pairs[cl])
        if len(clusters_pairs[cl])>=5:
            rearr[cl] = clusters_d.pop(cl)
    print(clusters_pairs[1])
    stats.append(["Total clusters =", len(clusters)])
    stats.append(["Clusters shared by over 5 lines (likely rearr) =", len(rearr.keys())])
    stats.append(["Remaining clusters = ", len(clusters_d.keys())])
    print(stats)
    print("ACTUAL CLUSTERS = " + str(len(clusters_d.keys())))
    IO.writetable(writedir+"rearr.clusters.sharing.txt", rearr.values())
    #Q_2: How many clusters are shared between cousins?
    #print(cluster_share)
    cl_shared_by_cousins = list(set([cl for cl in clusters_d.keys() for pair in clusters_pairs[cl] if pair in pairs_of_cousins]))
    print(cl_shared_by_cousins)
    stats.append(["Clusters shared by cousins =", len(cl_shared_by_cousins)])
    #Q_3: How many of the remaining clusters share 50% of genetic background with their cousins?
    remaining_clusters = [cl for cl in clusters_d.keys() if cl not in cl_shared_by_cousins]
    sim_gen_background = []
    for cl in remaining_clusters:
        line, chrom, start, end = clusters_d[cl][0:4]
        start = start
        end = end
        curr_cousins = [pair for pair in pairs_of_cousins if pair[0] == line or pair[1] == line]
        for pair in curr_cousins:
            curr_sim = cousins.sample_pair_similarity(pair, mosaic, nwindows = 10, sample_every = 1000, genome_length = end-start, onlychromregion = True, region = [chrom, start, end])
            #print(pair)
            #print(cl)
            #print(curr_sim)
            if (curr_sim > 0.45):
                #print("True")
                sim_gen_background.append(cl)
    sim_gen_background = list(set(sim_gen_background))
    stats.append(["Shared genetic background =", len(sim_gen_background)])
    print(stats)
    for cl in clusters_d.keys():
        if cl in cl_shared_by_cousins:
            clusters_d[cl].append("inter-crossing")
        elif cl in sim_gen_background:
            clusters_d[cl].append("selfing")
        else:
            clusters_d[cl].append("NA")
    print(stats)
    IO.writetable(writedir+"actual.clusters.not.sharing.txt", clusters_d.values())
    IO.writetable(writedir+"clusters.stats.txt", stats)