#!/usr/bin/python import IO import cousins #Goal1: Find which pairs of clusters are shared between lines #Keep only clusters that are shared between cousin lines, the others are probably rearrangements - Keep new clusters and rearrangements in a new file #Goal2: For every cousin pair, count how many pairs share clusters and how many clusters they share - should be straightforward from the results of Goal 1. #For the remaining clusters, see if the cousin pair lacking the cluster, has a similar genetic background in the area (considering a 2?5? Mb window around the cluster) def overlap(row1, row2, chr_col, start_col, end_col): if (row1[chr_col] == row2[chr_col]) and ((row1[start_col] >= row2[start_col] and row1[start_col] <= row2[end_col]) or (row1[end_col] >= row2[start_col] and row1[end_col] <= row2[end_col]) or (row2[start_col] >= row1[start_col] and row2[start_col] <= row1[end_col]) or (row2[end_col] >= row1[start_col] and row2[end_col] <= row2[start_col])): return True return False def cluster_sharing(pair, clusters_d, mosaic, sharing_threshold = 0.8): #print(pair) line1, line2 = pair shared_clusters = [] line1_cl, line2_cl ={cl:clusters_d[cl] for cl in clusters_d.keys() if clusters_d[cl][0] == line1}, {cl:clusters_d[cl] for cl in clusters_d.keys() if clusters_d[cl][0] == line2} if line1_cl and line2_cl: cl_pairs = [(cl1, cl2) for cl1 in line1_cl.keys() for cl2 in line2_cl.keys() if overlap(line1_cl[cl1], line2_cl[cl2], 1, 2, 3)] if cl_pairs: for cl_pair in cl_pairs: chrom = line1_cl[cl_pair[0]][1] start, end = min(line1_cl[cl_pair[0]][2], line2_cl[cl_pair[1]][2]), min(line1_cl[cl_pair[0]][3], line2_cl[cl_pair[1]][3]) cl_sim = round(cousins.sample_pair_similarity(pair, mosaic, nwindows = 10, sample_every = 1000, genome_length = end-start, onlychromregion = True, region = [chrom, start, end]), 1) if line1 == "MAGIC.101" and line2 == "MAGIC.40": print(line1_cl[cl_pair[0]]) print(line2_cl[cl_pair[1]]) print(cl_sim) if cl_sim >= sharing_threshold: shared_clusters.append(cl_pair) #print(shared_clusters) return(shared_clusters) if __name__ == "__main__": print("Cluster sharing") clustersfile = "/home/martha/DPhil/Recombination/Results2/Clusters/clusters10.txt" cousinsfile = "/home/martha/DPhil/Recombination/Results2/CousinLines/cousins.txt" mosaicfile = "/home/martha/DPhil/Recombination/Results2/Clusters/mosaic10.clusters.txt" writedir = "/home/martha/DPhil/Recombination/Results2/CousinLines/" statsfile = writedir + "stats.txt" stats = [] clusters = IO.readtable(clustersfile, ignoreheader = True, sep = '\t') clusters = [[line, chrom, int(start), int(end), int(nr_breaks)] for [line, chrom, start, end, nr_breaks] in clusters] the_cousins = IO.readtable(cousinsfile, ignoreheader = True, sep = '\t') mosaic = IO.readtable(mosaicfile, ignoreheader = True, sep = '\t') clusters_d = {index:item for index, item in enumerate(clusters)} lines = list(set([row[0] for row in mosaic])) lines.sort() pairs_of_lines = [(l1, l2) for l1 in lines for l2 in lines[lines.index(l1)+1:]] pairs_of_cousins = [(row[0], row[ind]) for row in the_cousins for ind in range(1,7,2) if row[0] < row[ind] and float(row[ind+1]) < 0.9] #pairs_of_cousins = list(set(pairs_of_cousins)) #pairs_of_cousins = [tuple(pair) for pair in pairs_of_cousins] print(pairs_of_cousins) cluster_share = {pair:cluster_sharing(pair, clusters_d, mosaic) for pair in pairs_of_lines} #Q_1: How many pairs share each cluster? - Any cluster shared by over 5 pairs should be eliminated as rearrangement clusters_pairs = {cl:[] for cl in clusters_d.keys()} for pair in cluster_share.keys(): if cluster_share[pair]: for cl_pair in cluster_share[pair]: clusters_pairs[cl_pair[0]].append(pair) clusters_pairs[cl_pair[1]].append(pair) #clusters_d = {cl:clusters_d[cl].append(len(clusters_pairs[cl])) for cl in clusters_d.keys()} #print(clusters_d) print(clusters_pairs[1]) rearr = {} for cl in clusters_d.keys(): clusters_d[cl].append(len(clusters_pairs[cl])) print(cl) print(clusters_d[cl]) print(clusters_pairs[cl]) if len(clusters_pairs[cl])>=5: rearr[cl] = clusters_d.pop(cl) print(clusters_pairs[1]) stats.append(["Total clusters =", len(clusters)]) stats.append(["Clusters shared by over 5 lines (likely rearr) =", len(rearr.keys())]) stats.append(["Remaining clusters = ", len(clusters_d.keys())]) print(stats) print("ACTUAL CLUSTERS = " + str(len(clusters_d.keys()))) IO.writetable(writedir+"rearr.clusters.sharing.txt", rearr.values()) #Q_2: How many clusters are shared between cousins? #print(cluster_share) cl_shared_by_cousins = list(set([cl for cl in clusters_d.keys() for pair in clusters_pairs[cl] if pair in pairs_of_cousins])) print(cl_shared_by_cousins) stats.append(["Clusters shared by cousins =", len(cl_shared_by_cousins)]) #Q_3: How many of the remaining clusters share 50% of genetic background with their cousins? remaining_clusters = [cl for cl in clusters_d.keys() if cl not in cl_shared_by_cousins] sim_gen_background = [] for cl in remaining_clusters: line, chrom, start, end = clusters_d[cl][0:4] start = start end = end curr_cousins = [pair for pair in pairs_of_cousins if pair[0] == line or pair[1] == line] for pair in curr_cousins: curr_sim = cousins.sample_pair_similarity(pair, mosaic, nwindows = 10, sample_every = 1000, genome_length = end-start, onlychromregion = True, region = [chrom, start, end]) #print(pair) #print(cl) #print(curr_sim) if (curr_sim > 0.45): #print("True") sim_gen_background.append(cl) sim_gen_background = list(set(sim_gen_background)) stats.append(["Shared genetic background =", len(sim_gen_background)]) print(stats) for cl in clusters_d.keys(): if cl in cl_shared_by_cousins: clusters_d[cl].append("inter-crossing") elif cl in sim_gen_background: clusters_d[cl].append("selfing") else: clusters_d[cl].append("NA") print(stats) IO.writetable(writedir+"actual.clusters.not.sharing.txt", clusters_d.values()) IO.writetable(writedir+"clusters.stats.txt", stats)