#!/usr/bin/python
from __future__ import division
import IO
import pysam
from scipy import spatial as spa
from scipy import cluster as cluster
from collections import Counter
import numpy
import os

#function that selects the n top-scoring candidate qtls from a logp table if we are in haplotype mode
#EXTENSION: for non-haplotype mode using a smith-waterman approach we can define the qtl??
def get_candidate_qtls(logp_table, n_qtls = 3, logp_thres = 6, haplotype_mode = True, qtl_size_limit = 1000000):
    qtls = []
    if haplotype_mode:
        for i in xrange(n_qtls):
            #print('QTL = ' + str(i))
            pvalues = [float(row[2]) for row in logp_table]
            maxp = max(pvalues)
            #print(maxp)
            if (maxp < logp_thres):
                qtls.append(['NA', 'NA', 'NA', 'NA'])
            else:
                maxp_index = pvalues.index(maxp)
                #print(maxp_index)
                chrom, fro, to = 'Chr'+str(logp_table[maxp_index][0]), maxp_index, maxp_index
                while pvalues[fro] > logp_thres and fro>0:
                    fro -= 1
                while pvalues[to] > logp_thres and to<len(logp_table)-1:
                    to += 1
                #print(maxp)
                #print(fro)
                #print(to)
                qtls.append([chrom, int(logp_table[fro+1][1]), int(logp_table[to-1][1]), maxp])
                logp_table = [[c, pos, 0.00000001] if (index >= fro and index <= to) else [c, pos, pval] for index, [c, pos, pval] in enumerate(logp_table)]
    else:
        print('Need to implement SW recursion for non-haplotype test')
    return qtls

#potentially add other criteria
def get_random_seqs(nseqs, size, maskfile):
    numpy.random.seed()
    print('simulate')
    chrom_lengths_dict = {'Chr1':30427671, 'Chr2': 19698289, 'Chr3':23459830, 'Chr4':18585056, 'Chr5':26975502}
    mask = IO.readgff(maskfile)
    samples = list(numpy.random.choice(chrom_lengths_dict.keys(), nseqs))
    i = 1
    rsamples = {}
    for chr_sample in samples:
        if (i % 100) == 0:
            print(i)
        inside_transposon = True
        while(inside_transposon):
            sample_start = numpy.random.randint(0, chrom_lengths_dict[chr_sample])
            sample_end = sample_start + size
            inside_transposon = False
            #############BOTTLENECK!!!
            for row in mask:
                if row[0] == chr_sample and ((sample_start>=int(row[3]) and sample_start<=int(row[4])) or (sample_end>=int(row[3]) and sample_end<=int(row[4])) or (sample_start<=row[3] and sample_end >= row[4])):
                    inside_transposon = True
        rsamples[i] = [str(chr_sample), int(sample_start), int(sample_end)]
        i += 1
    return(rsamples)

def bootstrap(samfiles, n_bootstrap_samples, length_of_each_sample, size_qtl, maskfile):
    refs = ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'Chl', 'Mch']
    rsamples = get_random_seqs(n_bootstrap_samples, length_of_each_sample, maskfile)
    #for each random sample, for the same set of lines, find what is the maximum number of reads mapping to the same area (of the same size as the qtl) - agglomerative clustering????
    #for each random sample return: [n_trans_reads, mean_coverage, mean_percentage_trans_reads, n_non_zero_lines]
    #mean refers to the number of lines
    bootstrap_samples = {}
    print('Run')
    for i, rs in rsamples.iteritems():
        if (i % 100) == 0:
            print(i)
            #print(rs)
        curr_rand_dict = {}
        curr_rand_dict_coverage = {}
        all_normal = True
        for line, sf in samfiles.iteritems():
            tc = [[read.rnext, read.pnext] for read in sf.fetch(rs[0], rs[1], rs[2]) if abs(read.tlen) >= 1000 or abs(read.tlen) == 0]
            curr_rand_dict[line] = tc
            all_reads = [read.tlen for read in sf.fetch(rs[0], rs[1], rs[2])]
            curr_rand_dict_coverage[line] = len(all_reads)
            mean_coverage = float(numpy.mean(curr_rand_dict_coverage.values()))
            if len(tc) > 0:
                all_normal = False
        if (all_normal):
            bootstrap_samples['_'.join([rs[0], str(rs[1]), str(rs[2])])] = [0, mean_coverage, 0, 0]
        else:
            #find reads (with abnormal tlen) that map to each chromosome and find the largest cluster of size equal to the qtl for each chromosome
            chr_reads = {}
            for chrom in refs[0:5]:
                chrom_pos = [[read_pair[1]] for line in curr_rand_dict.keys() for read_pair in curr_rand_dict[line] if refs[read_pair[0]] == chrom]
                if len(chrom_pos)>1:
                    eudist = spa.distance.pdist(chrom_pos)
                    clusters = list(cluster.hierarchy.fcluster(cluster.hierarchy.linkage(eudist), t = size_qtl, criterion = 'distance'))
                    counts = Counter(clusters)
                    max_clust = [clu for clu in counts.keys() if counts[clu] == max(counts.values())]
                    chr_reads[chrom] = [[refs.index(chrom), pos[0]] for index, pos in enumerate(chrom_pos) if clusters[index] == max_clust[0]]
                elif len(chrom_pos) == 1:
                    chr_reads[chrom] = chrom_pos
                else:
                    chr_reads[chrom] = []
            #print(chr_reads)
            #find which chromosome has the maximum cluster
            cluster_lengths_per_chromosome = {chrom:len(val) for chrom, val in chr_reads.iteritems()}
            max_val = max(cluster_lengths_per_chromosome.values())
            max_chr = [chrom for chrom, val in  cluster_lengths_per_chromosome.iteritems() if val == max_val][0]
            rand_trans_reads = chr_reads[max_chr]
            n_trans_reads = len(rand_trans_reads)
            dict_line_selected_trans_reads = {}
            #find what is the distribution of such reads
            for line, line_trans_reads in curr_rand_dict.iteritems():
                dict_line_selected_trans_reads[line] = 0
                if( rand_trans_reads and line_trans_reads):
                    for rtr in line_trans_reads:
                        if rtr in rand_trans_reads:
                            rand_trans_reads.pop(rand_trans_reads.index(rtr))
                            dict_line_selected_trans_reads[line] += 1
            curr_rand_percentages = {line:dict_line_selected_trans_reads[line]/curr_rand_dict_coverage[line] if curr_rand_dict_coverage[line] > 0 else 0 for line in dict_line_selected_trans_reads.keys()}
            mean_percentage_trans_reads = numpy.mean(curr_rand_percentages.values())
            non_zero_lines = [line for line, crp in curr_rand_percentages.iteritems() if crp > 0]
            n_non_zero_lines = len(non_zero_lines)
            bootstrap_samples['_'.join([rs[0], str(rs[1]), str(rs[2])])] = [n_trans_reads, mean_coverage, mean_percentage_trans_reads, n_non_zero_lines]
    return bootstrap_samples
        

#function that quantifies the fraction of reads that map from one given region (defined as: [chrom, start, end]) to another
#returns [min_from_trans_start, max_from_trans_end, min_to_trans_start, max_to_trans_end, total_number_of_trans_reads, mean_percentage_trans_reads, conf_interval_start, conf_interval_end, p_value, num_lines_with_trans, conf_interval_start, conf_interval_end, p_value]
def get_trans(from_reg, to_reg, bamdir, lines_to_consider, n_bootstrap_samples, maskfile):
    max_qtl_size = 3000
    refs = ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'Chl', 'Mch']
    samfiles = {line:pysam.Samfile(bamdir + line + '.bam', 'rb') for line in lines_to_consider}
    #for each line count the number (and estimate the fraction of) the reads that map to to_reg
    min_from_trans_start, max_from_trans_end = float('Inf'), 0
    min_to_trans_start, max_to_trans_end = float('Inf'), 0
    #dict of the form 'MAGIC.XX':[n_transreads, coverage, perc_transreads]
    lines_to_ntransreads = {}
    #dict with all the read start positions per line - will be used only if the from coordinates are too large
    line_all_read_pos = {}
    all_read_pos = []
    for line, sf in samfiles.iteritems():
        n_curr = 0
        line_all_read_pos[line] = []
        for read in sf.fetch(from_reg[0], from_reg[1]-1000, from_reg[2] + 1000):
            if refs[read.rnext] == to_reg[0] and read.pnext >= to_reg[1] and read.pnext <= to_reg[2]:
                n_curr += 1
                if read.pos <= min_from_trans_start:
                    min_from_trans_start = read.pos
                if read.pos + read.qlen >= max_from_trans_end:
                    max_from_trans_end = read.pos + read.qlen
                if read.pnext <= min_to_trans_start:
                    min_to_trans_start = read.pnext
                if read.pnext >= max_to_trans_end:
                    max_to_trans_end = read.pnext + read.qlen
                line_all_read_pos[line].append(read.pos)
                all_read_pos.append(read.pos)
            lines_to_ntransreads[line] = [n_curr]
    if max_from_trans_end == 0:
        return [min_from_trans_start, max_from_trans_end, min_to_trans_start, max_to_trans_end, 0, 0, 0, float('Inf'), 1, 0, 0, float('Inf'), 1]
    #now count the total number of reads only in the region where the translocated reads are detected
    for line, sf in samfiles.iteritems():
        lines_to_ntransreads[line].append(sf.count(from_reg[0], min_from_trans_start, max_from_trans_end))
        if lines_to_ntransreads[line][1] >0:
            lines_to_ntransreads[line].append(lines_to_ntransreads[line][0]/ lines_to_ntransreads[line][1])
        else:
            lines_to_ntransreads[line].append(0)
    #generate random regions and return in a dictionary
    reg_size = max_from_trans_end - min_from_trans_start
    size_qtl = max_to_trans_end - min_to_trans_start
    print('reg_size = ' + str(reg_size)) 
    if reg_size <= max_qtl_size:
        bootstrap_samples = bootstrap(samfiles, n_bootstrap_samples, reg_size, size_qtl, maskfile)
    else:
        #find maximum sliding window
        max_window_reads = 0
        adj_start, adj_end = 0, 0
        for i in range(min_from_trans_start, max_from_trans_end, 100):
            curr_win = len([rpos for rpos in all_read_pos if rpos>=i and rpos< i+max_qtl_size])
            if curr_win >= max_window_reads:
                max_window_reads = curr_win
                adj_start, adj_end = i, i+max_qtl_size
        #update lines_to_ntransreads according to that
        for line, sf in samfiles.iteritems():
            lines_to_ntransreads[line] = [len([r for r in line_all_read_pos[line] if r >= adj_start and r < adj_end])]
            lines_to_ntransreads[line].append(sf.count(from_reg[0], adj_start, adj_end))
            if lines_to_ntransreads[line][1] >0:
                lines_to_ntransreads[line].append(lines_to_ntransreads[line][0]/ lines_to_ntransreads[line][1])
            else:
                lines_to_ntransreads[line].append(0)
        print('adjusted_qtl = ' + str(adj_end - adj_start))
        bootstrap_samples = bootstrap(samfiles, n_bootstrap_samples, adj_end-adj_start, size_qtl, maskfile)
    #get summary statistics for the translocation tested - and get pvalues comparing with the randoms
    number_of_trans_reads = [nreads for line, [nreads, cov, perc] in lines_to_ntransreads.iteritems()]
    total_number_of_trans_reads = sum(number_of_trans_reads)
    percentage_trans_reads = [perc for line, [nreads, cov, perc] in lines_to_ntransreads.iteritems()]
    mean_percentage_trans_reads = numpy.mean(percentage_trans_reads)
    num_lines_with_trans = len([p for p in percentage_trans_reads if p > 0])
    #get bootstrap confidence interval and p-value for the percentage
    bootstrap_mean_percentage_trans_reads = [mptr for key, [ntr, mc, mptr, nnzl] in bootstrap_samples.iteritems()]
    perc_conf_interval_start = numpy.percentile(bootstrap_mean_percentage_trans_reads, 2.5)
    perc_conf_interval_end = numpy.percentile(bootstrap_mean_percentage_trans_reads, 97.5)
    perc_p_value = (1 + len([bm for bm in bootstrap_mean_percentage_trans_reads if bm>=mean_percentage_trans_reads]))/(n_bootstrap_samples + 1)
    #get bootstrap confidence interval and p-value for the number of lines
    bootstrap_nlines = [nnzl for key, [ntr, mc, mptr, nnzl] in bootstrap_samples.iteritems()]
    lines_conf_interval_start = numpy.percentile(bootstrap_nlines, 2.5)
    lines_conf_interval_end = numpy.percentile(bootstrap_nlines, 97.5)
    lines_p_value = (1 + len([bl for bl in bootstrap_nlines if bl>=num_lines_with_trans]))/(n_bootstrap_samples + 1)
    return [min_from_trans_start, max_from_trans_end, min_to_trans_start, max_to_trans_end, total_number_of_trans_reads, mean_percentage_trans_reads, perc_conf_interval_start, perc_conf_interval_end, perc_p_value, num_lines_with_trans, lines_conf_interval_start, lines_conf_interval_end, lines_p_value]
                            
if __name__=='__main__':
    bamdir = '/Net/mus/data/www/bams_magic/'
    bootstrap_samples = 1000
    maskfile = '/Net/mus/data/martha/Recombination/data/transposons.gff'
    writetransfile = './translocations.perms.3000.1000samples.txt'
    #get the candidate qtls for each trait
    logpdir = '/Net/mus/data/martha/Recombination/SharedSegments/MappingTranslocations/HaplotypeMode/'
    logpfiles = [f for f in os.listdir(logpdir) if f.endswith('logP.txt')]
    haplotype_mode = True
    if haplotype_mode:
        logp_thres = 6
        max_split = 3
    else:
        logp_thres = 20
        max_split = 2
    logp = {lf.rsplit('.', max_split)[0]:IO.readtable(logpdir+lf, ignoreheader = True) for lf in logpfiles}
    candidate_qtls = {key:get_candidate_qtls(logp_table) for key, logp_table in logp.iteritems()}
    #print(candidate_qtls)
    #get which lines have the trait
    traitfile = '/home/martha/DPhil/Recombination/Results2/SharedSegments/trans.phenotypes.txt'
    traits = IO.readtable(traitfile, sep = '\t')
    trait_per_line = {}
    for t in traits[0][1:]:
        i = traits[0].index(t)
        lines = [row[0] for row in traits[1:] if row[i] == '1']
        trait_per_line[t] = lines
    #get what the trait's rough coordinates are
    likely_translocations = IO.readtable('./likely.translocations.txt', sep = '\t')
    coords = {}
    for key in logp.keys():
        els = key.split('_')
        chrom, fro = 'Chr' + els[0], int(els[2])
        for row in likely_translocations:
            if els[0] == row[0] and els[1] == row[1] and els[2] == row[2]:
                to = int(row[3])
                break
        coords[key] = [chrom, fro, to]
    translocations = []
    for key in logp.keys():
        for qtl in candidate_qtls[key]:
            print('##########################################################')
            print(key)
            print('Coords = ' + str(coords[key]))
            print('Candidate qtl = ' + str(qtl))
            prefix = [key]
            prefix.extend(coords[key])
            #METRIC 1: find read pairs with one pair in the segment and the other in the qtl region - return number of reads, fraction, p-value
            #ONLY IF THE QTL DOES NOT OVERLAP WITH THE REGION - IF IT DOES - SPLIT THE QTL SUCH THAT THE CURRENT REGION IS REMOVED - if the qtl is contained in the region of interest then this is not a translocation so we don't test this hypothesis at all
            if coords[key][0] == qtl[0] and coords[key][1] <= qtl[1] and coords[key][2] >= qtl[2]:
                print('Case 1 - the qtl completeley overlaps with the region of interest')
                trans_qtl_stats = [0, 0, 0, 0, 0, 0, 0, float('Inf'), 1, 0, 0, float('Inf'), 1]
                prefix.append('overlap')
                prefix.extend(qtl)
                prefix.extend(trans_qtl_stats)
                translocations.append(prefix)
            elif coords[key][0] == qtl[0] and ((coords[key][1] >= qtl[1] and coords[key][1] <= qtl[2] and coords[key][2] >= qtl[2]) or (coords[key][1] <= qtl[1] and coords[key][2] >= qtl[1] and coords[key][2] <= qtl[2]) or (coords[key][1]>= qtl[1] and coords[key][2]<=qtl[2])):
                if coords[key][1] >= qtl[1] and coords[key][1] <= qtl[2] and coords[key][2] >= qtl[2]:
                    print('Case2a - the qtl overlaps but extends to the right')
                    nqtl = [qtl[0], coords[key][2] + 5000, qtl[2], qtl[3]]
                    trans_qtl_stats = get_trans(from_reg = coords[key], to_reg = nqtl, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile)
                    prefix.append('cis')
                    prefix.extend(qtl)
                    prefix.extend(trans_qtl_stats)
                    print(prefix)
                    translocations.append(prefix)
                elif coords[key][1] <= qtl[1] and coords[key][2] >= qtl[1] and coords[key][2] <= qtl[2]:
                    print('Case2b - the qtl overlaps but extends to the left')
                    nqtl = [qtl[0], qtl[1], coords[key][1]-5000, qtl[3]]
                    trans_qtl_stats = get_trans(from_reg = coords[key], to_reg = nqtl, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile)
                    prefix.append('cis')
                    prefix.extend(qtl)
                    prefix.extend(trans_qtl_stats)
                    print(prefix)
                    translocations.append(prefix)
                else:
                    print('Case2c - the qtl overlaps but extends both sides')
                    nqtl1 = [qtl[0], qtl[1], coords[key][1] - 5000, qtl[3]]
                    nqtl2 = [qtl[0], coords[key][2] + 5000, qtl[2], qtl[3]]
                    trans_qtl_stats1 = get_trans(from_reg = coords[key], to_reg = nqtl1, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile)
                    trans_qtl_stats2 = get_trans(from_reg = coords[key], to_reg = nqtl2, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile)
                    prefix2 = prefix[:]
                    prefix.append('cis')
                    prefix.extend(nqtl1)
                    prefix.extend(trans_qtl_stats1)
                    prefix2.append('cis')
                    prefix2.extend(nqtl2)
                    prefix2.extend(trans_qtl_stats2)
                    print(prefix)
                    print(prefix2)
                    translocations.append(prefix)
                    translocations.append(prefix2)
            else:
                print('Case 3 - trans qtl')
                prefix.append('trans')
                prefix.extend(qtl)
                trans_qtl_stats = get_trans(from_reg = coords[key], to_reg = qtl, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile)
                prefix.extend(trans_qtl_stats)
                print(prefix)
                translocations.append(prefix)
    translocations.insert(0, ['ID', 'chr.segment', 'start.segment', 'end.segment', 'qtl_type', 'chr.full_qtl', 'start.full_qtl', 'end.full_qtl', 'pval.full_qtl', 'min_from_trans_start', 'max_from_trans_end', 'min_to_trans_start', 'max_to_trans_end', 'total_number_of_trans_reads', 'mean_percentage_trans_reads', 'perc_conf_interval_start', 'perc_conf_interval_end', 'perc_p_value', 'num_lines_with_trans', 'lines_conf_interval_start', 'lines_conf_interval_end', 'lines_p_value'])
    IO.writetable(writetransfile, translocations)