#!/usr/bin/python from __future__ import division import IO import pysam from scipy import spatial as spa from scipy import cluster as cluster from collections import Counter import numpy import os #function that selects the n top-scoring candidate qtls from a logp table if we are in haplotype mode #EXTENSION: for non-haplotype mode using a smith-waterman approach we can define the qtl?? def get_candidate_qtls(logp_table, n_qtls = 3, logp_thres = 6, haplotype_mode = True, qtl_size_limit = 1000000): qtls = [] if haplotype_mode: for i in xrange(n_qtls): #print('QTL = ' + str(i)) pvalues = [float(row[2]) for row in logp_table] maxp = max(pvalues) #print(maxp) if (maxp < logp_thres): qtls.append(['NA', 'NA', 'NA', 'NA']) else: maxp_index = pvalues.index(maxp) #print(maxp_index) chrom, fro, to = 'Chr'+str(logp_table[maxp_index][0]), maxp_index, maxp_index while pvalues[fro] > logp_thres and fro>0: fro -= 1 while pvalues[to] > logp_thres and to= fro and index <= to) else [c, pos, pval] for index, [c, pos, pval] in enumerate(logp_table)] else: print('Need to implement SW recursion for non-haplotype test') return qtls #potentially add other criteria def get_random_seqs(nseqs, size, maskfile): numpy.random.seed() print('simulate') chrom_lengths_dict = {'Chr1':30427671, 'Chr2': 19698289, 'Chr3':23459830, 'Chr4':18585056, 'Chr5':26975502} mask = IO.readgff(maskfile) samples = list(numpy.random.choice(chrom_lengths_dict.keys(), nseqs)) i = 1 rsamples = {} for chr_sample in samples: if (i % 100) == 0: print(i) inside_transposon = True while(inside_transposon): sample_start = numpy.random.randint(0, chrom_lengths_dict[chr_sample]) sample_end = sample_start + size inside_transposon = False #############BOTTLENECK!!! for row in mask: if row[0] == chr_sample and ((sample_start>=int(row[3]) and sample_start<=int(row[4])) or (sample_end>=int(row[3]) and sample_end<=int(row[4])) or (sample_start<=row[3] and sample_end >= row[4])): inside_transposon = True rsamples[i] = [str(chr_sample), int(sample_start), int(sample_end)] i += 1 return(rsamples) def bootstrap(samfiles, n_bootstrap_samples, length_of_each_sample, size_qtl, maskfile): refs = ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'Chl', 'Mch'] rsamples = get_random_seqs(n_bootstrap_samples, length_of_each_sample, maskfile) #for each random sample, for the same set of lines, find what is the maximum number of reads mapping to the same area (of the same size as the qtl) - agglomerative clustering???? #for each random sample return: [n_trans_reads, mean_coverage, mean_percentage_trans_reads, n_non_zero_lines] #mean refers to the number of lines bootstrap_samples = {} print('Run') for i, rs in rsamples.iteritems(): if (i % 100) == 0: print(i) #print(rs) curr_rand_dict = {} curr_rand_dict_coverage = {} all_normal = True for line, sf in samfiles.iteritems(): tc = [[read.rnext, read.pnext] for read in sf.fetch(rs[0], rs[1], rs[2]) if abs(read.tlen) >= 1000 or abs(read.tlen) == 0] curr_rand_dict[line] = tc all_reads = [read.tlen for read in sf.fetch(rs[0], rs[1], rs[2])] curr_rand_dict_coverage[line] = len(all_reads) mean_coverage = float(numpy.mean(curr_rand_dict_coverage.values())) if len(tc) > 0: all_normal = False if (all_normal): bootstrap_samples['_'.join([rs[0], str(rs[1]), str(rs[2])])] = [0, mean_coverage, 0, 0] else: #find reads (with abnormal tlen) that map to each chromosome and find the largest cluster of size equal to the qtl for each chromosome chr_reads = {} for chrom in refs[0:5]: chrom_pos = [[read_pair[1]] for line in curr_rand_dict.keys() for read_pair in curr_rand_dict[line] if refs[read_pair[0]] == chrom] if len(chrom_pos)>1: eudist = spa.distance.pdist(chrom_pos) clusters = list(cluster.hierarchy.fcluster(cluster.hierarchy.linkage(eudist), t = size_qtl, criterion = 'distance')) counts = Counter(clusters) max_clust = [clu for clu in counts.keys() if counts[clu] == max(counts.values())] chr_reads[chrom] = [[refs.index(chrom), pos[0]] for index, pos in enumerate(chrom_pos) if clusters[index] == max_clust[0]] elif len(chrom_pos) == 1: chr_reads[chrom] = chrom_pos else: chr_reads[chrom] = [] #print(chr_reads) #find which chromosome has the maximum cluster cluster_lengths_per_chromosome = {chrom:len(val) for chrom, val in chr_reads.iteritems()} max_val = max(cluster_lengths_per_chromosome.values()) max_chr = [chrom for chrom, val in cluster_lengths_per_chromosome.iteritems() if val == max_val][0] rand_trans_reads = chr_reads[max_chr] n_trans_reads = len(rand_trans_reads) dict_line_selected_trans_reads = {} #find what is the distribution of such reads for line, line_trans_reads in curr_rand_dict.iteritems(): dict_line_selected_trans_reads[line] = 0 if( rand_trans_reads and line_trans_reads): for rtr in line_trans_reads: if rtr in rand_trans_reads: rand_trans_reads.pop(rand_trans_reads.index(rtr)) dict_line_selected_trans_reads[line] += 1 curr_rand_percentages = {line:dict_line_selected_trans_reads[line]/curr_rand_dict_coverage[line] if curr_rand_dict_coverage[line] > 0 else 0 for line in dict_line_selected_trans_reads.keys()} mean_percentage_trans_reads = numpy.mean(curr_rand_percentages.values()) non_zero_lines = [line for line, crp in curr_rand_percentages.iteritems() if crp > 0] n_non_zero_lines = len(non_zero_lines) bootstrap_samples['_'.join([rs[0], str(rs[1]), str(rs[2])])] = [n_trans_reads, mean_coverage, mean_percentage_trans_reads, n_non_zero_lines] return bootstrap_samples #function that quantifies the fraction of reads that map from one given region (defined as: [chrom, start, end]) to another #returns [min_from_trans_start, max_from_trans_end, min_to_trans_start, max_to_trans_end, total_number_of_trans_reads, mean_percentage_trans_reads, conf_interval_start, conf_interval_end, p_value, num_lines_with_trans, conf_interval_start, conf_interval_end, p_value] def get_trans(from_reg, to_reg, bamdir, lines_to_consider, n_bootstrap_samples, maskfile): max_qtl_size = 3000 refs = ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'Chl', 'Mch'] samfiles = {line:pysam.Samfile(bamdir + line + '.bam', 'rb') for line in lines_to_consider} #for each line count the number (and estimate the fraction of) the reads that map to to_reg min_from_trans_start, max_from_trans_end = float('Inf'), 0 min_to_trans_start, max_to_trans_end = float('Inf'), 0 #dict of the form 'MAGIC.XX':[n_transreads, coverage, perc_transreads] lines_to_ntransreads = {} #dict with all the read start positions per line - will be used only if the from coordinates are too large line_all_read_pos = {} all_read_pos = [] for line, sf in samfiles.iteritems(): n_curr = 0 line_all_read_pos[line] = [] for read in sf.fetch(from_reg[0], from_reg[1]-1000, from_reg[2] + 1000): if refs[read.rnext] == to_reg[0] and read.pnext >= to_reg[1] and read.pnext <= to_reg[2]: n_curr += 1 if read.pos <= min_from_trans_start: min_from_trans_start = read.pos if read.pos + read.qlen >= max_from_trans_end: max_from_trans_end = read.pos + read.qlen if read.pnext <= min_to_trans_start: min_to_trans_start = read.pnext if read.pnext >= max_to_trans_end: max_to_trans_end = read.pnext + read.qlen line_all_read_pos[line].append(read.pos) all_read_pos.append(read.pos) lines_to_ntransreads[line] = [n_curr] if max_from_trans_end == 0: return [min_from_trans_start, max_from_trans_end, min_to_trans_start, max_to_trans_end, 0, 0, 0, float('Inf'), 1, 0, 0, float('Inf'), 1] #now count the total number of reads only in the region where the translocated reads are detected for line, sf in samfiles.iteritems(): lines_to_ntransreads[line].append(sf.count(from_reg[0], min_from_trans_start, max_from_trans_end)) if lines_to_ntransreads[line][1] >0: lines_to_ntransreads[line].append(lines_to_ntransreads[line][0]/ lines_to_ntransreads[line][1]) else: lines_to_ntransreads[line].append(0) #generate random regions and return in a dictionary reg_size = max_from_trans_end - min_from_trans_start size_qtl = max_to_trans_end - min_to_trans_start print('reg_size = ' + str(reg_size)) if reg_size <= max_qtl_size: bootstrap_samples = bootstrap(samfiles, n_bootstrap_samples, reg_size, size_qtl, maskfile) else: #find maximum sliding window max_window_reads = 0 adj_start, adj_end = 0, 0 for i in range(min_from_trans_start, max_from_trans_end, 100): curr_win = len([rpos for rpos in all_read_pos if rpos>=i and rpos< i+max_qtl_size]) if curr_win >= max_window_reads: max_window_reads = curr_win adj_start, adj_end = i, i+max_qtl_size #update lines_to_ntransreads according to that for line, sf in samfiles.iteritems(): lines_to_ntransreads[line] = [len([r for r in line_all_read_pos[line] if r >= adj_start and r < adj_end])] lines_to_ntransreads[line].append(sf.count(from_reg[0], adj_start, adj_end)) if lines_to_ntransreads[line][1] >0: lines_to_ntransreads[line].append(lines_to_ntransreads[line][0]/ lines_to_ntransreads[line][1]) else: lines_to_ntransreads[line].append(0) print('adjusted_qtl = ' + str(adj_end - adj_start)) bootstrap_samples = bootstrap(samfiles, n_bootstrap_samples, adj_end-adj_start, size_qtl, maskfile) #get summary statistics for the translocation tested - and get pvalues comparing with the randoms number_of_trans_reads = [nreads for line, [nreads, cov, perc] in lines_to_ntransreads.iteritems()] total_number_of_trans_reads = sum(number_of_trans_reads) percentage_trans_reads = [perc for line, [nreads, cov, perc] in lines_to_ntransreads.iteritems()] mean_percentage_trans_reads = numpy.mean(percentage_trans_reads) num_lines_with_trans = len([p for p in percentage_trans_reads if p > 0]) #get bootstrap confidence interval and p-value for the percentage bootstrap_mean_percentage_trans_reads = [mptr for key, [ntr, mc, mptr, nnzl] in bootstrap_samples.iteritems()] perc_conf_interval_start = numpy.percentile(bootstrap_mean_percentage_trans_reads, 2.5) perc_conf_interval_end = numpy.percentile(bootstrap_mean_percentage_trans_reads, 97.5) perc_p_value = (1 + len([bm for bm in bootstrap_mean_percentage_trans_reads if bm>=mean_percentage_trans_reads]))/(n_bootstrap_samples + 1) #get bootstrap confidence interval and p-value for the number of lines bootstrap_nlines = [nnzl for key, [ntr, mc, mptr, nnzl] in bootstrap_samples.iteritems()] lines_conf_interval_start = numpy.percentile(bootstrap_nlines, 2.5) lines_conf_interval_end = numpy.percentile(bootstrap_nlines, 97.5) lines_p_value = (1 + len([bl for bl in bootstrap_nlines if bl>=num_lines_with_trans]))/(n_bootstrap_samples + 1) return [min_from_trans_start, max_from_trans_end, min_to_trans_start, max_to_trans_end, total_number_of_trans_reads, mean_percentage_trans_reads, perc_conf_interval_start, perc_conf_interval_end, perc_p_value, num_lines_with_trans, lines_conf_interval_start, lines_conf_interval_end, lines_p_value] if __name__=='__main__': bamdir = '/Net/mus/data/www/bams_magic/' bootstrap_samples = 1000 maskfile = '/Net/mus/data/martha/Recombination/data/transposons.gff' writetransfile = './translocations.perms.3000.1000samples.txt' #get the candidate qtls for each trait logpdir = '/Net/mus/data/martha/Recombination/SharedSegments/MappingTranslocations/HaplotypeMode/' logpfiles = [f for f in os.listdir(logpdir) if f.endswith('logP.txt')] haplotype_mode = True if haplotype_mode: logp_thres = 6 max_split = 3 else: logp_thres = 20 max_split = 2 logp = {lf.rsplit('.', max_split)[0]:IO.readtable(logpdir+lf, ignoreheader = True) for lf in logpfiles} candidate_qtls = {key:get_candidate_qtls(logp_table) for key, logp_table in logp.iteritems()} #print(candidate_qtls) #get which lines have the trait traitfile = '/home/martha/DPhil/Recombination/Results2/SharedSegments/trans.phenotypes.txt' traits = IO.readtable(traitfile, sep = '\t') trait_per_line = {} for t in traits[0][1:]: i = traits[0].index(t) lines = [row[0] for row in traits[1:] if row[i] == '1'] trait_per_line[t] = lines #get what the trait's rough coordinates are likely_translocations = IO.readtable('./likely.translocations.txt', sep = '\t') coords = {} for key in logp.keys(): els = key.split('_') chrom, fro = 'Chr' + els[0], int(els[2]) for row in likely_translocations: if els[0] == row[0] and els[1] == row[1] and els[2] == row[2]: to = int(row[3]) break coords[key] = [chrom, fro, to] translocations = [] for key in logp.keys(): for qtl in candidate_qtls[key]: print('##########################################################') print(key) print('Coords = ' + str(coords[key])) print('Candidate qtl = ' + str(qtl)) prefix = [key] prefix.extend(coords[key]) #METRIC 1: find read pairs with one pair in the segment and the other in the qtl region - return number of reads, fraction, p-value #ONLY IF THE QTL DOES NOT OVERLAP WITH THE REGION - IF IT DOES - SPLIT THE QTL SUCH THAT THE CURRENT REGION IS REMOVED - if the qtl is contained in the region of interest then this is not a translocation so we don't test this hypothesis at all if coords[key][0] == qtl[0] and coords[key][1] <= qtl[1] and coords[key][2] >= qtl[2]: print('Case 1 - the qtl completeley overlaps with the region of interest') trans_qtl_stats = [0, 0, 0, 0, 0, 0, 0, float('Inf'), 1, 0, 0, float('Inf'), 1] prefix.append('overlap') prefix.extend(qtl) prefix.extend(trans_qtl_stats) translocations.append(prefix) elif coords[key][0] == qtl[0] and ((coords[key][1] >= qtl[1] and coords[key][1] <= qtl[2] and coords[key][2] >= qtl[2]) or (coords[key][1] <= qtl[1] and coords[key][2] >= qtl[1] and coords[key][2] <= qtl[2]) or (coords[key][1]>= qtl[1] and coords[key][2]<=qtl[2])): if coords[key][1] >= qtl[1] and coords[key][1] <= qtl[2] and coords[key][2] >= qtl[2]: print('Case2a - the qtl overlaps but extends to the right') nqtl = [qtl[0], coords[key][2] + 5000, qtl[2], qtl[3]] trans_qtl_stats = get_trans(from_reg = coords[key], to_reg = nqtl, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile) prefix.append('cis') prefix.extend(qtl) prefix.extend(trans_qtl_stats) print(prefix) translocations.append(prefix) elif coords[key][1] <= qtl[1] and coords[key][2] >= qtl[1] and coords[key][2] <= qtl[2]: print('Case2b - the qtl overlaps but extends to the left') nqtl = [qtl[0], qtl[1], coords[key][1]-5000, qtl[3]] trans_qtl_stats = get_trans(from_reg = coords[key], to_reg = nqtl, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile) prefix.append('cis') prefix.extend(qtl) prefix.extend(trans_qtl_stats) print(prefix) translocations.append(prefix) else: print('Case2c - the qtl overlaps but extends both sides') nqtl1 = [qtl[0], qtl[1], coords[key][1] - 5000, qtl[3]] nqtl2 = [qtl[0], coords[key][2] + 5000, qtl[2], qtl[3]] trans_qtl_stats1 = get_trans(from_reg = coords[key], to_reg = nqtl1, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile) trans_qtl_stats2 = get_trans(from_reg = coords[key], to_reg = nqtl2, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile) prefix2 = prefix[:] prefix.append('cis') prefix.extend(nqtl1) prefix.extend(trans_qtl_stats1) prefix2.append('cis') prefix2.extend(nqtl2) prefix2.extend(trans_qtl_stats2) print(prefix) print(prefix2) translocations.append(prefix) translocations.append(prefix2) else: print('Case 3 - trans qtl') prefix.append('trans') prefix.extend(qtl) trans_qtl_stats = get_trans(from_reg = coords[key], to_reg = qtl, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile) prefix.extend(trans_qtl_stats) print(prefix) translocations.append(prefix) translocations.insert(0, ['ID', 'chr.segment', 'start.segment', 'end.segment', 'qtl_type', 'chr.full_qtl', 'start.full_qtl', 'end.full_qtl', 'pval.full_qtl', 'min_from_trans_start', 'max_from_trans_end', 'min_to_trans_start', 'max_to_trans_end', 'total_number_of_trans_reads', 'mean_percentage_trans_reads', 'perc_conf_interval_start', 'perc_conf_interval_end', 'perc_p_value', 'num_lines_with_trans', 'lines_conf_interval_start', 'lines_conf_interval_end', 'lines_p_value']) IO.writetable(writetransfile, translocations)