#!/usr/bin/python from __future__ import division import IO import pysam from scipy import spatial as spa from scipy import cluster as cluster from collections import Counter import numpy import os #function that selects the n top-scoring candidate qtls from a logp table if we are in haplotype mode #EXTENSION: for non-haplotype mode using a smith-waterman approach we can define the qtl?? def get_candidate_qtls(logp_table, n_qtls = 3, logp_thres = 6, haplotype_mode = True, qtl_size_limit = 1000000): qtls = [] if haplotype_mode: for i in xrange(n_qtls): #print('QTL = ' + str(i)) pvalues = [float(row[2]) for row in logp_table] maxp = max(pvalues) #print(maxp) if (maxp < logp_thres): qtls.append(['NA', 'NA', 'NA', 'NA']) else: maxp_index = pvalues.index(maxp) #print(maxp_index) chrom, fro, to = 'Chr'+str(logp_table[maxp_index][0]), maxp_index, maxp_index while pvalues[fro] > logp_thres and fro>0: fro -= 1 while pvalues[to] > logp_thres and to= fro and index <= to) else [c, pos, pval] for index, [c, pos, pval] in enumerate(logp_table)] else: print('Need to implement SW recursion for non-haplotype test') return qtls #potentially add other criteria def get_random_seqs(nseqs, size, maskfile): numpy.random.seed() print('simulate') chrom_lengths_dict = {'Chr1':30427671, 'Chr2': 19698289, 'Chr3':23459830, 'Chr4':18585056, 'Chr5':26975502} mask = IO.readgff(maskfile) samples = list(numpy.random.choice(chrom_lengths_dict.keys(), nseqs)) i = 1 rsamples = {} for chr_sample in samples: if (i % 100) == 0: print(i) inside_transposon = True while(inside_transposon): sample_start = numpy.random.randint(0, chrom_lengths_dict[chr_sample]) sample_end = sample_start + size inside_transposon = False #############BOTTLENECK!!! for row in mask: if row[0] == chr_sample and ((sample_start>=int(row[3]) and sample_start<=int(row[4])) or (sample_end>=int(row[3]) and sample_end<=int(row[4])) or (sample_start<=row[3] and sample_end >= row[4])): inside_transposon = True rsamples[i] = [str(chr_sample), int(sample_start), int(sample_end)] i += 1 return(rsamples) def bootstrap(samfiles, n_bootstrap_samples, length_of_each_sample, qtl, maskfile): refs = ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'Chl', 'Mch'] rsamples = get_random_seqs(n_bootstrap_samples, length_of_each_sample, maskfile) #for each random sample return: [n_trans_reads, mean_coverage, mean_percentage_trans_reads, n_non_zero_lines] #mean refers to the number of lines bootstrap_samples = {} print('Run') for i, rs in rsamples.iteritems(): if (i % 100) == 0: print(i) #print(rs) curr_rand_dict = {} curr_rand_dict_coverage = {} all_normal = True n_trans_reads = 0 for line, sf in samfiles.iteritems(): tc = [[read.rnext, read.pnext] for read in sf.fetch(rs[0], rs[1], rs[2]) if (abs(read.tlen) >= 2000 or abs(read.tlen) == 0) and qtl[0] == read.rnext and read.pnext >= qtl[1] and read.pnext <= qtl[2] and read.mapq > 20] curr_rand_dict[line] = len(tc) all_reads = [read.tlen for read in sf.fetch(rs[0], rs[1], rs[2])] curr_rand_dict_coverage[line] = len(all_reads) mean_coverage = float(numpy.mean(curr_rand_dict_coverage.values())) n_trans_reads += len(tc) curr_rand_percentages = {line:reads_to_qtl/curr_rand_dict_coverage[line] if curr_rand_dict_coverage[line] > 0 else 0 for line, reads_to_qtl in curr_rand_dict.iteritems()} mean_percentage_trans_reads = numpy.mean(curr_rand_percentages.values()) non_zero_lines = [line for line, crp in curr_rand_percentages.iteritems() if crp > 0] n_non_zero_lines = len(non_zero_lines) bootstrap_samples['_'.join([rs[0], str(rs[1]), str(rs[2])])] = [n_trans_reads, mean_coverage, mean_percentage_trans_reads, n_non_zero_lines] return bootstrap_samples #function that quantifies the fraction of reads that map from one given region (defined as: [chrom, start, end]) to another #returns [min_from_trans_start, max_from_trans_end, min_to_trans_start, max_to_trans_end, total_number_of_trans_reads, mean_percentage_trans_reads, conf_interval_start, conf_interval_end, p_value, num_lines_with_trans, conf_interval_start, conf_interval_end, p_value] def get_trans(from_reg, to_reg, bamdir, lines_to_consider, n_bootstrap_samples, maskfile): max_qtl_size = 3000 refs = ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'Chl', 'Mch'] samfiles = {line:pysam.Samfile(bamdir + line + '.bam', 'rb') for line in lines_to_consider if (line + '.bam') in os.listdir(bamdir)} #for each line count the number (and estimate the fraction of) the reads that map to to_reg min_from_trans_start, max_from_trans_end = float('Inf'), 0 min_to_trans_start, max_to_trans_end = float('Inf'), 0 #dict of the form 'MAGIC.XX':[n_transreads, coverage, perc_transreads] lines_to_ntransreads = {} #dict with all the read start positions per line - will be used only if the from coordinates are too large line_all_read_pos = {} all_read_pos = [] for line, sf in samfiles.iteritems(): n_curr = 0 line_all_read_pos[line] = [] for read in sf.fetch(from_reg[0], from_reg[1], from_reg[2]): if refs[read.rnext] == to_reg[0] and read.pnext >= to_reg[1] and read.pnext <= to_reg[2] and read.mapq >= 20 and (read.isize == 0 or read.isize >= 2000): n_curr += 1 if read.pos <= min_from_trans_start: min_from_trans_start = read.pos if read.pos + read.qlen >= max_from_trans_end: max_from_trans_end = read.pos + read.qlen if read.pnext <= min_to_trans_start: min_to_trans_start = read.pnext if read.pnext >= max_to_trans_end: max_to_trans_end = read.pnext + read.qlen line_all_read_pos[line].append(read.pos) all_read_pos.append(read.pos) lines_to_ntransreads[line] = [n_curr] if max_from_trans_end == 0: return [min_from_trans_start, max_from_trans_end, min_to_trans_start, max_to_trans_end, 0, 0, 0, float('Inf'), 1, 0, 0, float('Inf'), 1] #now count the total number of reads only in the region where the translocated reads are detected for line, sf in samfiles.iteritems(): lines_to_ntransreads[line].append(sf.count(from_reg[0], min_from_trans_start, max_from_trans_end)) if lines_to_ntransreads[line][1] >0: lines_to_ntransreads[line].append(lines_to_ntransreads[line][0]/ lines_to_ntransreads[line][1]) else: lines_to_ntransreads[line].append(0) #generate random regions and return in a dictionary reg_size = max_from_trans_end - min_from_trans_start size_qtl = max_to_trans_end - min_to_trans_start to_qtl = [to_reg[0], min_to_trans_start, max_to_trans_end] #get summary statistics for the translocation tested - and get pvalues comparing with the randoms number_of_trans_reads = [nreads for line, [nreads, cov, perc] in lines_to_ntransreads.iteritems()] total_number_of_trans_reads = sum(number_of_trans_reads) percentage_trans_reads = [perc for line, [nreads, cov, perc] in lines_to_ntransreads.iteritems()] mean_percentage_trans_reads = numpy.mean(percentage_trans_reads) num_lines_with_trans = len([p for p in percentage_trans_reads if p > 0]) print('Number of trans reads = ' + str(total_number_of_trans_reads)) if (total_number_of_trans_reads >= 50): print('Bootstrap') bootstrap_samples = bootstrap(samfiles, n_bootstrap_samples, reg_size, to_qtl, maskfile) print('reg_size = ' + str(reg_size)) #get bootstrap confidence interval and p-value for the percentage bootstrap_mean_percentage_trans_reads = [mptr for key, [ntr, mc, mptr, nnzl] in bootstrap_samples.iteritems()] perc_conf_interval_start = numpy.percentile(bootstrap_mean_percentage_trans_reads, 2.5) perc_conf_interval_end = numpy.percentile(bootstrap_mean_percentage_trans_reads, 97.5) perc_p_value = (1 + len([bm for bm in bootstrap_mean_percentage_trans_reads if bm>=mean_percentage_trans_reads]))/(n_bootstrap_samples + 1) #get bootstrap confidence interval and p-value for the number of lines bootstrap_nlines = [nnzl for key, [ntr, mc, mptr, nnzl] in bootstrap_samples.iteritems()] lines_conf_interval_start = numpy.percentile(bootstrap_nlines, 2.5) lines_conf_interval_end = numpy.percentile(bootstrap_nlines, 97.5) lines_p_value = (1 + len([bl for bl in bootstrap_nlines if bl>=num_lines_with_trans]))/(n_bootstrap_samples + 1) else: print('No bootstrap') perc_conf_interval_start, perc_conf_interval_end, perc_p_value, lines_conf_interval_start, lines_conf_interval_end, lines_p_value = 0, 0, 1, 0, 0, 1 return [min_from_trans_start, max_from_trans_end, min_to_trans_start, max_to_trans_end, total_number_of_trans_reads, mean_percentage_trans_reads, perc_conf_interval_start, perc_conf_interval_end, perc_p_value, num_lines_with_trans, lines_conf_interval_start, lines_conf_interval_end, lines_p_value] if __name__=='__main__': bamdir = '/Net/mus/data/www/bams_magic/' bootstrap_samples = 1000 maskfile = '/Net/mus/data/martha/Recombination/data/transposons.gff' writetransfile = '../translocations.perms.mapq.nobootstrsp.1000samples.txt' #get the candidate qtls for each trait logpdir = '/Net/mus/data/martha/Recombination/SharedSegments/MappingTranslocations/HaplotypeMode/' logpfiles = [f for f in os.listdir(logpdir) if f.endswith('logP.txt')] haplotype_mode = True if haplotype_mode: logp_thres = 6 max_split = 3 else: logp_thres = 20 max_split = 2 logp = {lf.rsplit('.', max_split)[0]:IO.readtable(logpdir+lf, ignoreheader = True) for lf in logpfiles} candidate_qtls = {key:get_candidate_qtls(logp_table) for key, logp_table in logp.iteritems()} #print(candidate_qtls) #get which lines have the trait traitfile = '../Phenotypes/trans.phenotypes.5.txt' traits = IO.readtable(traitfile, sep = '\t') trait_per_line = {} for t in traits[0][1:]: i = traits[0].index(t) lines = [row[0] for row in traits[1:] if row[i] == '1'] trait_per_line[t] = lines #get what the trait's rough coordinates are likely_translocations = IO.readtable('../LikelyTranslocations/likely.translocations.txt', sep = '\t') coords = {} for key in logp.keys(): els = key.split('_') chrom, fro = 'Chr' + els[0], int(els[2]) for row in likely_translocations: if els[0] == row[0] and els[1] == row[1] and els[2] == row[2]: to = int(row[3]) break coords[key] = [chrom, fro, to] translocations = [] for key in logp.keys(): for qtl in candidate_qtls[key]: print('##########################################################') print(key) print('Coords = ' + str(coords[key])) print('Candidate qtl = ' + str(qtl)) prefix = [key] prefix.extend(coords[key]) #METRIC 1: find read pairs with one pair in the segment and the other in the qtl region - return number of reads, fraction, p-value #ONLY IF THE QTL DOES NOT OVERLAP WITH THE REGION - IF IT DOES - SPLIT THE QTL SUCH THAT THE CURRENT REGION IS REMOVED - if the qtl is contained in the region of interest then this is not a translocation so we don't test this hypothesis at all if coords[key][0] == qtl[0] and coords[key][1] <= qtl[1] and coords[key][2] >= qtl[2]: print('Case 1 - the qtl completeley overlaps with the region of interest') trans_qtl_stats = [0, 0, 0, 0, 0, 0, 0, float('Inf'), 1, 0, 0, float('Inf'), 1] prefix.append('overlap') prefix.extend(qtl) prefix.extend(trans_qtl_stats) translocations.append(prefix) elif coords[key][0] == qtl[0] and ((coords[key][1] >= qtl[1] and coords[key][1] <= qtl[2] and coords[key][2] >= qtl[2]) or (coords[key][1] <= qtl[1] and coords[key][2] >= qtl[1] and coords[key][2] <= qtl[2]) or (coords[key][1]>= qtl[1] and coords[key][2]<=qtl[2])): if coords[key][1] >= qtl[1] and coords[key][1] <= qtl[2] and coords[key][2] >= qtl[2]: print('Case2a - the qtl overlaps but extends to the right') nqtl = [qtl[0], coords[key][2] + 5000, qtl[2], qtl[3]] trans_qtl_stats = get_trans(from_reg = coords[key], to_reg = nqtl, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile) prefix.append('cis') prefix.extend(qtl) prefix.extend(trans_qtl_stats) print(prefix) translocations.append(prefix) elif coords[key][1] <= qtl[1] and coords[key][2] >= qtl[1] and coords[key][2] <= qtl[2]: print('Case2b - the qtl overlaps but extends to the left') nqtl = [qtl[0], qtl[1], coords[key][1]-5000, qtl[3]] trans_qtl_stats = get_trans(from_reg = coords[key], to_reg = nqtl, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile) prefix.append('cis') prefix.extend(qtl) prefix.extend(trans_qtl_stats) print(prefix) translocations.append(prefix) else: print('Case2c - the qtl overlaps but extends both sides') nqtl1 = [qtl[0], qtl[1], coords[key][1] - 5000, qtl[3]] nqtl2 = [qtl[0], coords[key][2] + 5000, qtl[2], qtl[3]] trans_qtl_stats1 = get_trans(from_reg = coords[key], to_reg = nqtl1, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile) trans_qtl_stats2 = get_trans(from_reg = coords[key], to_reg = nqtl2, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile) prefix2 = prefix[:] prefix.append('cis') prefix.extend(nqtl1) prefix.extend(trans_qtl_stats1) prefix2.append('cis') prefix2.extend(nqtl2) prefix2.extend(trans_qtl_stats2) print(prefix) print(prefix2) translocations.append(prefix) translocations.append(prefix2) else: print('Case 3 - trans qtl') prefix.append('trans') prefix.extend(qtl) trans_qtl_stats = get_trans(from_reg = coords[key], to_reg = qtl, bamdir = bamdir, lines_to_consider = trait_per_line[key], n_bootstrap_samples = bootstrap_samples, maskfile = maskfile) prefix.extend(trans_qtl_stats) print(prefix) translocations.append(prefix) translocations.insert(0, ['ID', 'chr.segment', 'start.segment', 'end.segment', 'qtl_type', 'chr.full_qtl', 'start.full_qtl', 'end.full_qtl', 'pval.full_qtl', 'min_from_trans_start', 'max_from_trans_end', 'min_to_trans_start', 'max_to_trans_end', 'total_number_of_trans_reads', 'mean_percentage_trans_reads', 'perc_conf_interval_start', 'perc_conf_interval_end', 'perc_p_value', 'num_lines_with_trans', 'lines_conf_interval_start', 'lines_conf_interval_end', 'lines_p_value']) IO.writetable(writetransfile, translocations)