#!/usr/bin/python from __future__ import division import IO import pysam from collections import Counter import numpy import os import BAM from scipy import stats as stats #counts reads mapping on the same strand within a segment, calculates their percentage and estimates their mean insert size to get a feel for the size of the inversion def inversions(rearr, trait_per_line, bamdir): print('Inversions') #inv - dict of the form trait_id: same_strand_pairs, lines_with_same_strand_pairs, isize_same_strand_pairs inv = {} for trait, lines in trait_per_line.iteritems(): samfiles = {line:pysam.Samfile(bamdir + line + '.bam', 'rb') for line in lines if (line + '.bam') in os.listdir(bamdir)} boundaries = [[row[1], int(row[2]), int(row[3])]for row in rearr if row[0] == trait][0] lines_w_inv = 0 total_inversions = 0 isizes = [] mean_inv_size = 0 for line, sf in samfiles.iteritems(): rev_isizes = [abs(read.isize) for read in sf.fetch(boundaries[0], boundaries[1], boundaries[2]) if read.is_reverse == read.mate_is_reverse and read.mapq>20] total_inversions += len(rev_isizes) if rev_isizes: lines_w_inv += 1 isizes.extend(rev_isizes) if isizes: mean_inv_isize = numpy.mean(isizes) inv[trait] = [total_inversions, lines_w_inv, mean_inv_isize] frearr = [] for row in rearr: rr = row[:] rr.extend(inv[row[0]]) frearr.append(rr) return(frearr) #estimates the average coverage in the regions of interest, the median p-value of the coverage of the region compared to total coverage and the number of lines with significantly increased coverage (0.05 level) def coverage(rearr, trait_per_line, bamdir): print('Coverage') cov = {} for trait, lines in trait_per_line.iteritems(): samfiles = {line:bamdir + line + '.bam' for line in lines if (line + '.bam') in os.listdir(bamdir)} boundaries = [[row[1], int(row[2]), int(row[3])] for row in rearr if row[0] == trait][0] coverages = [] cov_pvalues = [] for line, sf in samfiles.iteritems(): reg_coverage = BAM.coverage_of_region(sf, boundaries[0], boundaries[1], boundaries[2], return_probability = False) coverages.append((reg_coverage * 51)/(boundaries[2] - boundaries[1])) average_coverage = BAM.average_coverage(sf) pvalue = sum(list(stats.binom.pmf(range(reg_coverage*51, boundaries[2]-boundaries[1]), boundaries[2]-boundaries[1], average_coverage))) cov_pvalues.append(pvalue) high_coverage_lines = len([cov_pvalue for cov_pvalue in cov_pvalues if cov_pvalue <= 0.05]) mean_coverage = numpy.mean(coverages) cov[trait] = [mean_coverage, numpy.median(cov_pvalues), high_coverage_lines] frearr = [] for row in rearr: print(cov[row[0]]) rr = row[:] rr.extend(cov[row[0]]) frearr.append(rr) return(frearr) if __name__=='__main__': translocations_file = '../RearrFromReads/translocations.perms.mapq.nobootstrap.1000samples.txt' bamdir = '/data/www/bams_magic/' rearr = IO.readtable(translocations_file, sep = '\t', ignoreheader = False) header = rearr[0] rearr = rearr[1:] write_file = '../RearrFromReads/all.rearr.txt' traitfile = '../Phenotypes/trans.phenotypes.5.txt' traits = IO.readtable(traitfile, sep = '\t') trait_per_line = {} for t in traits[0][1:]: i = traits[0].index(t) lines = [row[0] for row in traits[1:] if row[i] == '1'] trait_per_line[t] = lines #new header = rearr = inversions(rearr, trait_per_line, bamdir) header.extend(['pairs_on_same_strand', 'lines_with_same_strand_pairs', 'isize_same_strand_pairs']) rearr = coverage(rearr, trait_per_line, bamdir) header.extend(['mean_coverage', 'median_pvalue', 'high_coverage_lines']) rearr.insert(0, header) IO.writetable(write_file, rearr)