#!/usr/bin/env python import csv import os import pysam import re import fnmatch import array #Read a table from a csv file - if boolean variable snps_only = TRUE, then read only snps #-if int variable bppos is >0 , then only rows with well defined positions are read, positions column is given #by the number in int_only class csvtable: def __init__(self, directory, first_col, last_col, snps_only, bppos, header): self.data = [] with open(directory, 'rU') as f: reader = csv.reader(f) if (snps_only == True) & (bppos>0): if (header == True): head = reader.next() h = [head[bppos], head[first_col:last_col]] self.data.append(h) for row in reader: pos = float(row[bppos]) snps = row[first_col:last_col] if (pos % int(pos) == 0) & (allsnps(snps) == True): snps = row[first_col:last_col] snps = [int(pos), snps] self.data.append(snps) else: for row in reader: snps = row[first_col:last_col] self.data.append(snps) f.close() #Check that all values in a string list l are SNPs def allsnps(l): flag = True lengths = [len(element) for element in l] if sum(lengths)> len(l) : flag = False return flag #Read a column from a csv file as a vector class csvvector: def __init__(self, directory, column): self.data = [] with open(directory, 'rU') as f: reader = csv.reader(f) for row in reader: self.data.append(row[column]) f.close() #Read a fasta file #class read_fasta_file(infile): # sequences = [] # with open (infile, 'rU') as f: # sequence = [] # line = f.readline().rstrip() # title = line[1:len(line)] # while line != "": # line = f.readline().rstrip() # if not line.startswith(">"): # sequence.append(line) # else: # sequences.append((title, "".join(sequence))) # title = line[1:len(line)] # sequence = "" # f.close() # fastaseq = dict(sequences) # return(fastaseq) #get fragments from a fasta file def get_frag(fastaseq, ref, start, end): return("".join[">", fastaseq[ref][start:end]]) def verifybreaks(bamdir, breaks, bp, variants): final_breaks = [] print(bamdir) for file in os.listdir(bamdir): print(file) if fnmatch.fnmatch(file, '*.bam'): samfile = pysam.Samfile(bamdir+"/"+file, "rb") line = samfile.header['RG'][0]['SM'] print(line) curr_line_breaks = [[nn, mline, chrom, sp, ep, bef, aft, int(yes), int(no)] for [nn, mline, chrom, (sp, ep, bef, aft, yes, no)] in breaks if mline == line] for breakpoint in curr_line_breaks: iter = samfile.fetch(current_chromosome, breakpoint[3], breakpoint[3]+1) for x in iter: flag = 0 if (x.pos + len(x.seq) >= breakpoint[4]): print "Position is ", x.pos+1, "and read sequence is ", x.seq read_start, read_end, qual = x.pos+1, x.pos+len(x.seq), x.qual snps = [snp for snp in bp if ((snp >= read_start) & (snp <= read_end))] contr = 0 for snp in snps: snp_in_read = x.seq[snp-read_start] snp_index = bp.index(snp) if (snp < breakpoint[3]): snp_in_variants = variants[bp.index(snp)][founders[breakpoint[5]]][0] if ((snp_in_variants in matches[snp_in_read])==False): flag = flag+1 contr = contr +1 print "Snp is ", snp print "Read: ", snp_in_read, " Variants: ", snp_in_variants elif (snp > breakpoint[4]): snp_in_variants = variants[bp.index(snp)][founders[breakpoint[6]]][0] if ((snp_in_variants in matches[snp_in_read])==False): flag = flag+1 contr = contr +1 print "Snp is ", snp print "Read: ", snp_in_read, " Variants: ", snp_in_variants if (flag == 0): breakpoint[7] = breakpoint[7]+1 else: breakpoint[8] = breakpoint[8]+1 print "Breakpoint :", breakpoint else: a = 0 #print ("No coverage by a single read") final_breaks.append(breakpoint) samfile.close() return final_breaks b = csvtable('/data/martha/Recombination/data/breakpoints/breaks50.csv', 0, 9, False, 0, True)