#!/usr/bin/env python import csv import os import pysam import re import fnmatch import array #Read a table from a csv file - if boolean variable snps_only = TRUE, then read only snps #-if int variable bppos is >0 , then only rows with well defined positions are read, positions column is given #by the number in int_only class csvtable: def __init__(self, directory, first_col, last_col, snps_only, bppos, header): self.data = [] with open(directory, 'rU') as f: reader = csv.reader(f) if (snps_only == True) & (bppos>0): if (header == True): head = reader.next() h = [head[bppos], head[first_col:last_col]] self.data.append(h) for row in reader: pos = float(row[bppos]) snps = row[first_col:last_col] if (pos % int(pos) == 0) & (allsnps(snps) == True): snps = row[first_col:last_col] snps = [int(pos), snps] self.data.append(snps) else: for row in reader: snps = row[first_col:last_col] self.data.append(snps) f.close() #Check that all values in a string list l are SNPs def allsnps(l): flag = True lengths = [len(element) for element in l] if sum(lengths)> len(l) : flag = False return flag #Read a column from a csv file as a vector class csvvector: def __init__(self, directory, column): self.data = [] with open(directory, 'rU') as f: reader = csv.reader(f) for row in reader: self.data.append(row[column]) f.close() #Read a fasta file class read_fasta_file: def __init__(self, infile, chrom_num): self.sequences = [] with open (infile, 'rU') as f: sequence = "" line = f.readline().rstrip() title = line while line != "": line = f.readline().rstrip() if not line.startswith(">"): sequence = sequence +line else: self.sequences = self.sequences + [(title, sequence)] title = line sequence = "" f.close() def verifybreaks(bamdir, breaks, bp, variants): final_breaks = [] print(bamdir) for file in os.listdir(bamdir): print(file) if fnmatch.fnmatch(file, '*.bam'): samfile = pysam.Samfile(bamdir+"/"+file, "rb") line = samfile.header['RG'][0]['SM'] print(line) curr_line_breaks = [[nn, mline, chrom, sp, ep, bef, aft, int(yes), int(no)] for [nn, mline, chrom, (sp, ep, bef, aft, yes, no)] in breaks if mline == line] for breakpoint in curr_line_breaks: iter = samfile.fetch(current_chromosome, breakpoint[3], breakpoint[3]+1) for x in iter: flag = 0 if (x.pos + len(x.seq) >= breakpoint[4]): print "Position is ", x.pos+1, "and read sequence is ", x.seq read_start, read_end, qual = x.pos+1, x.pos+len(x.seq), x.qual snps = [snp for snp in bp if ((snp >= read_start) & (snp <= read_end))] contr = 0 for snp in snps: snp_in_read = x.seq[snp-read_start] snp_index = bp.index(snp) if (snp < breakpoint[3]): snp_in_variants = variants[bp.index(snp)][founders[breakpoint[5]]][0] if ((snp_in_variants in matches[snp_in_read])==False): flag = flag+1 contr = contr +1 print "Snp index is ", bp.index(snp) print "Read: ", snp_in_read, " Variants: ", snp_in_variants elif (snp > breakpoint[4]): snp_in_variants = variants[bp.index(snp)][founders[breakpoint[6]]][0] if ((snp_in_variants in matches[snp_in_read])==False): flag = flag+1 contr = contr +1 print "Snp index is ", bp.index(snp) print "Read: ", snp_in_read, " Variants: ", snp_in_variants if (flag == 0): breakpoint[7] = breakpoint[7]+1 else: breakpoint[8] = breakpoint[8]+1 print "Breakpoint :", breakpoint else: a = 0 #print ("No coverage by a single read") final_breaks.append(breakpoint) samfile.close() return final_breaks #Script #bamdir='/Users/martha/Documents/DPhil/Recombination/Data/MagicLines/BamFiles/' gendir = '/Net/x9000/projects/mott/xiang/magicpool/' #Low coverage folders #bamdirs = [gendir + 'bcarab_1st/recon/', gendir + 'bcarab_2nd/recon/', gendir + 'bcarab_3rd/recon/', gendir + 'bcarab_4th/recon/', gendir + 'bcarab_5th/recon/', gendir + 'bcarab_6th/recon/', gendir + 'bcarab_7th/recon/'] #High coverage folder bamdirs = [gendir + 'bcarab_hi10/'] fdir = '/data/martha/Recombination/data/' #reffile = fdir + '19Genomes/ReferenceGenome/tair9.masked.fa' #writefile = '/Users/martha/Documents/DPhil/Recombination/Results/Reads/LowCoverage/breaksverified1.csv' writefile = '/data/martha/Recombination/breaksver-highcov.csv' accessions = ["bur-0","can-0","col-0","ct-1","edi-0","hi-0","kn-0","ler-0","mt-0","no-0","oy-0","po-0","rsch-4","sf-2","tsu-0","wil-2","ws-0","wu-0","zu-0"] matches = {'A':['A', 'W', 'M', 'R', 'D', 'H', 'V', 'N', '0'], 'C':['C', 'S', 'M', 'Y', 'B', 'H', 'V', 'N', '0'], 'G':['G', 'S', 'K', 'R', 'B', 'D', 'V', 'N', '0'], 'T':['T', 'W', 'K', 'Y', 'B', 'D', 'H', 'N', '0'], 'N':['A', 'C', 'G', 'T', 'W', 'S', 'M', 'K', 'R', 'Y', 'B', 'D', 'H', 'V', 'N', '0']} final_breaks = [] chromosomes = ['1','2','3','4','5'] #chromosomes = ['5'] breakpoints = csvtable(fdir + 'breaks.csv', 0, 9, False, 0, True) allbreaks = breakpoints.data for bamdir in bamdirs: for chrom_n in chromosomes: current_chromosome = 'Chr'+chrom_n print (current_chromosome) breaks = [[nn, mline, chrom, (int(sp), int(ep), bef, aft, int(yes), int(no))] for [nn, mline, chrom, sp, ep, bef, aft, yes, no] in allbreaks if chrom == chrom_n] chrom = csvtable(fdir + current_chromosome + '.alleles.csv', 6, 25, True, 3, True) #variants contains: SNP position + a list of variants for each annotation variants = chrom.data founders = dict([(variants[0][1][i], i) for i in range(19)]) bp = [var[0] for var in variants[1:len(variants)]] bp.insert(0, 0) variants = [var[1] for var in variants[0:len(variants)]] final_breaks += verifybreaks(bamdir, breaks, bp, variants) #write to csv with open(writefile, 'wb') as f: writer = csv.writer(f) writer.writerows(final_breaks) writebreaks = csv.writer(open(writefile, 'wb')) writebreaks.writerows(final_breaks) #RS #columbia = read_fasta_file("/Users/martha/Documents/DPhil/Recombination/Data/19Genomes/ReferenceGenome/tair9.masked.fa", 7) #ref = columbia.sequences[int(chrom_n)-1] #ref = ref[1] #From each bam file extract reads that span the breakpoint