#!/usr/bin/env python
import csv
import os
import pysam
import re
import fnmatch
import array

#Read a table from a csv file - if boolean variable snps_only = TRUE, then read only snps
#-if int variable bppos is >0 , then only rows with well defined positions are read, positions column is given
#by the number in int_only
class csvtable:
    def __init__(self, directory, first_col, last_col, snps_only, bppos, header):
        self.data = []
        with open(directory, 'rU') as f:
            reader = csv.reader(f)
            if (snps_only == True) & (bppos>0):
                if (header == True):
                    head = reader.next()
                    h = [head[bppos], head[first_col:last_col]]
                    self.data.append(h)
                    for row in reader:
                        pos = float(row[bppos])
                        snps = row[first_col:last_col]
                        if (pos % int(pos) == 0) & (allsnps(snps) == True):
                            snps = row[first_col:last_col]
                            snps = [int(pos), snps]
                            self.data.append(snps)
            else:
                for row in reader:
                    snps = row[first_col:last_col]
                    self.data.append(snps)
            f.close()

#Check that all values in a string list l are SNPs
def allsnps(l):
    flag = True
    lengths = [len(element) for element in l]
    if sum(lengths)> len(l) : flag = False
    return flag
        

#Read a column from a csv file as a vector
class csvvector:
    def __init__(self, directory, column):
        self.data = []
        with open(directory, 'rU') as f:
            reader = csv.reader(f)
            for row in reader:
                self.data.append(row[column])
            f.close()
            
#Read a fasta file
class read_fasta_file:
    def __init__(self, infile, chrom_num):
        self.sequences = []
        with open (infile, 'rU') as f:
            sequence = ""
            line = f.readline().rstrip()
            title = line
            while line != "":
                line = f.readline().rstrip()
                if not line.startswith(">"):
                    sequence = sequence +line
                else:
                    self.sequences = self.sequences + [(title, sequence)]
                    title = line
                    sequence = ""
            f.close()

def verifybreaks(bamdir, breaks, bp, variants):
    final_breaks = []
    print(bamdir)
    for file in os.listdir(bamdir):
        print(file)
        if fnmatch.fnmatch(file, '*.bam'):
            samfile = pysam.Samfile(bamdir+"/"+file, "rb")
            line = samfile.header['RG'][0]['SM']
	    print(line)	
            curr_line_breaks = [[nn, mline, chrom, sp, ep, bef, aft, int(yes), int(no)] for [nn, mline, chrom, (sp, ep, bef, aft, yes, no)] in breaks if mline == line]
            for breakpoint in curr_line_breaks:
                iter = samfile.fetch(current_chromosome, breakpoint[3], breakpoint[3]+1)
                for x in iter:
                    flag = 0
                    if (x.pos + len(x.seq) >= breakpoint[4]):
			print "Position is ", x.pos+1, "and read sequence is ", x.seq
                        read_start, read_end, qual = x.pos+1, x.pos+len(x.seq), x.qual
                        snps = [snp for snp in bp if ((snp >= read_start) & (snp <= read_end))]
                        contr = 0
                        for snp in snps:
                            snp_in_read = x.seq[snp-read_start]
                            snp_index = bp.index(snp)
                            if (snp < breakpoint[3]):
                                snp_in_variants = variants[bp.index(snp)][founders[breakpoint[5]]][0] 
                                if ((snp_in_variants in matches[snp_in_read])==False):
                                    flag = flag+1
                                    contr = contr +1
				    print "Snp index is ", bp.index(snp)
				    print "Read: ", snp_in_read, " Variants: ", snp_in_variants		
                            elif (snp > breakpoint[4]):
                                snp_in_variants = variants[bp.index(snp)][founders[breakpoint[6]]][0] 
                                if ((snp_in_variants in matches[snp_in_read])==False):
                                    flag = flag+1
                                    contr = contr +1
				    print "Snp index is ", bp.index(snp)
				    print "Read: ", snp_in_read, " Variants: ", snp_in_variants		
                        if (flag == 0):
                            breakpoint[7] = breakpoint[7]+1
                        else:
                            breakpoint[8] = breakpoint[8]+1
                        print "Breakpoint :", breakpoint
                    else: a = 0 #print ("No coverage by a single read")
                final_breaks.append(breakpoint)
            samfile.close()
    return final_breaks
    
    
#Script
#bamdir='/Users/martha/Documents/DPhil/Recombination/Data/MagicLines/BamFiles/'
gendir = '/Net/x9000/projects/mott/xiang/magicpool/'
#Low coverage folders
#bamdirs = [gendir + 'bcarab_1st/recon/', gendir + 'bcarab_2nd/recon/', gendir + 'bcarab_3rd/recon/', gendir + 'bcarab_4th/recon/', gendir + 'bcarab_5th/recon/', gendir + 'bcarab_6th/recon/', gendir + 'bcarab_7th/recon/'] 
#High coverage folder
bamdirs = [gendir + 'bcarab_hi10/']
fdir = '/data/martha/Recombination/data/'
#reffile = fdir + '19Genomes/ReferenceGenome/tair9.masked.fa'
#writefile = '/Users/martha/Documents/DPhil/Recombination/Results/Reads/LowCoverage/breaksverified1.csv'
writefile = '/data/martha/Recombination/breaksver-highcov.csv'

accessions = ["bur-0","can-0","col-0","ct-1","edi-0","hi-0","kn-0","ler-0","mt-0","no-0","oy-0","po-0","rsch-4","sf-2","tsu-0","wil-2","ws-0","wu-0","zu-0"]

matches = {'A':['A', 'W', 'M', 'R', 'D', 'H', 'V', 'N', '0'],
           'C':['C', 'S', 'M', 'Y', 'B', 'H', 'V', 'N', '0'],
           'G':['G', 'S', 'K', 'R', 'B', 'D', 'V', 'N', '0'],
           'T':['T', 'W', 'K', 'Y', 'B', 'D', 'H', 'N', '0'],
           'N':['A', 'C', 'G', 'T', 'W', 'S', 'M', 'K', 'R', 'Y', 'B', 'D', 'H', 'V', 'N', '0']}

final_breaks = []
chromosomes = ['1','2','3','4','5']
#chromosomes = ['5']
breakpoints = csvtable(fdir + 'breaks.csv', 0, 9, False, 0, True)
allbreaks = breakpoints.data


for bamdir in bamdirs:
    for chrom_n in chromosomes:
        current_chromosome = 'Chr'+chrom_n
        print (current_chromosome)
        breaks = [[nn, mline, chrom, (int(sp), int(ep), bef, aft, int(yes), int(no))] for [nn, mline, chrom, sp, ep, bef, aft, yes, no] in allbreaks if chrom == chrom_n]
        chrom = csvtable(fdir + current_chromosome + '.alleles.csv', 6, 25, True, 3, True)
        #variants contains: SNP position + a list of variants for each annotation
        variants = chrom.data
        founders = dict([(variants[0][1][i], i) for i in range(19)])
        bp = [var[0] for var in variants[1:len(variants)]]
        bp.insert(0, 0)
        variants = [var[1] for var in variants[0:len(variants)]]
        final_breaks += verifybreaks(bamdir, breaks, bp, variants)

#write to csv
with open(writefile, 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(final_breaks)
writebreaks = csv.writer(open(writefile, 'wb'))
writebreaks.writerows(final_breaks)


#RS
#columbia = read_fasta_file("/Users/martha/Documents/DPhil/Recombination/Data/19Genomes/ReferenceGenome/tair9.masked.fa", 7)
#ref = columbia.sequences[int(chrom_n)-1]
#ref = ref[1]


#From each bam file extract reads that span the breakpoint