"""calculates ibs matrix between all wild and inbred strains"""
from tables   import *
from itertools import *
from numpy.random import randint
import matplotlib
matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab!
import matplotlib.pyplot as plt
import numpy as np
from numpy import array, dtype, newaxis,zeros,ones,log
from bisect import *
import sys
max=3000000000
wildtestchr="chr19"
inbredtestchr="19"
inbredsearch = "(chrom=='%s')" % inbredtestchr
wildsearch = "(chrom=='%s') " % wildtestchr

def main(infile, outfile):
    def index(a, x):
        'Locate the leftmost value exactly equal to x. If not present return -1'
        i = bisect_left(a, x)
        if i != len(a) and a[i] == x:
            return i
        return -1

    def getwildsites():
        h5file = openFile(infile, "r")
        wildsites = h5file.root.Genotypes.Sites
        wildsitesrows = wildsites.readWhere(wildsearch)
        wildsitesrows=wildsitesrows[:max]
        wildsitespositions = tuple((x[0][3:], x[1]) for x in wildsitesrows)
        wildgenos = h5file.root.Genotypes
        return wildgenos, wildsites, wildsitesrows, wildsitespositions

    def getinbredsites():
        h5file2 = openFile("inbredgeno.h5", "r")
        inbredsites = h5file2.root.Genotypes.Sites
        #index=siteinbred.cols.chrom.createIndex()
        inbredsitesrows=inbredsites.readWhere(inbredsearch)
        inbredsitesrows=inbredsitesrows[:max]
        inbredsitespositions=tuple((x[0], x[1]) for x in inbredsitesrows)
        inbredgenos = h5file2.root.Genotypes
        return inbredgenos, inbredsites, inbredsitesrows, inbredsitespositions

    def printoverlapsandfigures():
        uniqueinbredsites = set(inbredsitespositions)
        uniquewildsites = set(wildsitespositions)
        print 'wild snps ', len(uniquewildsites)
        print '17 strains snps', len(uniqueinbredsites)
        print 'intersection ', len(uniquewildsites&uniqueinbredsites)
        novel = uniquewildsites-uniqueinbredsites
        print 'snps in wild but not in 17 strains ', len(novel)
        novelrows = [x for x in wildsitesrows if (x[0][3:], x[1]) in novel]
        novelpass=[x for x in novelrows if x[6].startswith('PASS')]
        print 'novel snps that pass', len(novelpass), ' a proportion of ', len(novelpass)/float(len(novelrows))
        totalpass=[x for x in wildsitesrows if  x[6].startswith('PASS')]
        print 'total that pass', len(totalpass), ' a proportion of ' , len(totalpass)/float(len(wildsitesrows))
        samplen=10000 # how many to sample
        wildsitesrows_r = wildsitesrows.view(np.recarray)
        qualitysample = wildsitesrows_r[randint(0,len(wildsitesrows_r), samplen)].qual
        fig = plt.figure()
        plt.hist(qualitysample,bins=100, log=True, range=(0, 30000), normed=True)
        fig.savefig('qualitysample.png')
        r_novel = np.array(novelrows, dtype=novelrows[0].dtype).view(np.recarray)
        qualitysample = r_novel[randint(0, len(r_novel), samplen)].qual
        fig = plt.figure()
        plt.hist(qualitysample, bins=100, log=True, range=(0, 30000) ,normed=True)
        fig.savefig('qualitynovel.png')

    def getsamplenames(wildgenos, inbredgenos):
        samplenameswild = [x for x in dict(wildgenos._v_leaves).keys() if x.startswith('sample')]
        samplenamesinbred=[x for x in dict(inbredgenos._v_leaves).keys() if x.startswith('sample')]
        return samplenameswild, samplenamesinbred

    def getsampleslist():
        #retuturns list of tuple(sites, genotypes) representing all samples
        allsamples = [(inbredsitespositions, getattr(inbredgenos, name)[inbredcoords]) for name in samplenamesinbred]
        selectedwildsites=tuple([tuple(x) for x in array(wildsitespositions)[array(passedsnps)]])
        allsamples += [(selectedwildsites, getattr(wildgenos, name)[wildcoords]) for name in  samplenameswild]
        return allsamples

    wildgenos, wildsites, wildsitesrows, wildsitespositions= getwildsites()
    inbredgenos, inbredsites, inbredsitesrows, inbredsitespositions = getinbredsites()
    samplenameswild, samplenamesinbred = getsamplenames(wildgenos, inbredgenos)
    printoverlapsandfigures()
    #have to get the genotypes that correspond to sites in the test set whatever is in the test set
    inbredcoords = inbredsites.getWhereList(inbredsearch)[:max]
    wildcoords = wildsites.getWhereList(wildsearch)[:max]
    passedsnps=[y for x, y in zip(wildsitesrows, np.arange(len(wildcoords))) if  x[6].startswith('PASS')]
    wildcoords=wildcoords[passedsnps]
    allsamples=getsampleslist()
    ## Get all of the samples and compare them one with another. if the site is in one but not the other
    ## then the one without a site is assumed to be a reference allele.
    ## Need to build a matrix of IBS.
    numsamples=len(allsamples)
    ibsmatrix=zeros((numsamples, numsamples), dtype=int)
    lookupix={} # dict (list1, list2)  of  list1 indexes of sites in list2. Contains either the index or a -1.
    print 'num samples = ',len(allsamples)
    for i, (sitesi, genotypesi) in enumerate(allsamples):
        for j, (sitesj, genotypesj) in enumerate(allsamples):
            print i, j
            sys.stdout.flush()
            if i<j:
                continue
            #represents: for each site in j  the matching site in the sample i
            ixt = lookupix.get((sitesi, sitesj))
            if ixt is None:
                lookupix[(sitesi, sitesj)] = [index(sitesi, x) for x in sitesj]
                ixt=lookupix.get((sitesi,sitesj))

            #array of null rows
            nullstructi = array([(-1,) * len(genotypesi.dtype)], dtype=genotypesi.dtype).repeat(len(ixt))
#            if i==17 and j==17:
#                import pdb; pdb.set_trace()
            #ixt contains either the index or a -1. if it has an index then check the genotype, if its -1 then there is no matching site in sitej so add a null row
            matchedgenotypesi = np.where(array(ixt) >= 0, genotypesi[ixt], nullstructi)
            #genotype 'dots' are encoded as a struct of -1 values. This means uncalled. I'll assume ref for now. Also non matching snps are encoding as the same
            #calculate number of identical alleles
            #calculate a single genotype score representing number of non ref alleles
            igt0 = matchedgenotypesi.view(np.recarray)['GT_0']
            igt1 = matchedgenotypesi.view(np.recarray)['GT_1']
            #if nonmatching site then its homozygous ref, likewise if uncalled otherwise the number of non zeros is the number of non ref alleles.
            iscores=(igt0 > 0) * 1 + (igt1 > 0) * 1
            matchedgenotypesj = genotypesj
            jgt0 = matchedgenotypesj.view(np.recarray)['GT_0']
            jgt1 = matchedgenotypesj.view(np.recarray)['GT_1']
            #if nonmatching site then its homozygous ref, likewise if uncalled otherwise the number of non zeros is the number of non ref alleles.
            jscores = (jgt0 > 0) * 1 + (jgt1 > 0) * 1
            diff = abs(jscores - iscores)
            #need to check the number of mismatching alleles between the two samples
            ibsmatrix[i, j] = sum(diff)

    np.savetxt(outfile, ibsmatrix)

inputfile = sys.argv[1] if len(sys.argv)>1 else "geno.h5"
outfile=sys.argv[2] if len(sys.argv)>2 else "ibsmatrix.txt"

if __name__ == "__main__":
    main(inputfile, outfile)