"""calculates ibs matrix between all wild and inbred strains""" from tables import * from itertools import * from numpy.random import randint import matplotlib matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab! import matplotlib.pyplot as plt import numpy as np from numpy import array, dtype, newaxis,zeros,ones,log from bisect import * import sys max=3000000000 wildtestchr="chr19" inbredtestchr="19" inbredsearch = "(chrom=='%s')" % inbredtestchr wildsearch = "(chrom=='%s') " % wildtestchr def main(infile, outfile): def index(a, x): 'Locate the leftmost value exactly equal to x. If not present return -1' i = bisect_left(a, x) if i != len(a) and a[i] == x: return i return -1 def getwildsites(): h5file = openFile(infile, "r") wildsites = h5file.root.Genotypes.Sites wildsitesrows = wildsites.readWhere(wildsearch) wildsitesrows=wildsitesrows[:max] wildsitespositions = tuple((x[0][3:], x[1]) for x in wildsitesrows) wildgenos = h5file.root.Genotypes return wildgenos, wildsites, wildsitesrows, wildsitespositions def getinbredsites(): h5file2 = openFile("inbredgeno.h5", "r") inbredsites = h5file2.root.Genotypes.Sites #index=siteinbred.cols.chrom.createIndex() inbredsitesrows=inbredsites.readWhere(inbredsearch) inbredsitesrows=inbredsitesrows[:max] inbredsitespositions=tuple((x[0], x[1]) for x in inbredsitesrows) inbredgenos = h5file2.root.Genotypes return inbredgenos, inbredsites, inbredsitesrows, inbredsitespositions def printoverlapsandfigures(): uniqueinbredsites = set(inbredsitespositions) uniquewildsites = set(wildsitespositions) print 'wild snps ', len(uniquewildsites) print '17 strains snps', len(uniqueinbredsites) print 'intersection ', len(uniquewildsites&uniqueinbredsites) novel = uniquewildsites-uniqueinbredsites print 'snps in wild but not in 17 strains ', len(novel) novelrows = [x for x in wildsitesrows if (x[0][3:], x[1]) in novel] novelpass=[x for x in novelrows if x[6].startswith('PASS')] print 'novel snps that pass', len(novelpass), ' a proportion of ', len(novelpass)/float(len(novelrows)) totalpass=[x for x in wildsitesrows if x[6].startswith('PASS')] print 'total that pass', len(totalpass), ' a proportion of ' , len(totalpass)/float(len(wildsitesrows)) samplen=10000 # how many to sample wildsitesrows_r = wildsitesrows.view(np.recarray) qualitysample = wildsitesrows_r[randint(0,len(wildsitesrows_r), samplen)].qual fig = plt.figure() plt.hist(qualitysample,bins=100, log=True, range=(0, 30000), normed=True) fig.savefig('qualitysample.png') r_novel = np.array(novelrows, dtype=novelrows[0].dtype).view(np.recarray) qualitysample = r_novel[randint(0, len(r_novel), samplen)].qual fig = plt.figure() plt.hist(qualitysample, bins=100, log=True, range=(0, 30000) ,normed=True) fig.savefig('qualitynovel.png') def getsamplenames(wildgenos, inbredgenos): samplenameswild = [x for x in dict(wildgenos._v_leaves).keys() if x.startswith('sample')] samplenamesinbred=[x for x in dict(inbredgenos._v_leaves).keys() if x.startswith('sample')] return samplenameswild, samplenamesinbred def getsampleslist(): #retuturns list of tuple(sites, genotypes) representing all samples allsamples = [(inbredsitespositions, getattr(inbredgenos, name)[inbredcoords]) for name in samplenamesinbred] selectedwildsites=tuple([tuple(x) for x in array(wildsitespositions)[array(passedsnps)]]) allsamples += [(selectedwildsites, getattr(wildgenos, name)[wildcoords]) for name in samplenameswild] return allsamples wildgenos, wildsites, wildsitesrows, wildsitespositions= getwildsites() inbredgenos, inbredsites, inbredsitesrows, inbredsitespositions = getinbredsites() samplenameswild, samplenamesinbred = getsamplenames(wildgenos, inbredgenos) printoverlapsandfigures() #have to get the genotypes that correspond to sites in the test set whatever is in the test set inbredcoords = inbredsites.getWhereList(inbredsearch)[:max] wildcoords = wildsites.getWhereList(wildsearch)[:max] passedsnps=[y for x, y in zip(wildsitesrows, np.arange(len(wildcoords))) if x[6].startswith('PASS')] wildcoords=wildcoords[passedsnps] allsamples=getsampleslist() ## Get all of the samples and compare them one with another. if the site is in one but not the other ## then the one without a site is assumed to be a reference allele. ## Need to build a matrix of IBS. numsamples=len(allsamples) ibsmatrix=zeros((numsamples, numsamples), dtype=int) lookupix={} # dict (list1, list2) of list1 indexes of sites in list2. Contains either the index or a -1. print 'num samples = ',len(allsamples) for i, (sitesi, genotypesi) in enumerate(allsamples): for j, (sitesj, genotypesj) in enumerate(allsamples): print i, j sys.stdout.flush() if i= 0, genotypesi[ixt], nullstructi) #genotype 'dots' are encoded as a struct of -1 values. This means uncalled. I'll assume ref for now. Also non matching snps are encoding as the same #calculate number of identical alleles #calculate a single genotype score representing number of non ref alleles igt0 = matchedgenotypesi.view(np.recarray)['GT_0'] igt1 = matchedgenotypesi.view(np.recarray)['GT_1'] #if nonmatching site then its homozygous ref, likewise if uncalled otherwise the number of non zeros is the number of non ref alleles. iscores=(igt0 > 0) * 1 + (igt1 > 0) * 1 matchedgenotypesj = genotypesj jgt0 = matchedgenotypesj.view(np.recarray)['GT_0'] jgt1 = matchedgenotypesj.view(np.recarray)['GT_1'] #if nonmatching site then its homozygous ref, likewise if uncalled otherwise the number of non zeros is the number of non ref alleles. jscores = (jgt0 > 0) * 1 + (jgt1 > 0) * 1 diff = abs(jscores - iscores) #need to check the number of mismatching alleles between the two samples ibsmatrix[i, j] = sum(diff) np.savetxt(outfile, ibsmatrix) inputfile = sys.argv[1] if len(sys.argv)>1 else "geno.h5" outfile=sys.argv[2] if len(sys.argv)>2 else "ibsmatrix.txt" if __name__ == "__main__": main(inputfile, outfile)