from tables import * from itertools import * from numpy.random import randint import matplotlib matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab! import matplotlib.pyplot as plt import numpy as np from numpy import array, dtype, newaxis,zeros,ones,log from bisect import * import sys max=3000000000 chroms=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chrX', 'chrY'] #chroms=[ 'chrY'] def main(infile, outfile): print "#infile=%s, outfile=%s" % (infile, outfile) def index(a, x): 'Locate the leftmost value exactly equal to x. If not present return -1' i = bisect_left(a, x) if i != len(a) and a[i] == x: return i return -1 def getwildsites(wildsearch): h5file = openFile(infile, "r") wildsites = h5file.root.Genotypes.Sites wildsitesrows = wildsites.readWhere(wildsearch) wildsitesrows=wildsitesrows[:max] wildsitespositions = tuple((x[0][3:], x[1]) for x in wildsitesrows) wildgenos = h5file.root.Genotypes wherelist = wildsites.getWhereList(wildsearch) return wildgenos, wildsites, wildsitesrows, wildsitespositions, wherelist def printcounts(): for i, wildtestchr in enumerate(chroms): wildsearch = "(chrom=='%s') " % wildtestchr wildgenos, wildsites, wildsitesrows, wildsitespositions, wildsiteswherelist = getwildsites(wildsearch) totalpass = [x for x in wildsitesrows if x[6].startswith('PASS')] totalpasswherelist = [y for x, y in zip(wildsitesrows, wildsiteswherelist) if x[6].startswith('PASS')] missing=[] for samplename in getsamplenames(wildgenos): samplegenotypes = getattr(wildgenos, samplename) samplegenotypesthischrom = samplegenotypes[totalpasswherelist] missing.append(str(sum(samplegenotypesthischrom["GT_0"]!=-1))) missingcolumns = "\t".join(missing) if i==0: print '%s\t%s\t%s\t%s\t%s' % ('wildtestchr', 'pass', 'total', 'proportion', '\t'.join(getsamplenames(wildgenos))) print '%s\t%s\t%s\t%s\t%s' % (wildtestchr, len(totalpass), len(wildsitesrows), len(totalpass)/float(len(wildsitesrows)), missingcolumns) samplen = 10000 # how many to sample wildsitesrows_r = wildsitesrows.view(np.recarray) qualitysample = wildsitesrows_r[randint(0,len(wildsitesrows_r), samplen)].qual # fig = plt.figure() # plt.hist(qualitysample,bins=100, log=True, range=(0, 30000), normed=True) # fig.savefig('qualitysample.png') def getsamplenames(wildgenos): samplenameswild = [x for x in dict(wildgenos._v_leaves).keys() if x.startswith('sample')] return samplenameswild printcounts() wildgenos, wildsites, wildsitesrows, wildsitespositions, wildsiteswherelist = getwildsites("(chrom=='chr19') ") # samplenameswild= getsamplenames(wildgenos) # wildcoords = wildsites.getWhereList(wildsearch)[:max] # passedsnps=[y for x, y in zip(wildsitesrows, np.arange(len(wildcoords))) if x[6].startswith('FINAL')] # wildcoords = wildcoords[passedsnps] inputfile = sys.argv[1] if len(sys.argv)>1 else "geno.h5" outfile=sys.argv[2] if len(sys.argv)>2 else "snpcounts.txt" if __name__ == "__main__": main(inputfile, outfile)