from tables import * from itertools import * from numpy.random import randint import matplotlib matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab! import matplotlib.pyplot as plt import numpy as np from numpy import array, dtype, newaxis,zeros,ones,log from bisect import * import sys max=3000000000 #chroms=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chrX', 'chrY'] chroms=[ 'chrY'] def main(infile, outfile): print "#infile=%s, outfile=%s" % (infile, outfile) def index(a, x): 'Locate the leftmost value exactly equal to x. If not present return -1' i = bisect_left(a, x) if i != len(a) and a[i] == x: return i return -1 def getwildsites(wildsearch): h5file = openFile(infile, "r") wildsites = h5file.root.Genotypes.Sites wildsitesrows = wildsites.readWhere(wildsearch) wildsitesrows=wildsitesrows[:max] wildsitespositions = tuple((x[0][3:], x[1]) for x in wildsitesrows) wildgenos = h5file.root.Genotypes wherelist = wildsites.getWhereList(wildsearch) return wildgenos, wildsites, wildsitesrows, wildsitespositions, wherelist def printcounts(): for i, wildtestchr in enumerate(chroms): wildsearch = "(chrom=='%s') " % wildtestchr wildgenos, wildsites, wildsitesrows, wildsitespositions, wildsiteswherelist = getwildsites(wildsearch) totalpass = [x for x in wildsitesrows if x[6].startswith('PASS')] totalpasswherelist = [y for x, y in zip(wildsitesrows, wildsiteswherelist) if x[6].startswith('PASS')] samplen = 100000 # how many to sample qualitysample = [x[1] for x in array(totalpass)[randint(0,len(totalpass), samplen)]] fig = plt.figure() plt.hist(qualitysample, bins=1000, normed=True) fig.savefig('pos%s.png'%wildtestchr) def getsamplenames(wildgenos): samplenameswild = [x for x in dict(wildgenos._v_leaves).keys() if x.startswith('sample')] return samplenameswild printcounts() wildgenos, wildsites, wildsitesrows, wildsitespositions, wildsiteswherelist = getwildsites("(chrom=='chr19') ") # samplenameswild= getsamplenames(wildgenos) # wildcoords = wildsites.getWhereList(wildsearch)[:max] # passedsnps=[y for x, y in zip(wildsitesrows, np.arange(len(wildcoords))) if x[6].startswith('FINAL')] # wildcoords = wildcoords[passedsnps] inputfile = sys.argv[1] if len(sys.argv)>1 else "geno.h5" outfile=sys.argv[2] if len(sys.argv)>2 else "snpcounts.txt" if __name__ == "__main__": main(inputfile, outfile)