""" This module contains various functions and classes for handling variant information, including utilities for generating combinations of variants, haplotypes, and genotypes. """ from __future__ import division import logging import os import pysam from variant import Variant from pysam import version FILE_VAR = 2 # Duplication of definition in variant.pxd logger = logging.getLogger("Log") ################################################################################################### class VariantCandidateReader(object): """ A class to read variant information from a vcf file, and return a stream of variant instances. """ def __init__(self, fileNames, options): """ Constructor. Takes the name of a vcf file. """ self.options = options self.vcfFiles = [] for fileName in fileNames: # Currently creating the index and compressed VCFs with pysam doesn't work if ".gz" not in fileName: #if not os.path.exists(fileName + ".gz"): # pysam.ctabix.tabix_compress(fileName, fileName + ".gz", force=True) #if not os.path.exists(fileName + ".gz.tbi"): # pysam.ctabix.tabix_index(fileName + ".gz", preset='vcf', force=True) logger.error("") logger.error("Source File %s does not look like a bgzipped VCF (i.e. name does not end in .gz)" %(fileName)) logger.error("You must supply a VCF that has been compressed with bgzip, and indexed with tabix") logger.error("Compress the VCF using bgzip: bgzip %s --> %s.gz" %(fileName, fileName)) logger.error("Remember to use the 'tabix -p vcf %s.gz' command to index the compressed file" %(fileName)) logger.error("This should create an index file: %s.gz.tbi" %(fileName)) logger.error("") raise StandardError, "Input VCF source file %s was not compressed and indexed" %(fileName) #if "gz" not in fileName: # self.vcfFiles.append(pysam.Tabixfile(fileName + ".gz")) #else: # self.vcfFiles.append(pysam.Tabixfile(fileName)) # Compressed VCF. Tabix will complain if there's no index. else: self.vcfFiles.append(pysam.Tabixfile(fileName)) def __del__(self): """ Destructor. Make sure to close files. """ pass def Variants(self, chromosome, start, end): """ Generator funtion. Yields variants in order of genomic co-ordinate. """ vcfLines = None varList = [] offByOne = False maxSize = self.options.maxSize if float(version.__version__) < 0.6: offByOne = True for vcfFile in self.vcfFiles: try: vcfLines = vcfFile.fetch(chromosome, start, end, parser=pysam.asVCF()) except Exception, e: logger.warning("Could not retrieve variants from source file in region %s:%s-%s. Error was %s" %(chromosome,start,end,e)) continue for line in vcfLines: chrom = line.contig pos = line.pos if offByOne: pos -= 1 ref = line.ref altCol = line.alt alts = altCol.split(",") lenRef = len(ref) for alt in alts: lenAlt = len(alt) varSize = abs(lenAlt - lenRef) if varSize > maxSize: logger.debug("Skipping large variant of size %s in source file. Maximum allowed variant size is %s" %(varSize, maxSize)) continue # SNP if lenRef == 1 and lenAlt == 1: var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR) varList.append(var) # MNP elif lenRef == lenAlt: var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR) varList.append(var) # Anything else else: # VCF4 is -1 indexed for indels, so trim off first base tempRef = ref[1:] tempAlt = alt[1:] tempPos = pos removed = tempRef added = tempAlt # Trim the matching bits off and shift position. This will decompose # multi-variant sites into individual alleles at different positions. while len(tempRef) > 0 and len(tempAlt) > 0 and tempRef[0] == tempAlt[0]: tempRef = tempRef[1:] tempAlt = tempAlt[1:] removed = tempRef added = tempAlt tempPos +=1 # Skip weird cases for now if len(removed) != 0 and len(added) != 0: continue #logger.error("Dodgy variant found at %s:%s, with ref=%s, alt = %s" %(chrom,pos,ref,alt)) #logger.error("This will probably break something later on...") var = Variant(chromosome, tempPos, removed, added, 0, FILE_VAR) varList.append(var) varList = sorted(list(set(varList))) logger.debug("Found %s variants in region %s in source file" %(len(varList), "%s:%s-%s" %(chromosome,start,end))) return varList ###################################################################################################