""" genome mapping, Python module that takes care of quick remapping of genomic coordinates written: Oliver Stegle, November 2010 """ import h5py import scipy as SP import os import sys import pdb import string class MappingTool(object): def __init__(self,mapping_file_name=None,h5db=None): """either mapping_file (mapping_file) or open database handle (h5db)""" self.pos_map = None self.mapping_file_name = mapping_file_name self.h5db = h5db self.name = None self.chrom = None self.cached = False def get_chrom_len_ref(self): """chromosome lengths of reference (col-0)""" return self.pos_map[:,1].max() def get_chrom_len_strain(self): """chromosome lengths of strain""" return self.pos_map[:,2].max() def cache_chromosome(self,name,chrom): """cache the translation tables for a particular strin (name) and chromosome (chrom) Note: this needs quite a bit of ram to build up the forward and reverse mapping tables """ self.name=name self.chrom=chrom self._open_h5db() #1. check whether chrom is in translation table #check name: if self.h5db.get(name) is not None: name = name elif self.h5db.get(string.upper(name)) is not None: name = string.upper(name) elif self.h5db.get(string.lower(name)) is not None: name = string.lower(name) else: print "name %s not in mapping DB" % (name) return False #read with current name chr_names = self.h5db[name].items() chr_names = [item[0] for item in chr_names] if chrom not in chr_names: self.cached = False return False self.pos_map = self.h5db[name][chrom] pos_map_col = self.pos_map[:,1] pos_map_strain = self.pos_map[:,2] #initiate reference lookup Imiss_col = (pos_map_col==-1) Imiss_strain= (pos_map_strain==-1) col0_chr_len = pos_map_col.max() strain_chr_len = pos_map_strain.max() #inverse mapps self.inv_map_strain = (-1)*SP.ones([strain_chr_len+1],dtype='int') self.inv_map_col = (-1)*SP.ones([col0_chr_len+1],dtype='int') self.inv_map_strain[pos_map_strain[~Imiss_strain]]=pos_map_col[~Imiss_strain] self.inv_map_col[pos_map_col[~Imiss_col]]=pos_map_strain[~Imiss_col] self.cached = True return True def map_coordinate(self,strain=None,col0=None): """map coordinates from strain to col0 or wise versa strain: strain coordinates of curren mapping chromosome col0 : col0 coordinates of current mapping chromosome See also cache_chromosome """ if strain is not None: if not self.cached: return SP.ones_like(strain)*(-1) return self.inv_map_strain[strain] elif col0 is not None: if not self.cached: return SP.ones_like(col0)*(-1) return self.inv_map_col[col0] pass #####PRIVATE##### def _open_h5db(self): """open h5db handle if not open already""" if self.h5db is None: self.h5db = h5py.File(self.mapping_file_name,'r') def _binary_search(self,x,a,lo=0,hi=None): """binary search""" if hi is None: hi = len(a) while lo < hi: mid = (lo+hi)//2 midval = a[mid] if midval < x: lo = mid+1 elif midval > x: hi = mid else: return mid return -1