""" Coordinate transformation from stran -> col-0 and vice versa This tools reads sam files and does read per read mapping. The mapping process is devidede in two parts. a) handling of sam files (this file) b) mapping itself (see genome_mapping) Written: Andre Kahles, Oliver Stegle November 2010 """ import sys #import path for h5py etc., we need this (Tuebingen) on some machines where h5py is not in the native path #sys.path.insert(0,'/agbs/agkbshare/software/LINUX64/python/lib/python2.6/site-packages') import genome_mapping as mapping import re import pdb import scipy as SP def parse_options(argv): """Parses options from the command line """ from optparse import OptionParser, OptionGroup parser = OptionParser() required = OptionGroup(parser, 'REQUIRED') required.add_option('-a', '--alignment', dest='alignment', metavar='FILE', help='alignment file in sam format', default='-') required.add_option('-o', '--outfile', dest='outfile', metavar='PATH', help='outfile', default='-') required.add_option('-t', '--trans_file', dest='trans_file', metavar='FILE', help='transformation file (hdf5 file with mapping coordinates)', default='-') required.add_option('-s', '--strain_name', dest='strain_name', help='Name of strain in transformation file', default='-') optional = OptionGroup(parser, 'OPTIONAL') optional.add_option('-e', '--skip_empty', dest='skip_empty_reads', action='store_true', help='verbosity', default=False) optional.add_option('-v', '--verbose', dest='verbose', action='store_true', help='verbosity', default=False) parser.add_option_group(required) parser.add_option_group(optional) (options, args) = parser.parse_args() if len(argv) < 3: parser.print_help() sys.exit(2) return options def map_strain_positions_debug(strain_name, chromosome, positions): """Debug function, not needed""" #return positions a = 17280390 test = [] pdb.set_trace() for i in range(len(positions)): test.append(a) a += 1 if a % 10 == 0: test[-1] = -1 test.append(-2) test.append(-2) return test def map_strain_positions(MP,strain_name, chromosome, positions): """Mapping function: MP: mapping object for the purpose of keeping an internal mapping cache chromosome: chromosome string (source coordinates) positions : vector with positions (source coordinates) only coordinates>0 are mapped, the remaining coordinates are kept as they are. new insertions are marked as -2 deletions are marked as -1 """ if len(positions)==0: return positions positions = SP.array(positions) if (MP.chrom!=chromosome) or (MP.name !=strain_name): print "updating cache for %s/%s" % (strain_name,chromosome) if not MP.cache_chromosome(strain_name, chromosome): print "Chromosome %s not found in mapping table, skipping" % (chromosome) #conduct actualy mapping Ivalid = (positions>0) Xpos = positions.copy() Xpos[Ivalid] = MP.map_coordinate(strain=positions[Ivalid]) #no insertions/deletions at all? if (Xpos.max()-Xpos.min()==Xpos.shape[0]-1): return Xpos #else: loop through and check that everything is sane pos = [Xpos[0]] #last pos is the first valid position Ivalid = (Xpos>0) if Ivalid.any(): lp = Xpos[Ivalid][0] else: #if everything deleted there is no point in tracking anything lp = 0 for p in Xpos[1::]: #copy insertions if p==-1: pos.append(p) #account for intronic things elif p==-2: pos.append(p) lp +=1 #finally, new insertions from translation: elif (p-lp<=1): pos.append(p) #update last valid position lp = p else: #insertion of some type pos.extend([-2]*(p-lp-1)) pos.append(p) lp = p pass return SP.array(pos) def main(): """Function parsing and transforming and writing sam-file""" options = parse_options(sys.argv) outfile = open(options.outfile, 'w') strain_name = options.strain_name if options.verbose: print 'Reading alignments from %s' % options.alignment print 'Working on strain: %s' % options.strain_name #create mapping object MP = mapping.MappingTool(mapping_file_name = options.trans_file) line_num=0 skip_counter = 0 for line in open(options.alignment, 'r'): line_num+=1 if line_num % 10000 == 0 and options.verbose: print '[lines processed: %i / skipped: %i' % (line_num, skip_counter) sl = line.strip().split('\t') if len(sl) < 9: print >> outfile, line, continue (size, op) = (re.split('[^0-9]', sl[5])[:-1], re.split('[0-9]*', sl[5])[1:]) try: size = [int(i) for i in size] except ValueError: print sl sys.exit(-1) ### extract all exonic positions from the cigar string start_pos = int(sl[3]) - 1 exon_range = [[]] exon_sum = 0 cigar = [""] for pp in range(len(op)): if op[pp] == 'H': continue if op[pp] == 'S': exon_sum += size[pp] continue if not op[pp] in ['N', 'D']: cigar[-1] += (size[pp]*op[pp]) elif op[pp] != 'D': cigar.append("") exon_range.append([]) seg_start = start_pos + exon_sum if op[pp] == 'I': exon_range[-1].extend([-1 for i in range(size[pp])]) elif op[pp] == 'D': exon_range[-1].extend([-2 for i in range(size[pp])]) elif op[pp] != 'N': exon_range[-1].extend(range(seg_start, seg_start + size[pp])) if op[pp] != 'I': exon_sum += size[pp] chrm = sl[2] new_exon_range = [] for exon in exon_range: new_exon_range.append(map_strain_positions(MP, strain_name, chrm, exon)) #new_exon_range.append(map_strain_positions_debug(strain_name, chrm, exon)) curr_size = 0 last_new_pos = 0 _cigar = [""] if new_exon_range[0][0] > -1: curr_code = cigar[0][0] elif new_exon_range[0][0] == -1: curr_code = 'I' elif new_exon_range[0][0] == -2: curr_code = 'D' for idx in range(len(new_exon_range)): offset = 0 for p in range(len(new_exon_range[idx])): ## transformation induced insertion if new_exon_range[idx][p] > -1: ## log insertions up to now if last_new_pos == -1: if curr_code != 'N': _cigar[-1] += (str(curr_size) + 'I') curr_size = 0 curr_code = cigar[idx][p - offset] ## log deletions up to now if last_new_pos == -2: if curr_code != 'N': _cigar[-1] += (str(curr_size) + 'D') curr_size = 0 curr_code = cigar[idx][p - offset] last_new_pos = 0 ### extending previous segment if cigar[idx][p - offset] == curr_code: curr_size += 1 ### starting new segment else: if curr_code != 'N': _cigar[-1] += (str(curr_size) + curr_code) curr_code = cigar[idx][p - offset] curr_size = 1 ### handle positions missing in the new genome elif new_exon_range[idx][p] == -1: if last_new_pos != -1 and curr_code != 'I': if curr_code != 'N': _cigar[-1] += (str(curr_size) + curr_code) curr_code = 'I' curr_size = 1 else: curr_size += 1 last_new_pos = -1 ### handle positons that are additional in the new genome elif new_exon_range[idx][p] == -2: if last_new_pos != -2 and curr_code != 'D': if curr_code != 'N': _cigar[-1] += (str(curr_size) + curr_code) curr_code = 'D' curr_size = 1 else: curr_size += 1 offset += 1 last_new_pos = -2 ### do we have to add an intron? if idx < len(new_exon_range) - 1: ### finish current exon if curr_code != 'N': _cigar[-1] += (str(curr_size) + curr_code) _cigar.append("") last_new_pos = 0 curr_size = 0 curr_code = 'N' _cigar[-1] += (str(curr_size) + curr_code) ### check if no position of the alignment could be found in traget genome merged_new_exon_range = [position for exon in new_exon_range for position in exon] if sum(merged_new_exon_range) <= -1 * len(merged_new_exon_range): if options.skip_empty_reads: skip_counter += 1 continue else: __cigar = '' for _p in _cigar: __cigar += _p sl[3] = 0 _flag = int(sl[1]) _flag -= (_flag & 16) _flag -= (_flag & 2) _flag += 4 sl[1] = str(_flag) else: ### place introns - if necessary if len(_cigar) > 1: intron_first = [] intron_last = [] ### check for valid intron start idx_start = 0 _deletions = 0 while idx_start < len(new_exon_range) - 1: for idx in range(idx_start, len(new_exon_range)): _deletions = 0 for _idx1 in range(1, len(new_exon_range[idx]) + 1): valid_intron_first = new_exon_range[idx][-_idx1] if valid_intron_first >= 0: break elif valid_intron_first == -2: _deletions += 1 if valid_intron_first >= 0: break if valid_intron_first >= 0: intron_first.append((valid_intron_first, _deletions, idx)) else: break idx2 = 0 for idx2 in range(idx + 1, len(new_exon_range)): for _idx2 in range(len(new_exon_range[idx2])): valid_intron_last = new_exon_range[idx2][_idx2] if valid_intron_last >= 0: break elif valid_intron_last == -2: _deletions += 1 if valid_intron_last >= 0: intron_last.append((valid_intron_last, _deletions, idx2)) break if idx2 > 0 and idx2 < len(new_exon_range): idx_start = idx2 else: break ### infer introns __cigar = "" # exons up to first intron for _idx in range(intron_first[0][2] + 1): __cigar += _cigar[_idx] # introns for intron in range(min(len(intron_first), len(intron_last))): intron_span = intron_last[intron][0] - intron_first[intron][0] - 1 if intron_span > 0: __cigar += (str(intron_span) + 'N') # missing exons spanned by that intron are marked as insertions at the end of that intron if intron_last[intron][2] - intron_first[intron][2] > 1: insertions = 0 for _idx in range(intron_first[intron][2] + 1, intron_last[intron][2]): insertions += (new_exon_range[_idx].tolist().count(-1)) __cigar += (str(insertions) + 'I') # following exon __cigar += _cigar[intron_last[intron][2]] # skipped exons at the end if len(intron_last) > 0: for _idx in range(intron_last[-1][2] + 1, len(_cigar)): __cigar += _cigar[_idx] else: for _idx in range(intron_first[0][2] + 1, len(_cigar)): __cigar += _cigar[_idx] else: __cigar = _cigar[0] ### transform start position current_valid_first = new_exon_range[0][0] _idx = 0 offset = 0 for ex in range(len(new_exon_range)): for p in range(len(new_exon_range[ex])): current_valid_first = new_exon_range[ex][p] if current_valid_first >= 0: break ### only count deletions elif current_valid_first == -2: offset += 1 if current_valid_first >= 0: #offset += p break #offset += len(new_exon_range[ex]) sl[3] = current_valid_first - offset if op[0] == 'S': sl[3] -= size[0] ### transform mate start position isl7 = int(sl[7]) if isl7 != 0: sl[7] = 0 #To fix this is tricky. We will take care of this strain information at a later time. #sl[7] = map_strain_positions(MP, strain_name, chrm, [isl7])[0] #sl[7] = map_strain_positions_debug(strain_name, chrm, [isl7])[0] ### prepend / append clipped regions if op[0] in ['S', 'H']: __cigar = (str(size[0]) + op[0] + __cigar) if op[-1] in ['S', 'H']: __cigar += (str(size[-1]) + op[-1]) sl[5] = __cigar _line = '' for bb in sl: _line += (str(bb) + '\t') print >> outfile, _line[:-1] outfile.close() if options.verbose: print '\nfinished processing\nwrote corrected alignments to %s' % options.outfile print '[lines processed: %i / skipped: %i' % (line_num, skip_counter) if __name__ == "__main__": #Example call #the sole relevant file is maps.hdf5 which contains all neede mapping information for the 20 strains. #sys.arv = ['transform_coord.py -a /kyb/agbs/stegle/work/20ArabStrains/AlignmentEvaluation/out/bur_0_nss.sam -o ./out/bur_0_nss_col0.sam -t /kyb/agbs/stegle/work/20ArabStrains/genome_coordinates/out/hdf5/maps.hdf5 -s bur_0' main()