"""
Coordinate transformation from stran -> col-0 and vice versa
This tools reads sam files and does read per read mapping.
The mapping process is devidede in two parts.

a) handling of sam files (this file)
b) mapping itself (see genome_mapping)

Written: Andre Kahles, Oliver Stegle November 2010 
"""

import sys
#import path for h5py etc., we need this (Tuebingen) on some machines where h5py is not in the native path 
#sys.path.insert(0,'/agbs/agkbshare/software/LINUX64/python/lib/python2.6/site-packages')
import genome_mapping as mapping
import re
import pdb
import scipy as SP


def parse_options(argv):

    """Parses options from the command line """

    from optparse import OptionParser, OptionGroup

    parser = OptionParser()
    required = OptionGroup(parser, 'REQUIRED')
    required.add_option('-a', '--alignment', dest='alignment', metavar='FILE', help='alignment file in sam format', default='-')
    required.add_option('-o', '--outfile', dest='outfile', metavar='PATH', help='outfile', default='-')
    required.add_option('-t', '--trans_file', dest='trans_file', metavar='FILE', help='transformation file (hdf5 file with mapping coordinates)', default='-')
    required.add_option('-s', '--strain_name', dest='strain_name', help='Name of strain in transformation file', default='-')
    optional = OptionGroup(parser, 'OPTIONAL')
    optional.add_option('-e', '--skip_empty', dest='skip_empty_reads', action='store_true', help='verbosity', default=False)
    optional.add_option('-v', '--verbose', dest='verbose', action='store_true', help='verbosity', default=False)
    parser.add_option_group(required)
    parser.add_option_group(optional)

    (options, args) = parser.parse_args()
    
    if len(argv) < 3:
        parser.print_help()
        sys.exit(2)

    return options


def  map_strain_positions_debug(strain_name, chromosome, positions):
    """Debug function, not needed"""
    #return positions
    a = 17280390
    test = []
    pdb.set_trace()
    for i in range(len(positions)):
        test.append(a)
        a += 1
        if a % 10 == 0:
            test[-1] = -1
            test.append(-2)
            test.append(-2)
    return test

def  map_strain_positions(MP,strain_name, chromosome, positions):
    """Mapping function:
    MP: mapping object for the purpose of keeping an internal mapping cache
    chromosome: chromosome string (source coordinates)
    positions : vector with positions (source coordinates)
    
    only coordinates>0 are mapped, the remaining coordinates are kept as they are.
    new insertions are marked as -2
    deletions are marked as -1
    """
    if len(positions)==0:
        return positions
    positions = SP.array(positions)
    if (MP.chrom!=chromosome) or (MP.name !=strain_name):
        print "updating cache for %s/%s" % (strain_name,chromosome)
        if not MP.cache_chromosome(strain_name, chromosome):
            print "Chromosome %s not found in mapping table, skipping" % (chromosome)
    #conduct actualy mapping
    Ivalid = (positions>0)
    Xpos   = positions.copy()
    Xpos[Ivalid] = MP.map_coordinate(strain=positions[Ivalid])      
    #no insertions/deletions at all?
    if (Xpos.max()-Xpos.min()==Xpos.shape[0]-1):
        return Xpos
    #else: loop through and check that everything is sane
    pos = [Xpos[0]]
    #last pos is the first valid position
    Ivalid = (Xpos>0)
    if Ivalid.any():
        lp  = Xpos[Ivalid][0]
    else:
        #if everything deleted there is no point in tracking anything
        lp = 0
    for p in Xpos[1::]:
        #copy insertions
        if p==-1:
            pos.append(p)
        #account for intronic things
        elif p==-2:
            pos.append(p)
            lp +=1
        #finally, new insertions from translation:
        elif (p-lp<=1):
            pos.append(p)
            #update last valid position
            lp = p
        else:
            #insertion of some type
            pos.extend([-2]*(p-lp-1))
            pos.append(p)
            lp = p
            pass
    return SP.array(pos)


def main():
    """Function parsing and transforming and writing sam-file"""

    options = parse_options(sys.argv)

    outfile = open(options.outfile, 'w')
    strain_name = options.strain_name

    if options.verbose:
        print 'Reading alignments from %s' % options.alignment
        print 'Working on strain: %s' % options.strain_name
    #create mapping object
    MP = mapping.MappingTool(mapping_file_name = options.trans_file)
    
    line_num=0
    skip_counter = 0
    for line in open(options.alignment, 'r'):
        line_num+=1
        if line_num % 10000 == 0 and options.verbose:
            print '[lines processed: %i / skipped: %i' % (line_num, skip_counter)
        sl = line.strip().split('\t')
        if len(sl) < 9:
            print >> outfile, line,
            continue
        
        (size, op) = (re.split('[^0-9]', sl[5])[:-1], re.split('[0-9]*', sl[5])[1:])
        try:
            size = [int(i) for i in size]
        except ValueError:
            print sl
            sys.exit(-1)

        ### extract all exonic positions from the cigar string
        start_pos = int(sl[3]) - 1
        exon_range = [[]]
        exon_sum = 0
        cigar = [""]
        for pp in range(len(op)):
            if op[pp] == 'H':
                continue
            if op[pp] == 'S':
                exon_sum += size[pp]
                continue

            if not op[pp] in ['N', 'D']:
                cigar[-1] += (size[pp]*op[pp])
            elif op[pp] != 'D':
                cigar.append("")
                exon_range.append([])

            seg_start = start_pos + exon_sum
            if op[pp] == 'I':
                exon_range[-1].extend([-1 for i in range(size[pp])])
            elif op[pp] == 'D':
                exon_range[-1].extend([-2 for i in range(size[pp])])
            elif op[pp] != 'N':
                exon_range[-1].extend(range(seg_start, seg_start + size[pp]))

            if op[pp] != 'I':
                exon_sum += size[pp]
            
        chrm = sl[2]

        new_exon_range = []
        for exon in exon_range:
            new_exon_range.append(map_strain_positions(MP, strain_name, chrm, exon))
            #new_exon_range.append(map_strain_positions_debug(strain_name, chrm, exon))

        curr_size = 0
        last_new_pos = 0
        _cigar = [""]
        if new_exon_range[0][0] > -1:
            curr_code = cigar[0][0]
        elif new_exon_range[0][0] == -1:
            curr_code = 'I'
        elif new_exon_range[0][0] == -2:
            curr_code = 'D'

        for idx in range(len(new_exon_range)):
            offset = 0
            for p in range(len(new_exon_range[idx])):
                ## transformation induced insertion
                if new_exon_range[idx][p] > -1:
                    ## log insertions up to now
                    if last_new_pos == -1:
                        if curr_code != 'N':
                            _cigar[-1] += (str(curr_size) + 'I')
                        curr_size = 0
                        curr_code = cigar[idx][p - offset]

                    ## log deletions up to now
                    if last_new_pos == -2:
                        if curr_code != 'N':
                            _cigar[-1] += (str(curr_size) + 'D')
                        curr_size = 0
                        curr_code = cigar[idx][p - offset]

                    last_new_pos = 0

                    ### extending previous segment
                    if cigar[idx][p - offset] == curr_code:
                        curr_size += 1
                    ### starting new segment
                    else:
                        if curr_code != 'N':
                            _cigar[-1] += (str(curr_size) + curr_code)
                        curr_code = cigar[idx][p - offset]
                        curr_size = 1
                ### handle positions missing in the new genome
                elif new_exon_range[idx][p] == -1:
                    if last_new_pos != -1 and curr_code != 'I':
                        if curr_code != 'N':
                            _cigar[-1] += (str(curr_size) + curr_code)
                        curr_code = 'I'
                        curr_size = 1
                    else:
                        curr_size += 1
                    last_new_pos = -1
                ### handle positons that are additional in the new genome
                elif new_exon_range[idx][p] == -2:
                    if last_new_pos != -2 and curr_code != 'D':
                        if curr_code != 'N':
                            _cigar[-1] += (str(curr_size) + curr_code)
                        curr_code = 'D'
                        curr_size = 1
                    else:
                        curr_size += 1
                    offset += 1
                    last_new_pos = -2

            ### do we have to add an intron?
            if idx < len(new_exon_range) - 1:
                ### finish current exon
                if curr_code != 'N':
                    _cigar[-1] += (str(curr_size) + curr_code)
                _cigar.append("")
                last_new_pos = 0
                curr_size = 0
                curr_code = 'N'
        _cigar[-1] += (str(curr_size) + curr_code)
       
        ### check if no position of the alignment could be found in traget genome
        merged_new_exon_range = [position for exon in new_exon_range for position in exon]
        if sum(merged_new_exon_range) <= -1 * len(merged_new_exon_range):
            if options.skip_empty_reads:
                skip_counter += 1
                continue
            else:
                __cigar = ''
                for _p in _cigar:
                    __cigar += _p
                sl[3] = 0
                _flag = int(sl[1])
                _flag -= (_flag & 16)
                _flag -= (_flag & 2)
                _flag += 4
                sl[1] = str(_flag)
        else:
            ### place introns - if necessary
            if len(_cigar) > 1:
                intron_first = []
                intron_last = []
                ### check for valid intron start
                idx_start = 0
                _deletions = 0
                while idx_start < len(new_exon_range) - 1:
                    for idx in range(idx_start, len(new_exon_range)):
                        _deletions = 0
                        for _idx1 in range(1, len(new_exon_range[idx]) + 1):
                            valid_intron_first = new_exon_range[idx][-_idx1]
                            if valid_intron_first >= 0:
                                break
                            elif valid_intron_first == -2:
                                _deletions += 1
                        if valid_intron_first >= 0:
                            break
                    if valid_intron_first >= 0:
                        intron_first.append((valid_intron_first, _deletions, idx))
                    else:
                        break
                    idx2 = 0
                    for idx2 in range(idx + 1, len(new_exon_range)):
                        for _idx2 in range(len(new_exon_range[idx2])):
                            valid_intron_last = new_exon_range[idx2][_idx2]
                            if valid_intron_last >= 0:
                                break
                            elif valid_intron_last == -2:
                                _deletions += 1
                        if valid_intron_last >= 0:
                            intron_last.append((valid_intron_last, _deletions, idx2))
                            break
                    if idx2 > 0 and idx2 < len(new_exon_range):
                        idx_start = idx2
                    else:
                        break
     
                ### infer introns
                __cigar = ""
                # exons up to first intron
                for _idx in range(intron_first[0][2] + 1):
                    __cigar += _cigar[_idx]
                    
                # introns
                for intron in range(min(len(intron_first), len(intron_last))):
                    intron_span = intron_last[intron][0] - intron_first[intron][0] - 1
                    if intron_span > 0:
                        __cigar += (str(intron_span) + 'N')
                    # missing exons spanned by that intron are marked as insertions at the end of that intron
                    if intron_last[intron][2] - intron_first[intron][2] > 1:
                        insertions = 0
                        for _idx in range(intron_first[intron][2] + 1, intron_last[intron][2]):
                            insertions += (new_exon_range[_idx].tolist().count(-1))
                        __cigar += (str(insertions) + 'I')
                    # following exon
                    __cigar += _cigar[intron_last[intron][2]]
                # skipped exons at the end
                if len(intron_last) > 0:
                    for _idx in range(intron_last[-1][2] + 1, len(_cigar)):
                        __cigar += _cigar[_idx]
                else:
                    for _idx in range(intron_first[0][2] + 1, len(_cigar)):
                        __cigar += _cigar[_idx]
            else:
                __cigar = _cigar[0]
     
            ### transform start position
            current_valid_first = new_exon_range[0][0]
            _idx = 0
            offset = 0
            for ex in range(len(new_exon_range)):
                for p in range(len(new_exon_range[ex])):
                    current_valid_first = new_exon_range[ex][p]
                    if current_valid_first >= 0:
                        break
                    ### only count deletions
                    elif current_valid_first == -2:
                        offset += 1
                if current_valid_first >= 0:
                    #offset += p
                    break
                #offset += len(new_exon_range[ex])
                

            sl[3] = current_valid_first - offset
            if op[0] == 'S':
                sl[3] -= size[0]

            ### transform mate start position
            isl7 = int(sl[7])
            if isl7 != 0:
                sl[7] = 0
               #To fix this is tricky. We will take care of this strain information at a later time. 
               #sl[7] = map_strain_positions(MP, strain_name, chrm, [isl7])[0] 
               #sl[7] = map_strain_positions_debug(strain_name, chrm, [isl7])[0] 

        ### prepend / append clipped regions
        if op[0] in ['S', 'H']:
            __cigar = (str(size[0]) + op[0] + __cigar)
        if op[-1] in ['S', 'H']:
            __cigar += (str(size[-1]) + op[-1])

        sl[5] = __cigar

        _line = ''
        for bb in sl:
            _line += (str(bb) + '\t')

        print >> outfile, _line[:-1]

    outfile.close()

    if options.verbose:
        print '\nfinished processing\nwrote corrected alignments to %s' % options.outfile
        print '[lines processed: %i / skipped: %i' % (line_num, skip_counter)
    
if __name__ == "__main__":
    #Example call
    #the sole relevant file is maps.hdf5 which contains all neede mapping information for the 20 strains.
    #sys.arv = ['transform_coord.py -a /kyb/agbs/stegle/work/20ArabStrains/AlignmentEvaluation/out/bur_0_nss.sam -o ./out/bur_0_nss_col0.sam -t /kyb/agbs/stegle/work/20ArabStrains/genome_coordinates/out/hdf5/maps.hdf5 -s bur_0'
    main()