=pod

=head1 LICENSE

  Copyright (c) 1999-2013 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 NAME

Bio::EnsEMBL::Utils::IO::GTFSerializer - Transcript to GTF converter

=head1 SYNOPSIS

use Bio::EnsEMBL::Utils::IO::GTFSerializer;

my $serializer = Bio::EnsEMBL::Utils::IO::GTFSerializer->new($output_fh);

=head1 DESCRIPTION

Subclass of Serializer that can turn a transcript into a series of lines 
for the GTF format.

=cut

package Bio::EnsEMBL::Utils::IO::GTFSerializer;

use strict;
use warnings;

use Bio::Tools::CodonTable;
use Bio::EnsEMBL::SeqFeature;
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::Utils::IO::FeatureSerializer;
use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;

use base qw(Bio::EnsEMBL::Utils::IO::FeatureSerializer);

my %strand_conversion = ( '1' => '+', '0' => '.', '-1' => '-');

=head2 print_feature

    Arg [1]    : Bio::EnsEMBL::Transcript
    Example    : $serializer->print_feature($transcript)
    Description: 
    Returntype : none

=cut

sub print_feature {
  my $self = shift;
  my $transcript = shift;

  throw(sprintf "Feature is of type %s. Cannot write non transcripts to GTF", ref($transcript))
    unless check_ref($transcript, "Bio::EnsEMBL::Transcript");

  # filehandle is inherited
  my $fh = $self->{'filehandle'};

  my $slice = $transcript->slice();
  my $sliceoffset = $slice->start-1;
  my $idstr = $slice->seq_region_name; 

  my @startcs =  $self->_make_start_codon_features($transcript);
  my @endcs   =  $self->_make_stop_codon_features($transcript);
  my ($hasstart, $hasend) = $self->_check_start_and_stop($transcript);

  my $dbname = $transcript->adaptor()->dbc()->dbname();
  my $vegadb = $dbname =~ /vega/;
  my $gene = $transcript->get_Gene();

  my ($biotype_display, $transcript_biotype);
  {
    no warnings 'uninitialized';
    $biotype_display = $vegadb ? $gene->status . '_' . $gene->biotype : $gene->biotype;
    $transcript_biotype = $vegadb ? $transcript->status . '_' . $transcript->biotype : $transcript->biotype;
  }

  my @translateable_exons;
  @translateable_exons = 
    @{$transcript->get_all_translateable_Exons} 
      if $transcript->translation;

  my ($count, $intrans, $instop) = (1, 0, 0);

  foreach my $exon (@{$transcript->get_all_Exons}) {
    my $strand = $strand_conversion{$exon->strand};

    print $fh 
      # Column 1 - seqname, the name of the sequence/chromosome the feature is on. Landmark for start below
      $idstr . "\t" . 
      # Column 2 - source
      $transcript_biotype . "\t" . 
      # Column 3 - feature type name
      'exon' . "\t" . 
      # Column 4 - start, the start coordinate of the feature
      ($exon->start+$sliceoffset) . "\t". 
      # Column 5 - end, coordinates (absolute) for the end of this feature
      ($exon->end+$sliceoffset) . "\t". 
      # Column 6 - score, for variations only
      "." . "\t". 
      # Column 7 - strand, forward (+) or reverse (-)
      $strand . "\t". 
      # Column 8 - frame (reading phase), what base of this feature is the first base of a codon
      "." . "\t";
    # Column 9 - attribute, a ;-separated list of key-value pairs (additional feature info)
    $self->_print_attribs($gene, $biotype_display, $transcript, $count, 'exon', $exon, $vegadb);
    print $fh "\n";

    $intrans = 1 
      if $transcript->translation and 
	$exon == $transcript->translation->start_Exon;

    if ($intrans) {
      # print the CDS of this exon

      my $cdsexon = shift @translateable_exons;
      #
      # Here is computing the value of the GTF frame (taking into
      # account the Ensembl convention), but it's misleadingly called phase
      #
      my $phase = $cdsexon->phase;
      if ($cdsexon->phase == 1) {
        $phase = 2;
      } elsif ($cdsexon->phase == 2) {
        $phase = 1;
      } elsif ($cdsexon->phase == -1) {
        $phase = 0;
      }

      my $exon_start = $cdsexon->start;
      my $exon_end   = $cdsexon->end;
      if ($transcript->translation && 
          $hasend && 
          ($exon->end >= $endcs[0]->start && $exon->start <= $endcs[0]->end)) {

        if ($cdsexon->strand == 1) {
          $exon_end = $cdsexon->end - $endcs[0]->length;
        } else {
          $exon_start = $cdsexon->start + $endcs[0]->length;
        }
      }

      if ($exon_start <= $cdsexon->end &&
          $exon_end >= $cdsexon->start &&
          !$instop) {
        print $fh $idstr . "\t" . 
	  $transcript_biotype . "\t" . 
          'CDS' . "\t" . 
          ($exon_start + $sliceoffset) . "\t". 
          ($exon_end + $sliceoffset) . "\t". 
          "." . "\t". 
          $strand . "\t". 
          $phase . "\t";
        $self->_print_attribs($gene, $biotype_display, $transcript, $count, 'CDS');
        print $fh "\n";
      }
    }
    if ($transcript->translation && 
        $exon == $transcript->translation->start_Exon && $hasstart) {
      my $tmpcnt = $count;
      foreach my $startc (@startcs) {
	# here we should check the start codon covers 3 bases	
        print $fh $idstr . "\t" . 
	  $transcript_biotype . "\t" . 
          'start_codon' . "\t" . 
          ($startc->start+$sliceoffset) . "\t". 
          ($startc->end+$sliceoffset) . "\t". 
          "." . "\t". 
          $strand . "\t". 
          $startc->phase . "\t";

        $self->_print_attribs($gene, $biotype_display, $transcript, $tmpcnt++, 'start_codon');
        print $fh "\n";
      }
    }
    if ($transcript->translation && 
        ($exon == $transcript->translation->end_Exon)) {
      if ($hasend) {
        my $tmpcnt = $count - $#endcs;

        foreach my $endc (@endcs) {
	  # here we should check the stop codon covers 3 bases
          print $fh $idstr . "\t" . 
	    $transcript_biotype . "\t" . 
            'stop_codon' . "\t" . 
            ($endc->start+$sliceoffset) . "\t". 
            ($endc->end+$sliceoffset) . "\t". 
            "." . "\t". 
            $strand . "\t". 
            $endc->phase . "\t";

          $self->_print_attribs($gene, $biotype_display, $transcript, $tmpcnt++, 'stop_codon');
          print $fh "\n";
        }
      }
      $intrans = 0;
    }

    if (scalar(@endcs) && 
        ($exon->end >= $endcs[0]->start && $exon->start <= $endcs[0]->end)) {
      $instop = 1;
    }

    $count++;
  }

}

=head2 _print_attribs

    Arg []     : 
    Example    : 
    Description: 
    Returntype : None

=cut

sub _print_attribs {
  my ($self, $gene, $gene_biotype, $transcript, $count, $type, $exon, $vegadb) = @_;

  my $gene_name;
  $gene_name = $gene->external_name;
  $gene_name =~ s/^[A-Z]{1,3}:// if $vegadb;

  my $trans_name;
  $trans_name = $transcript->external_name;
  $trans_name =~ s/^[A-Z]{1,3}:// if $vegadb;

  my $fh = $self->{'filehandle'};

  print $fh "gene_id \"" .  get_id_from_obj($gene) . "\";" .
            " transcript_id \"" . get_id_from_obj($transcript) . "\";";
  print $fh " exon_number \"$count\";";
  print $fh " gene_name \"" . $gene_name . "\";" if ($gene_name);
  print $fh " gene_biotype \"" . $gene_biotype ."\";";
  print $fh " transcript_name \"" . $trans_name . "\";" if ($trans_name);
  if ($type eq 'CDS') {
    print $fh ' protein_id "' . get_id_from_obj($transcript->translation) . '";';
  }
  if($exon) {
    printf $fh ' exon_id "%s";', get_id_from_obj($exon);
  }
  return;
}

sub get_id_from_obj {
  my ($obj) = @_;
  my $id = $obj->stable_id();
  $id = $obj->dbID() unless defined $id;
  return $id;
}

=head2 _make_start_codon_features

    Arg [1]    : Bio::EnsEMBL::Transcript
    Example    : 
    Description: 
    Returntype : Array

=cut

sub _make_start_codon_features {
  my ($self, $trans) = @_;
  defined $trans or
    throw("Transcript object not defined");

  return (()) unless $trans->translation;

  my @pepgencoords = $trans->pep2genomic(1,1);

  # cdna can come padded these days so allow gap at the start
  if($pepgencoords[0]->isa('Bio::EnsEMBL::Mapper::Gap')){
    shift @pepgencoords;
  }

  if(scalar(@pepgencoords) > 3) {
    throw(sprintf "Pep start for transcript %s does not map cleanly", $trans->display_id);
  }

  unless($pepgencoords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
    throw(sprintf "Pep start for transcript %s maps to gap", $trans->display_id);
  }
  unless($pepgencoords[$#pepgencoords]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
    throw(sprintf "Pep start (end of) for transcript %s maps to gap", $trans->display_id);
  }

  my @translateable = @{$trans->get_all_translateable_Exons};
  my @startc_feat;
  my $phase = 0;
  foreach my $pepgencoord (@pepgencoords) {
    push @startc_feat, 
      new Bio::EnsEMBL::SeqFeature(-seqname => $trans->stable_id,
				   -source_tag => 'starttrans',
				   -primary_tag => 'similarity',
				   -start => $pepgencoord->start,
				   -end   => $pepgencoord->end,
				   -phase => $phase,
				   -strand => $translateable[0]->strand);
    $phase = 3 - ($pepgencoord->end - $pepgencoord->start + 1);
  }
  if ($translateable[0]->strand == 1) {
    @startc_feat = sort {$a->start <=> $b->start } @startc_feat;
  } else {
    @startc_feat = sort {$b->start <=> $a->start } @startc_feat;
  }

  return @startc_feat;

}

=head2 _make_stop_codon_features

    Arg [1]    : Bio::EnsEMBL::Transcript
    Example    : 
    Description: 
    Returntype : Array

=cut

sub _make_stop_codon_features {
  my ($self, $trans) = @_;
  defined $trans or
    throw("Transcript object not defined");

  return (()) unless $trans->translation;

  my @translateable = @{$trans->get_all_translateable_Exons};

  my $cdna_endpos = $trans->cdna_coding_end;

  my @pepgencoords = $trans->cdna2genomic($cdna_endpos-2,$cdna_endpos);

  if(scalar(@pepgencoords) > 3) {
    throw(sprintf "Pep end for transcript %s does not map cleanly", $trans->display_id);
  }
  unless($pepgencoords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
    throw(sprintf "Pep end for transcript %s maps to gap", $trans->display_id);
  }
  unless($pepgencoords[$#pepgencoords]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
    throw(sprintf "Pep end (end of) for transcript %s maps to gap", $trans->display_id);
  }

  my @stopc_feat;
  my $phase = 0;
  foreach my $pepgencoord (@pepgencoords) {
    push @stopc_feat, 
      new Bio::EnsEMBL::SeqFeature(-seqname => $trans->display_id,
				   -source_tag => 'endtrans',
				   -primary_tag => 'similarity',
				   -start => $pepgencoord->start,
				   -end   => $pepgencoord->end,
				   -phase => $phase,
				   -strand => $translateable[0]->strand);
    $phase = 3 - ($pepgencoord->end-$pepgencoord->start+1);
  }

  if ($translateable[0]->strand == 1) {
    @stopc_feat = sort {$a->start <=> $b->start } @stopc_feat;
  } else {
    @stopc_feat = sort {$b->start <=> $a->start } @stopc_feat;
  }

  return @stopc_feat;

}

=head2 _check_start_and_stop

    Arg [1]    : Bio::EnsEMBL::Transcript
    Example    : 
    Description: 
    Returntype : Array

=cut

sub _check_start_and_stop {
  my ($self, ,$trans) = @_;

  return (0,0) unless defined $trans->translation;
  my ($has_start, $has_end);

  # transcript could be annotated has having incomplete
  # CDS at either 5', 3' end or both
  my @attrib = @{$trans->get_all_Attributes('cds_start_NF')};
  $has_start = (scalar @attrib == 1 and $attrib[0]->value() == 1)?0:1;
  @attrib = @{$trans->get_all_Attributes('cds_end_NF')};
  $has_end = (scalar @attrib == 1 and $attrib[0]->value() == 1)?0:1;
  return (0, 0) unless $has_start and $has_end;

  #
  # even if the transcript is not annotated with incomplete start/end 
  # CDS, we need to test whether the extracted start/stop codons are valid
  #
  # use translateable_seq (CDS) instead of spliced_seq (CDNA) which is
  # not padded for non-triplet issues
  #
  # $has_start = $has_end = 1;
  # my $coding_start = $trans->cdna_coding_start;
  # my $coding_end = $trans->cdna_coding_end;
  # my $cdna_seq = uc($trans->spliced_seq);
  # my $startseq = substr($cdna_seq, $coding_start-1, 3);
  # my $endseq = substr($cdna_seq, $coding_end-3, 3);
  #
  my $cds_seq = uc($trans->translateable_seq); 
  my $startseq = substr($cds_seq, 0, 3); 
  my $endseq = substr($cds_seq, -3); 

  # reimplemented since there are alternatively valid codon tables
  # $has_start = 0  if ($startseq ne "ATG");
  # $has_end = 0 if ($endseq ne "TAG" && $endseq ne "TGA" && $endseq ne "TAA");
  
  my ($attrib) = @{ $trans->slice()->get_all_Attributes('codon_table') };

  my $codon_table_id;
  $codon_table_id = $attrib->value()
    if defined $attrib;
  $codon_table_id ||= 1; # default codon table (vertebrate)
  my $codon_table = Bio::Tools::CodonTable->new( -id => $codon_table_id );

  $has_start = 0 
    unless $codon_table->is_start_codon($startseq);
  $has_end = 0 
    unless $codon_table->is_ter_codon($endseq);
  
  return ($has_start, $has_end);

}

1;