=pod

=head1 LICENSE

  Copyright (c) 1999-2013 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 NAME

GFFParser - simple gff3 parser.


=head1 SYNOPSIS

use strict;
use Bio::EnsEMBL::Utils::IO::GFFParser;
use IO::File;

my $file_name = "features.gff";
my $fh = IO::File->new($file_name, 'r');
my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh);

my @header_lines = @{$parser->parse_header()};
#do something with the header lines array, e.g. print array elements

foreach my $header_line (@header_lines) {
    print $header_line . "\n";
}
print "\n\n";
my $feature = $parser->parse_next_feature();

while (defined($feature) ) {

    my %feature = %{$feature};

    #do something with the feature, e.g. print hash keys and values 
    foreach my $key (keys %feature) {
    if ($key ne 'attribute') {
        print $key . " " . $feature{$key} ."\n";
    } else {
        print $key . "\n";
        my %attribs =  %{$feature{$key}};
        foreach my $attrib_key (keys %attribs) {
        printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));

        }
    }
    }
    print "\n\n";
    $feature = $parser->parse_next_feature();
}

my $sequence = $parser->parse_next_sequence();

while (defined($sequence)) {
    my %sequence = %{$sequence};

    foreach my $key (keys %sequence) {      
        print $key . " " . $sequence{$key} ."\n";
    }
    print "\n\n";   

    $sequence = $parser->parse_next_sequence();
}

$parser->close();

$fh->close();


=head1 DESCRIPTION

GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml

Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file.

This class can be extended to convert a feature hash into a feature object reversing
the processing done by GFFSerializer.

=cut

package Bio::EnsEMBL::Utils::IO::GFFParser;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Exception;
use IO::File;
use URI::Escape;
use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;


my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1');

=head2 new

    Constructor
    Arg [1]    : File handle
    
    Returntype : Bio::EnsEMBL::Utils::IO::GFFParser

=cut

sub new {
    my $class = shift;
    my $self = {
        filehandle => shift,
    };
    bless $self, $class;
    if (!defined($self->{'filehandle'})) {
        throw("GFFParser requires a valid filehandle to a GFF3 formatted file"); 
    }
    return $self;

}

=head2 parse_header

    Arg [1]    : File handle 
    Description: Returns a arrayref with each header line stored in array element
    Returntype : Arrayref of GFF3 file header lines

=cut

sub parse_header {

    my $self = shift;

    my $next_line;
    my @header_lines;
    
    while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) )  {
    
        #stop parsing features if ##FASTA directive encountered
        last if ($next_line =~ /\#\#FASTA/ );
    
        #header lines start with ## (except for the ##FASTA directive indicating sequence section)
        if ($next_line =~ /^[\#]{2}/ ) {
            push @header_lines, $next_line;
            if ($next_line =~ /gff-version\s+(\d+)/) {
                if ($1 != 3) {
                    warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.");  
                }
            }
        }
    }

    if (defined($next_line)) {
        $self->{'first_non_header_line'} = $next_line;
    }
    return \@header_lines;

}

=head2 parse_next_feature

    Arg [1]    : File handle
    Description: Returns a hashref in the format -
                 {
                   seqid => scalar,
                   source => scalar,
                   type => scalar,
                   start => scalar,
                   end => scalar,
                   score => scalar,
                   strand => scalar,
                   phase => scalar,
                   attribute => hashref, 
                   
         }
    Returntype : Hashref of a GFF3 feature line

=cut

sub parse_next_feature {

    my $self = shift;

    my $next_line;
    my $feature_line;
    
    while (($next_line = $self->_read_line() ) && defined($next_line) ) {

        #stop parsing features if ##FASTA directive
        last if ($next_line =~ /\#\#FASTA/);
    
    
        next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
            $next_line =~ /^\/\//);
    
        $feature_line = $next_line;
        last;
    }

    return undef unless $feature_line;

    my %feature;
    my %attribute;


    #strip off trailing comments
    $feature_line =~ s/\#.*//;
    
    my @chunks = split(/\t/, $feature_line);

    %feature = (
        'seqid' => uri_unescape($chunks[0]),
        'source' => uri_unescape($chunks[1]),
        'type' => uri_unescape($chunks[2]),
        'start' => $chunks[3],
        'end' => $chunks[4],
        'score' => $chunks[5],
        'strand' => $strand_conversion{$chunks[6]},
        'phase' => $chunks[7] 
    );
    
    if ($chunks[8]) {
    my @attributes = split( /;/, $chunks[8] );
      my %attributes;
      foreach my $attribute (@attributes) {
        my ( $name, $value ) = split( /=/, $attribute );
        $name = uri_unescape($name);
        my @split_values = map { uri_unescape($_) } split(/,/, $value);
        if(scalar(@split_values) > 1) {
          $attributes{$name} = \@split_values;
        }
        else {
          $attributes{$name} = $split_values[0];
        }
      }
      $feature{'attribute'} = \%attributes;
    }

    return \%feature;    
}

=head2 parse_next_sequence

    Arg [1]    : File handle
    Description: Returns a hashref in the format -
                 {
                   header => scalar,
                   sequence => scalar,
                   
         }
    Returntype : Hashref of a GFF3 sequence line

=cut

sub parse_next_sequence {

    my $self = shift;

    my $next_line;
    my $sequence;
    my $header;
    
    while (($next_line = $self->_read_line() ) && defined($next_line) ) {

        next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
            $next_line =~ /^\/\//);
    
        if ($next_line =~ /^>/) {
            if ($header) {
                #next fasta header encountered
                $self->{'next_fasta_header'} = $next_line; 
                last;
            
            } else {
                $header = $next_line;
            }
        } else {
            $sequence .= $next_line;
        }
    }

    return undef unless ($sequence || $header);

    my %sequence = (header => $header , sequence => $sequence );

    return \%sequence;    
}


sub _read_line {

    my $self = shift;
    my $fh = $self->{'filehandle'};

    my $line;
    
    if (defined($self->{'first_non_header_line'})) {
        $line = $self->{'first_non_header_line'};
        $self->{'first_non_header_line'} = undef;
    } elsif ( defined($self->{'next_fasta_header'} )) {
        $line = $self->{'next_fasta_header'};
        $self->{'next_fasta_header'} = undef;
    }
    else {
        $line = <$fh>;
        if (defined($line)) {
            chomp $line;
            if (!$line) {
            #parse next line if current line is empty
            $line = $self->_read_line();
            }
        }
    }

    return $line;
}

sub close {

    my $self = shift;
    $self->{"filehandle"} = undef;

}

1;