package tim_db_helper::bam;

# modules
require Exporter;
use strict;
use Carp;
use Statistics::Lite qw(mean);
use Bio::DB::Sam;


# Exported names
our @ISA = qw(Exporter);
our @EXPORT = qw(
	collect_bam_scores
	collect_bam_position_scores
);

# Hashes of opened file objects
our %OPENED_BAMFILES; # opened bam file objects
	# in empirical testing, this doesn't really seem to speed things up
	# like I thought it would
	# oh well, keep it anyway????


# The true statement
1; 


### Modules ###


### Collect Bam scores only
sub collect_bam_scores {
	
	# we will collect positioned values but
	# only return the values
	
	# grab the method from the passed arguments
	my $method = $_[3];
	
	# collect the raw data
	my %bam_data = _collect_bam_data(@_);
	
	# combine multiple datapoints at the same position
	my @values;
	if ($method eq 'length') {
		# each hash value is an array of one or more datapoints
		foreach my $position (keys %bam_data) {
			push @values, mean( @{ $bam_data{$position} } );
		}
	}
	else {
		# score (coverage) or count
		# each value is a count
		@values = values %bam_data;
	}
	
	# return collected data
	return @values;
	
}


### Collect positioned Bam scores
sub collect_bam_position_scores {
	
	# grab the method from the passed arguments
	my $method = $_[3];
	
	# collect the raw data
	my %bam_data = _collect_bam_data(@_);
	
	# combine multiple datapoints at the same position
	if ($method eq 'length') {
		# each value is an array of one or more datapoints
		# we will take the simple mean
		foreach my $position (keys %bam_data) {
			$bam_data{$position} = mean( @{$bam_data{$position}} );
		}
	}
	
	# return collected data
	return %bam_data;
}


### Actual collection of scores
sub _collect_bam_data {
	
	# pass the required information
	unless (scalar @_ >= 5) {
		croak " At least five arguments must be passed to collect Bam data!\n";
	}
	my ($region, $region_strand, $stranded, $method, @bam_features) = @_;
		# method can be score, count, or length
	
	# set up hash, either position => count or position => [scores]
	my %bam_data;
	
	# look at each bamfile
	# usually there is only one, but there may be more than one
	foreach my $feature (@bam_features) {
	
		## Get the name of the bigbed file
		my $bamfile;
		
		if ($feature =~ /^file:(.+)$/) {
			# the passed feature appears to specify a file
			$bamfile = $1;
			
			# check file
			unless (-e $bamfile) {
				croak " Bam file '$bamfile' does not exist!\n";
				return;
			}
		}
		elsif ($feature =~ /^http|ftp/i) {
			# a remote file
			
			# this should be supported by Bio::DB::Sam
			$bamfile = $feature;
		}
		else {
			# otherwise we assume the passed feature is a database object
			
			# get bedfile name
			($bamfile) = $feature->get_tag_values('bamfile');
		}
		croak " no bamfile specified!\n" unless $bamfile;
		
		
		## Open the Bam File
		my $bam;
		if (exists $OPENED_BAMFILES{$bamfile} ) {
			# this file is already opened, use it
			$bam = $OPENED_BAMFILES{$bamfile};
		}
		else {
			# this file has not been opened yet, open it
			$bam = Bio::DB::Sam->new(-bam => $bamfile) or
				croak " unable to open Bam file '$bamfile'";
			
			# store the opened object for later use
			$OPENED_BAMFILES{$bamfile} = $bam;
		}
			
		
		# Set the code to filter alignments based on strand 
		my $filter;
		if ($stranded eq 'sense' and $region_strand == 1) {
			$filter = sub {
				my $a = shift;
				return $a->strand == 1 ? 1 : 0;
			};
		}
		elsif ($stranded eq 'sense' and $region_strand == -1) {
			$filter = sub {
				my $a = shift;
				return $a->strand == -1 ? 1 : 0;
			};
		}
		elsif ($stranded eq 'antisense' and $region_strand == 1) {
			$filter = sub {
				my $a = shift;
				return $a->strand == -1 ? 1 : 0;
			};
		}
		elsif ($stranded eq 'antisense' and $region_strand == -1) {
			$filter = sub {
				my $a = shift;
				return $a->strand == 1 ? 1 : 0;
			};
		}
		else {
			# no strand requested, take all
			$filter = sub {
				return 1;
			};
		}
		
		
		## Collect the data according to the requested method
		if ($method eq 'score') {
			# collecting scores, or in this case, basepair coverage of 
			# alignments over the requested region
			
			my $coverage;
			if ($stranded eq 'sense' or $stranded eq 'antisense') {
				# Cannot currently collect stranded data with the coverage 
				# method. I will keep the filter in here anyway to future-proof 
				# in case Lincoln ever adds this support (don't hold your 
				# breath!)
				
				($coverage) = $bam->features(
					-type     => 'coverage',
					-seq_id   => $region->seq_id,
					-start    => $region->start,
					-end      => $region->end,
					-filter   => $filter,
				);
			}
			else {
				# no stranded data wanted
				($coverage) = $bam->features(
					-type     => 'coverage',
					-seq_id   => $region->seq_id,
					-start    => $region->start,
					-end      => $region->end,
				);
			}
			
			# convert the coverage data
			# by default, this should return the coverage at 1 bp resolution
			if ($coverage) {
				my @scores = $coverage->coverage;
				for (my $i = $region->start; $i <= $region->end; $i++) {
					$bam_data{$i} += shift @scores;
				}
			}
		}
		
		else {
			# either collecting counts or length
			# working with actual alignments
			
			my @alignments;
			if ($stranded eq 'sense' or $stranded eq 'antisense') {
				
				@alignments = $bam->features(
					-type     => 'match',
					-seq_id   => $region->seq_id,
					-start    => $region->start,
					-end      => $region->end,
					-filter   => $filter,
				);
			}
			else {
				# no stranded data wanted
				@alignments = $bam->features(
					-type     => 'match',
					-seq_id   => $region->seq_id,
					-start    => $region->start,
					-end      => $region->end,
				);
			}
			
			# process the alignments
			if ($method eq 'count') {
				foreach my $a (@alignments) {
					# enumerate at the alignment's midpoint
					my $position = int( ( ($a->start + $a->end) / 2) + 0.5);
					$bam_data{$position} += 1;
				}
			}
			elsif ($method eq 'length') {
				foreach my $a (@alignments) {
					# record length at the alignment's midpoint
					my $position = int( ( ($a->start + $a->end) / 2) + 0.5);
					push @{ $bam_data{$position} }, ($a->end - $a->start + 1);
				}
			}
		
		}
	}

	
	# return collected data
	return %bam_data;
}


__END__


=head1 NAME

tim_db_helper::bam

=head1 DESCRIPTION

This module is used to collect the dataset scores from a binary 
bam file (.bam). The file may be identified in one of two ways. First,
it may be referenced in the database. Typically, a single 
feature representing the dataset is present across each chromosome. The 
feature should contain an attribute ('bamfile') that references the 
location of the binary file representing the alignments. Second, 
the local location of the file may be directly passed to the subroutine. 

In either case, the file is read using the Bio::DB::Sam module, and 
the values extracted from the region of interest. 

Collected data values may be restricted to strand by specifying the desired 
strandedness, 
depending on the method of data collection. Collecting scores, or basepair 
coverage of alignments over the region of interest, does not currently support 
stranded data collection (as of this writing). However, enumerating 
alignments (count method) and collecting alignment lengths do support 
stranded data collection.

If stranded coverage is desired, the best solution is to split the bam file 
into two files according to alignment strand using the biotoolbox script 
'split_bam_by_strand.pl'. 

Currently, paired-end bam files are treated as single-end files. If paired-end 
alignments are to be analyzed, they should be processed into another format 
(BigWig or BigBed).

For loading bam files into a Bio::DB database, see the biotoolbox perl 
script 'big_filegff3.pl'.

To speed up the program and avoid repetitive opening and 
closing of the files, the opened bam file object is stored in a global 
hash in case it is needed again.

=head1 USAGE

The module requires Lincoln Stein's Bio::DB::Sam to be installed. 

Load the module at the beginning of your program.

	use tim_db_helper::bam;

It will automatically export the name of the subroutines. 

=over

=item collect_bam_scores

This subroutine will collect only the data values from a binary bam file 
for the specified database region. The positional information of the 
scores is not retained, and the values are best further processed through 
some statistical method (mean, median, etc.).

The subroutine is passed five or more arguments in the following order:
    
    1) The database object representing the genomic region of interest. 
       This should be a Bio::DB::SeqFeature object that supports the 
       start, end, and strand methods.
    2) The strand of the original feature (or region), -1, 0, or 1.
    3) A scalar value representing the desired strandedness of the data 
       to be collected. Acceptable values include "sense", "antisense", 
       "none" or "no". Only those scores which match the indicated 
       strandedness are collected.
    4) The method or type of data collected. 
       Acceptable values include 'score' (returns the basepair coverage
       of alignments over the region of interest), 'count' (returns the 
       number of alignments found at each base position in the region, 
       recorded at the alignment's midpoint), or 'length' (returns the 
       mean lengths of the alignments found at each base position in 
       the region, recorded at the alignment's midpoint). 
    5) One or more database feature objects that contain the reference 
       to the .bam file. They should contain the attribute 'bamfile' 
       which has the path to the Bam file. Alternatively, pass one 
       or more filenames of .bam files. Each filename should be 
       prefixed with 'file:' to indicate that it is a direct file 
       reference, and not a database object.

The subroutine returns an array of the defined dataset values found within 
the region of interest. 

=item collect_bam_position_scores

This subroutine will collect the score values from a binary bam file 
for the specified database region keyed by position. 

The subroutine is passed five or more arguments in the following order:
    
    1) The database object representing the genomic region of interest. 
       This should be a Bio::DB::SeqFeature object that supports the 
       start, end, and strand methods.
    2) The strand of the original feature (or region), -1, 0, or 1.
    3) A scalar value representing the desired strandedness of the data 
       to be collected. Acceptable values include "sense", "antisense", 
       "none" or "no". Only those scores which match the indicated 
       strandedness are collected.
    4) The method or type of data collected. 
       Acceptable values include 'score' (returns the basepair coverage
       of alignments over the region of interest), 'count' (returns the 
       number of alignments found at each base position in the region, 
       recorded at the alignment's midpoint), or 'length' (returns the 
       mean lengths of the alignments found at each base position in 
       the region, recorded at the alignment's midpoint). 
    5) One or more database feature objects that contain the reference 
       to the .bam file. They should contain the attribute 'bamfile' 
       which has the path to the Bam file. Alternatively, pass one 
       or more filenames of .bam files. Each filename should be 
       prefixed with 'file:' to indicate that it is a direct file 
       reference, and not a database object.

The subroutine returns a hash of the defined dataset values found within 
the region of interest keyed by position. The feature midpoint is used 
as the key position. When multiple features are found at the same 
position, a simple mean (for length data methods) or sum 
(for count methods) is returned.

=back

=head1 AUTHOR

 Timothy J. Parnell, PhD
 Howard Hughes Medical Institute
 Dept of Oncological Sciences
 Huntsman Cancer Institute
 University of Utah
 Salt Lake City, UT, 84112

This package is free software; you can redistribute it and/or modify
it under the terms of the GPL (either version 1, or at your option,
any later version) or the Artistic License 2.0.