#$Id: MapTileUtils.pm 16123 2009-09-17 12:57:27Z cjfields $ package Bio::Search::Tiling::MapTileUtils; use strict; use warnings; use Exporter; BEGIN { our @ISA = qw( Exporter ); our @EXPORT = qw( get_intervals_from_hsps interval_tiling decompose_interval _allowable_filters _set_attributes _mapping_coeff); } # tiling trials # assumed: intervals are [$a0, $a1], with $a0 <= $a1 =head1 NAME Bio::Search::Tiling::MapTileUtils - utilities for manipulating closed intervals for an HSP tiling algorithm =head1 SYNOPSIS Not used directly. =head1 DESCRIPTION Not used directly. =head1 NOTE An "interval" in this module is defined as an arrayref C<[$a0, $a1]>, where C<$a0, $a1> are scalar numbers satisfying C<$a0 E= $a1>. =head1 AUTHOR Mark A. Jensen - maj -at- fortinbras -dot- us =head1 APPENDIX =head2 interval_tiling Title : interval_tiling() Usage : @tiling = interval_tiling( \@array_of_intervals ) Function: Find minimal set of intervals covering the input set Returns : array of arrayrefs of the form ( [$interval => [ @indices_of_collapsed_input_intervals ]], ...) Args : arrayref of intervals =cut sub interval_tiling { return unless $_[0]; # no input my $n = scalar @{$_[0]}; my %input; @input{(0..$n-1)} = @{$_[0]}; my @active = (0..$n-1); my @hold; my @tiled_ints; my @ret; while (@active) { my $tgt = $input{my $tgt_i = shift @active}; push @tiled_ints, $tgt_i; my $tgt_not_disjoint = 1; while ($tgt_not_disjoint) { $tgt_not_disjoint = 0; while (my $try_i = shift @active) { my $try = $input{$try_i}; if ( !are_disjoint($tgt, $try) ) { $tgt = min_covering_interval($tgt,$try); push @tiled_ints, $try_i; $tgt_not_disjoint = 1; } else { push @hold, $try_i; } } if (!$tgt_not_disjoint) { push @ret, [ $tgt => [@tiled_ints] ]; @tiled_ints = (); } @active = @hold; @hold = (); } } return @ret; } =head2 decompose_interval Title : decompose_interval Usage : @decomposition = decompose_interval( \@overlappers ) Function: Calculate the disjoint decomposition of a set of overlapping intervals, each annotated with a list of covering intervals Returns : array of arrayrefs of the form ( [[@interval] => [@indices_of_coverers]], ... ) Args : arrayref of intervals (arrayrefs like [$a0, $a1], with Note : Each returned interval is associated with a list of indices of the original intervals that cover that decomposition component (scalar size of this list could be called the 'coverage coefficient') Note : Coverage: each component of the decomp is completely contained in the input intervals that overlap it, by construction. Caveat : This routine expects the members of @overlappers to overlap, but doesn't check this. =cut ### what if the input intervals don't overlap?? They MUST overlap; that's ### what interval_tiling() is for. sub decompose_interval { return unless $_[0]; # no input my @ints = @{$_[0]}; my (%flat,@flat); ### this is ok, but need to handle the case where a lh and rh endpoint ### coincide... # decomposition -- # flatten: # every lh endpoint generates (lh-1, lh) # every rh endpoint generates (rh, rh+) foreach (@ints) { $flat{$$_[0]-1}++; $flat{$$_[0]}++; $flat{$$_[1]}++; $flat{$$_[1]+1}++; } # sort, create singletons if nec. my @a; @a = sort {$a<=>$b} keys %flat; # throw out first and last (meeting a boundary condition) shift @a; pop @a; # look for singletons @flat = (shift @a, shift @a); if ( $flat[1]-$flat[0] == 1 ) { @flat = ($flat[0],$flat[0], $flat[1]); } while (my $a = shift @a) { if ($a-$flat[-2]==2) { push @flat, $flat[-1]; # create singleton interval } push @flat, $a; } if ($flat[-1]-$flat[-2]==1 and @flat % 2) { push @flat, $flat[-1]; } # component intervals are consecutive pairs my @decomp; while (my $a = shift @flat) { push @decomp, [$a, shift @flat]; } # for each component, return a list of the indices of the input intervals # that cover the component. my @coverage; foreach my $i (0..$#decomp) { foreach my $j (0..$#ints) { unless (are_disjoint($decomp[$i], $ints[$j])) { if (defined $coverage[$i]) { push @{$coverage[$i]}, $j; } else { $coverage[$i] = [$j]; } } } } return map { [$decomp[$_] => $coverage[$_]] } (0..$#decomp); } =head2 are_disjoint Title : are_disjoint Usage : are_disjoint( [$a0, $a1], [$b0, $b1] ) Function: Determine if two intervals are disjoint Returns : True if the intervals are disjoint, false if they overlap Args : array of two intervals =cut sub are_disjoint { my ($int1, $int2) = @_; return 1 if ( $$int1[1] < $$int2[0] ) || ( $$int2[1] < $$int1[0]); return 0; } =head2 min_covering_interval Title : min_covering_interval Usage : $interval = min_covering_interval( [$a0,$a1],[$b0,$b1] ) Function: Determine the minimal covering interval for two intervals Returns : an interval Args : two intervals =cut sub min_covering_interval { my ($int1, $int2) = @_; my @a = sort {$a <=> $b} (@$int1, @$int2); return [$a[0],$a[-1]]; } =head2 get_intervals_from_hsps Title : get_intervals_from_hsps Usage : @intervals = get_intervals_from_hsps($type, @hsp_objects) Function: Return array of intervals of the form [ $start, $end ], from an array of hsp objects Returns : an array of intervals Args : scalar $type, array of HSPI objects; where $type is one of 'hit', 'subject', 'query' =cut sub get_intervals_from_hsps { my $type = shift; my @hsps; if (ref($type) =~ /HSP/) { push @hsps, $type; $type = 'query'; } elsif ( !grep /^$type$/,qw( hit subject query ) ) { die "Unknown HSP type '$type'"; } $type = 'hit' if $type eq 'subject'; push @hsps, @_; my @ret; foreach (@hsps) { # push @ret, [ $_->start($type), $_->end($type)]; push @ret, [ $_->range($type) ]; } return @ret; } # fast, clear, nasty, brutish and short. # for _allowable_filters(), _set_mapping() # covers BLAST, FAST families # FASTA is ambiguous (nt or aa) based on alg name only my $alg_lookup = { 'N' => { 'mapping' => [1,1], 'def_context' => ['p_','p_'], 'has_strand' => [1, 1], 'has_frame' => [0, 0]}, 'P' => { 'mapping' => [1,1], 'def_context' => ['all','all'], 'has_strand' => [0, 0], 'has_frame' => [0, 0]}, 'X' => { 'mapping' => [3, 1], 'def_context' => [undef,'all'], 'has_strand' => [1, 0], 'has_frame' => [1, 0]}, 'Y' => { 'mapping' => [3, 1], 'def_context' => [undef,'all'], 'has_strand' => [1, 0], 'has_frame' => [1, 0]}, 'TA' => { 'mapping' => [1, 3], 'def_context' => ['all',undef], 'has_strand' => [0, 1], 'has_frame' => [0, 1]}, 'TN' => { 'mapping' => [1, 3], 'def_context' => ['p_',undef], 'has_strand' => [1,1], 'has_frame' => [0, 1]}, 'TX' => { 'mapping' => [3, 3], 'def_context' => [undef,undef], 'has_strand' => [1, 1], 'has_frame' => [1, 1]}, 'TY' => { 'mapping' => [3, 3], 'def_context' => [undef,undef], 'has_strand' => [1, 1], 'has_frame' => [1, 1]} }; =head2 _allowable_filters Title : _allowable_filters Usage : _allowable_filters($Bio_Search_Hit_HitI, $type) Function: Return the HSP filters (strand, frame) allowed, based on the reported algorithm Returns : String encoding allowable filters: s = strand, f = frame Empty string if no filters allowed undef if algorithm unrecognized Args : A Bio::Search::Hit::HitI object, scalar $type, one of 'hit', 'subject', 'query'; default is 'query' =cut sub _allowable_filters { my $hit = shift; my $type = shift; $type ||= 'q'; unless (grep /^$type$/, qw( h q s ) ) { warn("Unknown type '$type'; returning ''"); return ''; } $type = 'h' if $type eq 's'; my $alg = $hit->algorithm; # pretreat (i.e., kludge it) $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/); for ($hit->algorithm) { /MEGABLAST/i && do { return qr/[s]/; }; /(.?)BLAST(.?)/i && do { return $$alg_lookup{$1.$2}{$type}; }; /(.?)FAST(.?)/ && do { return $$alg_lookup{$1.$2}{$type}; }; do { # unrecognized last; }; } return; } =head2 _set_attributes Title : _set_attributes Usage : $tiling->_set_attributes() Function: Sets attributes for invocant that depend on algorithm name Returns : True on success Args : none Note : setting based on the configuration table %alg_lookup =cut sub _set_attributes { my $self = shift; my $alg = $self->hit->algorithm; # pretreat (i.e., kludge it) $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/); for ($alg) { /MEGABLAST/i && do { ($self->{_mapping_query},$self->{_mapping_hit}) = (1,1); ($self->{_def_context_query},$self->{_def_context_hit}) = ('p_','p_'); ($self->{_has_frame_query},$self->{_has_frame_hit}) = (0, 0); ($self->{_has_strand_query},$self->{_has_strand_hit}) = (1, 1); last; }; /(.?)BLAST(.?)/i && do { ($self->{_mapping_query},$self->{_mapping_hit}) = @{$$alg_lookup{$1.$2}{mapping}}; ($self->{_def_context_query},$self->{_def_context_hit}) = @{$$alg_lookup{$1.$2}{def_context}}; ($self->{_has_frame_query},$self->{_has_frame_hit}) = @{$$alg_lookup{$1.$2}{has_frame}}; ($self->{_has_strand_query},$self->{_has_strand_hit}) = @{$$alg_lookup{$1.$2}{has_strand}}; last; }; /(.?)FAST(.?)/ && do { ($self->{_mapping_query},$self->{_mapping_hit}) = @{$$alg_lookup{$1.$2}{mapping}}; ($self->{_def_context_query},$self->{_def_context_hit}) = @{$$alg_lookup{$1.$2}{def_context}}; ($self->{_has_frame_query},$self->{_has_frame_hit}) = @{$$alg_lookup{$1.$2}{has_frame}}; ($self->{_has_strand_query},$self->{_has_strand_hit}) = @{$$alg_lookup{$1.$2}{has_strand}}; last; }; do { # unrecognized $self->warn("Unrecognized algorithm '$alg'; defaults may not work"); ($self->{_mapping_query},$self->{_mapping_hit}) = (1,1); ($self->{_def_context_query},$self->{_def_context_hit}) = ('all','all'); ($self->{_has_frame_query},$self->{_has_frame_hit}) = (0,0); ($self->{_has_strand_query},$self->{_has_strand_hit}) = (0,0); return 0; last; }; } return 1; } sub _mapping_coeff { my $obj = shift; my $type = shift; my %type_i = ( 'query' => 0, 'hit' => 1 ); unless ( ref($obj) && $obj->can('algorithm') ) { $obj->warn("Object type unrecognized"); return undef; } $type ||= 'query'; unless ( grep(/^$type$/, qw( query hit subject ) ) ) { $obj->warn("Sequence type unrecognized"); return undef; } $type = 'hit' if $type eq 'subject'; my $alg = $obj->algorithm; # pretreat (i.e., kludge it) $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/); for ($alg) { /MEGABLAST/i && do { return 1; }; /(.?)BLAST(.?)/i && do { return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}]; }; /(.?)FAST(.?)/ && do { return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}]; }; do { # unrecognized last; }; } return; } 1; # need our own subsequencer for hsps. package Bio::Search::HSP::HSPI; use strict; use warnings; =head2 matches_MT Title : matches_MT Usage : $hsp->matches($type, $action, $start, $end) Purpose : Get the total number of identical or conserved matches in the query or sbjct sequence for the given HSP. Optionally can report data within a defined interval along the seq. Returns : scalar int Args : Comments : Relies on seq_str('match') to get the string of alignment symbols between the query and sbjct lines which are used for determining the number of identical and conservative matches. Note : Modeled on Bio::Search::HSP::HSPI::matches =cut sub matches_MT { my( $self, @args ) = @_; my($type, $action, $beg, $end) = $self->_rearrange( [qw(TYPE ACTION START END)], @args); my @actions = qw( identities conserved searchutils ); # prep $type $self->throw("Type not specified") if !defined $type; $self->throw("Type '$type' unrecognized") unless grep(/^$type$/,qw(query hit subject)); $type = 'hit' if $type eq 'subject'; # prep $action $self->throw("Action not specified") if !defined $action; $self->throw("Action '$action' unrecognized") unless grep(/^$action$/, @actions); my ($len_id, $len_cons); my $c = Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type); if ((defined $beg && !defined $end) || (!defined $beg && defined $end)) { $self->throw("Both start and end are required"); } elsif ( (!defined($beg) && !defined($end)) || !$self->seq_str('match') ) { ## Get data for the whole alignment. # the reported values x mapping $self->debug("Sequence data not present in report; returning data for entire HSP") unless $self->seq_str('match'); ($len_id, $len_cons) = map { $c*$_ } ($self->num_identical, $self->num_conserved); for ($action) { $_ eq 'identities' && do { return $len_id; }; $_ eq 'conserved' && do { return $len_cons; }; $_ eq 'searchutils' && do { return ($len_id, $len_cons); }; do { $self->throw("What are YOU doing here?"); }; } } else { ## Get the substring representing the desired sub-section of aln. my($start,$stop) = $self->range($type); if ( $beg < $start or $stop < $end ) { $self->throw("Start/stop out of range [$start, $stop]"); } # handle gaps my $match_str = $self->seq_str('match'); if ($self->gaps) { # strip the homology string of gap positions relative # to the target type $match_str = $self->seq_str('match'); my $tgt = $self->seq_str($type); my $encode = $match_str ^ $tgt; my $zap = '-'^' '; $encode =~ s/$zap//g; $tgt =~ s/-//g; $match_str = $tgt ^ $encode; # match string is now the correct length for substr'ing below, # given that start and end are gapless coordinates in the # blast report } my $seq = ""; $seq = substr( $match_str, int( ($beg-$start)/Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type) ), int( 1+($end-$beg)/Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type) ) ); if(!CORE::length $seq) { $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop"); } $seq =~ s/ //g; # remove space (no info). $len_cons = (CORE::length $seq)*(Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self,$type)); $seq =~ s/\+//g; # remove '+' characters (conservative substitutions) $len_id = (CORE::length $seq)*(Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self,$type)); for ($action) { $_ eq 'identities' && do { return $len_id; }; $_ eq 'conserved' && do { return $len_cons; }; $_ eq 'searchutils' && do { return ($len_id, $len_cons); }; do { $self->throw("What are YOU doing here?"); }; } } } package Bio::Search::Tiling::MapTileUtils; # a graphical depiction of a set of intervals sub _ints_as_text { my $ints = shift; my @ints = @$ints; my %pos; for (@ints) { $pos{$$_[0]}++; $pos{$$_[1]}++; } my @pos = sort {$a<=>$b} keys %pos; @pos = map {sprintf("%03d",$_)} @pos; #header my $max=0; $max = (length > $max) ? length : $max for (@pos); for my $j (0..$max-1) { my $i = $max-1-$j; my @line = map { substr($_, $j, 1) || '0' } @pos; print join('', @line), "\n"; } print '-' x @pos, "\n"; undef %pos; @pos{map {sprintf("%d",$_)} @pos} = (0..@pos); foreach (@ints) { print ' ' x $pos{$$_[0]}, '[', ' ' x ($pos{$$_[1]}-$pos{$$_[0]}-1), ']', ' ' x (@pos-$pos{$$_[1]}), "\n"; } } 1;