=head1 LICENSE Copyright (c) 1999-2013 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at . Questions may also be sent to the Ensembl help desk at . =cut =head1 NAME Bio::EnsEMBL::TopLevelAssemblyMapper - Handles mapping between a given coordinate system and the toplevel pseudo coordinate system. =head1 SYNOPSIS $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); $asma = $db->get_AssemblyMapperAdaptor(); $csa = $db->get_CoordSystemAdaptor(); my $toplevel = $cs_adaptor->fetch_by_name('toplevel'); my $ctg_cs = $cs_adaptor->fetch_by_name('contig'); $asm_mapper = $map_adaptor->fetch_by_CoordSystems( $cs1, $cs2 ); # map to toplevel coord system for this region @chr_coords = $asm_mapper->map( 'AL30421.1.200.92341', 100, 10000, -1, $ctg_cs ); # list toplevel seq_region_ids for this region @chr_ids = $asm_mapper->list_ids( 'AL30421.1.200.92341', 1, 1000, -1, $ctg_cs ); =head1 DESCRIPTION The TopLevelAssemblyMapper performs mapping between a provided coordinate system and the toplevel pseudo cooordinate system. The toplevel coordinate system is not a real coordinate system, but represents the highest coordinate system that can be mapped to in a given region. It is only possible to perform unidirectional mapping using this mapper, because it does not make sense to map from the toplevel coordinate system to another coordinate system. =head1 METHODS =cut use strict; use warnings; package Bio::EnsEMBL::TopLevelAssemblyMapper; use Bio::EnsEMBL::Utils::Exception qw(throw); use Bio::EnsEMBL::Mapper; use Bio::EnsEMBL::CoordSystem; use Scalar::Util qw(weaken); =head2 new Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for the database this mapper is using. Arg [2] : Toplevel CoordSystem Arg [3] : Other CoordSystem Description: Creates a new TopLevelAssemblyMapper object Returntype : Bio::EnsEMBL::DBSQL::TopLevelAssemblyMapper Exceptions : throws if any of the 3 arguments are missing/ not : of the correct type. Caller : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor Status : Stable =cut sub new { my ($caller, $adaptor, $toplevel_cs, $other_cs) = @_; my $class = ref($caller) || $caller; if(!ref($toplevel_cs)) { throw('Toplevel CoordSystem argument expected.'); } if(!ref($other_cs)) { throw('Other CoordSystem argument expected.'); } if(!$toplevel_cs->is_top_level()) { throw($toplevel_cs->name() . " is not the toplevel CoordSystem."); } if($other_cs->is_top_level()) { throw("Other CoordSystem argument should not be toplevel CoordSystem."); } my $cs_adaptor = $adaptor->db()->get_CoordSystemAdaptor(); my $coord_systems = $cs_adaptor->fetch_all(); my $self = bless {'coord_systems' => $coord_systems, 'toplevel_cs' => $toplevel_cs, 'other_cs' => $other_cs}, $class; $self->adaptor($adaptor); return $self; } sub adaptor { my $self = shift; weaken($self->{'adaptor'} = shift) if(@_); return $self->{'adaptor'}; } =head2 map Arg [1] : string $frm_seq_region The name of the sequence region to transform FROM Arg [2] : int $frm_start The start of the region to transform FROM Arg [3] : int $frm_end The end of the region to transform FROM Arg [4] : int $strand The strand of the region to transform FROM Arg [5] : Bio::EnsEMBL::CoordSystem The coordinate system to transform FROM Arg [6] : if set will do a fastmap Example : @coords = $mapper->map('X', 1_000_000, 2_000_000, 1, $chr_cs); Description: Transforms coordinates from one coordinate system to another. Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or Bio::EnsEMBL::Mapper:Gap objects Exceptions : thrown if if the specified TO coordinate system is not one of the coordinate systems associated with this mapper Caller : general Status : Stable =cut sub map { throw('Incorrect number of arguments.') if(@_ != 6 && @_ != 7); my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_strand, $frm_cs, $fastmap) = @_; if($frm_cs->is_top_level()) { throw("The toplevel CoordSystem can only be mapped TO, not FROM."); } my @tmp; push @tmp, $frm_seq_region_name; my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; my $mapper = $self->{'mapper'}; my $toplevel_cs = $self->{'toplevel_cs'}; my $other_cs = $self->{'other_cs'}; my $adaptor = $self->adaptor; if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . " is neither the assembled nor the component coordinate system " . " of this AssemblyMapper"); } my $coord_systems = $self->{'coord_systems'}; my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); # # TBD try to make this more efficient # my $from_rank = $other_cs->rank(); foreach my $cs (@$coord_systems) { last if($cs->rank >= $from_rank); #check if a mapping path even exists to this coordinate system my @mapping_path = @{ $csa->get_mapping_path( $cs, $other_cs ) }; if(@mapping_path) { # Try to map to this coord system. If we get back any coordinates then # it is our 'toplevel' that we were looking for my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); if($fastmap) { my @result = $mapper->fastmap($frm_seq_region_name, $frm_start, $frm_end, $frm_strand, $frm_cs); return @result if(@result); } else { my @coords = $mapper->map($frm_seq_region_name, $frm_start, $frm_end, $frm_strand, $frm_cs); if(@coords > 1 || !$coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { return @coords; } } } } # the toplevel coordinate system for the region requested *is* the # requested region. if($fastmap) { return ($seq_region_id,$frm_start, $frm_end, $frm_strand, $other_cs); } return Bio::EnsEMBL::Mapper::Coordinate->new ($seq_region_id,$frm_start,$frm_end, $frm_strand, $other_cs); } # # for polymorphism with AssemblyMapper # =head2 flush Args : none Example : none Description: polymorphism with AssemblyMapper, does nothing Returntype : none Exceptions : none Status : Stable =cut sub flush {} =head2 fastmap Arg [1] : string $frm_seq_region The name of the sequence region to transform FROM Arg [2] : int $frm_start The start of the region to transform FROM Arg [3] : int $frm_end The end of the region to transform FROM Arg [4] : int $strand The strand of the region to transform FROM Arg [5] : Bio::EnsEMBL::CoordSystem The coordinate system to transform FROM Example : @coords = $mapper->fastmap('X', 1_000_000, 2_000_000, 1, $chr_cs); Description: Transforms coordinates from one coordinate system to another. Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or Bio::EnsEMBL::Mapper:Gap objects Exceptions : thrown if if the specified TO coordinate system is not one of the coordinate systems associated with this mapper Caller : general Status : Stable =cut sub fastmap { my $self = shift; return $self->map(@_,1); } =head2 assembled_CoordSystem Arg [1] : none Example : $cs = $mapper->assembled_CoordSystem Description: Retrieves the assembled CoordSystem from this mapper Returntype : Bio::EnsEMBL::CoordSystem Exceptions : none Caller : internal, AssemblyMapperAdaptor Status : Stable =cut sub assembled_CoordSystem { my $self = shift; return $self->{'toplevel_cs'}; } =head2 component_CoordSystem Arg [1] : none Example : $cs = $mapper->component_CoordSystem Description: Retrieves the component CoordSystem from this mapper Returntype : Bio::EnsEMBL::CoordSystem Exceptions : none Caller : internal, AssemblyMapperAdaptor Status : Stable =cut sub component_CoordSystem { my $self = shift; return $self->{'other_cs'}; } sub _list { my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_cs, $seq_regions) = @_; my $mapper = $self->{'mapper'}; my $toplevel_cs = $self->{'toplevel_cs'}; my $other_cs = $self->{'other_cs'}; my $adaptor = $self->adaptor; if($frm_cs->is_top_level()) { throw("The toplevel CoordSystem can only be mapped TO, not FROM."); } if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . " is neither the assembled nor the component coordinate system " . " of this AssemblyMapper"); } my $coord_systems = $self->{'coord_systems'}; my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); # # TBD try to make this more efficient # my $from_rank = $other_cs->rank(); foreach my $cs (@$coord_systems) { last if($cs->rank >= $from_rank); #check if a mapping path even exists to this coordinate system my @mapping_path = @{ $csa->get_mapping_path( $cs, $other_cs ) }; if(@mapping_path) { # Try to map to this coord system. If we get back any coordinates then # it is our 'toplevel' that we were looking for my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); my @result; my @tmp; push @tmp, $frm_seq_region_name; my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; if($seq_regions) { @result = $mapper->list_seq_regions($frm_seq_region_name, $frm_start, $frm_end, $frm_cs); } else { @result = $mapper->list_ids($frm_seq_region_name, $frm_start, $frm_end, $frm_cs); } return @result if(@result); } } # the toplevel coordinate system for the region requested *is* the return ($frm_seq_region_name); # requested region. if($seq_regions) { return ($frm_seq_region_name); } #this seems a bit silly and inefficient, but it is probably never #called anyway. my $slice_adaptor = $adaptor->db()->get_SliceAdaptor(); my $slice = $slice_adaptor->fetch_by_region($other_cs->name(), $frm_seq_region_name, undef,undef,undef,$other_cs); return ($slice_adaptor->get_seq_region_id($slice)); } =head2 list_seq_regions Arg [1] : string $frm_seq_region The name of the sequence region of interest Arg [2] : int $frm_start The start of the region of interest Arg [3] : int $frm_end The end of the region to transform of interest Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs The coordinate system to obtain overlapping ids of Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$ctg_cs)) {...} Description: Retrieves a list of overlapping seq_region names of another coordinate system. This is the same as the list_ids method but uses seq_region names rather internal ids Returntype : List of strings Exceptions : none Caller : general Status : Stable =cut sub list_seq_regions { throw('Incorrect number of arguments.') if(@_ != 5); return _list(@_,1); } =head2 list_ids Arg [1] : string $frm_seq_region The name of the sequence region of interest. Arg [2] : int $frm_start The start of the region of interest Arg [3] : int $frm_end The end of the region to transform of interest Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs The coordinate system to obtain overlapping ids of Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$chr_cs)) {...} Description: Retrieves a list of overlapping seq_region internal identifiers of another coordinate system. This is the same as the list_seq_regions method but uses internal identfiers rather than seq_region strings Returntype : List of ints Exceptions : thrown if the from CoordSystem is the toplevel coord system thrown if the from CoordSystem is not the one used in the mapper Caller : general Status : Stable =cut sub list_ids { throw('Incorrect number of arguments.') if(@_ != 5); return _list(@_,0); } 1;