=head1 LICENSE Copyright (c) 1999-2013 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/legal/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at . Questions may also be sent to the Ensembl help desk at . =cut # # Ensembl module for Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeAdaptor # # Copyright (c) 2005 Ensembl # # You may distribute this module under the same terms as perl itself # # =head1 NAME Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeAdaptor =head1 SYNOPSIS Adaptor for IndividualGenotype objects. =head1 DESCRIPTION This adaptor provides database connectivity for IndividualGenotype objects. IndividualGenotypes may be retrieved from the Ensembl variation database by several means using this module. =head1 METHODS =cut package Bio::EnsEMBL::Variation::DBSQL::IndividualGenotypeAdaptor; use strict; use warnings; use vars qw(@ISA); use Bio::EnsEMBL::Variation::DBSQL::BaseGenotypeAdaptor; use Bio::EnsEMBL::Variation::IndividualGenotype; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); use Scalar::Util qw(weaken); our $CACHE_SIZE = 5; @ISA = ('Bio::EnsEMBL::Variation::DBSQL::BaseGenotypeAdaptor'); # stores a listref of individual genotype objects sub store { my ($self, $gts, $merge) = @_; throw("First argument to store is not a listref") unless ref($gts) eq 'ARRAY'; # sort genotypes into rows by variation my %by_var; push @{$by_var{$_->variation->dbID.'_'.($_->{subsnp} ? $_->{subsnp} : '')}}, $_ for @$gts; # get unique genotypes and codes my %unique_gts = map {$_->genotype_string().'|'.(defined($_->phased) ? $_->phased : 'NULL') => 1} @$gts; foreach my $gt_key(keys %unique_gts) { my @split = split /\|/, $gt_key; my $phased = pop @split; $unique_gts{$gt_key} = $self->_genotype_code(\@split, $phased); } # get variation objects my %var_objs = map {$_->variation->dbID => $_->variation} @$gts; my $dbh = $self->dbc->db_handle; my $sth = $dbh->prepare_cached(qq{ INSERT INTO compressed_genotype_var( variation_id, subsnp_id, genotypes ) VALUES (?,?,?) }); my $update_sth = $dbh->prepare(qq{ UPDATE compressed_genotype_var SET genotypes = ? WHERE variation_id = ? AND subsnp_id = ? }); my $rows_added = 0; foreach my $combo_id(keys %by_var) { my $genotype_string = ''; my ($var_id, $subsnp_id) = split /\_/, $combo_id; $subsnp_id = undef if $subsnp_id eq ''; # check for existing genotypes my @existing_gts = ($merge ? grep { (defined($_->{subsnp}) && defined($subsnp_id) && $_->{subsnp} eq $subsnp_id) || (!defined($_->{subsnp}) && !defined($subsnp_id)) } @{$self->fetch_all_by_Variation($var_objs{$var_id})} : ()); # update if existing if(@existing_gts) { # refresh unique_gts %unique_gts = map {$_->genotype_string().'|'.(defined($_->phased) ? $_->phased : 'NULL') => 1} (@existing_gts, @$gts); foreach my $gt_key(keys %unique_gts) { my @split = split /\|/, $gt_key; my $phased = pop @split; $unique_gts{$gt_key} = $self->_genotype_code(\@split, $phased); } # make sure we don't put in duplicates my %by_ind = map {$_->individual->dbID => $_} (@existing_gts, @$gts); $genotype_string .= pack("ww", $_->individual->dbID, $unique_gts{$_->genotype_string.'|'.(defined($_->phased) ? $_->phased : 'NULL')}) for values %by_ind; $update_sth->execute( $genotype_string, $var_id, $subsnp_id ); } else { $genotype_string .= pack("ww", $_->individual->dbID, $unique_gts{$_->genotype_string.'|'.(defined($_->phased) ? $_->phased : 'NULL')}) for @$gts; $sth->execute( $var_id, $subsnp_id, $genotype_string ); $rows_added++; } } $sth->finish; $update_sth->finish; return $rows_added; } # similar to store above but writes to old-style non-compressed table # defaults to individual_genotype_multiple_bp since # tmp_individual_genotype_single_bp will only accept single nucleotide genotypes sub store_uncompressed { my ($self, $gts, $table) = @_; $table ||= 'individual_genotype_multiple_bp'; throw("First argument to store is not a listref") unless ref($gts) eq 'ARRAY'; my $dbh = $self->dbc->db_handle; my $q_string = join ",", map {'(?,?,?,?,?)'} @$gts; my @args = map { $_->{_variation_id} || $_->variation->dbID, $_->{subsnp}, $_->allele1, $_->allele2, $_->individual ? $_->individual->dbID : undef, } @$gts; my $sth = $dbh->prepare_cached(qq{ INSERT INTO $table( variation_id, subsnp_id, allele_1, allele_2, individual_id ) VALUES $q_string }); $sth->execute(@args); $sth->finish; return scalar @$gts; } =head2 fetch_all_by_Variation Arg [1] : Bio::EnsEMBL::Variation $variation Example : my $var = $variation_adaptor->fetch_by_name( "rs1121" ) $igtypes = $igtype_adaptor->fetch_all_by_Variation( $var ) Description: Retrieves a list of individual genotypes for the given Variation. If none are available an empty listref is returned. Returntype : listref Bio::EnsEMBL::Variation::IndividualGenotype Exceptions : none Caller : general Status : Stable =cut sub fetch_all_by_Variation { my $self = shift; my $variation = shift; my $sample = shift; if(!ref($variation) || !$variation->isa('Bio::EnsEMBL::Variation::Variation')) { throw('Bio::EnsEMBL::Variation::Variation argument expected'); } if(!defined($variation->dbID())) { warning("Cannot retrieve genotypes for variation without dbID"); return []; } my $results; # check cache # All ind gt objects for a variation are stored on the adaptor. # They are stored in an array of limited size so that the memory usage # doesn't go out of control for when the API requests genotypes for many # variations sequentially. my $cached; if(defined($self->{_cache})) { foreach my $stored(@{$self->{_cache}}) { my @keys = keys %{$stored}; $cached = $stored->{$keys[0]} if $keys[0] eq $variation->dbID; last if defined($cached); } } if(!defined($cached)) { $cached = $self->generic_fetch("g.variation_id = " . $variation->dbID()); for(@$cached) { $_->variation($variation); weaken($_->{'variation'}); } # add genotypes for this variant to the cache push @{$self->{_cache}}, {$variation->dbID => $cached}; # shift off first element to keep cache within size limit shift @{$self->{_cache}} if scalar @{$self->{_cache}} > $CACHE_SIZE; } if (defined $sample && defined $sample->dbID){ if($sample->isa('Bio::EnsEMBL::Variation::Individual')) { @$results = grep {$_->individual->dbID == $sample->dbID} @{$cached}; } elsif($sample->isa('Bio::EnsEMBL::Variation::Population')) { my %include = map {$_->dbID => 1} @{$sample->get_all_Individuals}; @$results = grep {$include{$_->individual->dbID}} @{$cached}; } else { throw("Argument supplied is not of type Bio::EnsEMBL::Variation::Individual or Bio::EnsEMBL::Variation::Population"); } } return $cached; } sub fetch_all_by_Slice { my $self = shift; my $cga = $self->db->get_IndividualGenotypeFeatureAdaptor(); return $cga->fetch_all_by_Slice(@_); } sub _tables{ my $self = shift; return (['compressed_genotype_var','g'],['failed_variation','fv']); } #Add a left join to the failed_variation table sub _left_join { return ([ 'failed_variation', 'fv.variation_id = g.variation_id']); } sub _columns{ return qw(g.variation_id g.subsnp_id g.genotypes); } sub _objs_from_sth{ my $self = shift; my $sth = shift; my ($variation_id, $subsnp_id, $genotypes); $sth->bind_columns(\$variation_id, \$subsnp_id, \$genotypes); my (%individual_hash, %gt_code_hash, @results); my %done; while($sth->fetch) { my @genotypes = unpack("(ww)*", $genotypes); while(@genotypes) { my $individual_id = shift @genotypes; my $gt_code = shift @genotypes; ## temp fix for duplicated 1KG data my $ss = $subsnp_id; $ss = 0 unless defined $ss ; next if $done{$individual_id}{$gt_code}{$ss}; $done{$individual_id}{$gt_code}{$ss} = 1; my $igtype = Bio::EnsEMBL::Variation::IndividualGenotype->new_fast({ _variation_id => $variation_id, subsnp => $subsnp_id, adaptor => $self, }); $individual_hash{$individual_id} ||= []; push @{$individual_hash{$individual_id}}, $igtype; $gt_code_hash{$gt_code} ||= []; push @{$gt_code_hash{$gt_code}}, $igtype; push @results, $igtype; } } # fetch individuals my $ia = $self->db()->get_IndividualAdaptor(); my $inds = $ia->fetch_all_by_dbID_list([keys %individual_hash]); foreach my $i (@$inds) { foreach my $igty (@{$individual_hash{$i->dbID()}}) { $igty->{individual} = $i; } } # get all genotypes from codes my $gtca = $self->db->get_GenotypeCodeAdaptor(); my $gtcs = $gtca->fetch_all_by_dbID_list([keys %gt_code_hash]); foreach my $gtc(@$gtcs) { foreach my $igty(@{$gt_code_hash{$gtc->dbID}}) { $igty->{genotype} = $gtc->genotype; $igty->{phased} = $gtc->phased; } } # check for defined genotype and individual @results = grep {defined $_->{genotype} && defined $_->{individual}} @results; return \@results; } 1;