# $Id: selex.pm 16123 2009-09-17 12:57:27Z cjfields $ # # BioPerl module for Bio::AlignIO::selex # based on the Bio::SeqIO::selex module # by Ewan Birney # and Lincoln Stein # # and the SimpleAlign.pm module of Ewan Birney # # Copyright Peter Schattner # # You may distribute this module under the same terms as perl itself # _history # September 5, 2000 # POD documentation - main docs before the code =head1 NAME Bio::AlignIO::selex - selex sequence input/output stream =head1 SYNOPSIS # Do not use this module directly. Use it via the L class. use Bio::AlignIO; use strict; my $in = Bio::AlignIO->new(-format => 'selex', -file => 't/data/testaln.selex'); while( my $aln = $in->next_aln ) { } =head1 DESCRIPTION This object can transform L objects to and from selex flat file databases. =head1 FEEDBACK =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://bugzilla.open-bio.org/ =head1 AUTHORS - Peter Schattner Email: schattner@alum.mit.edu =head1 CONTRIBUTORS Jason Stajich, jason-at-bioperl.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::AlignIO::selex; use strict; use base qw(Bio::AlignIO); =head2 next_aln Title : next_aln Usage : $aln = $stream->next_aln() Function: returns the next alignment in the stream. Tries to read *all* selex It reads all non whitespace characters in the alignment area. For selexs with weird gaps (eg ~~~) map them by using $al->map_chars('~','-') Returns : L object Args : NONE =cut sub next_aln { my $self = shift; my $entry; my ($start,$end,%align,$name,$seqname,%hash,@c2name, %accession,%desc); my $aln = Bio::SimpleAlign->new(-source => 'selex'); # in selex format, every non-blank line that does not start # with '#=' is an alignment segment; the '#=' lines are mark up lines. # Of particular interest are the '#=GF AC ' # lines, which give accession numbers for each segment while( $entry = $self->_readline) { if( $entry =~ /^\#=GS\s+(\S+)\s+AC\s+(\S+)/ ) { $accession{ $1 } = $2; next; } elsif( $entry =~ /^\#=GS\s+(\S+)\s+DE\s+(.+)\s*$/ ) { $desc{$1} .= $2; } elsif ( $entry =~ /^([^\#]\S+)\s+([A-Za-z\.\-\*]+)\s*/ ) { my ($name,$seq) = ($1,$2); if( ! defined $align{$name} ) { push @c2name, $name; } $align{$name} .= $seq; } } # ok... now we can make the sequences foreach my $name ( @c2name ) { if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { $seqname = $1; $start = $2; $end = $3; } else { $seqname=$name; $start = 1; $end = length($align{$name}); } my $seq = Bio::LocatableSeq->new ('-seq' => $align{$name}, '-display_id' => $seqname, '-start' => $start, '-end' => $end, '-description' => $desc{$name}, '-accession_number' => $accession{$name}, ); $aln->add_seq($seq); } return $aln if $aln->num_sequences; return; } =head2 write_aln Title : write_aln Usage : $stream->write_aln(@aln) Function: writes the $aln object into the stream in selex format Returns : 1 for success and 0 for error Args : L object =cut sub write_aln { my ($self,@aln) = @_; my ($namestr,$seq,$add); my ($maxn); foreach my $aln (@aln) { $maxn = $aln->maxdisplayname_length(); foreach $seq ( $aln->each_seq() ) { $namestr = $aln->displayname($seq->get_nse()); $add = $maxn - length($namestr) + 2; $namestr .= " " x $add; $self->_print (sprintf("%s %s\n",$namestr,$seq->seq())) or return; } } $self->flush if $self->_flush_on_write && defined $self->_fh; return 1; } 1;