# $Id: nexus.pm 16123 2009-09-17 12:57:27Z cjfields $ # # BioPerl module for Bio::AlignIO::nexus # # Copyright Heikki Lehvaslaiho # =head1 NAME Bio::AlignIO::nexus - NEXUS format sequence input/output stream =head1 SYNOPSIS Do not use this module directly. Use it via the L class. use Bio::AlignIO; my $in = Bio::AlignIO->new(-format => 'nexus', -file => 'aln.nexus'); while( my $aln = $in->next_aln ) { # do something with the alignment } =head1 DESCRIPTION This object can transform L objects to and from NEXUS data blocks. See method documentation for supported NEXUS features. =head1 ACKNOWLEDGEMENTS Will Fisher has written an excellent standalone NEXUS format parser in Perl, readnexus. A number of tricks were adapted from it. =head1 FEEDBACK =head2 Support Please direct usage questions or support issues to the mailing list: I rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible. =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web: http://bugzilla.open-bio.org/ =head1 AUTHORS - Heikki Lehvaslaiho Email: heikki-at-bioperl-dot-org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::AlignIO::nexus; use vars qw(%valid_type); use strict; no strict "refs"; use base qw(Bio::AlignIO); BEGIN { %valid_type = map {$_, 1} qw( dna rna protein standard ); # standard throws error: inherited from Bio::PrimarySeq } =head2 new Title : new Usage : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename'); Function: returns a new Bio::AlignIO object to handle clustalw files Returns : Bio::AlignIO::clustalw object Args : -verbose => verbosity setting (-1,0,1,2) -file => name of file to read in or with ">" - writeout -fh => alternative to -file param - provide a filehandle to read from/write to -format => type of Alignment Format to process or produce Customization of nexus flavor output -show_symbols => print the symbols="ATGC" in the data definition (MrBayes does not like this) boolean [default is 1] -show_endblock => print an 'endblock;' at the end of the data (MyBayes does not like this) boolean [default is 1] =cut sub _initialize { my ($self, @args) = @_; $self->SUPER::_initialize(@args); my ($show_symbols, $endblock) = $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args); my @names = qw(symbols endblock); for my $v ( $show_symbols, $endblock ) { $v = 1 unless defined $v; # default value is 1 my $n = shift @names; $self->flag($n, $v); } } =head2 next_aln Title : next_aln Usage : $aln = $stream->next_aln() Function: Returns the next alignment in the stream. Supports the following NEXUS format features: - The file has to start with '#NEXUS' - Reads in the name of the alignment from a comment (anything after 'TITLE: ') . - Sequence names can be given in a taxa block, too. - If matchchar notation is used, converts them back to sequence characters. - Does character conversions specified in the NEXUS equate command. - Sequence names of type 'Homo sapiens' and Homo_sapiens are treated identically. Returns : L object Args : =cut sub next_aln { my $self = shift; my $entry; my ($aln_name, $seqcount, $residuecount, %hash, $alphabet, $match, $gap, $missing, $equate, $interleave, $name,$str,@names,$seqname,$start,$end,$count,$seq); local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.'; local $Bio::LocatableSeq::GAP_SYMBOLS = '\-'; my $aln = Bio::SimpleAlign->new(-source => 'nexus'); # file starts with '#NEXUS' but we allow white space only lines before it $entry = $self->_readline; $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/; return unless $entry; $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry") unless ($entry && $entry =~ /^#NEXUS/i); # skip anything before either the taxa or data block # but read in the optional title in a comment while (defined($entry = $self->_readline)) { local ($_) = $entry; /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1; last if /^begin +data/i || /^begin +taxa/i; } $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name; # data and taxa blocks my $incomment; while (defined ($entry = $self->_readline)) { local ($_) = $entry; next if s/\[[^\]]+\]//g; # remove comments if( s/\[[^\]]+$// ) { $incomment = 1; # skip line if it is now empty or contains only whitespace next if /^\s*$/; } elsif($incomment) { if( s/^[^\]]*\]// ) { $incomment = 0; } else { next; } } elsif( /taxlabels/i ) { # doesn't deal with taxlabels adequately and can mess things up! # @names = $self->_read_taxlabels; } else { /ntax\s*=\s*(\d+)/i and $seqcount = $1; /nchar\s*=\s*(\d+)/i and $residuecount = $1; /matchchar\s*=\s*(.)/i and $match = $1; /gap\s*=\s*(.)/i and $gap = $1; /missing\s*=\s*(.)/i and $missing = $1; /equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A"; /datatype\s*=\s*(\w+)/i and $alphabet = lc $1; /interleave/i and $interleave = 1 ; last if /matrix/io; } } $self->throw("Not a valid NEXUS sequence file. Datatype not specified.") unless $alphabet; $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]") unless $valid_type{$alphabet}; $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^") if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/; $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^") if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/; $aln->gap_char($gap); $aln->missing_char($missing); # # if data is not right after the matrix line # read the empty lines out # while ($entry = $self->_readline) { unless ($entry =~ /^\s+$/) { $self->_pushback($entry); last; } } # # matrix command # # first alignment section if (@names == 0) { # taxa block did not exist while ($entry = $self->_readline) { local ($_) = $entry; if( s/\[[^\]]+\]//g ) { #] remove comments next if /^\s*$/; # skip line if it is now empty or contains only whitespace } if ($interleave && defined$count && ($count <= $seqcount)) { /^\s+$/ and last; } else { /^\s+$/ and next; } /^\s*;/ and last; # stop if colon at end of matrix is on it's own line #/^\s*;\s*$/ and last; if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) { # get single and double quoted names, or all the first # nonwhite word as the name, and remained is seq #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #' $name = ($2 || $3); if ($4) { # seq is on same line as name # this is the usual NEXUS format $str = $4; } else { # otherwise get seq from following lines. No comments allowed # a less common matrix format, usually used for very long seqs $str=''; while (local ($_) = $self->_readline) { my $str_tmp = $_; $str_tmp =~ s/[\s;]//g; $str .= $str_tmp; last if length$str == $residuecount; } } $name =~ s/ /_/g; push @names, $name; $str =~ s/[\s;]//g; $count = @names; $hash{$count} = $str; } $self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount; /;/ and last; # stop if colon at end of matrix is on the same line as the last seq } } # interleaved sections $count = 0; if ( $interleave ) { # only read next section if file is interleaved while( $entry = $self->_readline) { local ($_) = $entry; if( s/\[[^\]]+\]//g ) { #] remove comments next if /^\s*$/; # skip line if it is now empty or contains only whitespace } /^\s*;/ and last; # stop if colon at end of matrix is on it's own line $count = 0, next if $entry =~ /^\s*$/; if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #' $str = $4; $str =~ s/[\s;]//g; $count++; $hash{$count} .= $str; }; $self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount; /;/ and last; # stop if colon at end of matrix is on the same line as the last seq } } return 0 if @names < 1; # sequence creation $count = 0; foreach $name ( @names ) { $count++; if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { $seqname = $1; $start = $2; $end = $3; } else { $seqname=$name; $start = 1; $str = $hash{$count}; $str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g; $end = length($str); } # consistency test $self->throw("Length of sequence [$seqname] is not [$residuecount]; got".CORE::length($hash{$count})) unless CORE::length($hash{$count}) == $residuecount; $seq = Bio::LocatableSeq->new('-seq'=>$hash{$count}, '-id'=>$seqname, '-start'=>$start, '-end'=>$end, 'alphabet'=>$alphabet ); $aln->add_seq($seq); } # if matchchar is used $aln->unmatch($match) if $match; # if equate ( e.g. equate="T=C G=A") is used if ($equate) { $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g; } while (defined $entry && $entry !~ /endblock/i) { $entry = $self->_readline; } return $aln if $aln->num_sequences; return; } sub _read_taxlabels { my ($self) = @_; my ($name, @names); while (my $entry = $self->_readline) { last if $entry =~ m/^\s*(END)?;/i; if( $entry =~ m/\s*(\S+)\s+/ ) { ($name) = ($1); $name =~ s/\[[^\[]+\]//g; $name =~ s/\W/_/g; push @names, $name; } } return @names; } =head2 write_aln Title : write_aln Usage : $stream->write_aln(@aln) Function: Writes the $aln object into the stream in interleaved NEXUS format. Everything is written into a data block. SimpleAlign methods match_char, missing_char and gap_char must be set if you want to see them in the output. Returns : 1 for success and 0 for error Args : L object =cut sub write_aln { my ($self,@aln) = @_; my $count = 0; my $wrapped = 0; my $maxname; my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index ); my ($match, $missing, $gap,$symbols) = ('', '', '',''); foreach my $aln (@aln) { if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); next; } $self->throw("All sequences in the alignment must be the same length") unless $aln->is_flush($self->verbose); $length = $aln->length(); $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n", $aln->id, $aln->num_sequences, $length)); $match = "match=". $aln->match_char if $aln->match_char; $missing = "missing=". $aln->missing_char if $aln->missing_char; $gap = "gap=". $aln->gap_char if $aln->gap_char; $symbols = 'symbols="'.join('',$aln->symbol_chars). '"' if( $self->flag('symbols') && $aln->symbol_chars); $self->_print (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n", $aln->get_seq_by_pos(1)->alphabet, $match, $missing, $gap, $symbols)); # account for single quotes round names my $indent = $aln->maxdisplayname_length+2; $aln->set_displayname_flat(); foreach $seq ( $aln->each_seq() ) { my $nmid = $aln->displayname($seq->get_nse()); if( $nmid =~ /[^\w\d\.]/ ) { # put name in single quotes incase it contains any of # the following chars: ()[]{}/\,;:=*'"`+-<> that are not # allowed in PAUP* and possible other software $name = sprintf("%-${indent}s", "\'" . $nmid . "\'"); } else { $name = sprintf("%-${indent}s", $nmid); } $hash{$name} = $seq->seq; push(@arr,$name); } while( $count < $length ) { # there is another block to go! foreach $name ( @arr ) { my $dispname = $name; # $dispname = '' if $wrapped; $self->_print (sprintf("%${indent}s ",$dispname)); $tempcount = $count; $index = 0; while( ($tempcount + 10 < $length) && ($index < 5) ) { $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10))); $tempcount += 10; $index++; } # last if( $index < 5) { # space to print! $self->_print (sprintf("%s ",substr($hash{$name},$tempcount))); $tempcount += 10; } $self->_print ("\n"); } $self->_print ("\n\n"); $count = $tempcount; $wrapped = 1; } if( $self->flag('endblock') ) { $self->_print (";\n\nendblock;\n"); } else { $self->_print (";\n\nend;\n"); } } $self->flush if $self->_flush_on_write && defined $self->_fh; return 1; } =head2 flag Title : flag Usage : $obj->flag($name,$value) Function: Get/Set a flag value Returns : value of flag (a scalar) Args : on set, new value (a scalar or undef, optional) =cut sub flag{ my ($self,$name,$val) = @_; return $self->{'flag'}->{$name} = $val if defined $val; return $self->{'flag'}->{$name}; } 1;