/**************************************************************************** * FILE: alignment.c * AUTHOR: William Stafford Noble, Charles E. Grant * CREATE DATE: 10/28/2004 * PROJECT: EVOMCAST * DESCRIPTION: Multiple alignment of biological sequences. * COPYRIGHT: 1998-2008, UCSD, UCSC, UW ****************************************************************************/ #include #include #include "alignment.h" #include "alphabet.h" #include "utils.h" #include "string-list.h" #include "object-list.h" #include "clustalw-io.h" #define MAX_ALIGNMENT_NAME 40 // Longest alignment ID. #define MAX_ALIGNMENT_COMMENT 128 // Longest comment. extern char alphabet[]; // Instantiate the ALIGNMENT_T type. struct alignment { char name[MAX_ALIGNMENT_NAME + 1]; // Alignment program name. char desc[MAX_ALIGNMENT_COMMENT + 1]; // Alignment description. int length; // Length of the Alignment int num_sequences; // Number of sequences in the alignment SEQ_T** sequences; // The aligned sequences. char* consensus_string; // The string representing the alignment // N.B. At some point, we'll need to assign an alignment ID. }; /**************************************************************************** * Allocate one alignment object. Name and description may be NULL. * * Returns a pointer to the newly created alignment. ****************************************************************************/ ALIGNMENT_T* allocate_alignment( char* name, char* description, int num_sequences, SEQ_T** sequences, char* consensus_string ) { assert(num_sequences > 0); assert(sequences != NULL); assert(consensus_string != NULL); // Allocate the alignment object. ALIGNMENT_T* new_alignment = (ALIGNMENT_T*)mm_malloc(sizeof(ALIGNMENT_T)); if (new_alignment == NULL) { die("Error allocating alignment\n"); } // Store the name, truncating if necessary. if (name != NULL) { strncpy(new_alignment->name, name, MAX_ALIGNMENT_NAME); new_alignment->name[MAX_ALIGNMENT_NAME] = '\0'; if (strlen(new_alignment->name) != strlen(name)) { fprintf(stderr, "Warning: truncating alignment program name %s to %s.\n", name, new_alignment->name); } } // Store the description, truncating if necessary. if (description != NULL) { strncpy(new_alignment->desc, description, MAX_ALIGNMENT_COMMENT); new_alignment->desc[MAX_ALIGNMENT_COMMENT] = '\0'; } // Store the sequences. new_alignment->sequences = (SEQ_T**) mm_malloc(num_sequences * sizeof(SEQ_T*)); if (new_alignment->sequences == NULL) { die("Error allocating sequences\n"); } new_alignment->num_sequences = num_sequences; int seq_length = strlen(get_raw_sequence(sequences[0])); int i; for (i = 0; i < num_sequences; i++) { myassert(TRUE, strlen(get_raw_sequence(sequences[i])) == seq_length, "Sequence #1 (%s) is length=%d, but sequence #%d (%s) is length=%d.\n<%s>\n", get_seq_name(sequences[0]), seq_length, i, get_seq_name(sequences[i]), strlen(get_raw_sequence(sequences[i])), get_raw_sequence(sequences[i])); new_alignment->sequences[i] = allocate_seq(get_seq_name(sequences[i]), get_seq_description(sequences[i]), get_seq_offset(sequences[i]), get_raw_sequence(sequences[i]) ); } // Fill in the remaining fields. new_alignment->length = seq_length; copy_string(&(new_alignment->consensus_string), consensus_string); return(new_alignment); } /**************************************************************************** * Get and set various fields. ****************************************************************************/ char* get_alignment_name(ALIGNMENT_T* alignment) { assert(alignment != NULL); return(alignment->name); } void set_alignment_name(char* name, ALIGNMENT_T* alignment) { assert(name != NULL); assert(alignment != NULL); alignment->name[0] = 0; strncat(alignment->name, name, MAX_ALIGNMENT_NAME); } char* get_alignment_description(ALIGNMENT_T* alignment) { assert(alignment != NULL); return(alignment->desc); } char* get_consensus_string(ALIGNMENT_T* alignment) { assert(alignment != NULL); return(alignment->consensus_string); } SEQ_T* get_consensus_sequence(double threshold, ALIGNMENT_T* alignment) { char* seq_string = NULL; unsigned char c = 0; unsigned char most_freq_char = 0; #define NUM_CHARS 127 char char_counts[NUM_CHARS]; int i = 0; int j = 0; double max_char_freq = 0.0; SEQ_T* consensus; assert(alignment != NULL); seq_string = mm_malloc(alignment->length * sizeof(char) + 1); if (seq_string == NULL) { die("Error allocating consensus sequence string\n"); } // For each column in the alignment for (i = 0; i < alignment->length; i++) { most_freq_char = 0; memset(char_counts, 0, NUM_CHARS * sizeof(char)); // Count character occurances for (j = 0; j < alignment->num_sequences; j++) { c = get_seq_char(i, alignment->sequences[j]); char_counts[c]++; } // Find the index of the character that occurs the most frequently for (c = 0; c < NUM_CHARS; c++) { most_freq_char = char_counts[most_freq_char] >= char_counts[c] ? most_freq_char : c; } // If the most frequent character exceeds the threshold // it will be the consensus character. max_char_freq = (double) char_counts[most_freq_char] / (double) alignment->num_sequences; if (max_char_freq >= threshold) { seq_string[i] = most_freq_char; } else { // Otherwise the consensus is the gap character seq_string[i] = '-'; } } seq_string[i] = '\0'; consensus = allocate_seq("Consensus", "", 0, seq_string); if (seq_string != NULL) myfree(seq_string); return(consensus); } int get_alignment_length(ALIGNMENT_T* alignment) { assert(alignment != NULL); return(alignment->length); } int get_num_aligned_sequences(ALIGNMENT_T* alignment) { assert(alignment != NULL); return(alignment->num_sequences); } SEQ_T** get_alignment_sequences(ALIGNMENT_T* alignment) { assert(alignment != NULL); assert(alignment->sequences != NULL); return(alignment->sequences); } SEQ_T* get_alignment_sequence(int index, ALIGNMENT_T* alignment) { assert(alignment != NULL); assert(alignment->sequences != NULL); assert(index >= 0); assert(index < alignment->num_sequences); return(alignment->sequences[index]); } SEQ_T* get_alignment_sequence_by_name(char* name, ALIGNMENT_T* alignment) { int i = 0; SEQ_T* sequence = NULL; assert(alignment != NULL); assert(alignment->sequences != NULL); assert(name != NULL); for (i = 0; i < alignment->num_sequences; i++) { if (strcmp(name, get_seq_name(alignment->sequences[i])) == 0) { sequence = alignment->sequences[i]; break; } } return sequence; } int get_num_identical_sites(ALIGNMENT_T* alignment) { char *c = NULL; int length = 0; int num_identical_sites = 0; assert(alignment != NULL); assert(alignment->consensus_string != NULL); c = alignment->consensus_string; while (*c != '\0') { length++; if (*c == '*') { num_identical_sites++; } c++; } assert(length == alignment->length); return(num_identical_sites); } int get_num_conserved_sites(ALIGNMENT_T* alignment) { char *c = NULL; int length = 0; int num_conserved_sites = 0; assert(alignment != NULL); c = alignment->consensus_string; while (*c != '\0') { length++; if (*c == ':') { num_conserved_sites++; } c++; } assert(length == alignment->length); return(num_conserved_sites); } int get_num_semiconserved_sites(ALIGNMENT_T* alignment) { char *c = NULL; int length = 0; int num_semiconserved_sites = 0; assert(alignment != NULL); c = alignment->consensus_string; while (*c != '\0') { length++; if (*c == '.') { num_semiconserved_sites++; } c++; } assert(length == alignment->length); return(num_semiconserved_sites); } int get_num_nonconserved_sites(ALIGNMENT_T* alignment) { char *c = NULL; int length = 0; int num_nonconserved_sites = 0; assert(alignment != NULL); c = alignment->consensus_string; while (*c != '\0') { length++; if (*c == '.') { num_nonconserved_sites++; } c++; } assert(length == alignment->length); return(num_nonconserved_sites); } int count_residues(char* seq) { int count = 0; while(*seq != '\0') { if (*seq != '-' && *seq != '.') count++; seq++; } return count; } /**************************************************************************** * Fill in a null terminated string with the bases in one column of the * alignment. The user must allocate the memory for the string, which should * be large enough to store one characters from each sequence in the alignment * plus the trailing null character. This is done for reasons of efficiency, * since in most cases the user will be making for this call iteratively over * the length of the alignment. ****************************************************************************/ void get_alignment_col(int col, char* alignment_col, ALIGNMENT_T* alignment) { int i; for (i = 0; i < alignment->num_sequences; i++, alignment_col++) { *alignment_col = get_seq_char(col, alignment->sequences[i]); } *alignment_col = '\0'; } /************************************************************************* * Convert the string representing an alignment column into an integer * which will be the column index for that alignment column in the PSSM. * If the alphabet has m characters, and the alignment columns have n entries, * the array of all alignment columns is conveniently numbered by the set of * consecutive n-digit base m numerals: * AAAA = 0000, AAAC = 0001, ..., TTTG = 3332, TTTT = 3333. *************************************************************************/ int hash_alignment_col(ALPH_T alph, char* alignment_col, int alignment_col_size) { // The alignment column must have at least // one character aside from the terminating null. assert(alignment_col_size >= 1); assert(alignment_col != NULL); int asize = alph_size(alph, ALPH_SIZE); int hash = 0; int place = 1; int i, j; for (i = alignment_col_size - 1; i >= 0; i--) { // Find alignment column character in alphabet j = alph_index(alph, alignment_col[i]); if (j == -1 || j >= asize) { die("Alignment character %c not found in alphabet\n", alignment_col[i]); } hash += j * place; place *= asize; } return hash; } // hash_alignment_col /************************************************************************* * Convert an integer representing a column in a PSSM into the * corresponding alignment column string. * If the alphabet has m characters, and the alignment columns have n entries, * the array of all alignment columns is conveniently numbered by the set of * consecutive n-digit base m numerals: * AAAA = 0000, AAAC = 0001, ..., TTTG = 3332, TTTT = 3333. * The caller must allocate the memory for the alignment column string. * The memory required is the number of sequences in the alignment, plus one * for the terminating null. *************************************************************************/ void unhash_alignment_col( ALPH_T alph, int alignment_col_index, char *alignment_col, int alignment_col_size ) { int asize = alph_size(alph, ALPH_SIZE); assert(alignment_col_index >= 0); assert( alignment_col_index < pow( (double) asize, (double) alignment_col_index ) ); assert(alignment_col != NULL); assert(alignment_col_size >= 1); alignment_col[alignment_col_size] = '\0'; int i, j; for (i = alignment_col_size - 1; i >= 0; i--) { j = alignment_col_index % asize; alignment_col_index -= j; alignment_col[i] = alph_char(alph, j); alignment_col_index /= asize; } } // unhash_alignment_col /************************************************************************* * Build array containing the counts of columns in the alignment * Caller is responsible for freeing the returned array. * If input parameter "freqs" is NULL, allocates the array. * Otherwise, the counts are added to the existing counts in the counts * array. Ignores all columns containing gaps or ambiguity characters: * [.-nNxX] *************************************************************************/ static ARRAY_T* build_alignment_column_counts( ALPH_T alph, ALIGNMENT_T* alignment, ARRAY_T* counts ) { assert(alignment != NULL); int asize = alph_size(alph, ALPH_SIZE); // Calculate number of possible alignment columns // and create storage for counting occurences. int num_seqs = get_num_aligned_sequences(alignment); int num_alignment_cols = (int) pow((double) asize, (double) num_seqs); if (counts == NULL) { counts = allocate_array(num_alignment_cols); } // Count how many examples of each column occur in the alignment. // Skip columns that contain gaps or ambiguity characters. int alignment_length = get_alignment_length(alignment); char* alignment_col = mm_malloc(sizeof(char) * (num_seqs + 1)); alignment_col[num_seqs] = 0; int i, h; for(i = 0; i < alignment_length; i++) { get_alignment_col(i, alignment_col, alignment); if (strchr(alignment_col, '-') != NULL) { continue; } if (strchr(alignment_col, '.') != NULL) { continue; } if (strchr(alignment_col, 'N') != NULL) { continue; } if (strchr(alignment_col, 'n') != NULL) { continue; } if (strchr(alignment_col, 'X') != NULL) { continue; } if (strchr(alignment_col, 'x') != NULL) { continue; } h = hash_alignment_col(alph, alignment_col, num_seqs); incr_array_item(h, 1, counts); } return counts; } // build_alignment_column_counts /**************************************************************************** * Return a list containing the empirical column frequency distributions * for all alignments in the input. * * Each file in the list of filenames is read and the species list is * determined. The counts of each occurring column are tallied. * All files with the same species lists get their counts combined. * * The returned list contains one distribution per species list that * occurs in some alignment. ****************************************************************************/ OBJECT_LIST_T* get_alignment_column_freqs_list (ALPH_T alph, STRING_LIST_T* filenames, BOOLEAN_T remove_allgap_seqs) { int file_index; int num_filenames = get_num_strings(filenames); ARRAY_T* alignment_column_freqs = NULL; OBJECT_LIST_T* alignment_column_freqs_list = new_object_list(equal_string_lists, (void*)copy_string_list, free_string_list, free_array); // Consider each alignment in turn. for(file_index = 0; file_index < num_filenames; file_index++) { char* filename = get_nth_string(file_index, filenames); if (verbosity >= NORMAL_VERBOSE && !(file_index % 1)) { fprintf( stderr, "Computing column freqs: alignment file number %d of %d total files.\n", file_index+1, num_filenames ); } // Read the alignment int ref_seq_index = 0; ALIGNMENT_T* alignment = read_alignment_from_file(filename, TRUE, remove_allgap_seqs, &ref_seq_index); STRING_LIST_T* alignment_species = get_species_names(alignment); // Try to retrieve the counts so far for this list of species. alignment_column_freqs = (ARRAY_T*)retrieve_object( alignment_species, alignment_column_freqs_list ); // Found counts for current species list? if (alignment_column_freqs) { // Add counts from current alignment. (void) build_alignment_column_counts(alph, alignment, alignment_column_freqs); // Note: objects in lists are references, so no need to re-store // after modification. } // Didn't find counts for this species list, so create new array of counts. else { alignment_column_freqs = build_alignment_column_counts(alph, alignment, NULL); store_object( (void*)alignment_column_freqs, (void*)alignment_species, 0.0, // Score alignment_column_freqs_list ); } // free space used by alignment free_alignment(alignment); } // each filename fprintf(stderr, "\n"); // Convert counts to frequencies by retrieving each array of counts // and dividing by the total counts for that list of species. while ( (alignment_column_freqs = retrieve_next_object(alignment_column_freqs_list) ) != NULL ) { int i; int num_freqs = get_array_length(alignment_column_freqs); double total_counts; // Get total counts. for (i=total_counts=0; inum_sequences; s++) { for (i = 0; i < alignment->length; i++) { c = get_seq_char(i, alignment->sequences[s]); aindex = alph_index(alph, c); // c might be an ambiguity code. We don't count ambiguity codes. if (aindex != -1 && aindex < asize) { num_bases[aindex]++; total_bases++; } } } freqs = allocate_array(asize); for (i = 0; i < asize; i++) { set_array_item(i, (double) num_bases[i] / (double) total_bases, freqs); } // Clean up the count of characters myfree(num_bases); return freqs; } /**************************************************************************** * Get a cumulative count of gaps within one sequence of the alignment ****************************************************************************/ int* get_cumulative_gap_count(int seqIndex, ALIGNMENT_T* alignment) { int* results = (int *)mm_malloc( sizeof(int) * alignment->length ); int i = 0; char c = get_seq_char(i, alignment->sequences[seqIndex]); if (c == '-' || c == '.') { results[i] = 1; } else { results[i] = 0; } for (i = 1; i < alignment->length; i++) { c = get_seq_char(i, alignment->sequences[seqIndex]); if (c == '-' || c == '.') { results[i] = results[i-1] + 1; } else { results[i] = results[i-1]; } } return results; } /**************************************************************************** * Does a column of an alignment contain gaps? ****************************************************************************/ BOOLEAN_T alignment_site_has_gaps(int index, ALIGNMENT_T* alignment) { int i = 0; for (i = 0; i < alignment->num_sequences; i++) { char c = get_seq_char(index, alignment->sequences[i]); if (c == '-' || c == '.') { return(TRUE); } } return(FALSE); } /**************************************************************************** * Does a column of an alignment contain any ambiguity codes? ****************************************************************************/ BOOLEAN_T alignment_site_ambiguous(ALPH_T alph, int index, ALIGNMENT_T* alignment) { int i; for (i = 0; i < alignment->num_sequences; i++) { char c = get_seq_char(index, alignment->sequences[i]); if (!alph_is_concrete(alph, c)) return(TRUE); } return(FALSE); } /**************************************************************************** * Create a lookup table for converting an index into an alignment to an index * into a gapless version of one of the sequences in the alignment. ****************************************************************************/ int* make_alignment_to_seq_table(int ref_seq_index, ALIGNMENT_T* an_alignment) { char* raw_seq = NULL; int align_length = 0; int align_index = 0; int seq_index = 0; int *table = NULL; SEQ_T* seq = NULL; align_length = get_alignment_length(an_alignment); seq = get_alignment_sequence(ref_seq_index, an_alignment); raw_seq = get_raw_sequence(seq); // Table is indexed by position in the alignment // Table values are the corresponding position in the // reference sequence not including gaps. table = (int *) mm_malloc((align_length) * sizeof(int)); seq_index = 0; for (align_index = 0; align_index < align_length; align_index++) { if (raw_seq[align_index] != '-' && raw_seq[align_index] != '.') { seq_index++; } table[align_index] = seq_index; } return table; } /**************************************************************************** * Create a lookup table for converting an index into a sequence to an index * into the alignment. Note that because there are many alignment positions * that correspond to a sequence position we take the first occurence. * JCH: I have added this function for the sake of the BLS scan mode * so that single mode matches in each sequence can be mapped back * to positions in the alignment. ****************************************************************************/ int* make_seq_to_alignment_table(int ref_seq_index, ALIGNMENT_T* an_alignment) { char* raw_seq = NULL; int align_length = 0; int align_index = 0; int seq_index = 0; int *table = NULL; SEQ_T* seq = NULL; align_length = get_alignment_length(an_alignment); seq = get_alignment_sequence(ref_seq_index, an_alignment); raw_seq = get_raw_sequence(seq); // Table is indexed by position in the sequence // Table values are the first corresponding // position in the alignment. table = (int *) mm_malloc((align_length) * sizeof(int)); seq_index = 0; table[seq_index] = 0; for (align_index = 0; align_index < align_length; align_index++) { if (raw_seq[align_index] != '-' && raw_seq[align_index] != '.') { seq_index++; table[seq_index] = align_index; } } return table; } /**************************************************************************** * Get a list of the names of the species in the alignment. ****************************************************************************/ STRING_LIST_T* get_species_names(ALIGNMENT_T* an_alignment) { STRING_LIST_T* return_value; int i_seq; int num_seqs; // Allocate a new string list. return_value = new_string_list(); // Extract all the sequence names and add them to the list. num_seqs = get_num_aligned_sequences(an_alignment); for (i_seq = 0; i_seq < num_seqs; i_seq++) { add_string(get_seq_name(get_alignment_sequence(i_seq, an_alignment)), return_value); } return(return_value); } /**************************************************************************** * Extract a small alignment out of the middle of a larger alignment. ****************************************************************************/ ALIGNMENT_T* extract_subalignment (int start, int width, ALIGNMENT_T* alignment) { int num_sequences = get_num_aligned_sequences(alignment); SEQ_T** sequences = get_alignment_sequences(alignment); SEQ_T** subsequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); // Extract the specified columns into a new list of sequences. int i_seq = 0; char* subsequence = mm_malloc((width + 1) * sizeof(char)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_seq = sequences[i_seq]; char* raw_seq = get_raw_sequence(this_seq); strncpy(subsequence, raw_seq + start, width); subsequence[width] = '\0'; subsequences[i_seq] = allocate_seq(get_seq_name(this_seq), get_seq_description(this_seq), get_seq_offset(this_seq), subsequence); } // Extract the consensus string in the specified columns. char* consensus = get_consensus_string(alignment); char* subconsensus = mm_malloc(sizeof(char) * (width + 1)); strncpy(subconsensus, consensus + start, width); subconsensus[width] = '\0'; // Allocate and return the new alignment. ALIGNMENT_T* subalignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, subsequences, subconsensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { free_seq(subsequences[i_seq]); } myfree(subsequences); myfree(subsequence); return(subalignment); } /**************************************************************************** * Remove from the alignment all columns that contain gaps for the * specified species. ****************************************************************************/ ALIGNMENT_T* remove_alignment_gaps (char* species, ALIGNMENT_T* alignment) { // Locate this species in the alignment. int species_index = get_index_in_string_list(species, get_species_names(alignment)); if (species_index == -1) { die("Can't find %s in alignment.\n", species); } SEQ_T* this_seq = get_alignment_sequence(species_index, alignment); // Get the dimensions of the original matrix. int num_sequences = get_num_aligned_sequences(alignment); int alignment_length = get_alignment_length(alignment); // Allocate memory for raw sequences that will constitute the new alignment. char** raw_sequences = (char**)mm_malloc(sizeof(char*) * num_sequences); int i_seq = 0; for (i_seq = 0; i_seq < num_sequences; i_seq++) { raw_sequences[i_seq] = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); } char* consensus = get_consensus_string(alignment); char* new_consensus = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); // Iterate over all columns. int i_column; int i_raw = 0; for (i_column = 0; i_column < alignment_length; i_column++) { // Is there a gap? char this_char = get_seq_char(i_column, this_seq); if ((this_char != '-') && (this_char != '.')) { // If no gap, then copy over this column. for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); char this_char = get_seq_char(i_column, this_sequence); raw_sequences[i_seq][i_raw] = this_char; } new_consensus[i_raw] = consensus[i_column]; i_raw++; } } // Create new sequence objects. SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); new_sequences[i_seq] = allocate_seq(get_seq_name(this_sequence), get_seq_description(this_sequence), get_seq_offset(this_sequence), raw_sequences[i_seq]); } // Allocate and return the new alignment. ALIGNMENT_T* new_alignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, new_sequences, new_consensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { myfree(raw_sequences[i_seq]); free_seq(new_sequences[i_seq]); } myfree(raw_sequences); myfree(new_sequences); myfree(new_consensus); return(new_alignment); } /**************************************************************************** * Remove from an alignment any sequence whose ID is not in a given list. * * N.B. It is NOT an error for the given list to contain sequence IDs that * are not in the alignment. ****************************************************************************/ ALIGNMENT_T* remove_alignment_seqs (STRING_LIST_T* seqs_to_keep, ALIGNMENT_T* alignment) { // Extract the names of the sequences in the alignment. STRING_LIST_T* alignment_species = get_species_names(alignment); int num_species = get_num_strings(alignment_species); // Count how many sequences will be in the new alignment. int i_species; int num_final = 0; for (i_species = 0; i_species < num_species; i_species++) { char* this_species = get_nth_string(i_species, alignment_species); if (have_string(this_species, seqs_to_keep)) { num_final++; } else { if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Removing %s from alignment.\n", this_species); } } } // Allocate space for the new sequences. SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_final * sizeof(SEQ_T*)); // Copy the sequences. int final_index = 0; num_species = get_num_strings(seqs_to_keep); for (i_species = 0; i_species < num_species; i_species++) { char* this_species = get_nth_string(i_species, seqs_to_keep); // If the requested ID is in the alignment, then copy over the sequence. if (have_string(this_species, alignment_species)) { SEQ_T* this_seq = get_alignment_sequence_by_name(this_species, alignment); new_sequences[final_index] = copy_sequence(this_seq); final_index++; } } // Allocate and return the new alignment. char *consensus = NULL; copy_string(&consensus, get_consensus_string(alignment)); ALIGNMENT_T* new_alignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_final, new_sequences, consensus); return(new_alignment); } /**************************************************************************** * Create a new alignment with any sequence that contains nothing but * gap ('-') characters removed. Returns the new alignment. Does not * change the old alignment. * If there are no all-gap sequences, the returned alignment is the * same object as the original alignment. ****************************************************************************/ static ALIGNMENT_T* remove_allgap_sequences(ALIGNMENT_T* alignment) { ALIGNMENT_T* new_alignment; int i_aln; int l_aln = get_num_aligned_sequences(alignment); STRING_LIST_T* keeper_seqs = new_string_list(); // Identify the all-gap sequences. for (i_aln=0; i_alnsequences); alignment->sequences = sequences; // Find the new index of the reference sequence. *ref_seq_index = get_index_in_string_list(ref_name, alignment_species); } if (remove_allgap_seqs) { ALIGNMENT_T* new_alignment = remove_allgap_sequences(alignment); if (new_alignment != alignment) { free_alignment(alignment); alignment = new_alignment; } } return(alignment); } // read_alignment_from_file /************************************************************************* * Print an alignment in PHYLIP format. *************************************************************************/ #define OUTPUT_WIDTH 60 void print_phylip_alignment (ALIGNMENT_T* the_alignment, FILE* outfile) { int i_seq; int i_position; char buffer[OUTPUT_WIDTH+1]; char* this_sequence; fprintf(outfile, "%d %d\n", the_alignment->num_sequences, the_alignment->length); /* Print the IDs and initial sequences. */ for (i_seq = 0; i_seq < the_alignment->num_sequences; i_seq++) { /* Print the ID. */ strncpy(buffer, get_seq_name(the_alignment->sequences[i_seq]), 10); buffer[10] = '\0'; fprintf(outfile, "%-10s ", buffer); /* Print the first block of sequence. */ this_sequence = get_raw_sequence(the_alignment->sequences[i_seq]); strncpy(buffer, &(this_sequence[0]), OUTPUT_WIDTH); buffer[OUTPUT_WIDTH] = '\0'; fprintf(outfile, "%s\n", buffer); } /* Blank line between sequences. */ fprintf(outfile, "\n"); /* Print successive blocks. */ for (i_position = OUTPUT_WIDTH; i_position < the_alignment->length; i_position += OUTPUT_WIDTH) { for (i_seq = 0; i_seq < the_alignment->num_sequences; i_seq++) { this_sequence = get_raw_sequence(the_alignment->sequences[i_seq]); strncpy(buffer, &(this_sequence[i_position]), OUTPUT_WIDTH); buffer[OUTPUT_WIDTH] = '\0'; fprintf(outfile, " %s\n", buffer); } /* Blank line between sequences. */ fprintf(outfile, "\n"); } } /**************************************************************************** * Free one alignment object. ****************************************************************************/ void free_alignment(ALIGNMENT_T* alignment) { if (alignment == NULL) { return; } else { if (alignment->consensus_string != NULL) { myfree(alignment->consensus_string); } if (alignment->sequences != NULL) { int i; for(i = 0; i < alignment->num_sequences; i++) { assert(alignment->sequences[i] != NULL); free_seq(alignment->sequences[i]); } myfree(alignment->sequences); } myfree(alignment); } }