/****************************************************************************** * FILE: seq-reader-from-fasta.c * AUTHOR: Charles Grant, Bill Noble, Tim Bailey * CREATION DATE: 2010-09-17 * COPYRIGHT: 2010 UW * * This file contains the concrete implementation for a * data-block-reader UDT for reading sequence segments from a FASTA file. * CONSIDER: This and the PSP_DATA_BLOCK_READER_T data structure are almost * identical. Can they be refactored to share more of the implementation? *****************************************************************************/ #include #include #include #include #include #include #include #include "alphabet.h" #include "seq-reader-from-fasta.h" typedef struct seq_reader_from_fasta { BOOLEAN_T at_start_of_line; int current_position; char* filename; size_t filename_len; // Includes trailing '\0' size_t filename_buffer_len; char* sequence_header; size_t sequence_header_len; // Includes trailing '\0' size_t sequence_buffer_len; char* sequence_name; size_t sequence_name_len; FILE *fasta_file; ALPH_T alphabet; } SEQ_READER_FROM_FASTA_T; // Forward declarations DATA_BLOCK_READER_T *new_seq_reader_from_fasta(ALPH_T alph, const char *filename); void free_seq_reader_from_fasta(DATA_BLOCK_READER_T *reader); BOOLEAN_T close_seq_reader_from_fasta(DATA_BLOCK_READER_T *reader); BOOLEAN_T reset_seq_reader_from_fasta(DATA_BLOCK_READER_T *reader); BOOLEAN_T read_seq_header_from_seq_reader_from_fasta( SEQ_READER_FROM_FASTA_T *fasta_reader ); BOOLEAN_T get_seq_name_from_seq_reader_from_fasta( DATA_BLOCK_READER_T *reader, char **name // OUT ); BOOLEAN_T go_to_next_sequence_in_seq_reader_from_fasta( DATA_BLOCK_READER_T *reader ); BOOLEAN_T get_next_data_block_from_seq_reader_from_fasta( DATA_BLOCK_READER_T *reader, DATA_BLOCK_T *data_block ); /****************************************************************************** * This function creates an instance of a data block reader UDT for reading * sequence segments from a FASTA file. *****************************************************************************/ DATA_BLOCK_READER_T *new_seq_reader_from_fasta(ALPH_T alph, const char *filename) { SEQ_READER_FROM_FASTA_T *fasta_reader = mm_malloc(sizeof(SEQ_READER_FROM_FASTA_T) * 1); fasta_reader->at_start_of_line = TRUE; int filename_len = strlen(filename) + 1; fasta_reader->filename = mm_malloc(sizeof(char)* filename_len); fasta_reader->filename_len = filename_len; strncpy(fasta_reader->filename, filename, filename_len); fasta_reader->current_position = 0; fasta_reader->sequence_header = NULL; fasta_reader->sequence_header_len = 0; fasta_reader->sequence_name = NULL; fasta_reader->sequence_name_len = 0; fasta_reader->alphabet = alph; if ( open_file( filename, "r", TRUE, "FASTA", "sequences", &(fasta_reader->fasta_file )) == FALSE) { die("Couldn't open the file %s.\n", filename); } DATA_BLOCK_READER_T *reader = new_data_block_reader( (void *) fasta_reader, free_seq_reader_from_fasta, close_seq_reader_from_fasta, reset_seq_reader_from_fasta, get_next_data_block_from_seq_reader_from_fasta, go_to_next_sequence_in_seq_reader_from_fasta, get_seq_name_from_seq_reader_from_fasta ); return reader; } /****************************************************************************** * This function frees an instance of the sequence FASTA reader UDT. *****************************************************************************/ void free_seq_reader_from_fasta(DATA_BLOCK_READER_T *reader) { SEQ_READER_FROM_FASTA_T *fasta_reader = (SEQ_READER_FROM_FASTA_T *) get_data_block_reader_data(reader); myfree(fasta_reader->filename); fasta_reader->filename_len = 0; fasta_reader->filename_buffer_len = 0; myfree(fasta_reader->sequence_header); fasta_reader->sequence_header_len = 0; fasta_reader->sequence_buffer_len = 0; myfree(fasta_reader->sequence_name); fasta_reader->sequence_name_len = 0; myfree(fasta_reader); } /****************************************************************************** * This function closes a sequence FASTA reader UDT. *****************************************************************************/ BOOLEAN_T close_seq_reader_from_fasta(DATA_BLOCK_READER_T *reader) { BOOLEAN_T result = FALSE; SEQ_READER_FROM_FASTA_T *fasta_reader = (SEQ_READER_FROM_FASTA_T *) get_data_block_reader_data(reader); fasta_reader->current_position = 0; if (fasta_reader->fasta_file != NULL) { if (fclose(fasta_reader->fasta_file) == EOF) { die( "Error closing file: %s.\nError message: %s\n", fasta_reader->filename, strerror(errno) ); } else { result = TRUE; } } return result; } /****************************************************************************** * This function resets a sequence FASTA reader UDT. *****************************************************************************/ BOOLEAN_T reset_seq_reader_from_fasta(DATA_BLOCK_READER_T *reader) { SEQ_READER_FROM_FASTA_T *fasta_reader = (SEQ_READER_FROM_FASTA_T *) get_data_block_reader_data(reader); if (fasta_reader->fasta_file == stdin) { die("Unable to rewind when reading sequence from standard input\n"); } else { rewind(fasta_reader->fasta_file); } fasta_reader->current_position = -1; fasta_reader->at_start_of_line = TRUE; return TRUE; } /****************************************************************************** * This function attempts to parse genomic coordinates from the * current sequence header. If successful it will set the sequence * name to the chromosome name, and set the starting sequence position. * * Returns TRUE if it was able to find genomic coordinates, FALSE otherwise. *****************************************************************************/ static BOOLEAN_T parse_genomic_coordinates( SEQ_READER_FROM_FASTA_T *fasta_reader ) { #define NUM_SUBMATCHES 4 #define ERROR_MESSAGE_SIZE 100 #define BUFFER_SIZE 512 static BOOLEAN_T first_time = TRUE; static regex_t fasta_header_regex; static regmatch_t matches[NUM_SUBMATCHES]; char error_message[ERROR_MESSAGE_SIZE]; int status = 0; if (first_time == TRUE) { // Initialize regular express for extracting chromsome coordinates; // Expected format is name:start-stop status = regcomp( &fasta_header_regex, "[[:space:]]*([^[:space:]:]+):([[:digit:]]+)-([[:digit:]]+)", REG_EXTENDED | REG_ICASE | REG_NEWLINE ); if (status != 0) { regerror(status, &fasta_header_regex, error_message, ERROR_MESSAGE_SIZE); die( "Error while intitializing regular expression\n" "for parsing genome coordinates from FASTA header: %s\n", error_message ); } first_time = FALSE; } BOOLEAN_T found_coordinates = FALSE; char* header = fasta_reader->sequence_header; status = regexec( &fasta_header_regex, header, NUM_SUBMATCHES, matches, 0 ); if(!status) { // The sequence header contains genomic coordinates found_coordinates = TRUE; // Copy chromosome name and position to reader char buffer[BUFFER_SIZE]; int name_len = matches[1].rm_eo - matches[1].rm_so; strncpy(buffer, header + matches[1].rm_so, name_len); buffer[name_len] = 0; fasta_reader->sequence_name = strdup(buffer); if (fasta_reader->sequence_name == NULL) { die("Unable to allocate memory while parsing sequence header.\n"); } fasta_reader->sequence_name_len = name_len; size_t pos_len = matches[2].rm_eo - matches[2].rm_so; strncpy(buffer, header + matches[2].rm_so, pos_len); buffer[pos_len] = 0; fasta_reader->current_position = atol(buffer) - 1; } else if(status != REG_NOMATCH ){ regerror(status, &fasta_header_regex, error_message, 100); die("Error trying to parse genome coordinates from sequence header: %s\n" "error message is %s\n", header, error_message ); } else { found_coordinates = FALSE; } return found_coordinates; #undef NUM_SUBMATCHES #undef ERROR_MESSAGE_SIZE #undef BUFFER_SIZE } /****************************************************************************** * This function attempts to parse the sequence name from the * current sequence header. The sequence name is the string up to the first * white space in the sequence header. * * Returns TRUE if it was able to genomic coordinates, FALSE otherwise. *****************************************************************************/ static BOOLEAN_T parse_seq_name( SEQ_READER_FROM_FASTA_T *fasta_reader ) { #define NUM_SUBMATCHES 2 #define ERROR_MESSAGE_SIZE 100 #define BUFFER_SIZE 512 static BOOLEAN_T first_time = TRUE; static regex_t fasta_header_regex; static regmatch_t matches[NUM_SUBMATCHES]; char error_message[ERROR_MESSAGE_SIZE]; int status = 0; if (first_time == TRUE) { // Initialize regular express for extracting chromsome coordinates; // Expected format is name:start-stop status = regcomp( &fasta_header_regex, "([^[:space:]:]+)[[:space:]]*", REG_EXTENDED | REG_ICASE | REG_NEWLINE ); if (status != 0) { regerror(status, &fasta_header_regex, error_message, ERROR_MESSAGE_SIZE); die( "Error while intitializing regular expression\n" "for parsing sequence name from FASTA header: %s\n", error_message ); } first_time = FALSE; } BOOLEAN_T found_name = FALSE; char* header = fasta_reader->sequence_header; status = regexec( &fasta_header_regex, header, NUM_SUBMATCHES, matches, 0 ); if(!status) { // The sequence header contains genomic coordinates found_name = TRUE; // Copy sequence name to reader char buffer[BUFFER_SIZE]; int name_len = matches[1].rm_eo - matches[1].rm_so; strncpy(buffer, header + matches[1].rm_so, name_len); buffer[name_len] = 0; fasta_reader->sequence_name = strdup(buffer); if (fasta_reader->sequence_name == NULL) { die("Unable to allocate memory while parsing sequence header.\n"); } fasta_reader->sequence_name_len = name_len; } else if(status != REG_NOMATCH ){ regerror(status, &fasta_header_regex, error_message, 100); die("Error trying to parse name from sequence header: %s\n" "error message is %s\n", header, error_message ); } else { found_name = FALSE; } return found_name; #undef NUM_SUBMATCHES #undef ERROR_MESSAGE_SIZE #undef BUFFER_SIZE } /****************************************************************************** * This function reads the entire sequence header at the start of a new sequence. * The current position is assumed to be start of a new sequence. * Read from the current position to the end of the current line. * * Returns TRUE if it was able to read the sequence text, FALSE if * EOF reached before the terminal newline was found. Dies if other errors * are encountered. *****************************************************************************/ BOOLEAN_T read_seq_header_from_seq_reader_from_fasta( SEQ_READER_FROM_FASTA_T *fasta_reader ) { int result = FALSE; // Initial allocation of sequence buffer const size_t initial_buffer_len = 100; if (fasta_reader->sequence_header == NULL) { fasta_reader->sequence_header = mm_malloc(sizeof(char) * initial_buffer_len); fasta_reader->sequence_buffer_len = initial_buffer_len; } // Look for EOL int c = 0; int seq_index = 0; while((c = fgetc(fasta_reader->fasta_file)) != EOF) { if (seq_index >= fasta_reader->sequence_buffer_len) { // Need to grow buffer fasta_reader->sequence_header = mm_realloc( fasta_reader->sequence_header, 2 * fasta_reader->sequence_buffer_len ); fasta_reader->sequence_buffer_len = 2 * fasta_reader->sequence_buffer_len; } if (c == '\n') { // Found EOL fasta_reader->sequence_header[seq_index] = '\0'; fasta_reader->sequence_header_len = seq_index + 1; fasta_reader->at_start_of_line = TRUE; result = TRUE; break; } else { // Keep looking for EOL fasta_reader->sequence_header[seq_index] = c; ++seq_index; } } // At this point c is EOL or EOF if (c == EOF) { if (ferror(fasta_reader->fasta_file)) { // EOF could actually indicate an error. die( "Error reading file:%s.\nError message: %s\n", fasta_reader->filename, strerror(ferror(fasta_reader->fasta_file)) ); } else if (feof(fasta_reader->fasta_file)) { // Reached EOF before reaching EOL for the sequence. fasta_reader->sequence_header[0] = '\0'; fasta_reader->sequence_header_len = 0; } } return result; } /****************************************************************************** * This function gets the name of the current sequence from the data block * reader. The name of the sequence is passed using the name parameter. * The caller is responsible for freeing the memory for the sequence name. * * Returns TRUE if successful, FALSE if there is no current sequence, as * at the start of the file. *****************************************************************************/ BOOLEAN_T get_seq_name_from_seq_reader_from_fasta( DATA_BLOCK_READER_T *reader, char **name // OUT ) { BOOLEAN_T result = FALSE; SEQ_READER_FROM_FASTA_T *fasta_reader = (SEQ_READER_FROM_FASTA_T *) get_data_block_reader_data(reader); if (fasta_reader->sequence_name == NULL || fasta_reader->sequence_name_len <= 0) { result = FALSE; } else { *name = strdup(fasta_reader->sequence_name); result = TRUE; } return result; } /****************************************************************************** * Read from the current position in the file to the first prior after the * start of the next sequence. Set the value of the current sequence. * * Returns TRUE if it was able to advance to the next sequence, FALSE if * EOF reached before the next sequence was found. Dies if other errors * encountered. *****************************************************************************/ BOOLEAN_T go_to_next_sequence_in_seq_reader_from_fasta(DATA_BLOCK_READER_T *reader) { BOOLEAN_T result = FALSE; SEQ_READER_FROM_FASTA_T *fasta_reader = (SEQ_READER_FROM_FASTA_T *) get_data_block_reader_data(reader); fasta_reader->current_position = 0; int c = 0; while((c = fgetc(fasta_reader->fasta_file)) != EOF) { if (fasta_reader->at_start_of_line == TRUE && c == '>') { break; } else if (c == '\n') { fasta_reader->at_start_of_line = TRUE; } else { fasta_reader->at_start_of_line = FALSE; } } // At this point c is '>' or EOF if (c == '>') { BOOLEAN_T found_genomic_coordinates = FALSE; result = read_seq_header_from_seq_reader_from_fasta(fasta_reader); if (result == TRUE) { // Look for genomic coordinates in header found_genomic_coordinates = parse_genomic_coordinates(fasta_reader); } if (found_genomic_coordinates == FALSE) { // Look for whitespace in header // The sequence name is the string before the white space. BOOLEAN_T found_name = FALSE; found_name = parse_seq_name(fasta_reader); if (found_name == FALSE) { die( "Unable to find sequence name in header %s.\n", fasta_reader->sequence_header ); } } } else { if (ferror(fasta_reader->fasta_file)) { die( "Error reading file:%s.\nError message: %s\n", fasta_reader->filename, strerror(ferror(fasta_reader->fasta_file)) ); } else if (feof(fasta_reader->fasta_file)) { // Reached EOF before reaching the start of the sequence result = FALSE; } } return result; } /****************************************************************************** * Fills in the next data block for the sequence. * During the first call for the sequence it fills in the full data block. * On successive calls, shifts the sequence in the block down one position * and reads one more character. * * Returns TRUE if it was able to completely fill the block, FALSE if * the next sequence or EOF was reached before the block was filled. * Dies if other errors encountered. *****************************************************************************/ BOOLEAN_T get_next_data_block_from_seq_reader_from_fasta( DATA_BLOCK_READER_T *reader, DATA_BLOCK_T *data_block ) { BOOLEAN_T result = FALSE; char *raw_seq = get_sequence_from_data_block(data_block); int block_size = get_block_size_from_data_block(data_block); int num_read = get_num_read_into_data_block(data_block); SEQ_READER_FROM_FASTA_T *fasta_reader = (SEQ_READER_FROM_FASTA_T *) get_data_block_reader_data(reader); if (num_read == block_size) { // Block is alread full, shift all elements in the block down by one position // FIXME CEG: Inefficient, replace with circular buffer. memmove(raw_seq, raw_seq + 1, block_size - 1); num_read = block_size - 1; raw_seq[num_read] = 0; } int c = 0; while((c = fgetc(fasta_reader->fasta_file)) != EOF) { if (isspace(c)) { // Skip over white space if (c == '\n') { fasta_reader->at_start_of_line = TRUE; } else { fasta_reader->at_start_of_line = FALSE; } continue; } else if (c == '>' && fasta_reader->at_start_of_line == TRUE) { // We found the start of a new sequence while trying // to fill the block. Leave the block incomplete. c = ungetc(c, fasta_reader->fasta_file); if (ferror(fasta_reader->fasta_file)) { die( "Error while reading file:%s.\nError message: %s\n", fasta_reader->filename, strerror(ferror(fasta_reader->fasta_file)) ); } raw_seq[num_read] = 0; break; } else { // Fill in another character in the block raw_seq[num_read] = toupper(c); // Check that character is legal in alphabet. // If not, replace with wild card character. if (!alph_is_known(fasta_reader->alphabet, raw_seq[num_read])) { raw_seq[num_read] = alph_wildcard(fasta_reader->alphabet); fprintf( stderr, "Warning: %c is not a valid character in %s alphabet.\n" " Converting %c to %c.\n", c, alph_name(fasta_reader->alphabet), c, raw_seq[num_read] ); } ++num_read; if (num_read == block_size) { // block is full result = TRUE; break; } } } if (c == EOF && ferror(fasta_reader->fasta_file)) { die( "Error while reading file:%s.\nError message: %s\n", fasta_reader->filename, strerror(ferror(fasta_reader->fasta_file)) ); } ++fasta_reader->current_position; set_start_pos_for_data_block(data_block, fasta_reader->current_position); set_num_read_into_data_block(data_block, num_read); return result; }