/**************************************************************************** * FILE: seq.c * AUTHOR: William Stafford Noble * CREATE DATE: 06/24/2002 * PROJECT: MHMM * DESCRIPTION: Biological sequences. * COPYRIGHT: 1998-2008, UCSD, UCSC, UW ****************************************************************************/ #include #include #include #include #include "utils.h" #include "seq.h" #define MAX_SEQ_NAME 100 // Longest sequence ID. #define MAX_SEQ_COMMENT 128 // Longest comment. // maximum count to keep track of when calculating a background const static long BG_CALC_CHUNK = LONG_MAX; // Instantiate the SEQ_T type. struct seq { char name[MAX_SEQ_NAME + 1]; // Sequence ID. char desc[MAX_SEQ_COMMENT + 1]; // Sequence description. int length; // Sequence length. int offset; // Offset from the start of complete sequence. float weight; // Sequence weight. char* sequence; // The actual sequence. int* intseq; // The sequence in integer format. int* gc; // Cumulative GC counts; note: 2Gb size limit. double total_gc; // Total frequency of G and C in sequence; DNA only BOOLEAN_T is_complete; // Is this the end of the sequence? }; // For calculating the background // of a truly huge dataset keep // track of the result up to size // long max and then store weights // depending on how many chunks // have been normalized into the // final result struct bgcalc { // the alphabet ALPH_T alph; // the number of bases seen so far // in this chunk long chunk_seen; // the counts of the individual // bases seen so far long *chunk_counts; // the background seen so far double *bg; // the weight of the background // seen so far. Note that 1 chunk // has the weight of 1. long weight; }; /**************************************************************************** * Allocate one sequence object. ****************************************************************************/ SEQ_T* allocate_seq (char* name, char* description, int offset, char* sequence) { SEQ_T* new_sequence; // Allocate the sequence object. new_sequence = (SEQ_T*)mm_malloc(sizeof(SEQ_T)); // Store the name, truncating if necessary. strncpy(new_sequence->name, name, MAX_SEQ_NAME); new_sequence->name[MAX_SEQ_NAME] = '\0'; if (strlen(new_sequence->name) != strlen(name)) { fprintf(stderr, "Warning: truncating sequence ID %s to %s.\n", name, new_sequence->name); } // Store the description, truncating if necessary. strncpy(new_sequence->desc, description, MAX_SEQ_COMMENT); new_sequence->desc[MAX_SEQ_COMMENT] = '\0'; // Store the sequence. copy_string(&(new_sequence->sequence), sequence); // Fill in the remaining fields. new_sequence->length = strlen(sequence); new_sequence->offset = offset; new_sequence->weight = 1.0; new_sequence->intseq = NULL; new_sequence->gc = NULL; new_sequence->is_complete = TRUE; return(new_sequence); } /**************************************************************************** * Get and set various fields. ****************************************************************************/ char* get_seq_name (SEQ_T* a_sequence) { assert(a_sequence != NULL); return(a_sequence->name); } char* get_seq_description (SEQ_T* a_sequence) { assert(a_sequence != NULL); return(a_sequence->desc); } int get_seq_length (SEQ_T* a_sequence) { assert(a_sequence != NULL); return(a_sequence->length); } int get_seq_offset (SEQ_T* a_sequence) { assert(a_sequence != NULL); return(a_sequence->offset); } float get_seq_weight (SEQ_T* a_sequence) { assert(a_sequence != NULL); return(a_sequence->weight); } void set_seq_weight (float weight, SEQ_T* a_sequence) { assert(a_sequence != NULL); a_sequence->weight = weight; } unsigned char get_seq_char (int index, SEQ_T* a_sequence) { assert(a_sequence != NULL); assert(a_sequence->sequence != NULL); assert(index >= 0); assert(index <= a_sequence->length); return(a_sequence->sequence[index]); } void set_seq_char (int index, char a_char, SEQ_T* a_sequence) { assert(a_sequence != NULL); assert(a_sequence->sequence != NULL); assert(index >= 0); assert(index <= a_sequence->length); a_sequence->sequence[index] = a_char; } int get_seq_int (int index, SEQ_T* a_sequence) { assert(a_sequence != NULL); assert(a_sequence->sequence != NULL); assert(index >= 0); assert(index < a_sequence->length); return(a_sequence->intseq[index]); } int get_seq_gc (int index, SEQ_T* a_sequence) { assert(a_sequence != NULL); assert(a_sequence->sequence != NULL); assert(index >= 0); assert(index < a_sequence->length); return(a_sequence->gc[index]); } char* get_raw_sequence (SEQ_T* a_sequence) { assert(a_sequence != NULL); return(a_sequence->sequence); } char* get_raw_subsequence (int start, int stop, SEQ_T* a_sequence) { assert(a_sequence != NULL); assert((stop - start) >= 0); char *sequence = get_raw_sequence(a_sequence); char *subsequence = mm_malloc((stop - start + 2) * sizeof(char)); strncpy(subsequence, sequence + start, stop - start + 1); subsequence[stop - start + 1] = 0; return(subsequence); } int* get_int_sequence (SEQ_T* a_sequence) { assert(a_sequence != NULL); return(a_sequence->intseq); } int* get_gc_sequence (SEQ_T* a_sequence) { assert(a_sequence != NULL); return(a_sequence->gc); } double get_total_gc_sequence (SEQ_T* a_sequence) { assert(a_sequence != NULL); return(a_sequence->total_gc); } void set_total_gc_sequence (SEQ_T* a_sequence, double gc) { assert(a_sequence != NULL); a_sequence->total_gc = gc; } /************************************************************************** * Copy a sequence object. Memory must be freed by caller. **************************************************************************/ SEQ_T* copy_sequence (SEQ_T* source_sequence) { // Allocate the sequence object. SEQ_T* new_sequence = allocate_seq(source_sequence->name, source_sequence->desc, source_sequence->offset, source_sequence->sequence); // Copy additional fields. new_sequence->weight = source_sequence->weight; new_sequence->is_complete = source_sequence->is_complete; if (source_sequence->intseq != NULL) { new_sequence->intseq = (int*)mm_malloc(sizeof(int) * source_sequence->length); int i; for (i = 0; i < source_sequence->length; i++) { new_sequence->intseq[i] = source_sequence->intseq[i]; } } if (source_sequence->gc != NULL) { new_sequence->gc = (int*)mm_malloc(sizeof(int) * source_sequence->length); int i; for (i = 0; i < source_sequence->length; i++) { new_sequence->gc[i] = source_sequence->gc[i]; } } return(new_sequence); } /************************************************************************** * Sometimes a sequence object contains only a portion of the actual * sequence. This function tells whether or not the end of this * sequence object corresponds to the end of the actual sequence. **************************************************************************/ void set_complete (BOOLEAN_T is_complete, SEQ_T* a_sequence) { assert(a_sequence != NULL); a_sequence->is_complete = is_complete; } BOOLEAN_T is_complete (SEQ_T* a_sequence) { assert(a_sequence != NULL); return(a_sequence->is_complete); } /*************************************************************************** * Add or remove Xs from either side of the sequence. ***************************************************************************/ static void add_flanking_xs (SEQ_T* sequence, ALPH_T alph) { char* new_seq = NULL; // Pointer to copy of the sequence. new_seq = (char*)mm_calloc(sequence->length + 3, sizeof(char)); strcpy(&(new_seq[1]), sequence->sequence); new_seq[0] = alph_wildcard(alph); new_seq[sequence->length + 1] = alph_wildcard(alph); new_seq[sequence->length + 2] = '\0'; myfree(sequence->sequence); sequence->sequence = new_seq; sequence->length += 2; } void remove_flanking_xs (SEQ_T* sequence) { char* new_seq; // Copy of the sequence. new_seq = (char*)mm_calloc(sequence->length - 1, sizeof(char)); strncpy(new_seq, &(sequence->sequence[1]), sequence->length - 2); new_seq[sequence->length - 2] = '\0'; myfree(sequence->sequence); sequence->sequence = new_seq; sequence->length -= 2; } void allocate_int_sequence (SEQ_T* sequence) { sequence->intseq = (int *)mm_malloc(sizeof(int) * sequence->length); } /************************************************************************** * Prepare a sequence for recognition by * - making sure it is uppercase, * - making sure it doesn't contain illegal characters, * - adding flanking Xs to match START/END states, and * - converting it to an integer format * - computing cumulative GC counts * * In the integer form, each character in the sequence is replaced by * the index of that character in the alphabet array. Thus, if the * alphabet is 'ACGT', every occurence of the letter 'G' in the * sequence will be represented by the index 2. **************************************************************************/ void prepare_sequence (SEQ_T* sequence, ALPH_T alph) { int i_seq; // Index in the sequence. int badchar; // Number of characters converted. char wildcard; // Wildcard character wildcard = alph_wildcard(alph); badchar = 0; for (i_seq = 0; i_seq < get_seq_length(sequence); i_seq++) { // Make sure the sequence is uppercase. if (islower((int)(sequence->sequence)[i_seq])) { (sequence->sequence)[i_seq] = toupper((int)(sequence->sequence)[i_seq]); } // Convert non-alphabetic characters to ambiguous. if (alph_index(alph, (sequence->sequence)[i_seq]) == -1) { fprintf(stderr, "%c -> %c\n", (sequence->sequence)[i_seq], wildcard); (sequence->sequence)[i_seq] = wildcard; badchar++; } } // Tell the user about the conversions. if (badchar) { fprintf(stderr, "Warning: converted %d non-alphabetic ", badchar); fprintf(stderr, "characters to %c in sequence %s.\n", wildcard, get_seq_name(sequence)); } // Add flanking X's. add_flanking_xs(sequence, alph); // Make the integer sequence. sequence->intseq = (int *)mm_malloc(sizeof(int) * get_seq_length(sequence)); for (i_seq = 0; i_seq < get_seq_length(sequence); i_seq++) { (sequence->intseq)[i_seq] = alph_index(alph, (sequence->sequence)[i_seq]); } // // Get cumulative GC counts. // if (alph == DNA_ALPH) { int len = get_seq_length(sequence); char c = (sequence->sequence)[0]; // first character sequence->gc = (int *)mm_malloc(sizeof(int) * get_seq_length(sequence)); // set count at first position (sequence->gc)[0] = (c == 'G' || c == 'C') ? 1 : 0; // set cumulative counts at rest of postitions for (i_seq = 1; i_seq < len; i_seq++) { c = (sequence->sequence)[i_seq]; (sequence->gc)[i_seq] = (c == 'G' || c == 'C') ? (sequence->gc)[i_seq-1] + 1 : (sequence->gc)[i_seq-1]; } } } /*************************************************************************** * Remove the first N bases of a given sequence. ***************************************************************************/ void shift_sequence (int offset, SEQ_T* sequence) { char* smaller_sequence; // Make a smaller copy of the raw sequence. assert(offset > 0); assert(offset <= sequence->length); smaller_sequence = (char*)mm_malloc((offset + 1) * sizeof(char)); strcpy(smaller_sequence, &(sequence->sequence[offset])); assert((int)strlen(smaller_sequence) == (sequence->length - offset)); // Put the smaller sequence into the sequence object. myfree(sequence->sequence); sequence->sequence = smaller_sequence; sequence->offset += offset; sequence->length -= offset; // Free the integer version. myfree(sequence->intseq); sequence->intseq = NULL; // Free the GC counts. myfree(sequence->gc); sequence->gc = NULL; } /*************************************************************************** * Get the maximum sequence length from a set of sequences. ***************************************************************************/ int get_max_seq_length (int num_seqs, SEQ_T** sequences) { int max_length; int this_length; int i_seq; max_length = 0; for (i_seq = 0; i_seq < num_seqs; i_seq++) { this_length = get_seq_length(sequences[i_seq]); if (this_length > max_length) { max_length = this_length; } } return(max_length); } /*************************************************************************** * Get the maximum sequence ID length from a set of sequences. ***************************************************************************/ int get_max_seq_name (int num_seqs, SEQ_T** sequences) { int max_length; int this_length; int i_seq; max_length = 0; for (i_seq = 0; i_seq < num_seqs; i_seq++) { this_length = strlen(get_seq_name(sequences[i_seq])); if (this_length > max_length) { max_length = this_length; } } return(max_length); } /*************************************************************************** * Set the sequence weights according to an external file. * * If the filename is "none," "internal," or NULL, then the weights are * set uniformly. ***************************************************************************/ void set_sequence_weights (char* weight_filename, // Name of file containing sequence weights. int num_seqs, // Number of sequences. SEQ_T** sequences) // The sequences. { ARRAY_T* weights; FILE * weight_file; int i_seq; /* Allocate the weights. */ weights = allocate_array(num_seqs); /* Set uniform weights if no file was supplied. */ if ((weight_filename == NULL) || (strcmp(weight_filename, "none") == 0) || (strcmp(weight_filename, "internal") == 0)) { init_array(1.0, weights); } /* Read the weights from a file. */ else { if (open_file(weight_filename, "r", FALSE, "weight", "weights", &weight_file) == 0) exit(1); read_array(weight_file, weights); fclose(weight_file); /* Normalize the weights so they add to the total number of sequences. */ normalize(0.0, weights); scalar_mult(num_seqs, weights); } /* Assign the weights to the sequences. */ for (i_seq = 0; i_seq < num_seqs; i_seq++) { (sequences[i_seq])->weight = get_array_item(i_seq, weights); } /* Free the weights. */ free_array(weights); } /**************************************************************************** * Return an array containing the frequencies in the sequence for each * character of the alphabet. Characters not in the alphabet are not * counted. ****************************************************************************/ ARRAY_T* get_sequence_freqs (SEQ_T* seq, ALPH_T alph) { char c = 0; int a_index = 0; int a_size = 0; int i = 0; int total_bases = 0; int* num_bases = NULL; ARRAY_T* freqs = NULL; // Initialize counts for each character in the alphabet a_size = alph_size(alph, ALPH_SIZE); num_bases = mm_malloc(a_size * sizeof(int)); for (i = 0; i < a_size; i++) { num_bases[i] = 0; } for (i = 0; i < seq->length; i++) { c = get_seq_char(i, seq); if (c != '-' && c != '-') { a_index = alph_index(alph, c); if (a_index == -1 || a_index >= a_size) continue; num_bases[a_index]++; total_bases++; } } freqs = allocate_array(a_size); for (i = 0; i < a_size; i++) { set_array_item(i, (double) num_bases[i] / (double) total_bases, freqs); } // Clean up the count of characters myfree(num_bases); return freqs; } /**************************************************************************** * Return an array containing the frequencies in the sequences for each * character of the alphabet. Characters not in the alphabet are not * counted. * * When seq is provided it returns null, otherwise it converts the accumulated * result in bgcalc into a background. * * * Pseudocode example: * ALPH_T alph = ... * BGCALC_T *bgcalc = NULL; * for each seq: * calculate_background(alph, seq, &bgcalc); * ARRAY_T *bg = calculate_background(NULL, &bgcalc); ****************************************************************************/ ARRAY_T* calculate_background( ALPH_T alph, SEQ_T* seq, BGCALC_T** bgcalc ){ BGCALC_T *calc; int a_size, i, a_index; char c; double freq, chunk_part, chunk_freq; ARRAY_T *background; assert(bgcalc != NULL); assert(seq != NULL || *bgcalc != NULL); // get the alphabet // get the alphabet size a_size = alph_size(alph, ALPH_SIZE); if (*bgcalc == NULL) { //allocate and initialize calc calc = mm_malloc(sizeof(BGCALC_T)); calc->alph = alph; calc->chunk_seen = 0; calc->weight = 0; calc->chunk_counts = mm_malloc(a_size * sizeof(long)); calc->bg = mm_malloc(a_size * sizeof(double)); for (i = 0; i < a_size; ++i) { calc->chunk_counts[i] = 0; calc->bg[i] = 0; } *bgcalc = calc; } else { calc = *bgcalc; assert(alph == calc->alph); if (calc->weight == LONG_MAX) return NULL; } if (seq == NULL) { // no sequence so calculate the final result background = allocate_array(alph_size(alph, ALL_SIZE)); if (calc->weight == 0) { if (calc->chunk_seen > 0) { // when we haven't had to approximate yet // just do a normal background calculation for (i = 0; i < a_size; i++) { freq = (double) calc->chunk_counts[i] / (double) calc->chunk_seen; set_array_item(i, freq, background); } } else { fputs("Uniform\n", stdout); // when there are no counts then return uniform freq = (double) 1 / (double) a_size; for (i = 0; i < a_size; i++) { set_array_item(i, freq, background); } } } else { if (calc->chunk_seen > 0) { // combine the frequencies for the existing chunks with the counts // for the partially completed chunk chunk_part = (double) calc->chunk_seen / (double) BG_CALC_CHUNK; for (i = 0; i < a_size; i++) { chunk_freq = (double) calc->chunk_counts[i] / (double) calc->chunk_seen; freq = ((calc->bg[i] * calc->weight) + (chunk_freq * chunk_part)) / (calc->weight + chunk_part); set_array_item(i, freq, background); } } else { // in the odd case we get to an integer number of chunks for (i = 0; i < a_size; i++) { set_array_item(i, calc->bg[i], background); } } } calc_ambigs(alph, FALSE, background); // free bgcalc structure free(calc->bg); free(calc->chunk_counts); free(calc); *bgcalc = NULL; return background; } // we have a sequence to add to the background calculation for (i = 0; i < seq->length; i++) { c = get_seq_char(i, seq); a_index = alph_index(alph, c); if (a_index == -1 || a_index >= a_size) continue; calc->chunk_counts[a_index]++; calc->chunk_seen++; if (calc->chunk_seen == BG_CALC_CHUNK) { if (calc->weight == 0) { for (i = 0; i < a_size; i++) { calc->bg[i] = (double) calc->chunk_counts[i] / (double) BG_CALC_CHUNK; } } else { for (i = 0; i < a_size; i++) { chunk_freq = (double) calc->chunk_counts[i] / (double) BG_CALC_CHUNK; calc->bg[i] = (calc->bg[i] * calc->weight + chunk_freq) / (calc->weight + 1); } } calc->weight++; // reset the counts for the next chunk for (i = 0; i < a_size; i++) { calc->chunk_counts[i] = 0; } calc->chunk_seen = 0; // I don't think it is feasible to reach this limit // but I guess I'd better check anyway if (calc->weight == LONG_MAX) { fprintf(stderr, "Sequence data set is so large that even the " "approximation designed for large datasets can't handle it!"); return NULL; } } } return NULL; } /********************************************************************** shuffle_sequence() shuffle a given sequences based on their content **********************************************************************/ void shuffle_sequence( SEQ_T* seq, /* original sequence IN */ unsigned int seed, /* seed IN */ SEQ_T** target /* target sequence OUT */ ){ my_srand(seed); assert(*target==NULL); // reset target if not null if (*target != NULL){ free_seq(*target); } *target = allocate_seq(get_seq_name(seq),"shuffled",get_seq_offset(seq),get_raw_sequence(seq)); char *raw = get_raw_sequence(*target); /* copy original in temp string */ char* tmp = (char*)mm_calloc(get_seq_length(seq)+1,sizeof(char)); strcpy(tmp,get_raw_sequence(seq)); tmp[get_seq_length(seq)]='\0'; int i,j; char *ss; char *dd; for(j=0,i=get_seq_length(seq);i>0;i--){ // Pick a random number in the range: int pick = rand() % i; raw[j++] = tmp[pick]; // "shift" routine here eliminates the "picked" base from the _src string: // dd starts at the picked position: ss is one beyond that: for( dd = tmp+pick , ss = dd + 1 ; *dd ; *dd++=*ss++ ); } myfree(tmp); } /**************************************************************************** * Free one sequence object. ****************************************************************************/ void free_seq (SEQ_T* a_sequence) { if (a_sequence == NULL) { return; } myfree(a_sequence->sequence); myfree(a_sequence->intseq); myfree(a_sequence->gc); myfree(a_sequence); }