#include #include #include #include #include #include #include "array-list.h" #include "linked-list.h" #include "motif-in-meme-text.h" #include "motif-spec.h" #include "red-black-tree.h" #include "regex-utils.h" #include "string-builder.h" #include "string-match.h" #include "utils.h" enum mtext_state { MTEXT_FIND_VERSION, // also look for tag MTEXT_PRE_MOTIF, MTEXT_IN_LETTER_FREQ, MTEXT_IN_BACKGROUND, MTEXT_IN_MOTIF, MTEXT_IN_PSPM, MTEXT_IN_PSSM, MTEXT_PRE_C_SITES, MTEXT_IN_C_SITES, MTEXT_DONE }; typedef enum mtext_state MTEXT_STATE_EN; struct minfo { int length; }; struct fscope { int options_found; int options_returned; BOOLEAN_T is_html; int vmajor; int vminor; int vpatch; char *release; char *datafile; ALPH_T alphabet; int strands; char *bg_source; ARRAY_T *background; ARRAY_T *letter_freqs; int motif_count; struct minfo* motif_lookup; double scanned_sites_threshold; ARRAYLST_T *scanned_sites; }; struct mscope { int options_found; int optins_returned; BOOLEAN_T has_width; BOOLEAN_T has_evalue; BOOLEAN_T has_sites; MOTIF_T *motif; }; struct patterns { regex_t html; regex_t version; regex_t release; regex_t datafile; regex_t alphabet; regex_t strands; regex_t pos_strand; regex_t neg_strand; regex_t letter_freq; regex_t background; regex_t bg_source; regex_t freq_pair; regex_t whitespace; regex_t motif_intro; regex_t motif_id2; regex_t motif_kv_pair; regex_t motif_pspm; regex_t motif_pssm; regex_t motif_num; regex_t motif_url; regex_t s_sites_header; regex_t s_sites_dashes; regex_t s_sites_titles; regex_t s_sites_divider; regex_t s_sites_seq; regex_t s_sites_block; regex_t s_sites_offset; }; typedef struct mtext MTEXT_T; struct mtext { int options; char *file_name; short format_match; MTEXT_STATE_EN state; LINKLST_T *errors; LINKLST_T *motif_queue; STR_T *line; int counter; // used for a few things, like keeping track of which position of the background I'm up to. double sum; // used to check sums of probabilities struct patterns re; struct fscope fscope; struct mscope mscope; }; static const char * HTML_RE = ""; static const char * VERSION_RE = "^[[:space:]]*MEME[[:space:]]+version" "[[:space:]]+([0-9]+)(\\.([0-9]+)(\\.([0-9]+))?)?([^0-9].*)?$"; static const char * RELEASE_RE ="[[:space:]]+\\(Release[[:space:]]+date:" "[[:space:]]+([^[:space:]][^\\)]+)\\)"; static const char * DATAFILE_RE = "^[[:space:]]*DATAFILE[[:space:]]*=" "[[:space:]]*([^[:space:]].*)$"; static const char * ALPHABET_RE = "^[[:space:]]*ALPHABET[[:space:]]*=" "[[:space:]]*(.*)[[:space:]]*$"; static const char * STRANDS_RE = "^[[:space:]]*strands[[:space:]]*:" "([-+[:space:]]*)$"; static const char * POS_STRAND_RE = "\\+"; static const char * NEG_STRAND_RE = "-"; static const char * LETTER_FREQ_RE = "^Letter frequencies in dataset:$"; static const char * BACKGROUND_RE = "^[[:space:]]*Background[[:space:]]+" "letter[[:space:]]+frequencies([[:space:]].*)?$"; static const char * BG_SOURCE_RE = "^[[:space:]]+\\(from[[:space:]]+" "(.*)\\):.*$"; static const char * FREQ_PAIR_RE = "^[[:space:]]*([A-Za-z])[[:space:]]+" POS_NUM_RE "([[:space:]].*)?$"; static const char * WHITESPACE_RE = "^[[:space:]]*$"; // Can't ignore case on this one as normal meme format repeatedly // uses Motif as the first word on a line static const char * MOTIF_INTRO_RE = "^[[:space:]]*(BL[[:space:]]+)?MOTIF" "[[:space:]]*([^[:space:]]+)([[:space:]].*)?$"; // don't allow the second id to contain a '=' or be directly followed by an '=' // as that probably means it's not an id static const char * MOTIF_ID2_RE = "^[[:space:]]*([^[:space:]=]+)" "([[:space:]]+([^[:space:]=].*)?)?$"; static const char * MOTIF_KV_PAIR_RE = "^[[:space:]]*([^[:space:]=]+)" "[[:space:]]*=[[:space:]]*([^[:space:]=]+)([[:space:]].*)?$"; static const char * MOTIF_PSPM_RE = "^[[:space:]]*letter[[:space:]]*-" "[[:space:]]*probability[[:space:]]+matrix[[:space:]]*:(.*)$"; static const char * MOTIF_PSSM_RE = "^[[:space:]]*log[[:space:]]*-" "[[:space:]]*odds[[:space:]]+matrix[[:space:]]*:(.*)$"; static const char * MOTIF_NUM_RE = "^[[:space:]]*" NUM_RE "([[:space:]].*)?$"; static const char * MOTIF_URL_RE = "^[[:space:]]*URL[[:space:]]*:[[:space:]]*" "(.*)[[:space:]]*$"; // as this only really comes up in MEME output then require an exact match static const char * C_SITES_HEADER_RE = "^\tCombined block diagrams: " "non-overlapping sites with p-value < " POS_NUM_RE "$"; static const char * C_SITES_DASHES_RE = "^-+$"; static const char * C_SITES_TITLES_RE = "^SEQUENCE NAME[[:space:]]+COMBINED " "P-VALUE[[:space:]]+MOTIF DIAGRAM$"; static const char * C_SITES_DIVIDER_RE = "^-+[[:space:]]+-+[[:space:]]+-+$"; static const char * C_SITES_SEQ_RE = "^[[:space:]]*([^[:space:]]+)[[:space:]]+" POS_NUM_RE "[[:space:]]+(.*)$"; static const char * C_SITES_BLOCK_RE = "^([[<])([-+]?)([0-9]+)([abc]?)\\(" POS_NUM_RE "\\)([]>])"; static const char * C_SITES_OFFSET_RE = "^([0-9]+)"; static void compile_patterns(struct patterns *re) { regcomp_or_die("html", &(re->html), HTML_RE, REG_ICASE); regcomp_or_die("version", &(re->version), VERSION_RE, REG_EXTENDED | REG_ICASE); regcomp_or_die("release date", &(re->release), RELEASE_RE, REG_EXTENDED | REG_ICASE); regcomp_or_die("data file", &(re->datafile), DATAFILE_RE, REG_EXTENDED | REG_ICASE); regcomp_or_die("alphabet", &(re->alphabet), ALPHABET_RE, REG_EXTENDED | REG_ICASE); regcomp_or_die("strands", &(re->strands), STRANDS_RE, REG_EXTENDED | REG_ICASE); regcomp_or_die("positive strand", &(re->pos_strand), POS_STRAND_RE, 0); regcomp_or_die("negative strand", &(re->neg_strand), NEG_STRAND_RE, 0); regcomp_or_die("letter frequency", &(re->letter_freq), LETTER_FREQ_RE, REG_EXTENDED | REG_ICASE); regcomp_or_die("background", &(re->background), BACKGROUND_RE, REG_EXTENDED | REG_ICASE); // note ICASE is off for HTML regcomp_or_die("bg source", &(re->bg_source), BG_SOURCE_RE, REG_EXTENDED | REG_ICASE); regcomp_or_die("freq pair", &(re->freq_pair), FREQ_PAIR_RE, REG_EXTENDED); regcomp_or_die("whitespace", &(re->whitespace), WHITESPACE_RE, REG_NOSUB); regcomp_or_die("motif intro", &(re->motif_intro), MOTIF_INTRO_RE, REG_EXTENDED); regcomp_or_die("motif id 2", &(re->motif_id2), MOTIF_ID2_RE, REG_EXTENDED); regcomp_or_die("motif kv pair", &(re->motif_kv_pair), MOTIF_KV_PAIR_RE, REG_EXTENDED); regcomp_or_die("motif pspm", &(re->motif_pspm), MOTIF_PSPM_RE, REG_EXTENDED | REG_ICASE); regcomp_or_die("motif pssm", &(re->motif_pssm), MOTIF_PSSM_RE, REG_EXTENDED | REG_ICASE); regcomp_or_die("motif num", &(re->motif_num), MOTIF_NUM_RE, REG_EXTENDED); regcomp_or_die("motif url", &(re->motif_url), MOTIF_URL_RE, REG_EXTENDED | REG_ICASE); regcomp_or_die("c sites header", &(re->s_sites_header), C_SITES_HEADER_RE, REG_EXTENDED); regcomp_or_die("c sites dashes", &(re->s_sites_dashes), C_SITES_DASHES_RE, REG_EXTENDED); regcomp_or_die("c sites titles", &(re->s_sites_titles), C_SITES_TITLES_RE, REG_EXTENDED); regcomp_or_die("c sites divider", &(re->s_sites_divider), C_SITES_DIVIDER_RE, REG_EXTENDED); regcomp_or_die("c sites seq", &(re->s_sites_seq), C_SITES_SEQ_RE, REG_EXTENDED); regcomp_or_die("c sites block", &(re->s_sites_block), C_SITES_BLOCK_RE, REG_EXTENDED); regcomp_or_die("c sites offset", &(re->s_sites_offset), C_SITES_OFFSET_RE, REG_EXTENDED); } static void free_patterns(struct patterns *re) { regfree(&(re->html)); regfree(&(re->version)); regfree(&(re->release)); regfree(&(re->datafile)); regfree(&(re->alphabet)); regfree(&(re->strands)); regfree(&(re->pos_strand)); regfree(&(re->neg_strand)); regfree(&(re->letter_freq)); regfree(&(re->background)); regfree(&(re->bg_source)); regfree(&(re->freq_pair)); regfree(&(re->whitespace)); regfree(&(re->motif_intro)); regfree(&(re->motif_id2)); regfree(&(re->motif_kv_pair)); regfree(&(re->motif_pspm)); regfree(&(re->motif_pssm)); regfree(&(re->motif_num)); regfree(&(re->motif_url)); regfree(&(re->s_sites_header)); regfree(&(re->s_sites_dashes)); regfree(&(re->s_sites_titles)); regfree(&(re->s_sites_divider)); regfree(&(re->s_sites_seq)); regfree(&(re->s_sites_block)); regfree(&(re->s_sites_offset)); } static void init_fscope(struct fscope *fscope) { fscope->alphabet = INVALID_ALPH; } static void free_fscope(struct fscope *fscope) { int i; if (fscope->release) free(fscope->release); if (fscope->datafile) free(fscope->datafile); if (fscope->bg_source) free(fscope->bg_source); if (fscope->background) { free_array(fscope->background); } if (fscope->letter_freqs) free_array(fscope->letter_freqs); if (fscope->motif_lookup) free(fscope->motif_lookup); } static void enqueue_motif(MTEXT_T *parser) { int num; struct minfo **lookup_ref, *item; if (parser->mscope.motif) { // set the alphabet parser->mscope.motif->alph = parser->fscope.alphabet; // set the flags parser->mscope.motif->flags = (parser->fscope.strands == 2 ? MOTIF_BOTH_STRANDS : 0); // give the url a default value of empty string if it is not present if (!parser->mscope.motif->url) parser->mscope.motif->url = strdup(""); // get the motif index and increment the motif count num = parser->fscope.motif_count++; // cache motif information for the scanned sites if (parser->options & SCANNED_SITES) { // to read the scanned sites I need to cache the length lookup_ref = &(parser->fscope.motif_lookup); *lookup_ref = mm_realloc(*lookup_ref, sizeof(struct minfo) * (num + 1)); item = (*lookup_ref)+num; item->length = parser->mscope.motif->length; } // make the motif avaliable linklst_add(parser->mscope.motif, parser->motif_queue); } memset(&(parser->mscope), 0, sizeof(struct mscope)); } /* * Create */ void* mtext_create(const char *optional_filename, int options) { MTEXT_T *parser; parser = mm_malloc(sizeof(MTEXT_T)); memset(parser, 0, sizeof(MTEXT_T)); parser->options = options; parser->file_name = (optional_filename ? strdup(optional_filename) : NULL); parser->format_match = file_name_match("meme", "txt", optional_filename); parser->state = MTEXT_FIND_VERSION; parser->errors = linklst_create(); parser->motif_queue = linklst_create(); parser->line = str_create(250); compile_patterns(&(parser->re)); init_fscope(&(parser->fscope)); return parser; } void mtext_destroy(void *data) { MTEXT_T *parser; parser = (MTEXT_T*)data; if (parser->file_name) free(parser->file_name); linklst_destroy_all(parser->errors, free); linklst_destroy_all(parser->motif_queue, destroy_motif); str_destroy(parser->line, FALSE); free_patterns(&(parser->re)); free_fscope(&(parser->fscope)); memset(parser, 0, sizeof(MTEXT_T)); free(parser); } static void error(MTEXT_T *parser, const char *format, ...) { va_list argp; int len; char dummy[1], *msg; va_start(argp, format); len = vsnprintf(dummy, 1, format, argp); va_end(argp); msg = mm_malloc(sizeof(char) * (len + 1)); va_start(argp, format); vsnprintf(msg, len + 1, format, argp); va_end(argp); linklst_add(msg, parser->errors); } static void size_freqs_array(ARRAY_T **array, ALPH_T alpha, int index) { // reallocate the array to the correct size if (alpha != INVALID_ALPH) { if (*array == NULL || get_array_length(*array) < alph_size(alpha, ALPH_SIZE)) { *array = resize_array(*array, alph_size(alpha, ALPH_SIZE)); } } else { *array = resize_array(*array, index + 1); } } static RBTREE_T* parse_keyvals(MTEXT_T *parser, const char *line) { regmatch_t matches[3]; const char *substr; RBTREE_T *kvpairs; char *key, *value; kvpairs = rbtree_create(rbtree_strcasecmp, NULL, free, NULL, free); substr = line; while(regexec_or_die("key/value", &(parser->re.motif_kv_pair), substr, 3, matches, 0)) { key = regex_str(matches+1, substr); value = regex_str(matches+2, substr); rbtree_put(kvpairs, key, value); substr = substr+(matches[2].rm_eo); } if (!regexec_or_die("whitespace", &(parser->re.whitespace), substr, 0, matches, 0)) { error(parser, "Couldn't parse \"%s\" as key = value pairs.", substr); } return kvpairs; } static void set_alphabet_from_params(MTEXT_T *parser, RBTREE_T *kv_pairs, const char *type, const char *name, const char *alpha_key) { RBNODE_T *node; char *value, *end; int alen; if ((node = rbtree_find(kv_pairs, alpha_key)) != NULL) { value = (char*)rbtree_value(node); errno = 0; alen = strtol(value, &end, 10); if (errno != 0 || *end != '\0') { error(parser, "The %s of motif %s has an alphabet length value \"%s\" " "which is not an integer.\n", type, name, value); } else { if (parser->fscope.alphabet != INVALID_ALPH) { if (alen != alph_size(parser->fscope.alphabet, ALPH_SIZE)) { error(parser, "The %s of motif %s has an alphabet length value %d " "which does not match the %s alphabet length of %d.\n", type, name, alen, alph_name(parser->fscope.alphabet), alph_size(parser->fscope.alphabet, ALPH_SIZE)); } } else { switch (alen) { case DNA_ASIZE: parser->fscope.alphabet = DNA_ALPH; break; case PROTEIN_ASIZE: parser->fscope.alphabet = PROTEIN_ALPH; break; default: error(parser, "The %s of motif %s has an alphabet length value %d " "which does not match the lengths of any supported alphabets.\n", type, name, alen); } } } } } static void set_motif_width_from_params(MTEXT_T *parser, RBTREE_T *kv_pairs, const char *type, const char *name, const char *width_key) { RBNODE_T *node; char *value, *end; int width; if ((node = rbtree_find(kv_pairs, width_key)) != NULL) { value = (char*)rbtree_value(node); errno = 0; width = strtol(value, &end, 10); if (errno != 0 || *end != '\0') { error(parser, "The %s of motif %s has an width value \"%s\" " "which is not an integer.\n", type, name, value); } else { if (parser->mscope.has_width) { if (width != get_motif_length(parser->mscope.motif)) { error(parser, "The %s of motif %s has a width value %d " "which does not match the existing width value of %d.\n", type, name, width, get_motif_length(parser->mscope.motif)); } } else { if (width <= 0) { error(parser, "The %s of motif %s has a width value %d " "which is invalid as it is not larger than zero.\n", type, name, width); } else { parser->mscope.has_width = TRUE; parser->mscope.motif->length = width; } } } } } static void set_motif_sites_from_params(MTEXT_T *parser, RBTREE_T *kv_pairs, const char *type, const char *name, const char *sites_key) { RBNODE_T *node; char *value, *end; double sites; if ((node = rbtree_find(kv_pairs, sites_key)) != NULL) { value = (char*)rbtree_value(node); errno = 0; sites = strtod(value, &end); if (*end != '\0') { error(parser, "The %s of motif %s has a number of sites value \"%s\" " "which is not a number.\n", type, name, value); } else if (errno == ERANGE) { if (sites == 0) { error(parser, "The %s of motif %s has a number of sites value \"%s\" " "which is too small to represent.\n", type, name, value); } else { error(parser, "The %s of motif %s has a number of sites value \"%s\" " "which is too large to represent.\n", type, name, value); } } else { if (parser->mscope.has_sites) { if (sites != get_motif_nsites(parser->mscope.motif)) { error(parser, "The %s of motif %s has a number of sites value %g " "which does not match the existing value of %g.\n", type, name, sites, get_motif_nsites(parser->mscope.motif)); } } else { if (sites < 0) { error(parser, "The %s of motif %s has a number of sites value %g " "which is invalid as it is smaller than zero.\n", type, name, sites); } else { parser->mscope.has_sites = TRUE; parser->mscope.motif->num_sites = sites; } } } } } static void set_motif_evalue_from_params(MTEXT_T *parser, RBTREE_T *kv_pairs, const char *type, const char *name, const char *evalue_key) { RBNODE_T *node; char *value, *end; double evalue; if ((node = rbtree_find(kv_pairs, evalue_key)) != NULL) { value = (char*)rbtree_value(node); errno = 0; evalue = strtod(value, &end); // note ERANGE is acceptable if (*end != '\0') { error(parser, "The %s of motif %s has an evalue value \"%s\" " "which is not a number.\n", type, name, value); } else { if (parser->mscope.has_evalue) { if (evalue != get_motif_evalue(parser->mscope.motif)) { error(parser, "The %s of motif %s has an evalue value %g " "which does not match the existing value of %g.\n", type, name, evalue, get_motif_evalue(parser->mscope.motif)); } } else { if (evalue < 0) { error(parser, "The %s of motif %s has an evalue value %g " "which is invalid as it is smaller than zero.\n", type, name, evalue); } else { parser->mscope.has_evalue = TRUE; parser->mscope.motif->evalue = evalue; } } } } } static void set_motif_params(MTEXT_T *parser, RBTREE_T *kv_pairs, const char *type, const char *alpha_key, const char *width_key, const char *sites_key, const char *evalue_key) { char *name; name = get_motif_id(parser->mscope.motif); set_alphabet_from_params(parser, kv_pairs, type, name, alpha_key); set_motif_width_from_params(parser, kv_pairs, type, name, width_key); set_motif_sites_from_params(parser, kv_pairs, type, name, sites_key); set_motif_evalue_from_params(parser, kv_pairs, type, name, evalue_key); } /* * Read an array of numbers which make up a row of either a PSPM or a PSSM */ static ARRAY_T* parse_nums(MTEXT_T *parser, BOOLEAN_T probabilities, const char *line) { regmatch_t matches[3]; int col, row; double num, sum, delta; const char *substr; char *name, *type; ARRAY_T *array; ALPH_T alpha; BOOLEAN_T problem; problem = FALSE; name = get_motif_id(parser->mscope.motif); type = (probabilities ? "PSPM" : "PSSM"); row = parser->counter; alpha = parser->fscope.alphabet; if (alpha != INVALID_ALPH) array = allocate_array(alph_size(alpha, ALPH_SIZE)); else array = NULL; substr = line; col = 0; sum = 0; while (regexec_or_die("motif num", &(parser->re.motif_num), substr, 3, matches, 0)) { if (alpha == INVALID_ALPH || col >= alph_size(alpha, ALPH_SIZE)) array = resize_array(array, col + 1); num = regex_dbl(matches+1, substr); if (probabilities) { if (num < 0 || num > 1) { error(parser, "The PSPM of motif %s has a number which isn't a " "probability on row %d column %d. The number should be in " "the range 0 to 1 but it was %g.\n", name, row+1, col+1, num); problem = TRUE; } sum += num; } set_array_item(col, num, array); substr = substr+(matches[1].rm_eo); col++; } if (!regexec_or_die("whitespace", &(parser->re.whitespace), substr, 0, matches, 0)) { error(parser, "The %s of motif %s has unparsable characters \"%s\" on " "the end of row %d. Only spaces or tabs are allowed on the same " "line as the numbers.\n", type, name, substr, row+1); problem = TRUE; } if (col == 0) return NULL; // empty row // try to guess the alphabet if it is unknown if (alpha == INVALID_ALPH) { switch(col) { case DNA_ASIZE: // assume DNA (could be RNA but DNA is more likely) parser->fscope.alphabet = DNA_ALPH; break; case PROTEIN_ASIZE: // AA is the only supported alphabet with 20 items parser->fscope.alphabet = PROTEIN_ALPH; break; default: error(parser, "The %s of motif %s has %d numbers in the %d row but " "this does not match with any alphabet.\n", type, name, col, row+1); problem = TRUE; } } else if (col != alph_size(alpha, ALPH_SIZE)) { error(parser, "The %s of motif %s has %d numbers in the %d row but " "this does not match with the %s alphabet which requires %d " "numbers.\n", type, name, col+1, row, alph_name(alpha), alph_size(alpha, ALPH_SIZE)); problem = TRUE; } if (probabilities) { delta = sum - 1; if (delta < 0) delta = -delta; if (delta > 0.1) { error(parser, "The PSPM of motif %s has probabilities which don't " "sum to 1 on row %d.\n", name, row+1); problem = TRUE; } } if (!problem) return array; else { if (array) free_array(array); return NULL; } } static BOOLEAN_T state_find_version(MTEXT_T *parser, const char *line) { regmatch_t matches[7]; if (regexec_or_die("html", &(parser->re.html), line, 0, matches, 0)) { parser->fscope.is_html = TRUE; } if (regexec_or_die("version", &(parser->re.version), line, 7, matches, 0)) { parser->fscope.vmajor = regex_int(matches+1, line, 0); parser->fscope.vminor = regex_int(matches+3, line, 0); parser->fscope.vpatch = regex_int(matches+5, line, 0); // see if the release date is avaliable as well if (regexec_or_die("release date", &(parser->re.release), line, 2, matches, 0)) { parser->fscope.release = regex_str(matches+1, line); } // check if this is a more modern html file to avoid // conflict with the preferred parser if (parser->fscope.is_html) { if (parser->fscope.vmajor > 4 || (parser->fscope.vmajor == 4 && parser->fscope.vminor > 3) || (parser->fscope.vmajor == 4 && parser->fscope.vminor == 3 && parser->fscope.vpatch > 2)) { error(parser, "Newer html format detected. Stopping parse to avoid " "conflicts with the preferred parser.\n"); } else { // Turn off ignore case for the background pattern regfree(&(parser->re.background)); regcomp_or_die("background", &(parser->re.background), BACKGROUND_RE, REG_EXTENDED); } } parser->state = MTEXT_PRE_MOTIF; } return TRUE; } static BOOLEAN_T state_pre_motif(MTEXT_T *parser, const char *line) { regmatch_t matches[2]; BOOLEAN_T pos, neg; const char *substr; char *temp; int len; if (regexec_or_die("data file", &(parser->re.datafile), line, 2, matches, 0)) { parser->fscope.datafile = regex_str(matches+1, line); } else if (regexec_or_die("alphabet", &(parser->re.alphabet), line, 2, matches, 0)) { substr = line+(matches[1].rm_so); len = matches[1].rm_eo - matches[1].rm_so; if ((parser->fscope.alphabet = alph_type(substr, len)) == INVALID_ALPH) { temp = regex_str(matches+1, line); error(parser, "Unrecognised alphabet %s\n", temp); free(temp); } } else if (regexec_or_die("strands", &(parser->re.strands), line, 2, matches, 0)) { substr = line+(matches[1].rm_so); pos = regexec_or_die("positive strand", &(parser->re.pos_strand), substr, 0, matches, 0); neg = regexec_or_die("negative strand", &(parser->re.neg_strand), substr, 0, matches, 0); if (pos && neg) { parser->fscope.strands = 2; } else if (pos) { parser->fscope.strands = 1; } } else if (regexec_or_die("letter frequencies", &(parser->re.letter_freq), line, 0, matches, 0)) { parser->counter = 0; parser->sum = 0; parser->state = MTEXT_IN_LETTER_FREQ; } else if (regexec_or_die("background", &(parser->re.background), line, 2, matches, 0)) { substr = line+(matches[1].rm_so); if (regexec_or_die("background source", &(parser->re.bg_source), substr, 2, matches, 0)) { parser->fscope.bg_source = regex_str(matches+1, substr); } parser->counter = 0; parser->sum = 0; parser->state = MTEXT_IN_BACKGROUND; } else if (regexec_or_die("motif intro", &(parser->re.motif_intro), line, 0, matches, 0)) { parser->state = MTEXT_IN_MOTIF; return FALSE;// need to reparse this line } else if (regexec_or_die("combined sites header", &(parser->re.s_sites_header), line, 0, matches, 0)) { parser->state = MTEXT_PRE_C_SITES; parser->counter = 0; return FALSE; } return TRUE; } static BOOLEAN_T state_in_freqs(MTEXT_T *parser, const char *line) { regmatch_t matches[5]; const char *substr; char letter; double freq, delta; ARRAY_T **array; if (parser->state == MTEXT_IN_LETTER_FREQ) { array = &(parser->fscope.letter_freqs); } else { array = &(parser->fscope.background); } substr = line; while (regexec_or_die("freq pair", &(parser->re.freq_pair), substr, 5, matches, 0)) { letter = regex_chr(matches+1, substr); freq = regex_dbl(matches+2, substr); substr = substr+(matches[2].rm_eo); // check the letter and determine alphabet if it is unknown if (!alph_test(&(parser->fscope.alphabet), parser->counter, letter)) { error(parser, "The frequency letter %c at position %d is invalid " "for the %s alphabet.\n", letter, parser->counter + 1, alph_name(parser->fscope.alphabet)); return TRUE; } // check the freq is valid if (freq < 0 || freq > 1) { error(parser, "The frequency %f associated with the letter %c at " "position %d is not valid as it is not in the range 0 to 1.\n", freq, letter, parser->counter + 1); return TRUE; } // the letter is correct so now process it // resize the array if required size_freqs_array(array, parser->fscope.alphabet, parser->counter); set_array_item(parser->counter, freq, *array); parser->sum += freq; // move on to the next parser->counter++; } // see if we're finished if (parser->fscope.alphabet != INVALID_ALPH && parser->counter >= alph_size(parser->fscope.alphabet, ALPH_SIZE)) { // check that the frequencies sum to 1 delta = parser->sum - 1; if (delta < 0) delta = -delta; if (delta > 0.1) { error(parser, "The letter frequencies do not sum to 1.\n"); return TRUE; } if (parser->format_match < 4) parser->format_match = 4; // continue searching for pre motif information parser->state = MTEXT_PRE_MOTIF; } if (!regexec_or_die("whitespace", &(parser->re.whitespace), substr, 0, matches, 0)) { error(parser, "Expected only space after the letter " "frequencies but found \"%s\".\n", substr); } return TRUE; } static BOOLEAN_T state_in_motif(MTEXT_T *parser, const char *line) { regmatch_t matches[4]; const char *substr; MOTIF_T *motif; RBTREE_T *kvpairs; if (regexec_or_die("motif intro", &(parser->re.motif_intro), line, 4, matches, 0)) { if (parser->mscope.motif && regex_cmp(matches+2, line, get_motif_id(parser->mscope.motif)) == 0) { // check if we've seen this motif intro before as the old html format // has a nasty habit of repeating the intro return TRUE; } enqueue_motif(parser);// enqueue existing motifs // create a new motif parser->mscope.motif = (MOTIF_T*)mm_malloc(sizeof(MOTIF_T)); motif = parser->mscope.motif; memset(motif, 0, sizeof(MOTIF_T)); // set defaults motif->complexity = -1; set_motif_strand('+', motif); // copy over the id set_motif_id(line+(matches[2].rm_so), matches[2].rm_eo - matches[2].rm_so, motif); // try to find a second id substr = line+(matches[2].rm_eo); if ((!parser->fscope.is_html) && regexec_or_die("motif id 2", &(parser->re.motif_id2), substr, 4, matches, 0)) { // copy of the second id set_motif_id2(substr+(matches[1].rm_so), matches[1].rm_eo - matches[1].rm_so, motif); substr = substr+(matches[1].rm_eo); } // should I bother with the key/value arguments? } else if (regexec_or_die("motif pspm", &(parser->re.motif_pspm), line, 2, matches, 0)) { substr = line+(matches[1].rm_so); // TODO parse key / value pairs kvpairs = parse_keyvals(parser, substr); set_motif_params(parser, kvpairs, "pspm", "alength", "w", "nsites", "E"); rbtree_destroy(kvpairs); parser->counter = 0; parser->state = MTEXT_IN_PSPM; } else if (regexec_or_die("motif pssm", &(parser->re.motif_pssm), line, 2, matches, 0)) { substr = line+(matches[1].rm_so); kvpairs = parse_keyvals(parser, substr); // n is not equalivent to nsites, n is weighted possible sites set_motif_params(parser, kvpairs, "pssm", "alength", "w", "", "E"); rbtree_destroy(kvpairs); parser->counter = 0; parser->state = MTEXT_IN_PSSM; } else if (regexec_or_die("motif url", &(parser->re.motif_url), line, 2, matches, 0)) { motif->url = regex_str(matches+1, line); } else if (regexec_or_die("combined sites header", &(parser->re.s_sites_header), line, 0, matches, 0)) { parser->state = MTEXT_PRE_C_SITES; parser->counter = 0; return FALSE; } return TRUE; } static BOOLEAN_T state_in_pspm(MTEXT_T *parser, const char *line) { ARRAY_T *row; MOTIF_T *motif; motif = parser->mscope.motif; row = parse_nums(parser, TRUE, line); if (row) { if (motif->freqs) { grow_matrix(row, motif->freqs); } else { motif->freqs = array_to_matrix(TRUE, row); } free_array(row); parser->counter++; } else if (!parser->mscope.has_width) { // assume the first blank line is the end of the numbers motif->length = parser->counter; parser->mscope.has_width = TRUE; parser->state = MTEXT_IN_MOTIF; } if (parser->mscope.has_width && parser->counter >= get_motif_length(parser->mscope.motif)) { parser->state = MTEXT_IN_MOTIF; } return TRUE; } static BOOLEAN_T state_in_pssm(MTEXT_T *parser, const char *line) { ARRAY_T *row; MOTIF_T *motif; motif = parser->mscope.motif; row = parse_nums(parser, FALSE, line); if (row) { if (motif->scores) { grow_matrix(row, motif->scores); } else { motif->scores = array_to_matrix(TRUE, row); } free_array(row); parser->counter++; } else if (!parser->mscope.has_width) { // assume the first blank line is the end of the numbers motif->length = parser->counter; parser->mscope.has_width = TRUE; parser->state = MTEXT_IN_MOTIF; } if (parser->mscope.has_width && parser->counter >= get_motif_length(parser->mscope.motif)) { parser->state = MTEXT_IN_MOTIF; } return TRUE; } static BOOLEAN_T state_pre_s_sites(MTEXT_T *parser, const char *line) { regmatch_t matches[3]; regex_t *patterns[4] = { &(parser->re.s_sites_header), &(parser->re.s_sites_dashes), &(parser->re.s_sites_titles), &(parser->re.s_sites_divider)}; if (regexec_or_die("c sites headers", patterns[parser->counter], line, 3, matches, 0)) { if (parser->counter == 0) { parser->fscope.scanned_sites_threshold = regex_dbl(matches+1, line); } parser->counter++; if (parser->counter >= 4) { enqueue_motif(parser); parser->state = MTEXT_IN_C_SITES; parser->counter = 0; } return TRUE; } else { parser->state = MTEXT_DONE; return FALSE; } } static BOOLEAN_T count_sites(MTEXT_T *parser, const char *blocks, int *seq_site_count, int* seq_length) { regmatch_t matches[8]; const char *substr; int motif_i; // count the number of sites and calculate the sequence length substr = blocks; *seq_site_count = 0; *seq_length = 0; while (*substr != '\0') { if (regexec_or_die("c sites block", &(parser->re.s_sites_block), substr, 8, matches, 0)) { if ((regex_chr(matches+1, substr) == '[' && regex_chr(matches+7, substr) == '>') || (regex_chr(matches+1, substr) == '<' && regex_chr(matches+7, substr) == ']')) { error(parser, "Brackets don't match!\n"); return FALSE; } motif_i = regex_int(matches+3, substr, 0) - 1; // turn into index if (motif_i < 0 || motif_i >= parser->fscope.motif_count) { error(parser, "Bad motif number %d in scanned sites.\n", motif_i +1); return FALSE; } if (parser->options & SCANNED_SITES) { // motif_lookup is not avaliable unless SCANNED_SITES is set *seq_length += parser->fscope.motif_lookup[motif_i].length; (*seq_site_count)++; } substr += matches[0].rm_eo; } else if (regexec_or_die("c sites offset", &(parser->re.s_sites_offset), substr, 2, matches, 0)) { if (parser->options & SCANNED_SITES) { *seq_length += regex_int(matches+1, substr, 0); } substr += matches[0].rm_eo; } else if (!regexec_or_die("whitespace", &(parser->re.whitespace), substr, 0, matches, 0)) { error(parser, "Expected whitespace at end of combined sites line but got: %s\n", substr); return FALSE; } if (*substr == '_') substr++; } return TRUE; } /* * Note this function is only called when SCANNED_SITES option is set */ static void parse_sites(MTEXT_T *parser, const char *blocks, int seq_site_count, int seq_length, SCANNED_SEQ_T *sseq) { regmatch_t matches[8]; const char *substr; int offset, site_i, motif_num; BOOLEAN_T site_strong; char site_strand, site_frame; double site_pvalue; // read the scanned sites this time substr = blocks; offset = 0; site_i = 0; while (*substr != '\0') { if (regexec_or_die("c sites block", &(parser->re.s_sites_block), substr, 8, matches, 0)) { motif_num = regex_int(matches+3, substr, 0); // turn into index site_strong = (regex_chr(matches+1, substr) == '['); site_strand = regex_chr(matches+2, substr); site_frame = regex_chr(matches+4, substr); site_pvalue = regex_dbl(matches+5, substr); sseq_set(sseq, site_i++, motif_num, site_strand, offset, site_pvalue); offset += parser->fscope.motif_lookup[motif_num - 1].length; substr += matches[0].rm_eo; } else if (regexec_or_die("c sites offset", &(parser->re.s_sites_offset), substr, 2, matches, 0)) { offset += regex_int(matches+1, substr, 0); substr += matches[0].rm_eo; } if (*substr == '_') substr++; } } static BOOLEAN_T state_in_s_sites(MTEXT_T *parser, const char *line) { regmatch_t matches[8]; const char *blocks, *substr; char *seq_id; double seq_pvalue; int seq_site_count, seq_length; SCANNED_SEQ_T *sseq; if (parser->options & SCANNED_SITES) { if (!parser->fscope.scanned_sites) parser->fscope.scanned_sites = arraylst_create(); } if (regexec_or_die("c sites seq", &(parser->re.s_sites_seq), line, 5, matches, 0)) { blocks = line+(matches[4].rm_so); if (!count_sites(parser, blocks, &seq_site_count, &seq_length)) return TRUE; if (parser->options & SCANNED_SITES) { seq_id = regex_str(matches+1, line); seq_pvalue = regex_dbl(matches+2, line); sseq = sseq_create(seq_id, seq_length, seq_pvalue, seq_site_count); free(seq_id); parse_sites(parser, blocks, seq_site_count, seq_length, sseq); arraylst_add(sseq, parser->fscope.scanned_sites); } } else if (regexec_or_die("dashes", &(parser->re.s_sites_dashes), line, 0, matches, 0)) { if (parser->options & SCANNED_SITES) parser->fscope.options_found |= SCANNED_SITES; parser->state = MTEXT_DONE; } else { error(parser, "Unrecognised line \"%s\" in scanned sites.\n", line); parser->state = MTEXT_DONE; } return TRUE; } static void dispatch_line(MTEXT_T *parser, const char *line, size_t size, BOOLEAN_T end) { BOOLEAN_T consumed; MTEXT_STATE_EN before; consumed = FALSE; while (!consumed) { before = parser->state; switch (parser->state) { case MTEXT_FIND_VERSION: consumed = state_find_version(parser, line); break; case MTEXT_PRE_MOTIF: consumed = state_pre_motif(parser, line); break; case MTEXT_IN_LETTER_FREQ: consumed = state_in_freqs(parser, line); break; case MTEXT_IN_BACKGROUND: consumed = state_in_freqs(parser, line); break; case MTEXT_IN_MOTIF: consumed = state_in_motif(parser, line); break; case MTEXT_IN_PSPM: consumed = state_in_pspm(parser, line); break; case MTEXT_IN_PSSM: consumed = state_in_pssm(parser, line); break; case MTEXT_PRE_C_SITES: consumed = state_pre_s_sites(parser, line); break; case MTEXT_IN_C_SITES: consumed = state_in_s_sites(parser, line); break; case MTEXT_DONE: consumed = TRUE; break;//do nothing default: die("Unknown state"); } if (!consumed && parser->state == before) { die("Infinite loop"); } } if (end) { enqueue_motif(parser); } } #define MAX_LINE_LEN 1000 void mtext_update(void *data, const char *chunk, size_t size, short end) { assert(data != NULL); assert(size >= 0); MTEXT_T *parser; int offset, i; parser = (MTEXT_T*)data; i = 0; while (i < size) { // handle old Mac style newlines (\n\r) if (str_len(parser->line) == 0 && chunk[i] == '\r') i++; offset = i; for (; i < size; ++i) { if (chunk[i] == '\n') break; } if (str_len(parser->line) + i > MAX_LINE_LEN) { error(parser, "Maximum line length exceeded.\n"); return; } str_append(parser->line, chunk+offset, (i - offset)); if (i < size || end) { // handle DOS style newlines (\r\n) if (str_len(parser->line) > 0 && str_char(parser->line, -1) == '\r') { str_truncate(parser->line, -1); } dispatch_line(parser, str_internal(parser->line), str_len(parser->line), (end && i >= size - 1)); // don't allow it to shrink the buffer str_ensure(parser->line, str_len(parser->line)); str_clear(parser->line); } i++; // step over the newline } } short mtext_has_format_match(void *data) { return ((MTEXT_T*)data)->format_match; } short mtext_has_error(void *data) { return linklst_size(((MTEXT_T*)data)->errors) > 0; } char* mtext_next_error(void *data) { return (char*)linklst_pop(((MTEXT_T*)data)->errors); } short mtext_has_motif(void *data) { return linklst_size(((MTEXT_T*)data)->motif_queue) > 0; } MOTIF_T* mtext_next_motif(void *data) { return (MOTIF_T*)linklst_pop(((MTEXT_T*)data)->motif_queue); } ALPH_T mtext_get_alphabet(void *data) { MTEXT_T *parser; parser = (MTEXT_T*)data; return parser->fscope.alphabet; } int mtext_get_strands(void *data) { MTEXT_T *parser; parser = (MTEXT_T*)data; return parser->fscope.strands; } BOOLEAN_T mtext_get_bg(void *data, ARRAY_T **bg) { MTEXT_T *parser; parser = (MTEXT_T*)data; if (parser->fscope.background == NULL) return FALSE; *bg = resize_array(*bg, get_array_length(parser->fscope.background)); copy_array(parser->fscope.background, *bg); return TRUE; } void* mtext_motif_optional(void *data, int option) { MTEXT_T *parser; parser = (MTEXT_T*)data; if (!(parser->options & option)) { die("Requested value of optional component which was not requested " "during construction.\n"); return NULL; } //TODO return NULL; } void* mtext_file_optional(void *data, int option) { MTEXT_T *parser; parser = (MTEXT_T*)data; if (!(parser->options & option)) { die("Requested value of optional component which was not requested " "during construction.\n"); return NULL; } if (parser->fscope.options_found & option) { if (parser->fscope.options_returned & option) { die("Sorry, optional values are only returned once. " "This is because we can not guarantee that the " "previous caller did not deallocate the memory. " "Hence this is a feature to avoid memory bugs.\n"); return NULL; } parser->fscope.options_returned |= option; } else { // Not yet found or unsupported return NULL; } switch (option) { case SCANNED_SITES: return parser->fscope.scanned_sites; default: die("Option code %d does not match any single option. " "This means that it must contain multiple options " "and only one is allowed at a time\n.", option); } return NULL; //unreachable! make compiler happy }