/************************************************************************** * FILE: string-match.c * AUTHOR: James Johnson * CREATE DATE: 24-November-2009 * PROJECT: shared * COPYRIGHT: TBA * VERSION: $Revision: 1.0 $ * DESCRIPTION: implementation of various string matching algorithms. * Currently only Boyer-Moore (A Fast String Searching Algorithm, 1977) * is avaliable **************************************************************************/ #include "string-match.h" #include "utils.h" #include #include #include /* * structure for doing a Boyer-Moore exact single string match */ struct bm_string { char *string; int length; int ignore_case; int *sshift; int lstart; int *lshift; int lshift_len; }; /* * Create a compiled string for a Boyer-Moore exact string match * allowing for case to be ignored if required */ BMSTR_T* bmstr_create2(const char *string, int ignore_case) { BMSTR_T *bmstr; char *str; int len, i, j, suffix_len, suffix_first, shift, offset; len = strlen(string); str = (char*)mm_malloc(sizeof(char)*(len+1)); if (ignore_case) { for (i = 0; i < len; ++i) { str[i] = tolower(string[i]); } str[len] = '\0'; } else { strncpy(str, string, len+1); } bmstr = (BMSTR_T*)mm_malloc(sizeof(BMSTR_T)); bmstr->string = str; bmstr->length = len; bmstr->ignore_case = ignore_case; //calculate the letter shift table if (len > 1) { bmstr->lstart = str[len-2]; bmstr->lshift = (int*)mm_malloc(sizeof(int)); bmstr->lshift[0] = 1; bmstr->lshift_len = 1; for (i = len-3; i >=0; --i) { if (str[i] < bmstr->lstart) { int diff; diff = bmstr->lstart - str[i]; bmstr->lshift = mm_realloc(bmstr->lshift, sizeof(int)*(bmstr->lshift_len + diff)); memmove((bmstr->lshift)+diff, bmstr->lshift, sizeof(int)*(bmstr->lshift_len)); bmstr->lshift[0] = len - i - 1; for (j = 1; j < diff; ++j) bmstr->lshift[j] = 0; bmstr->lstart = str[i]; bmstr->lshift_len += diff; } else if (str[i] >= (bmstr->lstart + bmstr->lshift_len)) { int extra, new_len; extra = str[i] - bmstr->lstart - bmstr->lshift_len + 1; new_len = bmstr->lshift_len + extra; bmstr->lshift = mm_realloc(bmstr->lshift, sizeof(int)*(new_len)); for (j = bmstr->lshift_len; j < new_len; ++j) bmstr->lshift[j] = 0; bmstr->lshift[new_len-1] = len - i - 1; bmstr->lshift_len = new_len; } else if (!bmstr->lshift[str[i] - bmstr->lstart]) { bmstr->lshift[str[i] - bmstr->lstart] = len - i -1; } } } else { bmstr->lstart = 0; bmstr->lshift = (int*)NULL; bmstr->lshift_len = 0; } //calculate the suffix shifts bmstr->sshift = mm_malloc(sizeof(int)*len); //calculate the suffix shift table for each good suffix length proceeded by a bad prefix for (suffix_len = 0; suffix_len < len; ++suffix_len) { suffix_first = len - suffix_len; //search for the right most substring that has the last i letters but not the one just before for (shift = 1; shift <= len; ++shift) { //test that the position before the suffix first does not match at this shift offset = suffix_first - shift; if (offset < 1) { i = -offset; } else { if (str[offset - 1] == str[suffix_first -1]) continue; // not here i = 0; } for (; i < suffix_len; ++i) { if (str[offset + i] != str[suffix_first + i]) break; } if (i == suffix_len) break;//subpattern matched so we know the shift } bmstr->sshift[suffix_len] = shift; } return bmstr; } /* * Create a compiled string for a Boyer-Moore exact string match. * The case is required to match. */ BMSTR_T* bmstr_create(const char *str) { return bmstr_create2(str, 0); } /* * Destroy a compiled string for a Boyer-Moore exact string match */ void bmstr_destroy(BMSTR_T *compiled_string) { free(compiled_string->string); if (compiled_string->lshift_len) free(compiled_string->lshift); free(compiled_string->sshift); free(compiled_string); } /* * Gets the length of the compiled string */ int bmstr_length(BMSTR_T *bmstr) { return bmstr->length; } /* * Gets the text of the compiled string */ char* bmstr_text(BMSTR_T *bmstr) { return bmstr->string; } /* * Find a compiled string in another string. * Returns the offset that the string is at. * Return the -(offset+1) for a possible cut-off match */ int bmstr_substring(BMSTR_T *bmstr, const char *string, int len) { int i, last, matched; last = bmstr->length-1; if (last < 0) return 0;//find empty string anywhere! i = last; //for the sake of speed sacrifice conciseness, the two branches //of this if are practically a copy-paste with the exception of the //character comparison. This is the only way I see to move the decision //out of the loop... if (bmstr->ignore_case) { while (1) { if (i >= len) { //if we go off the edge the assume all chars off the edge match //note that we can no longer use the suffix shift while (1) { matched = i - len + 1; if (matched > last) { return -len -1; } while (1) { if (tolower(string[i-matched]) == bmstr->string[last-matched]) { if (matched == last) { return -((i-last)+1); } ++matched; } else { char miss = tolower(string[i-matched]); int shift; if (bmstr->lstart > miss || (bmstr->lstart + bmstr->lshift_len) <= miss) { shift = bmstr->length - matched; //letter not in subseq so do best skip } else { shift = bmstr->lshift[miss - bmstr->lstart]; if (shift) { shift -= matched; if (shift < 1) shift = 1; //as we no longer have suffix table, must check minimum shift of 1 } else { shift = bmstr->length - matched;//letter not in subseq so do best skip } } i += shift; break; } } } } else { matched = 0; } while (1) { if (tolower(string[i-matched]) == bmstr->string[last-matched]) { if (matched == last) { return (i-last); } ++matched; } else { char miss = tolower(string[i-matched]); int shift; if (bmstr->lstart > miss || (bmstr->lstart + bmstr->lshift_len) <= miss) { shift = bmstr->length - matched; //letter not in subseq so do best skip } else { shift = bmstr->lshift[miss - bmstr->lstart]; if (shift) { shift -= matched; int temp = bmstr->sshift[matched]; if (temp > shift) shift = temp; } else { shift = bmstr->length - matched;//letter not in subseq so do best skip } } i += shift; break; } } } } else { // require case to match while (1) { if (i >= len) { //if we go off the edge the assume all chars off the edge match //note that we can no longer use the suffix shift while (1) { matched = i - len + 1; if (matched > last) { return -len -1; } while (1) { if (string[i-matched] == bmstr->string[last-matched]) { if (matched == last) { return -((i-last)+1); } ++matched; } else { char miss = string[i-matched]; int shift; if (bmstr->lstart > miss || (bmstr->lstart + bmstr->lshift_len) <= miss) { shift = bmstr->length - matched; //letter not in subseq so do best skip } else { shift = bmstr->lshift[miss - bmstr->lstart]; if (shift) { shift -= matched; if (shift < 1) shift = 1; //as we no longer have suffix table, must check minimum shift of 1 } else { shift = bmstr->length - matched;//letter not in subseq so do best skip } } i += shift; break; } } } } else { matched = 0; } while (1) { if (string[i-matched] == bmstr->string[last-matched]) { if (matched == last) { return (i-last); } ++matched; } else { char miss = string[i-matched]; int shift; if (bmstr->lstart > miss || (bmstr->lstart + bmstr->lshift_len) <= miss) { shift = bmstr->length - matched; //letter not in subseq so do best skip } else { shift = bmstr->lshift[miss - bmstr->lstart]; if (shift) { shift -= matched; int temp = bmstr->sshift[matched]; if (temp > shift) shift = temp; } else { shift = bmstr->length - matched;//letter not in subseq so do best skip } } i += shift; break; } } } } } /* * Finds the first match or partial match to one of the compiled strings in another string. * Returns the offset that the string is at. * Return the -(offset+1) for a possible cut-off match * sets the which variable (if not null) to the index + 1 of the compiled string * or sets it to zero if nothing is found */ int bmstr_multi_substring(int num, BMSTR_T **bmstrs, const char *string, int len, int *which) { int i, pos, left_p, left_i; BOOLEAN_T matched, left_m; left_p = len; left_m = FALSE; left_i = 0; for (i = 0; i < num; ++i) { pos = bmstr_substring(bmstrs[i], string, len); if (pos < 0) { pos = -(pos + 1); matched = FALSE; } else { matched = TRUE; } if (pos < left_p) { left_p = pos; left_i = i + 1; left_m = matched; } } if (which) *which = left_i; return (left_m ? left_p : -(left_p + 1)); }