/* The Boyer-Moore String Search Algorithm * * This blog post is a C89 program that you can compile and run. To build, do: * cc -x c -o boyer-moore post.txt * and then you can run it with: * ./boyer-moore needle haystack * * String searching has some interesting properties as a problem, since it is * very very commonly done and has an extremely obvious naive solution: for * each character index in haystack, test whether needle occurs there. * Unfortunately the naive solution takes O(mn) where m is the length of the * needle and n is the length of the haystack. Early in the history of computer * science (like, the 1970s) improvements on the naive algorithm were being * made. * * The first published one I know of was the Knuth-Morris-Pratt algorithm, * which is based on kind of the same idea as Boyer-Moore but lacked an * important practical optimization. KMP was the first linear-time string search * algorithm. It relies on the key insight that, with a bit of preprocessing of * the needle, we can "know" that the needle couldn't match at any of the next * few places in the haystack. * * For example, if the needle is "abcde", and the input string is "abfgh", * we start checking for a match at | and detect the mismatch at ^: * * "abcde" * "abfgh" * 01234 * | ^ * * And having done so, we *already know* that the pattern can't match starting * at indexes 1 or 2, so we can skip straight to index 3 to look for the second * match. This allows us to divide the length of the pattern out of the usual * quadratic complexity, since we get to skip by the length of the pattern! * * Boyer-Moore, published in the October 1977 Communications of the ACM (CACM) * has another improvement on top of that basic principle: if we start checking * for a match from the *end* of the pattern rather than the beginning, we're * much more likely to be able to skip further, since we're finding the largest * mismatched span of the string rather than the smallest. It does this by * searching backwards rather than forwards, but the algorithm is otherwise * pretty similar. * * But wait a moment: how do we "already know" that the pattern can't match * starting at indexes 1 or 2? By preprocessing the pattern before we start * matching! For Boyer-Moore, we precompute two tables, which the paper calls * delta1 and delta2. The delta1 table tells us, for any given character, how * far we need to shift the search window over to "match up" the next occurence * of that character. The delta2 table tells us, for any given position in the * pattern, what the next leftward position in the pattern is that has the * same suffix as that position. It's confusing and difficult to explain, and * the original paper's description is very opaque, but there are some examples * there that may help. * * One other note: the paper is written assuming 1-indexed strings, so there are * some stray -1s in this implementation to deal with that. * * Anyway, onward to the implementation! */ /* These are used as the sizes of delta1 and delta2, to avoid dynamic memory * allocation. MAXCHR is the size of the "alphabet" of characters being * searched, so if you knew you were searching ASCII text you could reduce it, * but this algorithm has a hard time with Unicode's enormous alphabet. * MAXPAT is the maximum length of a pattern, used to size delta2. It would be * possible to dynamically allocate delta2 and remove this limit but this * simplifies the code. */ #define MAXCHR 256 #define MAXPAT 512 /* Since this is an example, I didn't want to depend on string.h, and wanted * to match the paper's naming as much as possible. This computes the length * of the given string. */ int len(char *str) { int n = 0; while (str[n]) n++; return n; } void make_delta1(int *d1, char *pat, int patlen) { int i; /* The delta1 table maps each character to the rightmost index at which * it occurs in the pattern, counting *from the end*, or patlen (meaning * the entire patlen can be skipped) if it doesn't occur at all. Here, * we do this by making two passes over the table, first filling it * with patlen as a default for every character, then overwriting some * entries. */ for (i = 0; i < MAXCHR; i++) d1[i] = patlen; for (i = 0; pat[i]; i++) d1[pat[i]] = patlen - i; } /* This returns whether pat[a .. a + len] = pat[b .. b + len]. Since this is C * and we have pointer arithmetic, it could be phrased as simply: * strncmp(pat + a, pat + b, len) * but again I wanted to stay close to the form of the original paper. */ int unify(char *pat, int a, int b, int len) { int i; for (i = 0; i < len; i++) if (pat[a + i] != pat[b + i]) return 0; return 1; } /* The paper says: rightmost k such that the (patlen - (j + 1)) chars at * pat[j + 1] and pat[k] are equal, and k <= 1 or pat[k - 1] != pat[j]. * We compute that by walking backwards from j + 1, looking for the first k that * unifies with the end of the pattern. * * The paper asserts breezily that this can return a negative value, but it * manifestly cannot and I can't figure out why they thought it could. They did * not provide pseudocode for this function, only a prose description. :( */ int rpr(char *pat, int patlen, int j) { int k; for (k = j + 1; k > 1; k--) { if (unify(pat, j + 1, k, patlen - (j + 1))) break; } return k; } /* This definition is straight from the paper. */ void make_delta2(int *d2, char *pat, int patlen) { int i; for (i = 0; i < MAXPAT; i++) d2[i] = patlen + 1 - rpr(pat, patlen, i); } int max(int a, int b) { return a > b ? a : b; } /* I made one concession here: the string argument has been renamed 'str' so * that if for some reason you are compiling this as C++ it won't conflict * with the built-in string type. */ int boyer_moore(char *pat, char *str) { int delta1[MAXCHR]; int delta2[MAXPAT]; int i, j; int patlen = len(pat); int stringlen = len(str); /* These steps could be precomputed for a given pattern and then have * their results reused many times, if one wanted to do so (eg, * searching for the same pattern in every line of a file). */ make_delta1(delta1, pat, patlen); make_delta2(delta2, pat, patlen); i = patlen - 1; /* While we're not yet past the end of the string (remember that i * is where the *end* of the pattern will go, not the start), ... */ while (i <= stringlen) { /* Walk rightwards from i, trying to match the pattern. */ for (j = patlen - 1; j >= 0; j--) { if (str[i] != pat[j]) break; i--; } /* This means we walked the entire pattern and didn't find a * mismatch, so we're done. */ if (j == -1) return i + 1; /* Otherwise, we can skip ahead by whichever of delta1 or * delta2 is larger. */ i = i + max(delta1[str[i]], delta2[j]); } return -1; } int main(int argc, char *argv[]) { if (argc != 3) return 0; return boyer_moore(argv[1], argv[2]); }