/* The Boyer-Moore String Search Algorithm
*
* This blog post is a C89 program that you can compile and run. To build, do:
* cc -x c -o boyer-moore post.txt
* and then you can run it with:
* ./boyer-moore needle haystack
*
* String searching has some interesting properties as a problem, since it is
* very very commonly done and has an extremely obvious naive solution: for
* each character index in haystack, test whether needle occurs there.
* Unfortunately the naive solution takes O(mn) where m is the length of the
* needle and n is the length of the haystack. Early in the history of computer
* science (like, the 1970s) improvements on the naive algorithm were being
* made.
*
* The first published one I know of was the Knuth-Morris-Pratt algorithm,
* which is based on kind of the same idea as Boyer-Moore but lacked an
* important practical optimization. KMP was the first linear-time string search
* algorithm. It relies on the key insight that, with a bit of preprocessing of
* the needle, we can "know" that the needle couldn't match at any of the next
* few places in the haystack.
*
* For example, if the needle is "abcde", and the input string is "abfgh",
* we start checking for a match at | and detect the mismatch at ^:
*
* "abcde"
* "abfgh"
* 01234
* | ^
*
* And having done so, we *already know* that the pattern can't match starting
* at indexes 1 or 2, so we can skip straight to index 3 to look for the second
* match. This allows us to divide the length of the pattern out of the usual
* quadratic complexity, since we get to skip by the length of the pattern!
*
* Boyer-Moore, published in the October 1977 Communications of the ACM (CACM)
* has another improvement on top of that basic principle: if we start checking
* for a match from the *end* of the pattern rather than the beginning, we're
* much more likely to be able to skip further, since we're finding the largest
* mismatched span of the string rather than the smallest. It does this by
* searching backwards rather than forwards, but the algorithm is otherwise
* pretty similar.
*
* But wait a moment: how do we "already know" that the pattern can't match
* starting at indexes 1 or 2? By preprocessing the pattern before we start
* matching! For Boyer-Moore, we precompute two tables, which the paper calls
* delta1 and delta2. The delta1 table tells us, for any given character, how
* far we need to shift the search window over to "match up" the next occurence
* of that character. The delta2 table tells us, for any given position in the
* pattern, what the next leftward position in the pattern is that has the
* same suffix as that position. It's confusing and difficult to explain, and
* the original paper's description is very opaque, but there are some examples
* there that may help.
*
* One other note: the paper is written assuming 1-indexed strings, so there are
* some stray -1s in this implementation to deal with that.
*
* Anyway, onward to the implementation!
*/
/* These are used as the sizes of delta1 and delta2, to avoid dynamic memory
* allocation. MAXCHR is the size of the "alphabet" of characters being
* searched, so if you knew you were searching ASCII text you could reduce it,
* but this algorithm has a hard time with Unicode's enormous alphabet.
* MAXPAT is the maximum length of a pattern, used to size delta2. It would be
* possible to dynamically allocate delta2 and remove this limit but this
* simplifies the code. */
#define MAXCHR 256
#define MAXPAT 512
/* Since this is an example, I didn't want to depend on string.h, and wanted
* to match the paper's naming as much as possible. This computes the length
* of the given string. */
int len(char *str) {
int n = 0;
while (str[n])
n++;
return n;
}
void make_delta1(int *d1, char *pat, int patlen) {
int i;
/* The delta1 table maps each character to the rightmost index at which
* it occurs in the pattern, counting *from the end*, or patlen (meaning
* the entire patlen can be skipped) if it doesn't occur at all. Here,
* we do this by making two passes over the table, first filling it
* with patlen as a default for every character, then overwriting some
* entries. */
for (i = 0; i < MAXCHR; i++)
d1[i] = patlen;
for (i = 0; pat[i]; i++)
d1[pat[i]] = patlen - i;
}
/* This returns whether pat[a .. a + len] = pat[b .. b + len]. Since this is C
* and we have pointer arithmetic, it could be phrased as simply:
* strncmp(pat + a, pat + b, len)
* but again I wanted to stay close to the form of the original paper. */
int unify(char *pat, int a, int b, int len) {
int i;
for (i = 0; i < len; i++)
if (pat[a + i] != pat[b + i])
return 0;
return 1;
}
/* The paper says: rightmost k such that the (patlen - (j + 1)) chars at
* pat[j + 1] and pat[k] are equal, and k <= 1 or pat[k - 1] != pat[j].
* We compute that by walking backwards from j + 1, looking for the first k that
* unifies with the end of the pattern.
*
* The paper asserts breezily that this can return a negative value, but it
* manifestly cannot and I can't figure out why they thought it could. They did
* not provide pseudocode for this function, only a prose description. :( */
int rpr(char *pat, int patlen, int j) {
int k;
for (k = j + 1; k > 1; k--) {
if (unify(pat, j + 1, k, patlen - (j + 1)))
break;
}
return k;
}
/* This definition is straight from the paper. */
void make_delta2(int *d2, char *pat, int patlen) {
int i;
for (i = 0; i < MAXPAT; i++)
d2[i] = patlen + 1 - rpr(pat, patlen, i);
}
int max(int a, int b) {
return a > b ? a : b;
}
/* I made one concession here: the string argument has been renamed 'str' so
* that if for some reason you are compiling this as C++ it won't conflict
* with the built-in string type. */
int boyer_moore(char *pat, char *str) {
int delta1[MAXCHR];
int delta2[MAXPAT];
int i, j;
int patlen = len(pat);
int stringlen = len(str);
/* These steps could be precomputed for a given pattern and then have
* their results reused many times, if one wanted to do so (eg,
* searching for the same pattern in every line of a file). */
make_delta1(delta1, pat, patlen);
make_delta2(delta2, pat, patlen);
i = patlen - 1;
/* While we're not yet past the end of the string (remember that i
* is where the *end* of the pattern will go, not the start), ... */
while (i <= stringlen) {
/* Walk rightwards from i, trying to match the pattern. */
for (j = patlen - 1; j >= 0; j--) {
if (str[i] != pat[j])
break;
i--;
}
/* This means we walked the entire pattern and didn't find a
* mismatch, so we're done. */
if (j == -1)
return i + 1;
/* Otherwise, we can skip ahead by whichever of delta1 or
* delta2 is larger. */
i = i + max(delta1[str[i]], delta2[j]);
}
return -1;
}
int main(int argc, char *argv[]) {
if (argc != 3)
return 0;
return boyer_moore(argv[1], argv[2]);
}
/*
* $#t The Boyer-Moore String Search Algorithm
* $#s A fast algorithm for finding substring matches, implemented in C
* $#o c, computer-science, programming
* $#u cc236173-19de-448b-b0ba-ef912e609ce0
*/