/* The Boyer-Moore String Search Algorithm
 *
 * This blog post is a C89 program that you can compile and run. To build, do:
 *   cc -x c -o boyer-moore post.txt
 * and then you can run it with:
 *   ./boyer-moore needle haystack
 *
 * String searching has some interesting properties as a problem, since it is
 * very very commonly done and has an extremely obvious naive solution: for
 * each character index in haystack, test whether needle occurs there.
 * Unfortunately the naive solution takes O(mn) where m is the length of the
 * needle and n is the length of the haystack. Early in the history of computer
 * science (like, the 1970s) improvements on the naive algorithm were being
 * made.
 *
 * The first published one I know of was the Knuth-Morris-Pratt algorithm,
 * which is based on kind of the same idea as Boyer-Moore but lacked an
 * important practical optimization. KMP was the first linear-time string search
 * algorithm. It relies on the key insight that, with a bit of preprocessing of
 * the needle, we can "know" that the needle couldn't match at any of the next
 * few places in the haystack.
 *
 * For example, if the needle is "abcde", and the input string is "abfgh",
 * we start checking for a match at | and detect the mismatch at ^:
 *
 *    "abcde"
 *    "abfgh"
 *     01234
 *     | ^
 *
 * And having done so, we *already know* that the pattern can't match starting
 * at indexes 1 or 2, so we can skip straight to index 3 to look for the second
 * match. This allows us to divide the length of the pattern out of the usual
 * quadratic complexity, since we get to skip by the length of the pattern!
 *
 * Boyer-Moore, published in the October 1977 Communications of the ACM (CACM)
 * has another improvement on top of that basic principle: if we start checking
 * for a match from the *end* of the pattern rather than the beginning, we're
 * much more likely to be able to skip further, since we're finding the largest
 * mismatched span of the string rather than the smallest.  It does this by
 * searching backwards rather than forwards, but the algorithm is otherwise
 * pretty similar.
 *
 * But wait a moment: how do we "already know" that the pattern can't match
 * starting at indexes 1 or 2? By preprocessing the pattern before we start
 * matching! For Boyer-Moore, we precompute two tables, which the paper calls
 * delta1 and delta2. The delta1 table tells us, for any given character, how
 * far we need to shift the search window over to "match up" the next occurence
 * of that character. The delta2 table tells us, for any given position in the
 * pattern, what the next leftward position in the pattern is that has the
 * same suffix as that position. It's confusing and difficult to explain, and
 * the original paper's description is very opaque, but there are some examples
 * there that may help.
 *
 * One other note: the paper is written assuming 1-indexed strings, so there are
 * some stray -1s in this implementation to deal with that.
 *
 * Anyway, onward to the implementation!
 */

/* These are used as the sizes of delta1 and delta2, to avoid dynamic memory
 * allocation. MAXCHR is the size of the "alphabet" of characters being
 * searched, so if you knew you were searching ASCII text you could reduce it,
 * but this algorithm has a hard time with Unicode's enormous alphabet.
 * MAXPAT is the maximum length of a pattern, used to size delta2. It would be
 * possible to dynamically allocate delta2 and remove this limit but this
 * simplifies the code. */
#define MAXCHR 256
#define MAXPAT 512

/* Since this is an example, I didn't want to depend on string.h, and wanted
 * to match the paper's naming as much as possible. This computes the length
 * of the given string. */
int len(char *str) {
	int n = 0;
	while (str[n])
		n++;
	return n;
}

void make_delta1(int *d1, char *pat, int patlen) {
	int i;
	/* The delta1 table maps each character to the rightmost index at which
	 * it occurs in the pattern, counting *from the end*, or patlen (meaning
	 * the entire patlen can be skipped) if it doesn't occur at all. Here,
	 * we do this by making two passes over the table, first filling it
	 * with patlen as a default for every character, then overwriting some
	 * entries. */
	for (i = 0; i < MAXCHR; i++)
		d1[i] = patlen;
	for (i = 0; pat[i]; i++)
		d1[pat[i]] = patlen - i;
}

/* This returns whether pat[a .. a + len] = pat[b .. b + len]. Since this is C
 * and we have pointer arithmetic, it could be phrased as simply:
 *   strncmp(pat + a, pat + b, len)
 * but again I wanted to stay close to the form of the original paper. */
int unify(char *pat, int a, int b, int len) {
	int i;
	for (i = 0; i < len; i++)
		if (pat[a + i] != pat[b + i])
			return 0;
	return 1;
}

/* The paper says: rightmost k such that the (patlen - (j + 1)) chars at
 * pat[j + 1] and pat[k] are equal, and k <= 1 or pat[k - 1] != pat[j].
 * We compute that by walking backwards from j + 1, looking for the first k that
 * unifies with the end of the pattern.
 *
 * The paper asserts breezily that this can return a negative value, but it
 * manifestly cannot and I can't figure out why they thought it could. They did
 * not provide pseudocode for this function, only a prose description. :( */
int rpr(char *pat, int patlen, int j) {
	int k;
	for (k = j + 1; k > 1; k--) {
		if (unify(pat, j + 1, k, patlen - (j + 1)))
			break;
	}
	return k;
}

/* This definition is straight from the paper. */
void make_delta2(int *d2, char *pat, int patlen) {
	int i;
	for (i = 0; i < MAXPAT; i++)
		d2[i] = patlen + 1 - rpr(pat, patlen, i);
}

int max(int a, int b) {
	return a > b ? a : b;
}

/* I made one concession here: the string argument has been renamed 'str' so
 * that if for some reason you are compiling this as C++ it won't conflict
 * with the built-in string type. */
int boyer_moore(char *pat, char *str) {
	int delta1[MAXCHR];
	int delta2[MAXPAT];
	int i, j;

	int patlen = len(pat);
	int stringlen = len(str);

	/* These steps could be precomputed for a given pattern and then have
	 * their results reused many times, if one wanted to do so (eg,
	 * searching for the same pattern in every line of a file).  */
	make_delta1(delta1, pat, patlen);
	make_delta2(delta2, pat, patlen);

	i = patlen - 1;

	/* While we're not yet past the end of the string (remember that i
	 * is where the *end* of the pattern will go, not the start), ... */
	while (i <= stringlen) {
		/* Walk rightwards from i, trying to match the pattern. */
		for (j = patlen - 1; j >= 0; j--) {
			if (str[i] != pat[j])
				break;
			i--;
		}

		/* This means we walked the entire pattern and didn't find a
		 * mismatch, so we're done. */
		if (j == -1)
			return i + 1;

		/* Otherwise, we can skip ahead by whichever of delta1 or
		 * delta2 is larger. */
		i = i + max(delta1[str[i]], delta2[j]);
	}

	return -1;
}

int main(int argc, char *argv[]) {
	if (argc != 3)
		return 0;

	return boyer_moore(argv[1], argv[2]);
}

/*
 * $#t The Boyer-Moore String Search Algorithm
 * $#s A fast algorithm for finding substring matches, implemented in C
 * $#o c, computer-science, programming
 * $#u cc236173-19de-448b-b0ba-ef912e609ce0
 */