/* The Shunting-Yard Algorithm
 *
 * As with some of my other blog posts, this one is a valid C89 program, which
 * you can compile:
 *   cc -x c -o sy post.txt
 * and run:
 *   ./sy '1 + 2 * ( 3 + 4 )'
 *
 * ALGOL was arguably the first programming language with recognizeably modern
 * syntax, which supported writing mathematical expressions like they would be
 * usually written, rather than as a sequence of more primitive operations. For
 * example, for the expression given above:
 *   1 + 2 * (3 + 4)
 * an assembly-language implementation might look like:
 *   mov eax, 4
 *   add eax, 3
 *   mul eax, 2
 *   add eax, 1
 * and a COBOL-60 implementation (contemporary of ALGOL) would have looked[1],
 * with a great deal of cruft elided, like:
 *   MOVE 4 TO A
 *   ADD 3 TO A
 *   MULTIPLY 2 BY A         [2]
 *   ADD 1 TO A
 * the structural parallel to assembly is pretty obvious. On the other hand,
 * that same expression rendered in ALGOL-60 is:
 *   1 + 2 * (3 + 4)
 * which expresses a lot more clearly what is going on.
 *
 * In order to implement this, ALGOL compilers (then called "translators", since
 * they conceptually translated ALGOL to machine code) needed to parse these
 * expressions, while being aware of normal operator precedence. In a modern
 * compiler with unlimited memory at one's disposal the normal way to do this
 * is:
 *
 *   1. Parse the expression, turning it into an abstract syntax tree
 *   2. Generate code corresponding to that abstract syntax tree
 *
 * The ALGOL compiler, however, operated in an environment where it was not
 * given that the machine running the compiler even had enough memory to hold
 * the entire source code at once - never mind any fancy intermediate
 * structures. As such, it was very desirable to do a direct single-pass
 * translation straight from source to machine code.
 *
 * To do that, Dijkstra developed the so-called "shunting yard algorithm"[3],
 * which converts an infix expression, including operators with different
 * precedences and parentheticals, into a sequence of stack instructions, each
 * of which is either "push this number" or "pop the last n numbers, do some
 * operation, and push the result". Since these corresponded well with the
 * capabilities of the present hardware, this technique allowed direct
 * translation of expressions into machine code.
 *
 * The algorithm works as follows: we have an input stream of tokens, which
 * we'll call "in", an output stream of instructions which we'll call "out",
 * and an intermediate stack of operators which we'll call "yard" (in the
 * original paper, Dijkstra calls it the "translator stack" which is a bit
 * unwieldy). Conceptually, the algorithm works by using the "yard" to store
 * operators as they are seen, while passing values directly over to the output
 * stream, and only emitting an operator once a lower-precedence operator is
 * being processed. An animated view of this process looks somewhat like trains
 * being reordered in a railyard via shunting - hence the name.
 *
 * Below is an implementation of a simple expression evaluator using this
 * algorithm; it first translates infix expressions into simple instructions, as
 * the paper does, but then evaluates these instructions directly rather than
 * emitting them as machine code.
 *
 * A handful of caveats about this specific implementation:
 *
 * - It does not handle negative integers
 * - It requires spaces between *all* tokens, so you can write:
 *     "1 + ( 2 * 3 )"
 *   but not:
 *     "1 + (2 * 3)"
 *   this is not a fundamental problem to fix, it would just have complicated
 *   the tokenizer.
 * - Both the yard and out structures are limited to 512 entries, so this does
 *   not handle big expressions; obviously they could be dynamically allocated
 *   but they are not here for simplicity's sake
 * - The out stream is implemented as a stack here, even though it is logically
 *   a stream; I didn't want to add a second data structure and lengthen this
 *   example still further
 * - It returns the result in the exit status, so you do not get many bits of
 *   result; I did not want to depend on stdio to print it properly.
 *
 * Onward to the code!
 */

/* These are used for the type field of the out queue only; the entries in the
 * yard are all always operators. */
enum {
	T_OP = 0,
	T_INT = 1,
};

/* A single Entry is a (type, value) pair. */
typedef struct {
	char type;
	int val;
} Entry;

/* ... and a Stack contains up to 512 Entries. */
typedef struct {
	Entry data[512];
	int top;
} Stack;

/* These are functions for manipulating Entry and Stack, and are not really
 * relevant to the algorithm, so their implementations are below, after
 * main(). They are deeply uninteresting. */
Entry makeentry(char type, int val);
void initstack(Stack *stack);
int isempty(Stack *stack);
void push(Stack *stack, Entry e);
Entry pop(Stack *stack);
Entry peek(Stack *stack);

/* These functions duplicate the libc functions atoi() and strsep(), and are
 * likewise not interesting. They are implemented below. */
int asint(char *token);
char *tokenize(char **in);

/* Returns whether the supplied token is an operator or not. */
int isop(char *token) {
	/* This is rather hackish and you would not do this in a real compiler,
 	 * but it works well enough here. */
	return *token < '0' || *token > '9';
}

/* This moves items from the yard to the out queue until it finds a '('; it
 * is used to ensure that parentheses always group expressions, regardless of
 * precedence. */
void balance(Stack *out, Stack *yard) {
	/* If the yard looks like: '(' op0 op1 op2 ...
	 * then after this function, op2 op1 op0 will have been added to
	 * the out queue, and everything up to and including the '(' will have
	 * been removed from the yard. */
	while (peek(yard).val != '(')
		push(out, pop(yard));
	pop(yard);
}

/* In a real compiler this would be a lookup table, and you would also want to
 * handle associativity of operators, which requires another case in shunt(). */
int precedence(char op) {
	switch (op) {
		case '(': return 0;
		case ')': return 1;
		case '+': return 9;
		case '-': return 9;
		case '*': return 10;
		case '/': return 10;
	}
}

/* This function transfers operators from yard to out until it finds one with
 * lesser precedence than op, then pushes op into yard. This ensures that yard
 * is always in non-decreasing priority order. */
void shunt(Stack *out, Stack *yard, char op) {
	while (precedence(peek(yard).val) >= precedence(op))
		push(out, pop(yard));
	push(yard, makeentry(T_OP, op));
}

/* Do a single step of the shunting-yard algorithm. */
void parseone(Stack *out, Stack *yard, char *intoken) {
	if (!isop(intoken))
		push(out, makeentry(T_INT, asint(intoken)));
	else if (*intoken == '(')
		push(yard, makeentry(T_OP, '('));
	else if (*intoken == ')')
		balance(out, yard);
	else
		shunt(out, yard, *intoken);
}

/* Run the shunting-yard algorithm on in, producing a list of operations in
 * out (for which I've used the Stack type, but logically it is a Queue, I
 * guess). */
void parse(Stack *out, char *in) {
	Stack yard;
	char *t;

	initstack(&yard);
	while ((t = tokenize(&in)))
		parseone(out, &yard, t);

	while (!isempty(&yard))
		push(out, pop(&yard));
}

/* A helper function for evaluate(): given an operation and two values,
 * return the value of that operation as applied to those two values. */
int evalop(char op, int a, int b) {
	switch (op) {
		case '+': return a + b;
		case '-': return a - b;
		case '*': return a * b;
		case '/': return a / b;
	}
}

/* Given a list of instructions as generated by parse(), evaluate them in order:
 * T_INT pushes one value to the stack, T_OP pops two, operates on them, and
 * pushes the result. */
int evaluate(Stack *prog) {
	Stack vals;
	int i, a, b;

	initstack(&vals);
	for (i = 0; i < prog->top; i++) {
		Entry e = prog->data[i];
		if (e.type == T_INT) {
			push(&vals, e);
		} else {
			/* Note that these are popped "backwards", because
			 * we're getting them from a stack. For example, if we
			 * had:
			 *   3 - 1
			 * we'd have generated:
			 *   T_INT 3, T_INT 1, T_OP -
			 * so here the top of the stack is actually the right
			 * hand side, not the left hand side. */
			b = pop(&vals).val;
			a = pop(&vals).val;
			push(&vals, makeentry(T_INT, evalop(e.val, a, b)));
		}
	}

	return vals.data[0].val;
}

int main(int argc, char *argv[]) {
	Stack prog;
	initstack(&prog);

	parse(&prog, argv[1]);
	return evaluate(&prog);
}

/* And that's it! If we were emitting output instructions directly rather than
 * stashing them in an intermediate structure, this would only need memory
 * proportional to the depth of the most deeply-nested expression, which is what
 * we in the industry call "pretty darn cool". This kind of idea will crop up
 * again in the shift-reduce method of parsing, which I'll write about in some
 * later post.
 *
 * You're at the end of the interesting bit - as always, thanks for reading :)
 */

struct Entry {
	char type;
	int val;
};

Entry makeentry(char type, int val) {
	Entry e;
	e.type = type;
	e.val = val;
	return e;
}

struct Stack {
	Entry data[512];
	int top;
};

void initstack(Stack *s) {
	s->top = 0;
}

int isempty(Stack *s) {
	return s->top == 0;
}

void push(Stack *s, Entry e) {
	s->data[s->top] = e;
	s->top++;
}

Entry pop(Stack *s) {
	s->top--;
	return s->data[s->top];
}

Entry peek(Stack *s) {
	return s->data[s->top - 1];
}

/* This is more or less the idiomatic way to do this in C. */
int asint(char *token) {
	int v = 0;
	while (*token) {
		v *= 10;
		v += *token - '0';
		token++;
	}
	return v;
}

/* This is basically what strsep(3) does, down to modifying the input string,
 * but hardcoded to use ' ' as the separator. */
char *tokenize(char **in) {
	char *p = *in, *s = *in;

	while (*p && *p != ' ')
		p++;

	/* If not at the end of the string, null-terminate the token we found
	 * and step past the null terminator for next time. */
	if (*p) {
		*p = '\0';
		p++;
	}

	*in = p;

	/* Obviously this should be NULL, but that comes from stddef.h. */
	return s == p ? (void*)0 : s;
}

/* Footnotes!
 * [1]: I do not actually know COBOL, and certainly not COBOL-60; I had to read
 *      a COBOL manual to produce this code fragment. I hope you appreciate my
 *      sacrifice.
 *
 * [2]: I was agog to learn that it is, in fact, "MULTIPLY 2 BY A" and not the
 *      far more natural "MULTIPLY A BY 2". Presumably this is so that the
 *      grammar always has the out parameter second, as with MOVE and ADD, but
 *      also: yikes. So much for programming in English.
 *
 * [3]: The paper I'm referring to throughout is actually "ALGOL Bulletin,
 *      Supplement nr. 10: ALGOL-60 Translation", by E.W. Dijkstra. You can find
 *      it on the internet if you want to read it but it's quite difficult to
 *      decipher and spends a lot of words on things that are not concerns for
 *      modern readers. My copy has sha256sum:
 *        10ccdbcfa9bc399e72fe25f86f24a37935b6de7762c2b3233b4dccea8e8e66d6
 *
 * $#t The Shunting-Yard Algorithm
 * $#s An algorithm for doing precedence-aware parsing of mathematical
 * $#s expressions into a sequence of stack ops in a single pass
 * $#o c, computer-science, programming
 * $#u eaaf9aab-46f3-468f-b74c-43257a18e7ff
 */