;; Parser Combinators in Scheme ;; ;; This blog post is an executable Scheme program, written in "pure" ;; R7RS Scheme. You can run it with any R7RS Scheme implementation to ;; see the examples running. Specifically, we'll be using the "small" ;; R7RS standard: (import (scheme small)) ;; Note that if you're using Guile, which doesn't know about (scheme ;; small), you will instead need to run it with --r7rs and swap the ;; above line for: ;; (import (scheme base)) ;; because Scheme portability is fake. :( ;; That having been said, let's start this post off by writing a ;; regular parser. We'll skip over the entire lexing step and just ;; assume that we have a list of tokens that we want to turn into a ;; single parse tree. Tokens are represented as either a single ;; symbol, for literals, or a pair of symbol and value, for non-literals. (define (token-type t) (if (symbol? t) t (car t))) (define (token-value t) (if (symbol? t) #f (cdr t))) (define (token-is? t type) (symbol=? (token-type t) type)) ;; And a parser is a function that takes a list of tokens and returns ;; two things: a parse tree and a list of any remaining unparsed ;; tokens. ;; ;; The language we're going to parse is going to be just a subset of ;; C: ;; ;; program ::= statement* ;; statement ::= var '=' expr ;; | 'if' '(' expr ')' statement ['else' statement] ;; | 'return' expr ;; expr ::= int | var ;; ;; So for example, here's a valid program: ;; a = 1 ;; if (a) a = 2 ;; else a = 3 ;; return a ;; ;; Here's our sample program converted to a list of tokens, using the ;; representation from above. (define *sample-program* '((var . a) equals (int . 1) if lparen (var . a) rparen (var . a) equals (int . 2) else (var . a) equals (int . 3) return (var . a))) ;; Here's how we might write a parser for programs, given a list of ;; tokens: (define (parse-1-expr tokens) (cond ((null? tokens) (values #f tokens)) ((token-is? (car tokens) 'int) (values (token-value (car tokens)) (cdr tokens))) ((token-is? (car tokens) 'var) (values (token-value (car tokens)) (cdr tokens))) (else (values #f tokens)))) (define (parse-1-return tokens) (let-values (((e rest) (parse-1-expr (cdr tokens)))) (if e (values (list 'return e) rest) (values #f tokens)))) (define (parse-1-assignment tokens) (if (and (>= (length tokens) 3) (token-is? (car tokens) 'var) (token-is? (cadr tokens) 'equals)) (begin (let-values (((e rest) (parse-1-expr (cddr tokens)))) (if e (values (list 'assign (token-value (car tokens)) e) rest) (values #f tokens)))) (values #f tokens))) (define (parse-1-if tokens) (cond ((null? tokens) (values #f tokens)) ((and (token-is? (car tokens) 'if) (token-is? (cadr tokens) 'lparen)) (let-values (((condition rest) (parse-1-expr (cddr tokens)))) (cond ((and condition (not (null? rest)) (token-is? (car rest) 'rparen)) (let-values (((ifarm rest) (parse-1-statement (cdr rest)))) (cond ((and ifarm (not (null? rest)) (token-is? (car rest) 'else)) (let-values (((elsearm rest) (parse-1-statement (cdr rest)))) (if elsearm (values (list 'if condition ifarm elsearm) rest) (values #f tokens)))) (else (values (list 'if condition ifarm) rest))))) (else (values #f rest))))) (values #f tokens))) (define (parse-1-statement tokens) (if (null? tokens) (values #f tokens) (let ((head (car tokens))) (case head ((return) (parse-1-return tokens)) ((if) (parse-1-if tokens)) (else (parse-1-assignment tokens)))))) ;; To parse a program, we repeatedly try to parse a statement from ;; the list of tokens. As soon as the statement parser returns false ;; (indicating no more statements), we return the list we have so ;; far, and any unused tokens. (define (parse-1 tokens) (let loop ((statements '()) (tokens tokens)) (let-values (((statement rest) (parse-1-statement tokens))) (if statement (loop (cons statement statements) rest) (values (reverse statements) tokens))))) (define (run-parser p tokens) (let-values (((result rest) (p tokens))) (display result) (newline))) (run-parser parse-1 *sample-program*) ;; Hooray! Our parser is a hot mess, with a bunch of unhandled error ;; cases and a lot of nesting in parse-1-if; we don't have a good tool ;; to express conditions within our parser other than using actual ;; nested Scheme conditionals. For repetition, as in parse-1 itself, ;; we have to use a Scheme loop to represent repetition of a parser. ;; First off, let's see if we can get rid of the nesting in ;; parse-1-if. The reason we need the nesting in the first place is ;; that we need to run several parsers in sequence, and continue ;; running the parser sequence only as long as the parsers are ;; succeeding. Here's how we might express that as a "meta-parser" (or ;; parser combinator), which we'll call p/seq. This function takes a ;; list of parsers, and returns a function that runs all of them on ;; the input sequentially, returning either a list of all their ;; results (if they all succeed) or false (if any of them fails). (define (p/seq . parsers) (lambda (initial-tokens) (let loop ((results '()) (tokens initial-tokens) (parsers parsers)) (cond ;; success! ((null? parsers) (values (reverse results) tokens)) ;; failure - ran out of tokens while we still have parsers ((null? tokens) (values #f initial-tokens)) (else (let-values (((result rest) ((car parsers) tokens))) (if result ;; keep going... (loop (cons result results) rest (cdr parsers)) ;; a parser in the middle failed, fail the whole sequence (values #f initial-tokens)))))))) ;; Let's write another parser combinator which will grab as many ;; repetitions of a parser as it can: (define (p/many parser) (lambda (initial-tokens) (let loop ((results '()) (tokens initial-tokens)) (cond ;; If there are no parsed results, and we're already out of ;; tokens, we just fail. ((and (null? results) (null? tokens)) (values #f tokens)) ;; Otherwise, run the parser again: (else (let-values (((result rest) (parser tokens))) (if result ;; and if it succeeds, keep going: (loop (cons result results) rest) ;; otherwise, we're done! (values (reverse results) rest)))))))) ;; Lastly, we're going to want the ability to match alternatives, so ;; we want a combinator that tries several parsers and returns if any ;; of them succeeds: (define (p/any . parsers) (lambda (tokens) (let loop ((parsers parsers)) (if (null? parsers) ;; none of the parsers matched - return a failure. (values #f tokens) (let-values (((result rest) ((car parsers) tokens))) (if result ;; if that parser succeeds, we're done! Yay (values result rest) (loop (cdr parsers)))))))) ;; That's part of the solution, but the if parser is still annoying to ;; write - we need to introduce a couple of helpers to match primitive ;; things. Those look like this: (define (p/lit what) (lambda (tokens) (if (and (>= (length tokens) 1) (token-is? (car tokens) what)) (values (token-type (car tokens)) (cdr tokens)) (values #f tokens)))) (define (p/type what) (lambda (tokens) (if (and (>= (length tokens) 1) (token-is? (car tokens) what)) (values (token-value (car tokens)) (cdr tokens)) (values #f tokens)))) ;; With these tools in hand, let's try rewriting our parsers! (define parse-2-expr (p/any (p/type 'int) (p/type 'var))) (define parse-2-return (p/seq (p/lit 'return) parse-2-expr)) (define parse-2-assignment (p/seq (p/type 'var) (p/lit 'equals) parse-2-expr)) ;; Wow, this is going great! These are so easy to read. I can't wait to... ;; (define parse-2-if ;; (p/any ;; (p/seq (p/lit 'if) ;; (p/lit 'lparen) ;; parse-2-expr ;; (p/lit 'rparen) ;; parse-2-statement ;; (p/lit 'else) ;; parse-2-statement) ;; (p/seq (p/lit 'if) ;; (p/lit 'lparen) ;; parse-2-expr ;; (p/lit 'rparen) ;; parse-2-statement))) ;; ;; Unbound variable: parse-2-statement ;; ;; Well, rats. Our parsers are no longer a set of mutually recursive ;; functions - instead, they're a set of lambdas which are constructed ;; by calling parser combinators, which means that parse-2-statement's ;; value has to be available already in the body of parse-2-if. To ;; handle that, we're going to do something truly gross, and delay ;; evaluation of what parse-2-statement actually *is* until later, and ;; open-coding the body of p/any to allow for that: (define *parse-2-statement-parsers* '()) (define (parse-2-statement tokens) (let loop ((parsers *parse-2-statement-parsers*)) (if (or (null? tokens) (null? parsers)) (values #f tokens) (let-values (((result rest) ((car parsers) tokens))) (if result (values result rest) (loop (cdr parsers))))))) (define parse-2-if (p/any (p/seq (p/lit 'if) (p/lit 'lparen) parse-2-expr (p/lit 'rparen) parse-2-statement (p/lit 'else) parse-2-statement) (p/seq (p/lit 'if) (p/lit 'lparen) parse-2-expr (p/lit 'rparen) parse-2-statement))) (set! *parse-2-statement-parsers* (list parse-2-return parse-2-assignment parse-2-if)) (define parse-2 (p/many parse-2-statement)) (run-parser parse-2 *sample-program*) ;; Okay, that's... tolerable, but the resulting parse tree is going to ;; need some fixing up, and the body of parse-2-if is still quite ;; ugly. Let's make our combinators a little bit smarter by having ;; p/seq treat literals as being "self-matching": (define (p/seq-lit . parsers) (lambda (initial-tokens) (let loop ((results '()) (tokens initial-tokens) (parsers parsers)) (cond ((null? parsers) (values (reverse results) tokens)) ((null? tokens) (values #f initial-tokens)) ((and (symbol? (car parsers)) (token-is? (car tokens) (car parsers))) (loop (cons (car tokens) results) (cdr tokens) (cdr parsers))) ((symbol? (car parsers)) ;; the current parser is a symbol and it didn't match, or the ;; above branch would've been taken instead - fail (values #f initial-tokens)) (else (let-values (((result rest) ((car parsers) tokens))) (if result (loop (cons result results) rest (cdr parsers)) (values #f initial-tokens)))))))) ;; And with that, we can rewrite the if parser like this: (define parse-3-if (p/any (p/seq-lit 'if 'lparen parse-2-expr 'rparen parse-2-statement 'else parse-2-statement) (p/seq-lit 'if 'lparen parse-2-expr 'rparen parse-2-statement))) ;; There we go! The only remaining problem is making p/seq-lit a ;; little smarter yet, by giving it a "combiner" function that gets ;; given all the parser results from the sub-parsers. The reason we ;; might want that is because right now, an if statement parses like: ;; ;; (if lparen e rparen s1 else s2) ;; ;; and we really just want: ;; ;; (if e s1 s2) ;; ;; Fortunately that's very easy to do, but I won't bother doing it ;; here. ;; ;; This is all neat, but the hack we had to do with parse-2-statement ;; is irritating - we're breaking the parser abstraction *and* ;; introducing mutation. Icky. Fortunately, there's an answer, and it ;; involves playing to one of Scheme's true strengths: writing a ;; domain-specific language. That can be part 2 of this post! Thanks ;; for reading :D