;; Parser Combinators in Scheme
;;
;; This blog post is an executable Scheme program, written in "pure"
;; R7RS Scheme. You can run it with any R7RS Scheme implementation to
;; see the examples running. Specifically, we'll be using the "small"
;; R7RS standard:

(import (scheme small))

;; Note that if you're using Guile, which doesn't know about (scheme
;; small), you will instead need to run it with --r7rs and swap the
;; above line for:
;;   (import (scheme base))
;; because Scheme portability is fake. :(

;; That having been said, let's start this post off by writing a
;; regular parser. We'll skip over the entire lexing step and just
;; assume that we have a list of tokens that we want to turn into a
;; single parse tree. Tokens are represented as either a single
;; symbol, for literals, or a pair of symbol and value, for non-literals.

(define (token-type t) (if (symbol? t) t (car t)))
(define (token-value t) (if (symbol? t) #f (cdr t)))
(define (token-is? t type) (symbol=? (token-type t) type))

;; And a parser is a function that takes a list of tokens and returns
;; two things: a parse tree and a list of any remaining unparsed
;; tokens.
;;
;; The language we're going to parse is going to be just a subset of
;; C:
;;
;;   program   ::= statement*
;;   statement ::= var '=' expr
;;               | 'if' '(' expr ')' statement ['else' statement]
;;               | 'return' expr
;;        expr ::= int | var
;;
;; So for example, here's a valid program:
;;    a = 1
;;    if (a) a = 2
;;    else a = 3
;;    return a
;;
;; Here's our sample program converted to a list of tokens, using the
;; representation from above.
(define *sample-program*
  '((var . a) equals (int . 1)
    if lparen (var . a) rparen
    (var . a) equals (int . 2)
    else (var . a) equals (int . 3)
    return (var . a)))

;; Here's how we might write a parser for programs, given a list of
;; tokens:
(define (parse-1-expr tokens)
  (cond
   ((null? tokens) (values #f tokens))
   ((token-is? (car tokens) 'int)
    (values (token-value (car tokens)) (cdr tokens)))
   ((token-is? (car tokens) 'var)
    (values (token-value (car tokens)) (cdr tokens)))
   (else (values #f tokens))))

(define (parse-1-return tokens)
  (let-values (((e rest) (parse-1-expr (cdr tokens))))
    (if e
	(values (list 'return e) rest)
	(values #f tokens))))

(define (parse-1-assignment tokens)
  (if (and (>= (length tokens) 3)
	   (token-is? (car tokens) 'var)
	   (token-is? (cadr tokens) 'equals))
      (begin
	(let-values (((e rest) (parse-1-expr (cddr tokens))))
	  (if e
	      (values
	       (list 'assign (token-value (car tokens)) e)
	       rest)
	      (values #f tokens))))
      (values #f tokens)))

(define (parse-1-if tokens)
  (cond
   ((null? tokens) (values #f tokens))
   ((and (token-is? (car tokens) 'if)
	 (token-is? (cadr tokens) 'lparen))
    (let-values (((condition rest) (parse-1-expr (cddr tokens))))
      (cond
       ((and condition
	     (not (null? rest))
	     (token-is? (car rest) 'rparen))
	(let-values (((ifarm rest) (parse-1-statement (cdr rest))))
	  (cond
	   ((and ifarm
		 (not (null? rest))
		 (token-is? (car rest) 'else))
	    (let-values (((elsearm rest) (parse-1-statement (cdr
							   rest))))
	      (if elsearm
		  (values (list 'if condition ifarm elsearm) rest)
		  (values #f tokens))))
	   (else (values (list 'if condition ifarm) rest)))))
       (else (values #f rest)))))
   (values #f tokens)))
  
(define (parse-1-statement tokens)
  (if (null? tokens)
      (values #f tokens)
      (let ((head (car tokens)))
	(case head
	  ((return) (parse-1-return tokens))
	  ((if) (parse-1-if tokens))
	  (else (parse-1-assignment tokens))))))

;; To parse a program, we repeatedly try to parse a statement from
;; the list of tokens. As soon as the statement parser returns false
;; (indicating no more statements), we return the list we have so
;; far, and any unused tokens.
(define (parse-1 tokens)
  (let loop ((statements '()) (tokens tokens))
    (let-values (((statement rest) (parse-1-statement tokens)))
      (if statement
	  (loop (cons statement statements) rest)
	  (values (reverse statements) tokens)))))

(define (run-parser p tokens)
  (let-values (((result rest) (p tokens)))
    (display result)
    (newline)))

(run-parser parse-1 *sample-program*)

;; Hooray! Our parser is a hot mess, with a bunch of unhandled error
;; cases and a lot of nesting in parse-1-if; we don't have a good tool
;; to express conditions within our parser other than using actual
;; nested Scheme conditionals. For repetition, as in parse-1 itself,
;; we have to use a Scheme loop to represent repetition of a parser.

;; First off, let's see if we can get rid of the nesting in
;; parse-1-if. The reason we need the nesting in the first place is
;; that we need to run several parsers in sequence, and continue
;; running the parser sequence only as long as the parsers are
;; succeeding. Here's how we might express that as a "meta-parser" (or
;; parser combinator), which we'll call p/seq. This function takes a
;; list of parsers, and returns a function that runs all of them on
;; the input sequentially, returning either a list of all their
;; results (if they all succeed) or false (if any of them fails).
(define (p/seq . parsers)
  (lambda (initial-tokens)
    (let loop ((results '())
	       (tokens initial-tokens)
	       (parsers parsers))
      (cond
       ;; success!
       ((null? parsers) (values (reverse results) tokens))

       ;; failure - ran out of tokens while we still have parsers
       ((null? tokens) (values #f initial-tokens))

       (else
	(let-values (((result rest) ((car parsers) tokens)))
	  (if result
	      
	      ;; keep going...
	      (loop (cons result results) rest (cdr parsers))

	      ;; a parser in the middle failed, fail the whole sequence
	      (values #f initial-tokens))))))))

;; Let's write another parser combinator which will grab as many
;; repetitions of a parser as it can:
(define (p/many parser)
  (lambda (initial-tokens)
    (let loop ((results '()) (tokens initial-tokens))
      (cond
       ;; If there are no parsed results, and we're already out of
       ;; tokens, we just fail.
       ((and (null? results) (null? tokens)) (values #f tokens))

       ;; Otherwise, run the parser again:
       (else
	(let-values (((result rest) (parser tokens)))
	  (if result
	      ;; and if it succeeds, keep going:
	      (loop (cons result results) rest)

	      ;; otherwise, we're done!
	      (values (reverse results) rest))))))))

;; Lastly, we're going to want the ability to match alternatives, so
;; we want a combinator that tries several parsers and returns if any
;; of them succeeds:
(define (p/any . parsers)
  (lambda (tokens)
    (let loop ((parsers parsers))
      (if (null? parsers)

	  ;; none of the parsers matched - return a failure.
	  (values #f tokens)

	  (let-values (((result rest) ((car parsers) tokens)))
	    (if result
		;; if that parser succeeds, we're done! Yay
		(values result rest)
		(loop (cdr parsers))))))))

;; That's part of the solution, but the if parser is still annoying to
;; write - we need to introduce a couple of helpers to match primitive
;; things. Those look like this:
(define (p/lit what)
  (lambda (tokens)
    (if (and (>= (length tokens) 1)
	     (token-is? (car tokens) what))
	(values (token-type (car tokens)) (cdr tokens))
	(values #f tokens))))

(define (p/type what)
  (lambda (tokens)
    (if (and (>= (length tokens) 1)
	     (token-is? (car tokens) what))
	(values (token-value (car tokens)) (cdr tokens))
	(values #f tokens))))

;; With these tools in hand, let's try rewriting our parsers!

(define parse-2-expr (p/any (p/type 'int) (p/type 'var)))
(define parse-2-return (p/seq (p/lit 'return) parse-2-expr))
(define parse-2-assignment
  (p/seq (p/type 'var)
	 (p/lit 'equals)
	 parse-2-expr))

;; Wow, this is going great! These are so easy to read. I can't wait to...

;; (define parse-2-if
;;   (p/any
;;    (p/seq (p/lit 'if)
;; 	  (p/lit 'lparen)
;; 	  parse-2-expr
;; 	  (p/lit 'rparen)
;; 	  parse-2-statement
;; 	  (p/lit 'else)
;; 	  parse-2-statement)
;;    (p/seq (p/lit 'if)
;; 	  (p/lit 'lparen)
;; 	  parse-2-expr
;; 	  (p/lit 'rparen)
;; 	  parse-2-statement)))
;;
;; Unbound variable: parse-2-statement
;;
;; Well, rats. Our parsers are no longer a set of mutually recursive
;; functions - instead, they're a set of lambdas which are constructed
;; by calling parser combinators, which means that parse-2-statement's
;; value has to be available already in the body of parse-2-if. To
;; handle that, we're going to do something truly gross, and delay
;; evaluation of what parse-2-statement actually *is* until later, and
;; open-coding the body of p/any to allow for that:

(define *parse-2-statement-parsers* '())
(define (parse-2-statement tokens)
  (let loop ((parsers *parse-2-statement-parsers*))
    (if (or (null? tokens) (null? parsers))
	(values #f tokens)
	(let-values (((result rest) ((car parsers) tokens)))
	  (if result
	      (values result rest)
	      (loop (cdr parsers)))))))

(define parse-2-if
  (p/any
   (p/seq (p/lit 'if)
	  (p/lit 'lparen)
	  parse-2-expr
	  (p/lit 'rparen)
	  parse-2-statement
	  (p/lit 'else)
	  parse-2-statement)
   (p/seq (p/lit 'if)
	  (p/lit 'lparen)
	  parse-2-expr
	  (p/lit 'rparen)
	  parse-2-statement)))

(set! *parse-2-statement-parsers*
  (list parse-2-return
	parse-2-assignment
	parse-2-if))
(define parse-2
  (p/many parse-2-statement))

(run-parser parse-2 *sample-program*)

;; Okay, that's... tolerable, but the resulting parse tree is going to
;; need some fixing up, and the body of parse-2-if is still quite
;; ugly. Let's make our combinators a little bit smarter by having
;; p/seq treat literals as being "self-matching":

(define (p/seq-lit . parsers)
  (lambda (initial-tokens)
    (let loop ((results '())
	       (tokens initial-tokens)
	       (parsers parsers))
      (cond
       ((null? parsers) (values (reverse results) tokens))
       ((null? tokens) (values #f initial-tokens))
       ((and (symbol? (car parsers))
	     (token-is? (car tokens) (car parsers)))
	(loop (cons (car tokens) results) (cdr tokens) (cdr parsers)))
       ((symbol? (car parsers))
	;; the current parser is a symbol and it didn't match, or the
	;; above branch would've been taken instead - fail
	(values #f initial-tokens))
       (else
	(let-values (((result rest) ((car parsers) tokens)))
	  (if result
	      (loop (cons result results) rest (cdr parsers))
	      (values #f initial-tokens))))))))

;; And with that, we can rewrite the if parser like this:
(define parse-3-if
  (p/any
   (p/seq-lit 'if 'lparen parse-2-expr 'rparen parse-2-statement
	      'else parse-2-statement)
   (p/seq-lit 'if 'lparen parse-2-expr 'rparen parse-2-statement)))

;; There we go! The only remaining problem is making p/seq-lit a
;; little smarter yet, by giving it a "combiner" function that gets
;; given all the parser results from the sub-parsers. The reason we
;; might want that is because right now, an if statement parses like:
;;
;;   (if lparen e rparen s1 else s2)
;;
;; and we really just want:
;;
;;   (if e s1 s2)
;;
;; Fortunately that's very easy to do, but I won't bother doing it
;; here.
;;
;; This is all neat, but the hack we had to do with parse-2-statement
;; is irritating - we're breaking the parser abstraction *and*
;; introducing mutation. Icky. Fortunately, there's an answer, and it
;; involves playing to one of Scheme's true strengths: writing a
;; domain-specific language. That can be part 2 of this post! Thanks
;; for reading :D

;; $#t Parser Combinators
;; $#s A neat technique for building parsers out of smaller parsers
;; $#o compilers, computer-science, scheme
;; $#u 17970ff6-5f9e-4cdc-8740-dfada20a04b7