Skip to content
Next Next commit
Refactor clj-parse.el
- Make parser produce an AST by default. - Switch to lexical binding and fix some broken var references. - Reorganizes reduce functions' signatures a bit.
  • Loading branch information
volrath committed Jul 5, 2017
commit 7ae887b1de6bc209958ee1fb69583a819a3c4ed2
163 changes: 85 additions & 78 deletions clj-parse.el
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
;;; clj-parse.el --- Clojure/EDN parser
;;; clj-parse.el --- Clojure/EDN parser -*- lexical-binding: t; -*-

;; Copyright (C) 2017 Arne Brasseur

Expand All @@ -25,6 +25,8 @@
;;; Code:

;; Before emacs 25.1 it's an ELPA package

(require 'a)
(require 'let-alist)
(require 'cl-lib)
(require 'clj-lex)
Expand All @@ -40,6 +42,24 @@
:character)
"Tokens that represent leaf nodes in the AST.")

(defvar clj-parse--closer-tokens '(:rparen
:rbracket
:rbrace)
"Tokens that represent closing of an AST branch.")

(defun clj-parse--is-leaf? (el)
(member (clj-lex-token-type el) clj-parse--leaf-tokens))

(defun clj-parse--is-node? (el)
(a-has-key el 'subnodes))

(defun clj-parse--is-open-prefix? (el)
(and (member (clj-lex-token-type el) '(:discard :tag))
(not (clj-parse--is-node? el))))

(defun clj-parse--make-node (type subnodes &rest kvs)
(apply 'a-list 'type type 'subnodes subnodes kvs))

;; The EDN spec is not clear about wether \u0123 and \o012 are supported in
;; strings. They are described as character literals, but not as string escape
;; codes. In practice all implementations support them (mostly with broken
Expand Down Expand Up @@ -69,97 +89,84 @@
(substring s 1 -1)))))

(defun clj-parse-character (c)
(let* ((form (cdr (assq 'form token)))
(first-char (elt form 1)))
(let ((first-char (elt c 1)))
(cond
((equal form "\\newline") ?\n)
((equal form "\\return") ?\r)
((equal form "\\space") ?\ )
((equal form "\\tab") ?\t)
((eq first-char ?u) (string-to-number (substring form 2) 16))
((eq first-char ?o) (string-to-number (substring form 2) 8))
((equal c "\\newline") ?\n)
((equal c "\\return") ?\r)
((equal c "\\space") ?\ )
((equal c "\\tab") ?\t)
((eq first-char ?u) (string-to-number (substring c 2) 16))
((eq first-char ?o) (string-to-number (substring c 2) 8))
(t first-char))))

(defun clj-parse-edn-reduce1 (stack token)
(cl-case (cdr (assq 'type token))
(:whitespace stack)
(:number (cons (string-to-number (cdr (assq 'form token))) stack))
(:nil (cons nil stack))
(:true (cons t stack))
(:false (cons nil stack))
(:symbol (cons (intern (cdr (assq 'form token))) stack))
(:keyword (cons (intern (cdr (assq 'form token))) stack))
(:string (cons (clj-parse-string (cdr (assq 'form token))) stack))
(:character (cons (clj-parse-character (cdr (assq 'form token))) stack))))

(defun clj-parse-edn-reduceN (stack type coll)
(if (eq :discard type)
stack
(cons
(cl-case type
(:whitespace :ws)
(:number coll)
(:list (-butlast (cdr coll)))
(:set (-butlast (cdr coll)))
(:vector (apply #'vector (-butlast (cdr coll))))
(:map (mapcar (lambda (pair)
(cons (car pair) (cadr pair)))
(-partition 2 (-butlast (cdr coll))))))
stack)))

(defun clj-parse--reduce-coll (stack open-token coll-type reducN)
(let ((coll nil))
(defun clj-parse--next ()
(setq next (clj-lex-next))
(while (eq (clj-lex-token-type next) :whitespace)
(setq next (clj-parse--next)))
next)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made this to simplify clj-parse--ast-reduce1, and it's ok as long as we only use this parser to produce AST, but if we want to use the clj-parse--reduce algorithm to produce CST (not entirely sure we would want to do that) then we can get rid of it and handle whitespaces when reducing leafs.


(defun clj-parse--ast-reduce1 (stack leaf)
(push leaf stack))

(defun clj-parse--ast-reduceN (stack node subnodes)
(push
(cl-case (clj-lex-token-type node)
(:lparen (clj-parse--make-node :list subnodes))
(:lbracket (clj-parse--make-node :vector subnodes))
(:set (clj-parse--make-node :set subnodes))
(:lbrace (clj-parse--make-node :map subnodes))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I left maps as a collection of subnodes so that printers would transform them into whatever they need, but maybe we want kv alists in our AST instead?

(:discard (clj-parse--make-node :discard subnodes)))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For :discard, the subnodes list only has one element, always. Right now the resulting node for a discard token includes this 1-sized list as the list of subnodes, but we could also get rid of the list and just add the its internal node (the discarded element). I'm guessing this depends on implementation details for the AST transversing funtions/API.

Same for :tag nodes.

stack))

(defun clj-parse--find-opener (stack closer-token)
(cl-case (clj-lex-token-type closer-token)
(:rparen :lparen)
(:rbracket :lbracket)
(:rbrace (clj-lex-token-type
(-find (lambda (token) (member (clj-lex-token-type token) '(:lbrace :set))) stack)))))

(defun clj-parse--reduce-coll (stack closer-token reduceN)
"Reduce collection based on the top of the stack"
(let ((opener-type (clj-parse--find-opener stack closer-token))
(coll nil))
(while (and stack
(not (eq (clj-lex-token-type (car stack)) open-token)))
(not (eq (clj-lex-token-type (car stack)) opener-type)))
(push (pop stack) coll))
(if (eq (clj-lex-token-type (car stack)) open-token)
(progn
(push (pop stack) coll)
(funcall reduceN stack coll-type coll))
;; Unwound the stack without finding a matching paren: return the original stack
(reverse list))))

(if (eq (clj-lex-token-type (car stack)) opener-type)
(let ((node (pop stack)))
(funcall reduceN stack node coll))
;; Syntax error
(error "Syntax Error"))))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parser should not throw syntax errors. Instead it should continue parsing and return a partial result. You can inspect the result afterwards to find syntax errors, or raise them in the reducer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you say "the reducer", do you mean the reduce1/reduceN functions? or the "to-elisp" / "to-clojure-string" functions?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, reduce1 and reduceN, which maybe should be called reduce-leaf and reduce-node. Or even shift-leaf and reduce-node, since the first one will typically "shift" a single leaf node onto the stack, whereas the second one "reduces" a number of nodes into a single node.

you can think of parsing as three layers working together: the lexer, "the parser", "the reducer". The last part is pluggable to support different outputs.

to-clojure-string would be "the printer"


(defun clj-parse-reduce (reduce1 reduceN)
(let ((stack nil)
(token (clj-lex-next)))
(let ((stack nil))

(while (not (eq (clj-lex-token-type token) :eof))
(while (not (eq (clj-lex-token-type (setq token (clj-parse--next))) :eof))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parser should not throw away whitespace, that's the job of the reducing functions (if they choose to do so). Otherwise the parser is no longer able to generate an AST that can round-trip.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright, I left you a comment about it in the clj-parse--next function definition. I was on the fence about this as well.

(message "STACK: %S" stack)
(message "TOKEN: %S\n" token)

(setf stack
(if (member (clj-lex-token-type token)
clj-parse--leaf-tokens)
(funcall reduce1 stack token)
(cons token stack)))

;; Reduce based on the top item on the stack (collections)
(cl-case (clj-lex-token-type (car stack))
(:rparen (setf stack (clj-parse--reduce-coll stack :lparen :list reduceN)))
(:rbracket (setf stack (clj-parse--reduce-coll stack :lbracket :vector reduceN)))
(:rbrace
(let ((open-token (-find (lambda (token)
(member (clj-lex-token-type token) '(:lbrace :set)))
stack)))

(cl-case (clj-lex-token-type open-token)
(:lbrace
(setf stack (clj-parse--reduce-coll stack :lbrace :map reduceN)))
(:set
(setf stack (clj-parse--reduce-coll stack :set :set reduceN)))))))

;; Reduce based on top two items on the stack
(if (not (clj-lex-token? (car stack))) ;; top is fully reduced
(cl-case (clj-lex-token-type (cadr stack))
(:discard (setf stack (funcall reduceN (cddr stack) :discard (-take 2 stack))))))

(setq token (clj-lex-next)))

(message "RESULT: %S" stack)
stack))
(let ((token-type (clj-lex-token-type token)))
(cond
((member token-type clj-parse--leaf-tokens) (setf stack (funcall reduce1 stack token)))
((member token-type clj-parse--closer-tokens) (setf stack (clj-parse--reduce-coll stack token reduceN)))
(t (push token stack))))

;; Reduce based on top two items on the stack (special prefixed elements)
(seq-let [top lookup] stack
(when (and (clj-parse--is-open-prefix? lookup)
(or (clj-parse--is-node? top)
(clj-parse--is-leaf? top))) ;; top is fully reduced
(setf stack (funcall reduceN (cddr stack) lookup (list top))))))

;; reduce root
(let ((root (clj-parse--make-node :root stack)))
(message "RESULT: %S" root)
root)))

(defun clj-parse ()
(clj-parse-reduce 'clj-parse-edn-reduce1 'clj-parse-edn-reduceN))
(clj-parse-reduce #'clj-parse--ast-reduce1 #'clj-parse--ast-reduceN))

(provide 'clj-parse)

Expand Down