;;; parseclj-lex.el --- Clojure/EDN Lexer ;; Copyright (C) 2017-2018 Arne Brasseur ;; Author: Arne Brasseur ;; This file is not part of GNU Emacs. ;; This file is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 3, or (at your option) ;; any later version. ;; This file is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to ;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ;; Boston, MA 02110-1301, USA. ;;; Commentary: ;; A reader for EDN data files and parser for Clojure source files. ;;; Code: (defvar parseclj-lex--leaf-tokens '(:whitespace :comment :number :nil :true :false :symbol :keyword :string :regex :character) "Types of tokens that represent leaf nodes in the AST.") (defvar parseclj-lex--closing-tokens '(:rparen :rbracket :rbrace) "Types of tokens that mark the end of a non-atomic form.") (defvar parseclj-lex--prefix-tokens '(:quote :backquote :unquote :unquote-splice :discard :tag :reader-conditional :reader-conditional-splice :var :deref :map-prefix :eval) "Tokens that modify the form that follows.") (defvar parseclj-lex--prefix-2-tokens '(:metadata) "Tokens that modify the two forms that follow.") ;; Token interface (defun parseclj-lex-token (type form pos &rest attributes) "Create a lexer token with the specified attributes. Tokens at a mimimum have these attributes - TYPE: the type of token, like :whitespace or :lparen - FORM: the source form, a string - POS: the position in the input, starts from 1 (like point) Other ATTRIBUTES can be given as a flat list of key-value pairs." (apply 'a-list :token-type type :form form :pos pos attributes)) (defun parseclj-lex-error-token (pos &optional error-type) "Create a lexer error token starting at POS. ERROR-TYPE is an optional keyword to attach to the created token, as the means for providing more information on the error." (apply #'parseclj-lex-token :lex-error (buffer-substring-no-properties pos (point)) pos (when error-type (list :error-type error-type)))) (defun parseclj-lex-token-p (token) "Is the given TOKEN a parseclj-lex TOKEN. A token is an association list with :token-type as its first key." (and (consp token) (consp (car token)) (eq :token-type (caar token)))) (defun parseclj-lex-token-type (token) "Get the type of TOKEN." (and (consp token) (cdr (assq :token-type token)))) (defun parseclj-lex-token-form (token) "Get the form of TOKEN." (and (consp token) (cdr (assq :form token)))) (defun parseclj-lex-leaf-token-p (token) "Return t if the given AST TOKEN is a leaf node." (member (parseclj-lex-token-type token) parseclj-lex--leaf-tokens)) (defun parseclj-lex-closing-token-p (token) "Return t if the given ast TOKEN is a closing token." (member (parseclj-lex-token-type token) parseclj-lex--closing-tokens)) (defun parseclj-lex-error-p (token) "Return t if the TOKEN represents a lexing error token." (eq (parseclj-lex-token-type token) :lex-error)) ;; Elisp values from tokens (defun parseclj-lex--string-value (s) "Parse an EDN string S into a regular string. S goes through three transformations: - Escaped characters in S are transformed into Elisp escaped characters. - Unicode escaped characters are decoded into its corresponding unicode character counterpart. - Octal escaped characters are decoded into its corresponding character counterpart." (replace-regexp-in-string "\\\\o[0-8]\\{3\\}" (lambda (x) (make-string 1 (string-to-number (substring x 2) 8))) (replace-regexp-in-string "\\\\u[0-9a-fA-F]\\{4\\}" (lambda (x) (make-string 1 (string-to-number (substring x 2) 16))) (replace-regexp-in-string "\\\\[tbnrf'\"\\]" (lambda (x) (cl-case (elt x 1) (?t "\t") (?f "\f") (?\" "\"") (?r "\r") (?n "\n") (?\\ "\\\\") (t (substring x 1)))) (substring s 1 -1))))) (defun parseclj-lex--character-value (c) "Parse an EDN character C into an Emacs Lisp character." (let ((first-char (elt c 1))) (cond ((equal c "\\newline") ?\n) ((equal c "\\return") ?\r) ((equal c "\\space") ?\ ) ((equal c "\\tab") ?\t) ((eq first-char ?u) (string-to-number (substring c 2) 16)) ((eq first-char ?o) (string-to-number (substring c 2) 8)) (t first-char)))) (defun parseclj-lex--leaf-token-value (token) "Parse the given leaf TOKEN to an Emacs Lisp value." (cl-case (parseclj-lex-token-type token) (:number (string-to-number (alist-get :form token))) (:nil nil) (:true t) (:false nil) (:symbol (intern (alist-get :form token))) (:keyword (intern (alist-get :form token))) (:string (parseclj-lex--string-value (alist-get :form token))) (:character (parseclj-lex--character-value (alist-get :form token))))) ;; Stream tokenization (defun parseclj-lex-at-whitespace-p () "Return t if char at point is white space." (let ((char (char-after (point)))) (or (equal char ?\ ) (equal char ?\t) (equal char ?\n) (equal char ?\r) (equal char ?,)))) (defun parseclj-lex-at-eof-p () "Return t if point is at the end of file." (eq (point) (point-max))) (defun parseclj-lex-whitespace () "Consume all consecutive white space as possible and return an :whitespace token." (let ((pos (point))) (while (parseclj-lex-at-whitespace-p) (right-char)) (parseclj-lex-token :whitespace (buffer-substring-no-properties pos (point)) pos))) (defun parseclj-lex-skip-digits () "Skip all consecutive digits after point." (while (and (char-after (point)) (<= ?0 (char-after (point))) (<= (char-after (point)) ?9)) (right-char))) (defun parseclj-lex-skip-hex () "Skip all consecutive hex digits after point." (while (and (char-after (point)) (or (<= ?0 (char-after (point)) ?9) (<= ?a (char-after (point)) ?f) (<= ?A (char-after (point)) ?F))) (right-char))) (defun parseclj-lex-skip-number () "Skip a number at point." ;; [\+\-]?\d+\.\d+ (if (and (eq ?0 (char-after (point))) (eq ?x (char-after (1+ (point))))) (progn (right-char 2) (parseclj-lex-skip-hex)) (progn (when (member (char-after (point)) '(?+ ?-)) (right-char)) (parseclj-lex-skip-digits) (when (eq (char-after (point)) ?.) (right-char)) (parseclj-lex-skip-digits)))) (defun parseclj-lex-number () "Consume a number and return a `:number' token representing it." (let ((pos (point))) (parseclj-lex-skip-number) ;; 10110r2 or 4.3e+22 (when (member (char-after (point)) '(?E ?e ?r)) (right-char)) (parseclj-lex-skip-number) ;; trailing M (when (eq (char-after (point)) ?M) (right-char)) (let ((char (char-after (point)))) (if (and char (or (and (<= ?a char) (<= char ?z)) (and (<= ?A char) (<= char ?Z)) (and (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?& ?= ?< ?> ?/))))) (progn (right-char) (parseclj-lex-error-token pos :invalid-number-format)) (parseclj-lex-token :number (buffer-substring-no-properties pos (point)) pos))))) (defun parseclj-lex-digit-p (char) "Return t if CHAR is a digit." (and char (<= ?0 char) (<= char ?9))) (defun parseclj-lex-at-number-p () "Return t if point is at a number." (let ((char (char-after (point)))) (or (parseclj-lex-digit-p char) (and (member char '(?- ?+ ?.)) (parseclj-lex-digit-p (char-after (1+ (point)))))))) (defun parseclj-lex-symbol-start-p (char &optional alpha-only) "Return t if CHAR is a valid start for a symbol. Symbols begin with a non-numeric character and can contain alphanumeric characters and . * + ! - _ ? $ % & = < > '. If - + or . are the first character, the second character (if any) must be non-numeric. In some cases, like in tagged elements, symbols are required to start with alphabetic characters only. ALPHA-ONLY ensures this behavior." (not (not (and char (or (and (<= ?a char) (<= char ?z)) (and (<= ?A char) (<= char ?Z)) (and (not alpha-only) (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?% ?& ?= ?< ?> ?/ ?')))))))) (defun parseclj-lex-symbol-rest-p (char) "Return t if CHAR is a valid character in a symbol. For more information on what determines a valid symbol, see `parseclj-lex-symbol-start-p'" (or (parseclj-lex-symbol-start-p char) (parseclj-lex-digit-p char) (eq ?: char) (eq ?# char))) (defun parseclj-lex-get-symbol-at-point (pos) "Return the symbol at POS as a string." (while (parseclj-lex-symbol-rest-p (char-after (point))) (right-char)) (buffer-substring-no-properties pos (point))) (defun parseclj-lex-symbol () "Return a lex token representing a symbol. Because of their special meaning, symbols \"nil\", \"true\", and \"false\" are returned as their own lex tokens." (let ((pos (point))) (right-char) (let ((sym (parseclj-lex-get-symbol-at-point pos))) (cond ((equal sym "nil") (parseclj-lex-token :nil "nil" pos)) ((equal sym "true") (parseclj-lex-token :true "true" pos)) ((equal sym "false") (parseclj-lex-token :false "false" pos)) (t (parseclj-lex-token :symbol sym pos)))))) (defun parseclj-lex-string* () "Helper for string/regex lexing. Returns either the string, or an error token" (let ((pos (point))) (right-char) (while (not (or (equal (char-after (point)) ?\") (parseclj-lex-at-eof-p))) (if (equal (char-after (point)) ?\\) (right-char 2) (right-char))) (when (equal (char-after (point)) ?\") (right-char) (buffer-substring-no-properties pos (point))))) (defun parseclj-lex-string () "Return a lex token representing a string. If EOF is reached without finding a closing double quote, a :lex-error token is returned." (let ((pos (point)) (str (parseclj-lex-string*))) (if str (parseclj-lex-token :string str pos) (parseclj-lex-error-token pos :invalid-string)))) (defun parseclj-lex-regex () "Return a lex token representing a regular expression. If EOF is reached without finding a closing double quote, a :lex-error token is returned." (let ((pos (1- (point))) (str (parseclj-lex-string*))) (if str (parseclj-lex-token :regex (concat "#" str) pos) (parseclj-lex-error-token pos :invalid-regex)))) (defun parseclj-lex-lookahead (n) "Return a lookahead string of N characters after point." (buffer-substring-no-properties (point) (min (+ (point) n) (point-max)))) (defun parseclj-lex-character () "Return a lex token representing a character." (let ((pos (point))) (right-char) (cond ((equal (parseclj-lex-lookahead 3) "tab") (right-char 3) (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) ((equal (parseclj-lex-lookahead 5) "space") (right-char 5) (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) ((equal (parseclj-lex-lookahead 6) "return") (right-char 6) (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) ((equal (parseclj-lex-lookahead 7) "newline") (right-char 7) (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) ((string-match-p "^u[0-9a-fA-F]\\{4\\}" (parseclj-lex-lookahead 5)) (right-char 5) (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) ((string-match-p "^o[0-8]\\{3\\}" (parseclj-lex-lookahead 4)) (right-char 4) (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) (t (right-char) (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))))) (defun parseclj-lex-keyword () "Return a lex token representing a keyword. Keywords follow the same rules as symbols, except they start with one or two colon characters. See `parseclj-lex-symbol', `parseclj-lex-symbol-start-p'." (let ((pos (point))) (right-char) (when (equal (char-after (point)) ?:) ;; same-namespace keyword (right-char)) (if (equal (char-after (point)) ?:) ;; three colons in a row => lex-error (progn (right-char) (parseclj-lex-error-token pos :invalid-keyword)) (progn (while (or (parseclj-lex-symbol-rest-p (char-after (point))) (equal (char-after (point)) ?#)) (right-char)) (parseclj-lex-token :keyword (buffer-substring-no-properties pos (point)) pos))))) (defun parseclj-lex-comment () "Return a lex token representing a comment." (let ((pos (point))) (goto-char (line-end-position)) (when (equal (char-after (point)) ?\n) (right-char)) (parseclj-lex-token :comment (buffer-substring-no-properties pos (point)) pos))) (defun parseclj-lex-map-prefix () "Return a lex token representing a map prefix." (let ((pos (1- (point)))) (right-char) (when (equal (char-after (point)) ?:) (right-char)) (while (parseclj-lex-symbol-rest-p (char-after (point))) (right-char)) (parseclj-lex-token :map-prefix (buffer-substring-no-properties pos (point)) pos))) (defun parseclj-lex-next () "Consume characters at point and return the next lexical token. See `parseclj-lex-token'." (if (parseclj-lex-at-eof-p) (parseclj-lex-token :eof nil (point)) (let ((char (char-after (point))) (pos (point))) (cond ((parseclj-lex-at-whitespace-p) (parseclj-lex-whitespace)) ((equal char ?\() (right-char) (parseclj-lex-token :lparen "(" pos)) ((equal char ?\)) (right-char) (parseclj-lex-token :rparen ")" pos)) ((equal char ?\[) (right-char) (parseclj-lex-token :lbracket "[" pos)) ((equal char ?\]) (right-char) (parseclj-lex-token :rbracket "]" pos)) ((equal char ?{) (right-char) (parseclj-lex-token :lbrace "{" pos)) ((equal char ?}) (right-char) (parseclj-lex-token :rbrace "}" pos)) ((equal char ?') (right-char) (parseclj-lex-token :quote "'" pos)) ((equal char ?`) (right-char) (parseclj-lex-token :backquote "`" pos)) ((equal char ?~) (right-char) (if (eq ?@ (char-after (point))) (progn (right-char) (parseclj-lex-token :unquote-splice "~@" pos)) (parseclj-lex-token :unquote "~" pos))) ((parseclj-lex-at-number-p) (parseclj-lex-number)) ((parseclj-lex-symbol-start-p char) (parseclj-lex-symbol)) ((equal char ?\") (parseclj-lex-string)) ((equal char ?\\) (parseclj-lex-character)) ((equal char ?:) (parseclj-lex-keyword)) ((equal char ?\;) (parseclj-lex-comment)) ((equal char ?^) (right-char) (parseclj-lex-token :metadata "^" pos)) ((equal char ?@) (right-char) (parseclj-lex-token :deref "@" pos)) ((equal char ?#) (right-char) (let ((char (char-after (point)))) (cond ((equal char ?{) (right-char) (parseclj-lex-token :set "#{" pos)) ((equal char ?_) (right-char) (parseclj-lex-token :discard "#_" pos)) ((equal char ?\() (right-char) (parseclj-lex-token :lambda "#(" pos)) ((equal char ?') (right-char) (parseclj-lex-token :var "#'" pos)) ((equal char ?=) (right-char) (parseclj-lex-token :eval "#=" pos)) ((equal char ?\") (parseclj-lex-regex)) ((equal char ?:) (parseclj-lex-map-prefix)) ((equal char ?\?) (right-char) (if (eq ?@ (char-after (point))) (progn (right-char) (parseclj-lex-token :reader-conditional-splice "#?@" pos)) (parseclj-lex-token :reader-conditional "#?" pos))) ((parseclj-lex-symbol-start-p char t) (right-char) (parseclj-lex-token :tag (concat "#" (parseclj-lex-get-symbol-at-point (1+ pos))) pos)) (t (while (not (or (parseclj-lex-at-whitespace-p) (parseclj-lex-at-eof-p))) (right-char)) (parseclj-lex-error-token pos :invalid-hashtag-dispatcher))))) (t (progn (right-char) (parseclj-lex-error-token pos))))))) (provide 'parseclj-lex) ;;; parseclj-lex.el ends here