|
|
- ;;; parseclj-lex.el --- Clojure/EDN Lexer
-
- ;; Copyright (C) 2017-2018 Arne Brasseur
-
- ;; Author: Arne Brasseur <arne@arnebrasseur.net>
-
- ;; This file is not part of GNU Emacs.
-
- ;; This file is free software; you can redistribute it and/or modify
- ;; it under the terms of the GNU General Public License as published by
- ;; the Free Software Foundation; either version 3, or (at your option)
- ;; any later version.
-
- ;; This file is distributed in the hope that it will be useful,
- ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;; GNU General Public License for more details.
-
- ;; You should have received a copy of the GNU General Public License
- ;; along with GNU Emacs; see the file COPYING. If not, write to
- ;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- ;; Boston, MA 02110-1301, USA.
-
- ;;; Commentary:
-
- ;; A reader for EDN data files and parser for Clojure source files.
-
- ;;; Code:
-
- (defvar parseclj-lex--leaf-tokens '(:whitespace
- :comment
- :number
- :nil
- :true
- :false
- :symbol
- :keyword
- :string
- :regex
- :character)
- "Types of tokens that represent leaf nodes in the AST.")
-
- (defvar parseclj-lex--closing-tokens '(:rparen
- :rbracket
- :rbrace)
- "Types of tokens that mark the end of a non-atomic form.")
-
- (defvar parseclj-lex--prefix-tokens '(:quote
- :backquote
- :unquote
- :unquote-splice
- :discard
- :tag
- :reader-conditional
- :reader-conditional-splice
- :var
- :deref
- :map-prefix
- :eval)
- "Tokens that modify the form that follows.")
-
- (defvar parseclj-lex--prefix-2-tokens '(:metadata)
- "Tokens that modify the two forms that follow.")
-
- ;; Token interface
-
- (defun parseclj-lex-token (type form pos &rest attributes)
- "Create a lexer token with the specified attributes.
-
- Tokens at a mimimum have these attributes
- - TYPE: the type of token, like :whitespace or :lparen
- - FORM: the source form, a string
- - POS: the position in the input, starts from 1 (like point)
-
- Other ATTRIBUTES can be given as a flat list of key-value pairs."
- (apply 'a-list :token-type type :form form :pos pos attributes))
-
- (defun parseclj-lex-error-token (pos &optional error-type)
- "Create a lexer error token starting at POS.
- ERROR-TYPE is an optional keyword to attach to the created token,
- as the means for providing more information on the error."
- (apply #'parseclj-lex-token
- :lex-error
- (buffer-substring-no-properties pos (point))
- pos
- (when error-type
- (list :error-type error-type))))
-
- (defun parseclj-lex-token-p (token)
- "Is the given TOKEN a parseclj-lex TOKEN.
-
- A token is an association list with :token-type as its first key."
- (and (consp token)
- (consp (car token))
- (eq :token-type (caar token))))
-
- (defun parseclj-lex-token-type (token)
- "Get the type of TOKEN."
- (and (consp token)
- (cdr (assq :token-type token))))
-
- (defun parseclj-lex-token-form (token)
- "Get the form of TOKEN."
- (and (consp token)
- (cdr (assq :form token))))
-
- (defun parseclj-lex-leaf-token-p (token)
- "Return t if the given AST TOKEN is a leaf node."
- (member (parseclj-lex-token-type token) parseclj-lex--leaf-tokens))
-
- (defun parseclj-lex-closing-token-p (token)
- "Return t if the given ast TOKEN is a closing token."
- (member (parseclj-lex-token-type token) parseclj-lex--closing-tokens))
-
- (defun parseclj-lex-error-p (token)
- "Return t if the TOKEN represents a lexing error token."
- (eq (parseclj-lex-token-type token) :lex-error))
-
- ;; Elisp values from tokens
-
- (defun parseclj-lex--string-value (s)
- "Parse an EDN string S into a regular string.
- S goes through three transformations:
- - Escaped characters in S are transformed into Elisp escaped
- characters.
- - Unicode escaped characters are decoded into its corresponding
- unicode character counterpart.
- - Octal escaped characters are decoded into its corresponding
- character counterpart."
- (replace-regexp-in-string
- "\\\\o[0-8]\\{3\\}"
- (lambda (x)
- (make-string 1 (string-to-number (substring x 2) 8)))
- (replace-regexp-in-string
- "\\\\u[0-9a-fA-F]\\{4\\}"
- (lambda (x)
- (make-string 1 (string-to-number (substring x 2) 16)))
- (replace-regexp-in-string "\\\\[tbnrf'\"\\]"
- (lambda (x)
- (cl-case (elt x 1)
- (?t "\t")
- (?f "\f")
- (?\" "\"")
- (?r "\r")
- (?n "\n")
- (?\\ "\\\\")
- (t (substring x 1))))
- (substring s 1 -1)))))
-
- (defun parseclj-lex--character-value (c)
- "Parse an EDN character C into an Emacs Lisp character."
- (let ((first-char (elt c 1)))
- (cond
- ((equal c "\\newline") ?\n)
- ((equal c "\\return") ?\r)
- ((equal c "\\space") ?\ )
- ((equal c "\\tab") ?\t)
- ((eq first-char ?u) (string-to-number (substring c 2) 16))
- ((eq first-char ?o) (string-to-number (substring c 2) 8))
- (t first-char))))
-
- (defun parseclj-lex--leaf-token-value (token)
- "Parse the given leaf TOKEN to an Emacs Lisp value."
- (cl-case (parseclj-lex-token-type token)
- (:number (string-to-number (alist-get :form token)))
- (:nil nil)
- (:true t)
- (:false nil)
- (:symbol (intern (alist-get :form token)))
- (:keyword (intern (alist-get :form token)))
- (:string (parseclj-lex--string-value (alist-get :form token)))
- (:character (parseclj-lex--character-value (alist-get :form token)))))
-
-
- ;; Stream tokenization
-
- (defun parseclj-lex-at-whitespace-p ()
- "Return t if char at point is white space."
- (let ((char (char-after (point))))
- (or (equal char ?\ )
- (equal char ?\t)
- (equal char ?\n)
- (equal char ?\r)
- (equal char ?,))))
-
- (defun parseclj-lex-at-eof-p ()
- "Return t if point is at the end of file."
- (eq (point) (point-max)))
-
- (defun parseclj-lex-whitespace ()
- "Consume all consecutive white space as possible and return an :whitespace token."
- (let ((pos (point)))
- (while (parseclj-lex-at-whitespace-p)
- (right-char))
- (parseclj-lex-token :whitespace
- (buffer-substring-no-properties pos (point))
- pos)))
-
- (defun parseclj-lex-skip-digits ()
- "Skip all consecutive digits after point."
- (while (and (char-after (point))
- (<= ?0 (char-after (point)))
- (<= (char-after (point)) ?9))
- (right-char)))
-
- (defun parseclj-lex-skip-hex ()
- "Skip all consecutive hex digits after point."
- (while (and (char-after (point))
- (or (<= ?0 (char-after (point)) ?9)
- (<= ?a (char-after (point)) ?f)
- (<= ?A (char-after (point)) ?F)))
- (right-char)))
-
- (defun parseclj-lex-skip-number ()
- "Skip a number at point."
- ;; [\+\-]?\d+\.\d+
- (if (and (eq ?0 (char-after (point)))
- (eq ?x (char-after (1+ (point)))))
- (progn
- (right-char 2)
- (parseclj-lex-skip-hex))
- (progn
- (when (member (char-after (point)) '(?+ ?-))
- (right-char))
-
- (parseclj-lex-skip-digits)
-
- (when (eq (char-after (point)) ?.)
- (right-char))
-
- (parseclj-lex-skip-digits))))
-
- (defun parseclj-lex-number ()
- "Consume a number and return a `:number' token representing it."
- (let ((pos (point)))
- (parseclj-lex-skip-number)
-
- ;; 10110r2 or 4.3e+22
- (when (member (char-after (point)) '(?E ?e ?r))
- (right-char))
-
- (parseclj-lex-skip-number)
-
- ;; trailing M
- (when (eq (char-after (point)) ?M)
- (right-char))
-
- (let ((char (char-after (point))))
- (if (and char (or (and (<= ?a char) (<= char ?z))
- (and (<= ?A char) (<= char ?Z))
- (and (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?& ?= ?< ?> ?/)))))
- (progn
- (right-char)
- (parseclj-lex-error-token pos :invalid-number-format))
- (parseclj-lex-token :number
- (buffer-substring-no-properties pos (point))
- pos)))))
-
-
- (defun parseclj-lex-digit-p (char)
- "Return t if CHAR is a digit."
- (and char (<= ?0 char) (<= char ?9)))
-
- (defun parseclj-lex-at-number-p ()
- "Return t if point is at a number."
- (let ((char (char-after (point))))
- (or (parseclj-lex-digit-p char)
- (and (member char '(?- ?+ ?.))
- (parseclj-lex-digit-p (char-after (1+ (point))))))))
-
- (defun parseclj-lex-symbol-start-p (char &optional alpha-only)
- "Return t if CHAR is a valid start for a symbol.
-
- Symbols begin with a non-numeric character and can contain alphanumeric
- characters and . * + ! - _ ? $ % & = < > '. If - + or . are the first
- character, the second character (if any) must be non-numeric.
-
- In some cases, like in tagged elements, symbols are required to start with
- alphabetic characters only. ALPHA-ONLY ensures this behavior."
- (not (not (and char
- (or (and (<= ?a char) (<= char ?z))
- (and (<= ?A char) (<= char ?Z))
- (and (not alpha-only) (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?% ?& ?= ?< ?> ?/ ?'))))))))
-
- (defun parseclj-lex-symbol-rest-p (char)
- "Return t if CHAR is a valid character in a symbol.
- For more information on what determines a valid symbol, see
- `parseclj-lex-symbol-start-p'"
- (or (parseclj-lex-symbol-start-p char)
- (parseclj-lex-digit-p char)
- (eq ?: char)
- (eq ?# char)))
-
- (defun parseclj-lex-get-symbol-at-point (pos)
- "Return the symbol at POS as a string."
- (while (parseclj-lex-symbol-rest-p (char-after (point)))
- (right-char))
- (buffer-substring-no-properties pos (point)))
-
- (defun parseclj-lex-symbol ()
- "Return a lex token representing a symbol.
- Because of their special meaning, symbols \"nil\", \"true\", and \"false\"
- are returned as their own lex tokens."
- (let ((pos (point)))
- (right-char)
- (let ((sym (parseclj-lex-get-symbol-at-point pos)))
- (cond
- ((equal sym "nil") (parseclj-lex-token :nil "nil" pos))
- ((equal sym "true") (parseclj-lex-token :true "true" pos))
- ((equal sym "false") (parseclj-lex-token :false "false" pos))
- (t (parseclj-lex-token :symbol sym pos))))))
-
- (defun parseclj-lex-string* ()
- "Helper for string/regex lexing.
- Returns either the string, or an error token"
- (let ((pos (point)))
- (right-char)
- (while (not (or (equal (char-after (point)) ?\") (parseclj-lex-at-eof-p)))
- (if (equal (char-after (point)) ?\\)
- (right-char 2)
- (right-char)))
- (when (equal (char-after (point)) ?\")
- (right-char)
- (buffer-substring-no-properties pos (point)))))
-
- (defun parseclj-lex-string ()
- "Return a lex token representing a string.
- If EOF is reached without finding a closing double quote, a :lex-error
- token is returned."
- (let ((pos (point))
- (str (parseclj-lex-string*)))
- (if str
- (parseclj-lex-token :string str pos)
- (parseclj-lex-error-token pos :invalid-string))))
-
- (defun parseclj-lex-regex ()
- "Return a lex token representing a regular expression.
- If EOF is reached without finding a closing double quote, a :lex-error
- token is returned."
- (let ((pos (1- (point)))
- (str (parseclj-lex-string*)))
- (if str
- (parseclj-lex-token :regex (concat "#" str) pos)
- (parseclj-lex-error-token pos :invalid-regex))))
-
- (defun parseclj-lex-lookahead (n)
- "Return a lookahead string of N characters after point."
- (buffer-substring-no-properties (point) (min (+ (point) n) (point-max))))
-
- (defun parseclj-lex-character ()
- "Return a lex token representing a character."
- (let ((pos (point)))
- (right-char)
- (cond
- ((equal (parseclj-lex-lookahead 3) "tab")
- (right-char 3)
- (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
-
- ((equal (parseclj-lex-lookahead 5) "space")
- (right-char 5)
- (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
-
- ((equal (parseclj-lex-lookahead 6) "return")
- (right-char 6)
- (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
-
- ((equal (parseclj-lex-lookahead 7) "newline")
- (right-char 7)
- (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
-
- ((string-match-p "^u[0-9a-fA-F]\\{4\\}" (parseclj-lex-lookahead 5))
- (right-char 5)
- (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
-
- ((string-match-p "^o[0-8]\\{3\\}" (parseclj-lex-lookahead 4))
- (right-char 4)
- (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
-
- (t
- (right-char)
- (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)))))
-
- (defun parseclj-lex-keyword ()
- "Return a lex token representing a keyword.
- Keywords follow the same rules as symbols, except they start with one or
- two colon characters.
-
- See `parseclj-lex-symbol', `parseclj-lex-symbol-start-p'."
- (let ((pos (point)))
- (right-char)
- (when (equal (char-after (point)) ?:) ;; same-namespace keyword
- (right-char))
- (if (equal (char-after (point)) ?:) ;; three colons in a row => lex-error
- (progn
- (right-char)
- (parseclj-lex-error-token pos :invalid-keyword))
- (progn
- (while (or (parseclj-lex-symbol-rest-p (char-after (point)))
- (equal (char-after (point)) ?#))
- (right-char))
- (parseclj-lex-token :keyword (buffer-substring-no-properties pos (point)) pos)))))
-
- (defun parseclj-lex-comment ()
- "Return a lex token representing a comment."
- (let ((pos (point)))
- (goto-char (line-end-position))
- (when (equal (char-after (point)) ?\n)
- (right-char))
- (parseclj-lex-token :comment (buffer-substring-no-properties pos (point)) pos)))
-
- (defun parseclj-lex-map-prefix ()
- "Return a lex token representing a map prefix."
- (let ((pos (1- (point))))
- (right-char)
- (when (equal (char-after (point)) ?:)
- (right-char))
- (while (parseclj-lex-symbol-rest-p (char-after (point)))
- (right-char))
- (parseclj-lex-token :map-prefix (buffer-substring-no-properties pos (point)) pos)))
-
- (defun parseclj-lex-next ()
- "Consume characters at point and return the next lexical token.
-
- See `parseclj-lex-token'."
- (if (parseclj-lex-at-eof-p)
- (parseclj-lex-token :eof nil (point))
- (let ((char (char-after (point)))
- (pos (point)))
- (cond
- ((parseclj-lex-at-whitespace-p)
- (parseclj-lex-whitespace))
-
- ((equal char ?\()
- (right-char)
- (parseclj-lex-token :lparen "(" pos))
-
- ((equal char ?\))
- (right-char)
- (parseclj-lex-token :rparen ")" pos))
-
- ((equal char ?\[)
- (right-char)
- (parseclj-lex-token :lbracket "[" pos))
-
- ((equal char ?\])
- (right-char)
- (parseclj-lex-token :rbracket "]" pos))
-
- ((equal char ?{)
- (right-char)
- (parseclj-lex-token :lbrace "{" pos))
-
- ((equal char ?})
- (right-char)
- (parseclj-lex-token :rbrace "}" pos))
-
- ((equal char ?')
- (right-char)
- (parseclj-lex-token :quote "'" pos))
-
- ((equal char ?`)
- (right-char)
- (parseclj-lex-token :backquote "`" pos))
-
- ((equal char ?~)
- (right-char)
- (if (eq ?@ (char-after (point)))
- (progn
- (right-char)
- (parseclj-lex-token :unquote-splice "~@" pos))
- (parseclj-lex-token :unquote "~" pos)))
-
- ((parseclj-lex-at-number-p)
- (parseclj-lex-number))
-
- ((parseclj-lex-symbol-start-p char)
- (parseclj-lex-symbol))
-
- ((equal char ?\")
- (parseclj-lex-string))
-
- ((equal char ?\\)
- (parseclj-lex-character))
-
- ((equal char ?:)
- (parseclj-lex-keyword))
-
- ((equal char ?\;)
- (parseclj-lex-comment))
-
- ((equal char ?^)
- (right-char)
- (parseclj-lex-token :metadata "^" pos))
-
- ((equal char ?@)
- (right-char)
- (parseclj-lex-token :deref "@" pos))
-
- ((equal char ?#)
- (right-char)
- (let ((char (char-after (point))))
- (cond
- ((equal char ?{)
- (right-char)
- (parseclj-lex-token :set "#{" pos))
- ((equal char ?_)
- (right-char)
- (parseclj-lex-token :discard "#_" pos))
- ((equal char ?\()
- (right-char)
- (parseclj-lex-token :lambda "#(" pos))
- ((equal char ?')
- (right-char)
- (parseclj-lex-token :var "#'" pos))
- ((equal char ?=)
- (right-char)
- (parseclj-lex-token :eval "#=" pos))
- ((equal char ?\")
- (parseclj-lex-regex))
- ((equal char ?:)
- (parseclj-lex-map-prefix))
- ((equal char ?\?)
- (right-char)
- (if (eq ?@ (char-after (point)))
- (progn
- (right-char)
- (parseclj-lex-token :reader-conditional-splice "#?@" pos))
- (parseclj-lex-token :reader-conditional "#?" pos)))
- ((parseclj-lex-symbol-start-p char t)
- (right-char)
- (parseclj-lex-token :tag (concat "#" (parseclj-lex-get-symbol-at-point (1+ pos))) pos))
- (t
- (while (not (or (parseclj-lex-at-whitespace-p)
- (parseclj-lex-at-eof-p)))
- (right-char))
- (parseclj-lex-error-token pos :invalid-hashtag-dispatcher)))))
-
- (t
- (progn
- (right-char)
- (parseclj-lex-error-token pos)))))))
-
- (provide 'parseclj-lex)
-
- ;;; parseclj-lex.el ends here
|