Klimi's new dotfiles with stow.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

545 lines
18 KiB

;;; parseclj-lex.el --- Clojure/EDN Lexer
;; Copyright (C) 2017-2018 Arne Brasseur
;; Author: Arne Brasseur <arne@arnebrasseur.net>
;; This file is not part of GNU Emacs.
;; This file is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3, or (at your option)
;; any later version.
;; This file is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING. If not, write to
;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;; Boston, MA 02110-1301, USA.
;;; Commentary:
;; A reader for EDN data files and parser for Clojure source files.
;;; Code:
(defvar parseclj-lex--leaf-tokens '(:whitespace
:comment
:number
:nil
:true
:false
:symbol
:keyword
:string
:regex
:character)
"Types of tokens that represent leaf nodes in the AST.")
(defvar parseclj-lex--closing-tokens '(:rparen
:rbracket
:rbrace)
"Types of tokens that mark the end of a non-atomic form.")
(defvar parseclj-lex--prefix-tokens '(:quote
:backquote
:unquote
:unquote-splice
:discard
:tag
:reader-conditional
:reader-conditional-splice
:var
:deref
:map-prefix
:eval)
"Tokens that modify the form that follows.")
(defvar parseclj-lex--prefix-2-tokens '(:metadata)
"Tokens that modify the two forms that follow.")
;; Token interface
(defun parseclj-lex-token (type form pos &rest attributes)
"Create a lexer token with the specified attributes.
Tokens at a mimimum have these attributes
- TYPE: the type of token, like :whitespace or :lparen
- FORM: the source form, a string
- POS: the position in the input, starts from 1 (like point)
Other ATTRIBUTES can be given as a flat list of key-value pairs."
(apply 'a-list :token-type type :form form :pos pos attributes))
(defun parseclj-lex-error-token (pos &optional error-type)
"Create a lexer error token starting at POS.
ERROR-TYPE is an optional keyword to attach to the created token,
as the means for providing more information on the error."
(apply #'parseclj-lex-token
:lex-error
(buffer-substring-no-properties pos (point))
pos
(when error-type
(list :error-type error-type))))
(defun parseclj-lex-token-p (token)
"Is the given TOKEN a parseclj-lex TOKEN.
A token is an association list with :token-type as its first key."
(and (consp token)
(consp (car token))
(eq :token-type (caar token))))
(defun parseclj-lex-token-type (token)
"Get the type of TOKEN."
(and (consp token)
(cdr (assq :token-type token))))
(defun parseclj-lex-token-form (token)
"Get the form of TOKEN."
(and (consp token)
(cdr (assq :form token))))
(defun parseclj-lex-leaf-token-p (token)
"Return t if the given AST TOKEN is a leaf node."
(member (parseclj-lex-token-type token) parseclj-lex--leaf-tokens))
(defun parseclj-lex-closing-token-p (token)
"Return t if the given ast TOKEN is a closing token."
(member (parseclj-lex-token-type token) parseclj-lex--closing-tokens))
(defun parseclj-lex-error-p (token)
"Return t if the TOKEN represents a lexing error token."
(eq (parseclj-lex-token-type token) :lex-error))
;; Elisp values from tokens
(defun parseclj-lex--string-value (s)
"Parse an EDN string S into a regular string.
S goes through three transformations:
- Escaped characters in S are transformed into Elisp escaped
characters.
- Unicode escaped characters are decoded into its corresponding
unicode character counterpart.
- Octal escaped characters are decoded into its corresponding
character counterpart."
(replace-regexp-in-string
"\\\\o[0-8]\\{3\\}"
(lambda (x)
(make-string 1 (string-to-number (substring x 2) 8)))
(replace-regexp-in-string
"\\\\u[0-9a-fA-F]\\{4\\}"
(lambda (x)
(make-string 1 (string-to-number (substring x 2) 16)))
(replace-regexp-in-string "\\\\[tbnrf'\"\\]"
(lambda (x)
(cl-case (elt x 1)
(?t "\t")
(?f "\f")
(?\" "\"")
(?r "\r")
(?n "\n")
(?\\ "\\\\")
(t (substring x 1))))
(substring s 1 -1)))))
(defun parseclj-lex--character-value (c)
"Parse an EDN character C into an Emacs Lisp character."
(let ((first-char (elt c 1)))
(cond
((equal c "\\newline") ?\n)
((equal c "\\return") ?\r)
((equal c "\\space") ?\ )
((equal c "\\tab") ?\t)
((eq first-char ?u) (string-to-number (substring c 2) 16))
((eq first-char ?o) (string-to-number (substring c 2) 8))
(t first-char))))
(defun parseclj-lex--leaf-token-value (token)
"Parse the given leaf TOKEN to an Emacs Lisp value."
(cl-case (parseclj-lex-token-type token)
(:number (string-to-number (alist-get :form token)))
(:nil nil)
(:true t)
(:false nil)
(:symbol (intern (alist-get :form token)))
(:keyword (intern (alist-get :form token)))
(:string (parseclj-lex--string-value (alist-get :form token)))
(:character (parseclj-lex--character-value (alist-get :form token)))))
;; Stream tokenization
(defun parseclj-lex-at-whitespace-p ()
"Return t if char at point is white space."
(let ((char (char-after (point))))
(or (equal char ?\ )
(equal char ?\t)
(equal char ?\n)
(equal char ?\r)
(equal char ?,))))
(defun parseclj-lex-at-eof-p ()
"Return t if point is at the end of file."
(eq (point) (point-max)))
(defun parseclj-lex-whitespace ()
"Consume all consecutive white space as possible and return an :whitespace token."
(let ((pos (point)))
(while (parseclj-lex-at-whitespace-p)
(right-char))
(parseclj-lex-token :whitespace
(buffer-substring-no-properties pos (point))
pos)))
(defun parseclj-lex-skip-digits ()
"Skip all consecutive digits after point."
(while (and (char-after (point))
(<= ?0 (char-after (point)))
(<= (char-after (point)) ?9))
(right-char)))
(defun parseclj-lex-skip-hex ()
"Skip all consecutive hex digits after point."
(while (and (char-after (point))
(or (<= ?0 (char-after (point)) ?9)
(<= ?a (char-after (point)) ?f)
(<= ?A (char-after (point)) ?F)))
(right-char)))
(defun parseclj-lex-skip-number ()
"Skip a number at point."
;; [\+\-]?\d+\.\d+
(if (and (eq ?0 (char-after (point)))
(eq ?x (char-after (1+ (point)))))
(progn
(right-char 2)
(parseclj-lex-skip-hex))
(progn
(when (member (char-after (point)) '(?+ ?-))
(right-char))
(parseclj-lex-skip-digits)
(when (eq (char-after (point)) ?.)
(right-char))
(parseclj-lex-skip-digits))))
(defun parseclj-lex-number ()
"Consume a number and return a `:number' token representing it."
(let ((pos (point)))
(parseclj-lex-skip-number)
;; 10110r2 or 4.3e+22
(when (member (char-after (point)) '(?E ?e ?r))
(right-char))
(parseclj-lex-skip-number)
;; trailing M
(when (eq (char-after (point)) ?M)
(right-char))
(let ((char (char-after (point))))
(if (and char (or (and (<= ?a char) (<= char ?z))
(and (<= ?A char) (<= char ?Z))
(and (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?& ?= ?< ?> ?/)))))
(progn
(right-char)
(parseclj-lex-error-token pos :invalid-number-format))
(parseclj-lex-token :number
(buffer-substring-no-properties pos (point))
pos)))))
(defun parseclj-lex-digit-p (char)
"Return t if CHAR is a digit."
(and char (<= ?0 char) (<= char ?9)))
(defun parseclj-lex-at-number-p ()
"Return t if point is at a number."
(let ((char (char-after (point))))
(or (parseclj-lex-digit-p char)
(and (member char '(?- ?+ ?.))
(parseclj-lex-digit-p (char-after (1+ (point))))))))
(defun parseclj-lex-symbol-start-p (char &optional alpha-only)
"Return t if CHAR is a valid start for a symbol.
Symbols begin with a non-numeric character and can contain alphanumeric
characters and . * + ! - _ ? $ % & = < > '. If - + or . are the first
character, the second character (if any) must be non-numeric.
In some cases, like in tagged elements, symbols are required to start with
alphabetic characters only. ALPHA-ONLY ensures this behavior."
(not (not (and char
(or (and (<= ?a char) (<= char ?z))
(and (<= ?A char) (<= char ?Z))
(and (not alpha-only) (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?% ?& ?= ?< ?> ?/ ?'))))))))
(defun parseclj-lex-symbol-rest-p (char)
"Return t if CHAR is a valid character in a symbol.
For more information on what determines a valid symbol, see
`parseclj-lex-symbol-start-p'"
(or (parseclj-lex-symbol-start-p char)
(parseclj-lex-digit-p char)
(eq ?: char)
(eq ?# char)))
(defun parseclj-lex-get-symbol-at-point (pos)
"Return the symbol at POS as a string."
(while (parseclj-lex-symbol-rest-p (char-after (point)))
(right-char))
(buffer-substring-no-properties pos (point)))
(defun parseclj-lex-symbol ()
"Return a lex token representing a symbol.
Because of their special meaning, symbols \"nil\", \"true\", and \"false\"
are returned as their own lex tokens."
(let ((pos (point)))
(right-char)
(let ((sym (parseclj-lex-get-symbol-at-point pos)))
(cond
((equal sym "nil") (parseclj-lex-token :nil "nil" pos))
((equal sym "true") (parseclj-lex-token :true "true" pos))
((equal sym "false") (parseclj-lex-token :false "false" pos))
(t (parseclj-lex-token :symbol sym pos))))))
(defun parseclj-lex-string* ()
"Helper for string/regex lexing.
Returns either the string, or an error token"
(let ((pos (point)))
(right-char)
(while (not (or (equal (char-after (point)) ?\") (parseclj-lex-at-eof-p)))
(if (equal (char-after (point)) ?\\)
(right-char 2)
(right-char)))
(when (equal (char-after (point)) ?\")
(right-char)
(buffer-substring-no-properties pos (point)))))
(defun parseclj-lex-string ()
"Return a lex token representing a string.
If EOF is reached without finding a closing double quote, a :lex-error
token is returned."
(let ((pos (point))
(str (parseclj-lex-string*)))
(if str
(parseclj-lex-token :string str pos)
(parseclj-lex-error-token pos :invalid-string))))
(defun parseclj-lex-regex ()
"Return a lex token representing a regular expression.
If EOF is reached without finding a closing double quote, a :lex-error
token is returned."
(let ((pos (1- (point)))
(str (parseclj-lex-string*)))
(if str
(parseclj-lex-token :regex (concat "#" str) pos)
(parseclj-lex-error-token pos :invalid-regex))))
(defun parseclj-lex-lookahead (n)
"Return a lookahead string of N characters after point."
(buffer-substring-no-properties (point) (min (+ (point) n) (point-max))))
(defun parseclj-lex-character ()
"Return a lex token representing a character."
(let ((pos (point)))
(right-char)
(cond
((equal (parseclj-lex-lookahead 3) "tab")
(right-char 3)
(parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
((equal (parseclj-lex-lookahead 5) "space")
(right-char 5)
(parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
((equal (parseclj-lex-lookahead 6) "return")
(right-char 6)
(parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
((equal (parseclj-lex-lookahead 7) "newline")
(right-char 7)
(parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
((string-match-p "^u[0-9a-fA-F]\\{4\\}" (parseclj-lex-lookahead 5))
(right-char 5)
(parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
((string-match-p "^o[0-8]\\{3\\}" (parseclj-lex-lookahead 4))
(right-char 4)
(parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
(t
(right-char)
(parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)))))
(defun parseclj-lex-keyword ()
"Return a lex token representing a keyword.
Keywords follow the same rules as symbols, except they start with one or
two colon characters.
See `parseclj-lex-symbol', `parseclj-lex-symbol-start-p'."
(let ((pos (point)))
(right-char)
(when (equal (char-after (point)) ?:) ;; same-namespace keyword
(right-char))
(if (equal (char-after (point)) ?:) ;; three colons in a row => lex-error
(progn
(right-char)
(parseclj-lex-error-token pos :invalid-keyword))
(progn
(while (or (parseclj-lex-symbol-rest-p (char-after (point)))
(equal (char-after (point)) ?#))
(right-char))
(parseclj-lex-token :keyword (buffer-substring-no-properties pos (point)) pos)))))
(defun parseclj-lex-comment ()
"Return a lex token representing a comment."
(let ((pos (point)))
(goto-char (line-end-position))
(when (equal (char-after (point)) ?\n)
(right-char))
(parseclj-lex-token :comment (buffer-substring-no-properties pos (point)) pos)))
(defun parseclj-lex-map-prefix ()
"Return a lex token representing a map prefix."
(let ((pos (1- (point))))
(right-char)
(when (equal (char-after (point)) ?:)
(right-char))
(while (parseclj-lex-symbol-rest-p (char-after (point)))
(right-char))
(parseclj-lex-token :map-prefix (buffer-substring-no-properties pos (point)) pos)))
(defun parseclj-lex-next ()
"Consume characters at point and return the next lexical token.
See `parseclj-lex-token'."
(if (parseclj-lex-at-eof-p)
(parseclj-lex-token :eof nil (point))
(let ((char (char-after (point)))
(pos (point)))
(cond
((parseclj-lex-at-whitespace-p)
(parseclj-lex-whitespace))
((equal char ?\()
(right-char)
(parseclj-lex-token :lparen "(" pos))
((equal char ?\))
(right-char)
(parseclj-lex-token :rparen ")" pos))
((equal char ?\[)
(right-char)
(parseclj-lex-token :lbracket "[" pos))
((equal char ?\])
(right-char)
(parseclj-lex-token :rbracket "]" pos))
((equal char ?{)
(right-char)
(parseclj-lex-token :lbrace "{" pos))
((equal char ?})
(right-char)
(parseclj-lex-token :rbrace "}" pos))
((equal char ?')
(right-char)
(parseclj-lex-token :quote "'" pos))
((equal char ?`)
(right-char)
(parseclj-lex-token :backquote "`" pos))
((equal char ?~)
(right-char)
(if (eq ?@ (char-after (point)))
(progn
(right-char)
(parseclj-lex-token :unquote-splice "~@" pos))
(parseclj-lex-token :unquote "~" pos)))
((parseclj-lex-at-number-p)
(parseclj-lex-number))
((parseclj-lex-symbol-start-p char)
(parseclj-lex-symbol))
((equal char ?\")
(parseclj-lex-string))
((equal char ?\\)
(parseclj-lex-character))
((equal char ?:)
(parseclj-lex-keyword))
((equal char ?\;)
(parseclj-lex-comment))
((equal char ?^)
(right-char)
(parseclj-lex-token :metadata "^" pos))
((equal char ?@)
(right-char)
(parseclj-lex-token :deref "@" pos))
((equal char ?#)
(right-char)
(let ((char (char-after (point))))
(cond
((equal char ?{)
(right-char)
(parseclj-lex-token :set "#{" pos))
((equal char ?_)
(right-char)
(parseclj-lex-token :discard "#_" pos))
((equal char ?\()
(right-char)
(parseclj-lex-token :lambda "#(" pos))
((equal char ?')
(right-char)
(parseclj-lex-token :var "#'" pos))
((equal char ?=)
(right-char)
(parseclj-lex-token :eval "#=" pos))
((equal char ?\")
(parseclj-lex-regex))
((equal char ?:)
(parseclj-lex-map-prefix))
((equal char ?\?)
(right-char)
(if (eq ?@ (char-after (point)))
(progn
(right-char)
(parseclj-lex-token :reader-conditional-splice "#?@" pos))
(parseclj-lex-token :reader-conditional "#?" pos)))
((parseclj-lex-symbol-start-p char t)
(right-char)
(parseclj-lex-token :tag (concat "#" (parseclj-lex-get-symbol-at-point (1+ pos))) pos))
(t
(while (not (or (parseclj-lex-at-whitespace-p)
(parseclj-lex-at-eof-p)))
(right-char))
(parseclj-lex-error-token pos :invalid-hashtag-dispatcher)))))
(t
(progn
(right-char)
(parseclj-lex-error-token pos)))))))
(provide 'parseclj-lex)
;;; parseclj-lex.el ends here