Klimi's new dotfiles with stow.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

545 lines
18 KiB

4 years ago
  1. ;;; parseclj-lex.el --- Clojure/EDN Lexer
  2. ;; Copyright (C) 2017-2018 Arne Brasseur
  3. ;; Author: Arne Brasseur <arne@arnebrasseur.net>
  4. ;; This file is not part of GNU Emacs.
  5. ;; This file is free software; you can redistribute it and/or modify
  6. ;; it under the terms of the GNU General Public License as published by
  7. ;; the Free Software Foundation; either version 3, or (at your option)
  8. ;; any later version.
  9. ;; This file is distributed in the hope that it will be useful,
  10. ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. ;; GNU General Public License for more details.
  13. ;; You should have received a copy of the GNU General Public License
  14. ;; along with GNU Emacs; see the file COPYING. If not, write to
  15. ;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  16. ;; Boston, MA 02110-1301, USA.
  17. ;;; Commentary:
  18. ;; A reader for EDN data files and parser for Clojure source files.
  19. ;;; Code:
  20. (defvar parseclj-lex--leaf-tokens '(:whitespace
  21. :comment
  22. :number
  23. :nil
  24. :true
  25. :false
  26. :symbol
  27. :keyword
  28. :string
  29. :regex
  30. :character)
  31. "Types of tokens that represent leaf nodes in the AST.")
  32. (defvar parseclj-lex--closing-tokens '(:rparen
  33. :rbracket
  34. :rbrace)
  35. "Types of tokens that mark the end of a non-atomic form.")
  36. (defvar parseclj-lex--prefix-tokens '(:quote
  37. :backquote
  38. :unquote
  39. :unquote-splice
  40. :discard
  41. :tag
  42. :reader-conditional
  43. :reader-conditional-splice
  44. :var
  45. :deref
  46. :map-prefix
  47. :eval)
  48. "Tokens that modify the form that follows.")
  49. (defvar parseclj-lex--prefix-2-tokens '(:metadata)
  50. "Tokens that modify the two forms that follow.")
  51. ;; Token interface
  52. (defun parseclj-lex-token (type form pos &rest attributes)
  53. "Create a lexer token with the specified attributes.
  54. Tokens at a mimimum have these attributes
  55. - TYPE: the type of token, like :whitespace or :lparen
  56. - FORM: the source form, a string
  57. - POS: the position in the input, starts from 1 (like point)
  58. Other ATTRIBUTES can be given as a flat list of key-value pairs."
  59. (apply 'a-list :token-type type :form form :pos pos attributes))
  60. (defun parseclj-lex-error-token (pos &optional error-type)
  61. "Create a lexer error token starting at POS.
  62. ERROR-TYPE is an optional keyword to attach to the created token,
  63. as the means for providing more information on the error."
  64. (apply #'parseclj-lex-token
  65. :lex-error
  66. (buffer-substring-no-properties pos (point))
  67. pos
  68. (when error-type
  69. (list :error-type error-type))))
  70. (defun parseclj-lex-token-p (token)
  71. "Is the given TOKEN a parseclj-lex TOKEN.
  72. A token is an association list with :token-type as its first key."
  73. (and (consp token)
  74. (consp (car token))
  75. (eq :token-type (caar token))))
  76. (defun parseclj-lex-token-type (token)
  77. "Get the type of TOKEN."
  78. (and (consp token)
  79. (cdr (assq :token-type token))))
  80. (defun parseclj-lex-token-form (token)
  81. "Get the form of TOKEN."
  82. (and (consp token)
  83. (cdr (assq :form token))))
  84. (defun parseclj-lex-leaf-token-p (token)
  85. "Return t if the given AST TOKEN is a leaf node."
  86. (member (parseclj-lex-token-type token) parseclj-lex--leaf-tokens))
  87. (defun parseclj-lex-closing-token-p (token)
  88. "Return t if the given ast TOKEN is a closing token."
  89. (member (parseclj-lex-token-type token) parseclj-lex--closing-tokens))
  90. (defun parseclj-lex-error-p (token)
  91. "Return t if the TOKEN represents a lexing error token."
  92. (eq (parseclj-lex-token-type token) :lex-error))
  93. ;; Elisp values from tokens
  94. (defun parseclj-lex--string-value (s)
  95. "Parse an EDN string S into a regular string.
  96. S goes through three transformations:
  97. - Escaped characters in S are transformed into Elisp escaped
  98. characters.
  99. - Unicode escaped characters are decoded into its corresponding
  100. unicode character counterpart.
  101. - Octal escaped characters are decoded into its corresponding
  102. character counterpart."
  103. (replace-regexp-in-string
  104. "\\\\o[0-8]\\{3\\}"
  105. (lambda (x)
  106. (make-string 1 (string-to-number (substring x 2) 8)))
  107. (replace-regexp-in-string
  108. "\\\\u[0-9a-fA-F]\\{4\\}"
  109. (lambda (x)
  110. (make-string 1 (string-to-number (substring x 2) 16)))
  111. (replace-regexp-in-string "\\\\[tbnrf'\"\\]"
  112. (lambda (x)
  113. (cl-case (elt x 1)
  114. (?t "\t")
  115. (?f "\f")
  116. (?\" "\"")
  117. (?r "\r")
  118. (?n "\n")
  119. (?\\ "\\\\")
  120. (t (substring x 1))))
  121. (substring s 1 -1)))))
  122. (defun parseclj-lex--character-value (c)
  123. "Parse an EDN character C into an Emacs Lisp character."
  124. (let ((first-char (elt c 1)))
  125. (cond
  126. ((equal c "\\newline") ?\n)
  127. ((equal c "\\return") ?\r)
  128. ((equal c "\\space") ?\ )
  129. ((equal c "\\tab") ?\t)
  130. ((eq first-char ?u) (string-to-number (substring c 2) 16))
  131. ((eq first-char ?o) (string-to-number (substring c 2) 8))
  132. (t first-char))))
  133. (defun parseclj-lex--leaf-token-value (token)
  134. "Parse the given leaf TOKEN to an Emacs Lisp value."
  135. (cl-case (parseclj-lex-token-type token)
  136. (:number (string-to-number (alist-get :form token)))
  137. (:nil nil)
  138. (:true t)
  139. (:false nil)
  140. (:symbol (intern (alist-get :form token)))
  141. (:keyword (intern (alist-get :form token)))
  142. (:string (parseclj-lex--string-value (alist-get :form token)))
  143. (:character (parseclj-lex--character-value (alist-get :form token)))))
  144. ;; Stream tokenization
  145. (defun parseclj-lex-at-whitespace-p ()
  146. "Return t if char at point is white space."
  147. (let ((char (char-after (point))))
  148. (or (equal char ?\ )
  149. (equal char ?\t)
  150. (equal char ?\n)
  151. (equal char ?\r)
  152. (equal char ?,))))
  153. (defun parseclj-lex-at-eof-p ()
  154. "Return t if point is at the end of file."
  155. (eq (point) (point-max)))
  156. (defun parseclj-lex-whitespace ()
  157. "Consume all consecutive white space as possible and return an :whitespace token."
  158. (let ((pos (point)))
  159. (while (parseclj-lex-at-whitespace-p)
  160. (right-char))
  161. (parseclj-lex-token :whitespace
  162. (buffer-substring-no-properties pos (point))
  163. pos)))
  164. (defun parseclj-lex-skip-digits ()
  165. "Skip all consecutive digits after point."
  166. (while (and (char-after (point))
  167. (<= ?0 (char-after (point)))
  168. (<= (char-after (point)) ?9))
  169. (right-char)))
  170. (defun parseclj-lex-skip-hex ()
  171. "Skip all consecutive hex digits after point."
  172. (while (and (char-after (point))
  173. (or (<= ?0 (char-after (point)) ?9)
  174. (<= ?a (char-after (point)) ?f)
  175. (<= ?A (char-after (point)) ?F)))
  176. (right-char)))
  177. (defun parseclj-lex-skip-number ()
  178. "Skip a number at point."
  179. ;; [\+\-]?\d+\.\d+
  180. (if (and (eq ?0 (char-after (point)))
  181. (eq ?x (char-after (1+ (point)))))
  182. (progn
  183. (right-char 2)
  184. (parseclj-lex-skip-hex))
  185. (progn
  186. (when (member (char-after (point)) '(?+ ?-))
  187. (right-char))
  188. (parseclj-lex-skip-digits)
  189. (when (eq (char-after (point)) ?.)
  190. (right-char))
  191. (parseclj-lex-skip-digits))))
  192. (defun parseclj-lex-number ()
  193. "Consume a number and return a `:number' token representing it."
  194. (let ((pos (point)))
  195. (parseclj-lex-skip-number)
  196. ;; 10110r2 or 4.3e+22
  197. (when (member (char-after (point)) '(?E ?e ?r))
  198. (right-char))
  199. (parseclj-lex-skip-number)
  200. ;; trailing M
  201. (when (eq (char-after (point)) ?M)
  202. (right-char))
  203. (let ((char (char-after (point))))
  204. (if (and char (or (and (<= ?a char) (<= char ?z))
  205. (and (<= ?A char) (<= char ?Z))
  206. (and (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?& ?= ?< ?> ?/)))))
  207. (progn
  208. (right-char)
  209. (parseclj-lex-error-token pos :invalid-number-format))
  210. (parseclj-lex-token :number
  211. (buffer-substring-no-properties pos (point))
  212. pos)))))
  213. (defun parseclj-lex-digit-p (char)
  214. "Return t if CHAR is a digit."
  215. (and char (<= ?0 char) (<= char ?9)))
  216. (defun parseclj-lex-at-number-p ()
  217. "Return t if point is at a number."
  218. (let ((char (char-after (point))))
  219. (or (parseclj-lex-digit-p char)
  220. (and (member char '(?- ?+ ?.))
  221. (parseclj-lex-digit-p (char-after (1+ (point))))))))
  222. (defun parseclj-lex-symbol-start-p (char &optional alpha-only)
  223. "Return t if CHAR is a valid start for a symbol.
  224. Symbols begin with a non-numeric character and can contain alphanumeric
  225. characters and . * + ! - _ ? $ % & = < > '. If - + or . are the first
  226. character, the second character (if any) must be non-numeric.
  227. In some cases, like in tagged elements, symbols are required to start with
  228. alphabetic characters only. ALPHA-ONLY ensures this behavior."
  229. (not (not (and char
  230. (or (and (<= ?a char) (<= char ?z))
  231. (and (<= ?A char) (<= char ?Z))
  232. (and (not alpha-only) (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?% ?& ?= ?< ?> ?/ ?'))))))))
  233. (defun parseclj-lex-symbol-rest-p (char)
  234. "Return t if CHAR is a valid character in a symbol.
  235. For more information on what determines a valid symbol, see
  236. `parseclj-lex-symbol-start-p'"
  237. (or (parseclj-lex-symbol-start-p char)
  238. (parseclj-lex-digit-p char)
  239. (eq ?: char)
  240. (eq ?# char)))
  241. (defun parseclj-lex-get-symbol-at-point (pos)
  242. "Return the symbol at POS as a string."
  243. (while (parseclj-lex-symbol-rest-p (char-after (point)))
  244. (right-char))
  245. (buffer-substring-no-properties pos (point)))
  246. (defun parseclj-lex-symbol ()
  247. "Return a lex token representing a symbol.
  248. Because of their special meaning, symbols \"nil\", \"true\", and \"false\"
  249. are returned as their own lex tokens."
  250. (let ((pos (point)))
  251. (right-char)
  252. (let ((sym (parseclj-lex-get-symbol-at-point pos)))
  253. (cond
  254. ((equal sym "nil") (parseclj-lex-token :nil "nil" pos))
  255. ((equal sym "true") (parseclj-lex-token :true "true" pos))
  256. ((equal sym "false") (parseclj-lex-token :false "false" pos))
  257. (t (parseclj-lex-token :symbol sym pos))))))
  258. (defun parseclj-lex-string* ()
  259. "Helper for string/regex lexing.
  260. Returns either the string, or an error token"
  261. (let ((pos (point)))
  262. (right-char)
  263. (while (not (or (equal (char-after (point)) ?\") (parseclj-lex-at-eof-p)))
  264. (if (equal (char-after (point)) ?\\)
  265. (right-char 2)
  266. (right-char)))
  267. (when (equal (char-after (point)) ?\")
  268. (right-char)
  269. (buffer-substring-no-properties pos (point)))))
  270. (defun parseclj-lex-string ()
  271. "Return a lex token representing a string.
  272. If EOF is reached without finding a closing double quote, a :lex-error
  273. token is returned."
  274. (let ((pos (point))
  275. (str (parseclj-lex-string*)))
  276. (if str
  277. (parseclj-lex-token :string str pos)
  278. (parseclj-lex-error-token pos :invalid-string))))
  279. (defun parseclj-lex-regex ()
  280. "Return a lex token representing a regular expression.
  281. If EOF is reached without finding a closing double quote, a :lex-error
  282. token is returned."
  283. (let ((pos (1- (point)))
  284. (str (parseclj-lex-string*)))
  285. (if str
  286. (parseclj-lex-token :regex (concat "#" str) pos)
  287. (parseclj-lex-error-token pos :invalid-regex))))
  288. (defun parseclj-lex-lookahead (n)
  289. "Return a lookahead string of N characters after point."
  290. (buffer-substring-no-properties (point) (min (+ (point) n) (point-max))))
  291. (defun parseclj-lex-character ()
  292. "Return a lex token representing a character."
  293. (let ((pos (point)))
  294. (right-char)
  295. (cond
  296. ((equal (parseclj-lex-lookahead 3) "tab")
  297. (right-char 3)
  298. (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
  299. ((equal (parseclj-lex-lookahead 5) "space")
  300. (right-char 5)
  301. (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
  302. ((equal (parseclj-lex-lookahead 6) "return")
  303. (right-char 6)
  304. (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
  305. ((equal (parseclj-lex-lookahead 7) "newline")
  306. (right-char 7)
  307. (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
  308. ((string-match-p "^u[0-9a-fA-F]\\{4\\}" (parseclj-lex-lookahead 5))
  309. (right-char 5)
  310. (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
  311. ((string-match-p "^o[0-8]\\{3\\}" (parseclj-lex-lookahead 4))
  312. (right-char 4)
  313. (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
  314. (t
  315. (right-char)
  316. (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)))))
  317. (defun parseclj-lex-keyword ()
  318. "Return a lex token representing a keyword.
  319. Keywords follow the same rules as symbols, except they start with one or
  320. two colon characters.
  321. See `parseclj-lex-symbol', `parseclj-lex-symbol-start-p'."
  322. (let ((pos (point)))
  323. (right-char)
  324. (when (equal (char-after (point)) ?:) ;; same-namespace keyword
  325. (right-char))
  326. (if (equal (char-after (point)) ?:) ;; three colons in a row => lex-error
  327. (progn
  328. (right-char)
  329. (parseclj-lex-error-token pos :invalid-keyword))
  330. (progn
  331. (while (or (parseclj-lex-symbol-rest-p (char-after (point)))
  332. (equal (char-after (point)) ?#))
  333. (right-char))
  334. (parseclj-lex-token :keyword (buffer-substring-no-properties pos (point)) pos)))))
  335. (defun parseclj-lex-comment ()
  336. "Return a lex token representing a comment."
  337. (let ((pos (point)))
  338. (goto-char (line-end-position))
  339. (when (equal (char-after (point)) ?\n)
  340. (right-char))
  341. (parseclj-lex-token :comment (buffer-substring-no-properties pos (point)) pos)))
  342. (defun parseclj-lex-map-prefix ()
  343. "Return a lex token representing a map prefix."
  344. (let ((pos (1- (point))))
  345. (right-char)
  346. (when (equal (char-after (point)) ?:)
  347. (right-char))
  348. (while (parseclj-lex-symbol-rest-p (char-after (point)))
  349. (right-char))
  350. (parseclj-lex-token :map-prefix (buffer-substring-no-properties pos (point)) pos)))
  351. (defun parseclj-lex-next ()
  352. "Consume characters at point and return the next lexical token.
  353. See `parseclj-lex-token'."
  354. (if (parseclj-lex-at-eof-p)
  355. (parseclj-lex-token :eof nil (point))
  356. (let ((char (char-after (point)))
  357. (pos (point)))
  358. (cond
  359. ((parseclj-lex-at-whitespace-p)
  360. (parseclj-lex-whitespace))
  361. ((equal char ?\()
  362. (right-char)
  363. (parseclj-lex-token :lparen "(" pos))
  364. ((equal char ?\))
  365. (right-char)
  366. (parseclj-lex-token :rparen ")" pos))
  367. ((equal char ?\[)
  368. (right-char)
  369. (parseclj-lex-token :lbracket "[" pos))
  370. ((equal char ?\])
  371. (right-char)
  372. (parseclj-lex-token :rbracket "]" pos))
  373. ((equal char ?{)
  374. (right-char)
  375. (parseclj-lex-token :lbrace "{" pos))
  376. ((equal char ?})
  377. (right-char)
  378. (parseclj-lex-token :rbrace "}" pos))
  379. ((equal char ?')
  380. (right-char)
  381. (parseclj-lex-token :quote "'" pos))
  382. ((equal char ?`)
  383. (right-char)
  384. (parseclj-lex-token :backquote "`" pos))
  385. ((equal char ?~)
  386. (right-char)
  387. (if (eq ?@ (char-after (point)))
  388. (progn
  389. (right-char)
  390. (parseclj-lex-token :unquote-splice "~@" pos))
  391. (parseclj-lex-token :unquote "~" pos)))
  392. ((parseclj-lex-at-number-p)
  393. (parseclj-lex-number))
  394. ((parseclj-lex-symbol-start-p char)
  395. (parseclj-lex-symbol))
  396. ((equal char ?\")
  397. (parseclj-lex-string))
  398. ((equal char ?\\)
  399. (parseclj-lex-character))
  400. ((equal char ?:)
  401. (parseclj-lex-keyword))
  402. ((equal char ?\;)
  403. (parseclj-lex-comment))
  404. ((equal char ?^)
  405. (right-char)
  406. (parseclj-lex-token :metadata "^" pos))
  407. ((equal char ?@)
  408. (right-char)
  409. (parseclj-lex-token :deref "@" pos))
  410. ((equal char ?#)
  411. (right-char)
  412. (let ((char (char-after (point))))
  413. (cond
  414. ((equal char ?{)
  415. (right-char)
  416. (parseclj-lex-token :set "#{" pos))
  417. ((equal char ?_)
  418. (right-char)
  419. (parseclj-lex-token :discard "#_" pos))
  420. ((equal char ?\()
  421. (right-char)
  422. (parseclj-lex-token :lambda "#(" pos))
  423. ((equal char ?')
  424. (right-char)
  425. (parseclj-lex-token :var "#'" pos))
  426. ((equal char ?=)
  427. (right-char)
  428. (parseclj-lex-token :eval "#=" pos))
  429. ((equal char ?\")
  430. (parseclj-lex-regex))
  431. ((equal char ?:)
  432. (parseclj-lex-map-prefix))
  433. ((equal char ?\?)
  434. (right-char)
  435. (if (eq ?@ (char-after (point)))
  436. (progn
  437. (right-char)
  438. (parseclj-lex-token :reader-conditional-splice "#?@" pos))
  439. (parseclj-lex-token :reader-conditional "#?" pos)))
  440. ((parseclj-lex-symbol-start-p char t)
  441. (right-char)
  442. (parseclj-lex-token :tag (concat "#" (parseclj-lex-get-symbol-at-point (1+ pos))) pos))
  443. (t
  444. (while (not (or (parseclj-lex-at-whitespace-p)
  445. (parseclj-lex-at-eof-p)))
  446. (right-char))
  447. (parseclj-lex-error-token pos :invalid-hashtag-dispatcher)))))
  448. (t
  449. (progn
  450. (right-char)
  451. (parseclj-lex-error-token pos)))))))
  452. (provide 'parseclj-lex)
  453. ;;; parseclj-lex.el ends here