;;; haskell-lexeme.el --- haskell lexical tokens -*- coding: utf-8; lexical-binding: t -*-
|
|
|
|
;; Copyright (C) 2015 Gracjan Polak
|
|
|
|
;; This file is free software; you can redistribute it and/or modify
|
|
;; it under the terms of the GNU General Public License as published by
|
|
;; the Free Software Foundation; either version 3, or (at your option)
|
|
;; any later version.
|
|
|
|
;; This file is distributed in the hope that it will be useful,
|
|
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
;; GNU General Public License for more details.
|
|
|
|
;; You should have received a copy of the GNU General Public License
|
|
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
;;; Commentary:
|
|
|
|
;;; Code:
|
|
|
|
(require 'rx)
|
|
|
|
(unless (category-docstring ?P)
|
|
(define-category ?P "Haskell symbol constituent characters")
|
|
(map-char-table
|
|
#'(lambda (key val)
|
|
(if (or
|
|
(and (consp key) (> (car key) 128))
|
|
(and (numberp key) (> key 128)))
|
|
(if (member val '(Pc Pd Po Sm Sc Sk So))
|
|
(modify-category-entry key ?P))))
|
|
unicode-category-table)
|
|
|
|
(dolist (key (string-to-list "!#$%&*+./<=>?@^|~\\-:"))
|
|
(modify-category-entry key ?P)))
|
|
|
|
(defconst haskell-lexeme-modid
|
|
"[[:upper:]][[:alnum:]'_]*"
|
|
"Regexp matching a valid Haskell module identifier.
|
|
|
|
Note that GHC accepts Unicode category UppercaseLetter as a first
|
|
character. Following letters are from Unicode categories
|
|
UppercaseLetter, LowercaseLetter, OtherLetter, TitlecaseLetter,
|
|
ModifierLetter, DecimalNumber, OtherNumber, backslash or
|
|
underscore.")
|
|
|
|
(defconst haskell-lexeme-id
|
|
"[[:alpha:]_][[:alnum:]'_]*"
|
|
"Regexp matching a valid Haskell identifier.
|
|
|
|
GHC accepts a string starting with any alphabetic character or
|
|
underscore followed by any alphanumeric character or underscore
|
|
or apostrophe.")
|
|
|
|
(defconst haskell-lexeme-sym
|
|
"\\cP+"
|
|
"Regexp matching a valid Haskell variable or constructor symbol.
|
|
|
|
GHC accepts a string of chars from the set
|
|
[:!#$%&*+./<=>?@^|~\\-] or Unicode category Symbol for chars with
|
|
codes larger than 128 only.")
|
|
|
|
(defconst haskell-lexeme-idsym-first-char
|
|
"\\(?:[[:alpha:]_]\\|\\cP\\)"
|
|
"Regexp matching first character of a qualified or unqualified
|
|
identifier or symbol.
|
|
|
|
Useful for `re-search-forward'.")
|
|
|
|
(defconst haskell-lexeme-modid-opt-prefix
|
|
(concat "\\(?:" haskell-lexeme-modid "\\.\\)*")
|
|
"Regexp matching a valid Haskell module prefix, potentially empty.
|
|
|
|
Module path prefix is separated by dots and finishes with a
|
|
dot. For path component syntax see `haskell-lexeme-modid'.")
|
|
|
|
(defconst haskell-lexeme-qid-or-qsym
|
|
(rx-to-string `(: (regexp ,haskell-lexeme-modid-opt-prefix)
|
|
(group (| (regexp ,haskell-lexeme-id) (regexp ,haskell-lexeme-sym)
|
|
))))
|
|
"Regexp matching a valid qualified identifier or symbol.
|
|
|
|
Note that (match-string 1) returns the unqualified part.")
|
|
|
|
(defun haskell-lexeme-looking-at-qidsym ()
|
|
"Non-nil when point is just in front of an optionally qualified
|
|
identifier or symbol.
|
|
|
|
Using this function is more efficient than matching against the
|
|
regexp `haskell-lexeme-qid-or-qsym'.
|
|
|
|
Returns:
|
|
'qid - if matched a qualified id: 'Data.Map' or 'Map'
|
|
'qsym - if matched a qualified id: 'Monad.>>=' or '>>='
|
|
'qprefix - if matched only modid prefix: 'Data.'
|
|
|
|
After successful 'qid or 'qsym match (match-string 1) will return
|
|
the unqualified part (if any)."
|
|
(let ((begin (point))
|
|
(match-data-old (match-data)))
|
|
(save-excursion
|
|
(while (looking-at (concat haskell-lexeme-modid "\\."))
|
|
(goto-char (match-end 0)))
|
|
(cond
|
|
((looking-at haskell-lexeme-id)
|
|
(let ((beg (match-beginning 0))
|
|
(end (match-end 0)))
|
|
|
|
;; check is MagicHash is present at the end of the token
|
|
(goto-char end)
|
|
(when (looking-at "#+")
|
|
(setq end (match-end 0)))
|
|
|
|
(set-match-data
|
|
(list begin end
|
|
beg end)))
|
|
'qid)
|
|
((looking-at haskell-lexeme-sym)
|
|
(set-match-data
|
|
(list begin (match-end 0)
|
|
(match-beginning 0) (match-end 0)))
|
|
'qsym)
|
|
((equal begin (point))
|
|
(set-match-data match-data-old)
|
|
nil)
|
|
(t
|
|
(set-match-data
|
|
(list begin (point)
|
|
nil nil))
|
|
'qprefix)))))
|
|
|
|
(defun haskell-lexeme-looking-at-backtick ()
|
|
"Non-nil when point is just in front of an identifier quoted with backticks.
|
|
|
|
When match is successful, match-data will contain:
|
|
(match-text 1) - opening backtick
|
|
(match-text 2) - whole qualified identifier
|
|
(match-text 3) - unqualified part of identifier
|
|
(match-text 4) - closing backtick"
|
|
(let ((match-data-old (match-data))
|
|
first-backtick-start
|
|
last-backtick-start
|
|
qid-start
|
|
id-start
|
|
id-end
|
|
result)
|
|
(save-excursion
|
|
(when (looking-at "`")
|
|
(setq first-backtick-start (match-beginning 0))
|
|
(goto-char (match-end 0))
|
|
(forward-comment (buffer-size))
|
|
(when (haskell-lexeme-looking-at-qidsym)
|
|
(setq qid-start (match-beginning 0))
|
|
(setq id-start (match-beginning 1))
|
|
(setq id-end (match-end 1))
|
|
(goto-char (match-end 0))
|
|
(forward-comment (buffer-size))
|
|
(when (looking-at "`")
|
|
(setq last-backtick-start (match-beginning 0))
|
|
(set-match-data
|
|
(mapcar
|
|
(lambda (p)
|
|
(set-marker (make-marker) p))
|
|
(list
|
|
first-backtick-start (1+ last-backtick-start)
|
|
first-backtick-start (1+ first-backtick-start)
|
|
qid-start id-end
|
|
id-start id-end
|
|
last-backtick-start (1+ last-backtick-start))))
|
|
(setq result t)))))
|
|
(unless result
|
|
(set-match-data match-data-old))
|
|
result))
|
|
|
|
(defconst haskell-lexeme-qid
|
|
(rx-to-string `(: (regexp "'*")
|
|
(regexp ,haskell-lexeme-modid-opt-prefix)
|
|
(group (regexp ,haskell-lexeme-id))))
|
|
"Regexp matching a valid qualified identifier.
|
|
|
|
Note that (match-string 1) returns the unqualified part.")
|
|
|
|
(defconst haskell-lexeme-qsym
|
|
(rx-to-string `(: (regexp "'*")
|
|
(regexp ,haskell-lexeme-modid-opt-prefix)
|
|
(group (regexp ,haskell-lexeme-id))))
|
|
"Regexp matching a valid qualified symbol.
|
|
|
|
Note that (match-string 1) returns the unqualified part.")
|
|
|
|
(defconst haskell-lexeme-number
|
|
(rx (| (: (regexp "[0-9]+\\.[0-9]+") (opt (regexp "[eE][-+]?[0-9]+")))
|
|
(regexp "[0-9]+[eE][-+]?[0-9]+")
|
|
(regexp "0[xX][0-9a-fA-F]+")
|
|
(regexp "0[oO][0-7]+")
|
|
(regexp "[0-9]+")))
|
|
"Regexp matching a floating point, decimal, octal or hexadecimal number.
|
|
|
|
Note that negative sign char is not part of a number.")
|
|
|
|
(defconst haskell-lexeme-char-literal-inside
|
|
(rx (| (not (any "\n'\\"))
|
|
(: "\\"
|
|
(| "a" "b" "f" "n" "r" "t" "v" "\\" "\"" "'"
|
|
"NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK"
|
|
"BEL" "BS" "HT" "LF" "VT" "FF" "CR" "SO" "SI" "DLE"
|
|
"DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB" "CAN"
|
|
"EM" "SUB" "ESC" "FS" "GS" "RS" "US" "SP" "DEL"
|
|
(regexp "[0-9]+")
|
|
(: "x" (regexp "[0-9a-fA-F]+"))
|
|
(: "o" (regexp "[0-7]+"))
|
|
(: "^" (regexp "[]A-Z@^_\\[]"))))))
|
|
"Regexp matching an inside of a character literal.
|
|
|
|
Note that `haskell-lexeme-char-literal-inside' matches strictly
|
|
only escape sequences defined in Haskell Report.")
|
|
|
|
(defconst haskell-lexeme--char-literal-rx
|
|
(rx-to-string `(: (group "'")
|
|
(| (: (group (regexp "[[:alpha:]_:([]")) (group "'")) ; exactly one char
|
|
(: (group (| (regexp "\\\\[^\n][^'\n]*") ; allow quote just after first backslash
|
|
(regexp "[^[:alpha:]_:(['\n][^'\n]*")))
|
|
(| (group "'") "\n" (regexp "\\'"))))))
|
|
"Regexp matching a character literal lookalike.
|
|
|
|
Note that `haskell-lexeme--char-literal-rx' matches more than
|
|
Haskell Report specifies because we want to support also code
|
|
under edit.
|
|
|
|
Character literals end with a quote or a newline or end of
|
|
buffer.
|
|
|
|
Regexp has subgroup expressions:
|
|
(match-text 1) matches the opening quote.
|
|
(match-text 2) matches the inside of the character literal.
|
|
(match-text 3) matches the closing quote or an empty string
|
|
at the end of line or the end buffer.")
|
|
|
|
(defun haskell-lexeme-looking-at-char-literal ()
|
|
"Non-nil when point is at a char literal lookalike.
|
|
|
|
Note that this function matches more than Haskell Report
|
|
specifies because we want to support also code under edit.
|
|
|
|
Char literals end with a quote or an unescaped newline or end
|
|
of buffer.
|
|
|
|
After successful match:
|
|
(match-text 1) matches the opening quote.
|
|
(match-text 2) matches the inside of the char literla.
|
|
(match-text 3) matches the closing quote, or a closing
|
|
newline or is nil when at the end of the buffer."
|
|
(when (looking-at haskell-lexeme--char-literal-rx)
|
|
(set-match-data
|
|
(list (match-beginning 0) (match-end 0)
|
|
(match-beginning 1) (match-end 1)
|
|
(or (match-beginning 2) (match-beginning 4)) (or (match-end 2) (match-end 4))
|
|
(or (match-beginning 3) (match-beginning 5)) (or (match-end 3) (match-end 5))))
|
|
t))
|
|
|
|
(defconst haskell-lexeme-string-literal-inside-item
|
|
(rx (| (not (any "\n\"\\"))
|
|
(: "\\"
|
|
(| "a" "b" "f" "n" "r" "t" "v" "\\" "\"" "'" "&"
|
|
"NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK"
|
|
"BEL" "BS" "HT" "LF" "VT" "FF" "CR" "SO" "SI" "DLE"
|
|
"DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB" "CAN"
|
|
"EM" "SUB" "ESC" "FS" "GS" "RS" "US" "SP" "DEL"
|
|
(regexp "[0-9]+")
|
|
(: "x" (regexp "[0-9a-fA-F]+"))
|
|
(: "o" (regexp "[0-7]+"))
|
|
(: "^" (regexp "[]A-Z@^_\\[]"))
|
|
(regexp "[ \t\n\r\v\f]*\\\\")))))
|
|
"Regexp matching an item that is a single character or a single
|
|
escape sequence inside of a string literal.
|
|
|
|
Note that `haskell-lexeme-string-literal-inside-item' matches
|
|
strictly only escape sequences defined in Haskell Report.")
|
|
|
|
(defconst haskell-lexeme-string-literal
|
|
(rx (: (group "\"")
|
|
(group (* (| (regexp "\\\\[ \t\n\r\v\f]*\\\\")
|
|
(regexp "\\\\[ \t\n\r\v\f]+")
|
|
(regexp "\\\\[^ \t\n\r\v\f]")
|
|
(* (regexp "[^\"\n\\]")))))
|
|
(group (| "\"" (regexp "$") (regexp "\\\\?\\'")
|
|
))))
|
|
"Regexp matching a string literal lookalike.
|
|
|
|
Note that `haskell-lexeme-string-literal' matches more than
|
|
Haskell Report specifies because we want to support also code
|
|
under edit.
|
|
|
|
String literals end with double quote or unescaped newline or end
|
|
of buffer.
|
|
|
|
Regexp has subgroup expressions:
|
|
(match-text 1) matches the opening double quote.
|
|
(match-text 2) matches the inside of the string.
|
|
(match-text 3) matches the closing double quote or an empty string
|
|
at the end of line or the end buffer.")
|
|
|
|
(defun haskell-lexeme-looking-at-string-literal ()
|
|
"Non-nil when point is at a string literal lookalike.
|
|
|
|
Note that this function matches more than Haskell Report
|
|
specifies because we want to support also code under edit.
|
|
|
|
String literals end with double quote or unescaped newline or end
|
|
of buffer.
|
|
|
|
After successful match:
|
|
(match-text 1) matches the opening doublequote.
|
|
(match-text 2) matches the inside of the string.
|
|
(match-text 3) matches the closing quote, or a closing
|
|
newline or is nil when at the end of the buffer."
|
|
(when (looking-at "\"")
|
|
(save-excursion
|
|
(let ((begin (point)))
|
|
(goto-char (match-end 0))
|
|
(let (finish)
|
|
(while (and (not finish)
|
|
(re-search-forward "[\"\n\\]" nil 'goto-eob))
|
|
(cond
|
|
((equal (match-string 0) "\\")
|
|
(if (looking-at "[ \t\n\r\v\f]+\\\\?")
|
|
(goto-char (match-end 0))
|
|
(goto-char (1+ (point)))))
|
|
|
|
((equal (match-string 0) "\"")
|
|
(set-match-data
|
|
(list begin (match-end 0)
|
|
begin (1+ begin)
|
|
(1+ begin) (match-beginning 0)
|
|
(match-beginning 0) (match-end 0)))
|
|
(setq finish t))
|
|
|
|
((equal (match-string 0) "\n")
|
|
(set-match-data
|
|
(list begin (match-beginning 0)
|
|
begin (1+ begin)
|
|
(1+ begin) (match-beginning 0)
|
|
nil nil))
|
|
(setq finish t))))
|
|
(unless finish
|
|
;; string closed by end of buffer
|
|
(set-match-data
|
|
(list begin (point)
|
|
begin (1+ begin)
|
|
(1+ begin) (point)
|
|
nil nil))))))
|
|
;; there was a match
|
|
t))
|
|
|
|
(defun haskell-lexeme-looking-at-quasi-quote-literal ()
|
|
"Non-nil when point is just in front of Template Haskell
|
|
quaisquote literal.
|
|
|
|
Quasi quotes start with '[xxx|' or '[$xxx|' sequence and end with
|
|
'|]'. The 'xxx' is a quoter name. There is no escaping mechanism
|
|
provided for the ending sequence.
|
|
|
|
Regexp has subgroup expressions:
|
|
(match-text 1) matches the quoter name (without $ sign if present).
|
|
(match-text 2) matches the opening vertical bar.
|
|
(match-text 3) matches the inside of the quoted string.
|
|
(match-text 4) matches the closing vertical bar
|
|
or nil if at the end of the buffer.
|
|
|
|
Note that this function excludes 'e', 't', 'd', 'p' as quoter
|
|
names according to Template Haskell specification."
|
|
(let ((match-data-old (match-data)))
|
|
(if (and
|
|
(looking-at (rx-to-string `(: "[" (optional "$")
|
|
(group (regexp ,haskell-lexeme-id))
|
|
(group "|"))))
|
|
(equal (haskell-lexeme-classify-by-first-char (char-after (match-beginning 1)))
|
|
'varid)
|
|
(not (member (match-string 1) '("e" "t" "d" "p"))))
|
|
(save-excursion
|
|
;; note that quasi quote syntax does not have any escaping
|
|
;; mechanism and if not closed it will span til lthe end of buffer
|
|
(goto-char (match-end 0))
|
|
(let ((match-data (match-data))
|
|
(match-data-2 (and (re-search-forward "|]" nil t)
|
|
(match-data))))
|
|
(if match-data-2
|
|
(set-match-data
|
|
(list
|
|
(nth 0 match-data) (nth 1 match-data-2) ;; whole match
|
|
(nth 2 match-data) (nth 3 match-data) ;; quoter name
|
|
(nth 4 match-data) (nth 5 match-data) ;; opening bar
|
|
(nth 5 match-data) (nth 0 match-data-2) ;; inner string
|
|
(nth 0 match-data-2) (1+ (nth 0 match-data-2)))) ;; closing bar
|
|
|
|
(set-match-data
|
|
(list
|
|
(nth 0 match-data) (point-max) ;; whole match
|
|
(nth 2 match-data) (nth 3 match-data) ;; quoter name
|
|
(nth 4 match-data) (nth 5 match-data) ;; opening bar
|
|
(nth 5 match-data) (point-max) ;; inner string
|
|
nil nil)) ;; closing bar
|
|
))
|
|
t)
|
|
;; restore old match data if not matched
|
|
(set-match-data match-data-old)
|
|
nil)))
|
|
|
|
(defun haskell-lexeme-classify-by-first-char (char)
|
|
"Classify token by CHAR.
|
|
|
|
CHAR is a chararacter that is assumed to be the first character
|
|
of a token."
|
|
(let ((category (get-char-code-property (or char ?\ ) 'general-category)))
|
|
|
|
(cond
|
|
((or (member char '(?! ?# ?$ ?% ?& ?* ?+ ?. ?/ ?< ?= ?> ?? ?@ ?^ ?| ?~ ?\\ ?-))
|
|
(and (> char 127)
|
|
(member category '(Pc Pd Po Sm Sc Sk So))))
|
|
'varsym)
|
|
((equal char ?:)
|
|
'consym)
|
|
((equal char ?\')
|
|
'char)
|
|
((equal char ?\")
|
|
'string)
|
|
((member category '(Lu Lt))
|
|
'conid)
|
|
((or (equal char ?_)
|
|
(member category '(Ll Lo)))
|
|
'varid)
|
|
((and (>= char ?0) (<= char ?9))
|
|
'number)
|
|
((member char '(?\] ?\[ ?\( ?\) ?\{ ?\} ?\` ?\, ?\;))
|
|
'special))))
|
|
|
|
(defun haskell-lexeme-looking-at-token (&rest flags)
|
|
"Like `looking-at' but understands Haskell lexemes.
|
|
|
|
Moves point forward over whitespace. Returns a symbol describing
|
|
type of Haskell token recognized. Use `match-string',
|
|
`match-beginning' and `match-end' with argument 0 to query match
|
|
result.
|
|
|
|
Possible results are:
|
|
- 'special: for chars [](){}`,;
|
|
- 'comment: for single line comments
|
|
- 'nested-comment: for multiline comments
|
|
- 'qsymid: for qualified identifiers or symbols
|
|
- 'string: for strings literals
|
|
- 'char: for char literals
|
|
- 'number: for decimal, float, hexadecimal and octal number literals
|
|
- 'template-haskell-quote: for a string of apostrophes for template haskell
|
|
- 'template-haskell-quasi-quote: for a string of apostrophes for template haskell
|
|
|
|
Note that for qualified symbols (match-string 1) returns the
|
|
unqualified identifier or symbol. Further qualification for
|
|
symbol or identifier can be done with:
|
|
|
|
(haskell-lexeme-classify-by-first-char (char-after (match-beginning 1)))
|
|
|
|
See `haskell-lexeme-classify-by-first-char' for details."
|
|
(while
|
|
;; Due to how unterminated strings terminate at newline, some
|
|
;; newlines have syntax set to generic string delimeter. We want
|
|
;; those to be treated as whitespace anyway
|
|
(or
|
|
(> (skip-syntax-forward "-") 0)
|
|
(and (not (member 'newline flags))
|
|
(> (skip-chars-forward "\n") 0))))
|
|
(let
|
|
((case-fold-search nil)
|
|
(point (point-marker)))
|
|
(or
|
|
(and
|
|
(equal (string-to-syntax "<")
|
|
(get-char-property (point) 'syntax-table))
|
|
(progn
|
|
(set-match-data (list point (set-marker (make-marker) (line-end-position))))
|
|
'literate-comment))
|
|
(and (looking-at "\n")
|
|
'newline)
|
|
(and (looking-at "{-")
|
|
(save-excursion
|
|
(forward-comment 1)
|
|
(set-match-data (list point (point-marker)))
|
|
'nested-comment))
|
|
(and (haskell-lexeme-looking-at-char-literal)
|
|
'char)
|
|
(and (haskell-lexeme-looking-at-string-literal)
|
|
'string)
|
|
(and (looking-at "[][(){}`,;]")
|
|
(if (haskell-lexeme-looking-at-quasi-quote-literal)
|
|
'template-haskell-quasi-quote
|
|
'special))
|
|
(and (haskell-lexeme-looking-at-qidsym)
|
|
(if (save-match-data
|
|
(string-match "\\`---*\\'" (match-string-no-properties 0)))
|
|
(progn
|
|
(set-match-data (list point (set-marker (make-marker) (line-end-position))))
|
|
'comment)
|
|
'qsymid))
|
|
(and (looking-at haskell-lexeme-number)
|
|
'number)
|
|
(and (looking-at "'+")
|
|
'template-haskell-quote)
|
|
(and (looking-at ".")
|
|
'illegal))))
|
|
|
|
(provide 'haskell-lexeme)
|
|
|
|
;;; haskell-lexeme.el ends here
|