Klimi's new dotfiles with stow.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

523 lines
20 KiB

4 years ago
  1. ;;;
  2. ;;; This code was written by:
  3. ;;;
  4. ;;; Lawrence E. Freil <lef@freil.com>
  5. ;;; National Science Center Foundation
  6. ;;; Augusta, Georgia 30909
  7. ;;;
  8. ;;; This program was released into the public domain on 2005-08-31.
  9. ;;; (See the slime-devel mailing list archive for details.)
  10. ;;;
  11. ;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
  12. ;;; parser.
  13. ;;;
  14. ;;; This regular expression parser operates by taking a
  15. ;;; regular expression and breaking it down into a list
  16. ;;; consisting of lisp expressions and flags. The list
  17. ;;; of lisp expressions is then taken in turned into a
  18. ;;; lambda expression that can be later applied to a
  19. ;;; string argument for parsing.
  20. ;;;;
  21. ;;;; Modifications made 6 March 2001 By Chris Double (chris@double.co.nz)
  22. ;;;; to get working with Corman Lisp 1.42, add package statement and export
  23. ;;;; relevant functions.
  24. ;;;;
  25. (in-package :cl-user)
  26. ;; Renamed to slime-nregex avoid name clashes with other versions of
  27. ;; this file. -- he
  28. ;;;; CND - 6/3/2001
  29. (defpackage slime-nregex
  30. (:use #:common-lisp)
  31. (:export
  32. #:regex
  33. #:regex-compile
  34. ))
  35. ;;;; CND - 6/3/2001
  36. (in-package :slime-nregex)
  37. ;;;
  38. ;;; First we create a copy of macros to help debug the beast
  39. (eval-when (:compile-toplevel :load-toplevel :execute)
  40. (defvar *regex-debug* nil) ; Set to nil for no debugging code
  41. )
  42. (defmacro info (message &rest args)
  43. (if *regex-debug*
  44. `(format *standard-output* ,message ,@args)))
  45. ;;;
  46. ;;; Declare the global variables for storing the paren index list.
  47. ;;;
  48. (defvar *regex-groups* (make-array 10))
  49. (defvar *regex-groupings* 0)
  50. ;;;
  51. ;;; Declare a simple interface for testing. You probably wouldn't want
  52. ;;; to use this interface unless you were just calling this once.
  53. ;;;
  54. (defun regex (expression string)
  55. "Usage: (regex <expression> <string)
  56. This function will call regex-compile on the expression and then apply
  57. the string to the returned lambda list."
  58. (let ((findit (cond ((stringp expression)
  59. (regex-compile expression))
  60. ((listp expression)
  61. expression)))
  62. (result nil))
  63. (if (not (funcall (if (functionp findit)
  64. findit
  65. (eval `(function ,findit))) string))
  66. (return-from regex nil))
  67. (if (= *regex-groupings* 0)
  68. (return-from regex t))
  69. (dotimes (i *regex-groupings*)
  70. (push (funcall 'subseq
  71. string
  72. (car (aref *regex-groups* i))
  73. (cadr (aref *regex-groups* i)))
  74. result))
  75. (reverse result)))
  76. ;;;
  77. ;;; Declare some simple macros to make the code more readable.
  78. ;;;
  79. (defvar *regex-special-chars* "?*+.()[]\\${}")
  80. (defmacro add-exp (list)
  81. "Add an item to the end of expression"
  82. `(setf expression (append expression ,list)))
  83. ;;;
  84. ;;; Define a function that will take a quoted character and return
  85. ;;; what the real character should be plus how much of the source
  86. ;;; string was used. If the result is a set of characters, return an
  87. ;;; array of bits indicating which characters should be set. If the
  88. ;;; expression is one of the sub-group matches return a
  89. ;;; list-expression that will provide the match.
  90. ;;;
  91. (defun regex-quoted (char-string &optional (invert nil))
  92. "Usage: (regex-quoted <char-string> &optional invert)
  93. Returns either the quoted character or a simple bit vector of bits set for
  94. the matching values"
  95. (let ((first (char char-string 0))
  96. (result (char char-string 0))
  97. (used-length 1))
  98. (cond ((eql first #\n)
  99. (setf result #\NewLine))
  100. ((eql first #\c)
  101. (setf result #\Return))
  102. ((eql first #\t)
  103. (setf result #\Tab))
  104. ((eql first #\d)
  105. (setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
  106. ((eql first #\D)
  107. (setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
  108. ((eql first #\w)
  109. (setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
  110. ((eql first #\W)
  111. (setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
  112. ((eql first #\b)
  113. (setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
  114. ((eql first #\B)
  115. (setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
  116. ((eql first #\s)
  117. (setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
  118. ((eql first #\S)
  119. (setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
  120. ((and (>= (char-code first) (char-code #\0))
  121. (<= (char-code first) (char-code #\9)))
  122. (if (and (> (length char-string) 2)
  123. (and (>= (char-code (char char-string 1)) (char-code #\0))
  124. (<= (char-code (char char-string 1)) (char-code #\9))
  125. (>= (char-code (char char-string 2)) (char-code #\0))
  126. (<= (char-code (char char-string 2)) (char-code #\9))))
  127. ;;
  128. ;; It is a single character specified in octal
  129. ;;
  130. (progn
  131. (setf result (do ((x 0 (1+ x))
  132. (return 0))
  133. ((= x 2) return)
  134. (setf return (+ (* return 8)
  135. (- (char-code (char char-string x))
  136. (char-code #\0))))))
  137. (setf used-length 3))
  138. ;;
  139. ;; We have a group number replacement.
  140. ;;
  141. (let ((group (- (char-code first) (char-code #\0))))
  142. (setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group))
  143. (cadr (aref *regex-groups* ,group)))))
  144. (if (< length (+ index (length nstring)))
  145. (return-from compare nil))
  146. (if (not (string= string nstring
  147. :start1 index
  148. :end1 (+ index (length nstring))))
  149. (return-from compare nil)
  150. (incf index (length nstring)))))))))
  151. (t
  152. (setf result first)))
  153. (if (and (vectorp result) invert)
  154. (bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t))
  155. (values result used-length)))
  156. ;;;
  157. ;;; Now for the main regex compiler routine.
  158. ;;;
  159. (defun regex-compile (source &key (anchored nil))
  160. "Usage: (regex-compile <expression> [ :anchored (t/nil) ])
  161. This function take a regular expression (supplied as source) and
  162. compiles this into a lambda list that a string argument can then
  163. be applied to. It is also possible to compile this lambda list
  164. for better performance or to save it as a named function for later
  165. use"
  166. (info "Now entering regex-compile with \"~A\"~%" source)
  167. ;;
  168. ;; This routine works in two parts.
  169. ;; The first pass take the regular expression and produces a list of
  170. ;; operators and lisp expressions for the entire regular expression.
  171. ;; The second pass takes this list and produces the lambda expression.
  172. (let ((expression '()) ; holder for expressions
  173. (group 1) ; Current group index
  174. (group-stack nil) ; Stack of current group endings
  175. (result nil) ; holder for built expression.
  176. (fast-first nil)) ; holder for quick unanchored scan
  177. ;;
  178. ;; If the expression was an empty string then it alway
  179. ;; matches (so lets leave early)
  180. ;;
  181. (if (= (length source) 0)
  182. (return-from regex-compile
  183. '(lambda (&rest args)
  184. (declare (ignore args))
  185. t)))
  186. ;;
  187. ;; If the first character is a caret then set the anchored
  188. ;; flags and remove if from the expression string.
  189. ;;
  190. (cond ((eql (char source 0) #\^)
  191. (setf source (subseq source 1))
  192. (setf anchored t)))
  193. ;;
  194. ;; If the first sequence is .* then also set the anchored flags.
  195. ;; (This is purely for optimization, it will work without this).
  196. ;;
  197. (if (>= (length source) 2)
  198. (if (string= source ".*" :start1 0 :end1 2)
  199. (setf anchored t)))
  200. ;;
  201. ;; Also, If this is not an anchored search and the first character is
  202. ;; a literal, then do a quick scan to see if it is even in the string.
  203. ;; If not then we can issue a quick nil,
  204. ;; otherwise we can start the search at the matching character to skip
  205. ;; the checks of the non-matching characters anyway.
  206. ;;
  207. ;; If I really wanted to speed up this section of code it would be
  208. ;; easy to recognize the case of a fairly long multi-character literal
  209. ;; and generate a Boyer-Moore search for the entire literal.
  210. ;;
  211. ;; I generate the code to do a loop because on CMU Lisp this is about
  212. ;; twice as fast a calling position.
  213. ;;
  214. (if (and (not anchored)
  215. (not (position (char source 0) *regex-special-chars*))
  216. (not (and (> (length source) 1)
  217. (position (char source 1) *regex-special-chars*))))
  218. (setf fast-first `((if (not (dotimes (i length nil)
  219. (if (eql (char string i)
  220. ,(char source 0))
  221. (return (setf start i)))))
  222. (return-from final-return nil)))))
  223. ;;
  224. ;; Generate the very first expression to save the starting index
  225. ;; so that group 0 will be the entire string matched always
  226. ;;
  227. (add-exp '((setf (aref *regex-groups* 0)
  228. (list index nil))))
  229. ;;
  230. ;; Loop over each character in the regular expression building the
  231. ;; expression list as we go.
  232. ;;
  233. (do ((eindex 0 (1+ eindex)))
  234. ((= eindex (length source)))
  235. (let ((current (char source eindex)))
  236. (info "Now processing character ~A index = ~A~%" current eindex)
  237. (case current
  238. ((#\.)
  239. ;;
  240. ;; Generate code for a single wild character
  241. ;;
  242. (add-exp '((if (>= index length)
  243. (return-from compare nil)
  244. (incf index)))))
  245. ((#\$)
  246. ;;
  247. ;; If this is the last character of the expression then
  248. ;; anchor the end of the expression, otherwise let it slide
  249. ;; as a standard character (even though it should be quoted).
  250. ;;
  251. (if (= eindex (1- (length source)))
  252. (add-exp '((if (not (= index length))
  253. (return-from compare nil))))
  254. (add-exp '((if (not (and (< index length)
  255. (eql (char string index) #\$)))
  256. (return-from compare nil)
  257. (incf index))))))
  258. ((#\*)
  259. (add-exp '(ASTRISK)))
  260. ((#\+)
  261. (add-exp '(PLUS)))
  262. ((#\?)
  263. (add-exp '(QUESTION)))
  264. ((#\()
  265. ;;
  266. ;; Start a grouping.
  267. ;;
  268. (incf group)
  269. (push group group-stack)
  270. (add-exp `((setf (aref *regex-groups* ,(1- group))
  271. (list index nil))))
  272. (add-exp `(,group)))
  273. ((#\))
  274. ;;
  275. ;; End a grouping
  276. ;;
  277. (let ((group (pop group-stack)))
  278. (add-exp `((setf (cadr (aref *regex-groups* ,(1- group)))
  279. index)))
  280. (add-exp `(,(- group)))))
  281. ((#\[)
  282. ;;
  283. ;; Start of a range operation.
  284. ;; Generate a bit-vector that has one bit per possible character
  285. ;; and then on each character or range, set the possible bits.
  286. ;;
  287. ;; If the first character is carat then invert the set.
  288. (let* ((invert (eql (char source (1+ eindex)) #\^))
  289. (bitstring (make-array 256 :element-type 'bit
  290. :initial-element
  291. (if invert 1 0)))
  292. (set-char (if invert 0 1)))
  293. (if invert (incf eindex))
  294. (do ((x (1+ eindex) (1+ x)))
  295. ((eql (char source x) #\]) (setf eindex x))
  296. (info "Building range with character ~A~%" (char source x))
  297. (cond ((and (eql (char source (1+ x)) #\-)
  298. (not (eql (char source (+ x 2)) #\])))
  299. (if (>= (char-code (char source x))
  300. (char-code (char source (+ 2 x))))
  301. (error "Invalid range \"~A-~A\". Ranges must be in acending order"
  302. (char source x) (char source (+ 2 x))))
  303. (do ((j (char-code (char source x)) (1+ j)))
  304. ((> j (char-code (char source (+ 2 x))))
  305. (incf x 2))
  306. (info "Setting bit for char ~A code ~A~%" (code-char j) j)
  307. (setf (sbit bitstring j) set-char)))
  308. (t
  309. (cond ((not (eql (char source x) #\]))
  310. (let ((char (char source x)))
  311. ;;
  312. ;; If the character is quoted then find out what
  313. ;; it should have been
  314. ;;
  315. (if (eql (char source x) #\\ )
  316. (let ((length))
  317. (multiple-value-setq (char length)
  318. (regex-quoted (subseq source x) invert))
  319. (incf x length)))
  320. (info "Setting bit for char ~A code ~A~%" char (char-code char))
  321. (if (not (vectorp char))
  322. (setf (sbit bitstring (char-code (char source x))) set-char)
  323. (bit-ior bitstring char t))))))))
  324. (add-exp `((let ((range ,bitstring))
  325. (if (>= index length)
  326. (return-from compare nil))
  327. (if (= 1 (sbit range (char-code (char string index))))
  328. (incf index)
  329. (return-from compare nil)))))))
  330. ((#\\ )
  331. ;;
  332. ;; Intreprete the next character as a special, range, octal, group or
  333. ;; just the character itself.
  334. ;;
  335. (let ((length)
  336. (value))
  337. (multiple-value-setq (value length)
  338. (regex-quoted (subseq source (1+ eindex)) nil))
  339. (cond ((listp value)
  340. (add-exp value))
  341. ((characterp value)
  342. (add-exp `((if (not (and (< index length)
  343. (eql (char string index)
  344. ,value)))
  345. (return-from compare nil)
  346. (incf index)))))
  347. ((vectorp value)
  348. (add-exp `((let ((range ,value))
  349. (if (>= index length)
  350. (return-from compare nil))
  351. (if (= 1 (sbit range (char-code (char string index))))
  352. (incf index)
  353. (return-from compare nil)))))))
  354. (incf eindex length)))
  355. (t
  356. ;;
  357. ;; We have a literal character.
  358. ;; Scan to see how many we have and if it is more than one
  359. ;; generate a string= verses as single eql.
  360. ;;
  361. (let* ((lit "")
  362. (term (dotimes (litindex (- (length source) eindex) nil)
  363. (let ((litchar (char source (+ eindex litindex))))
  364. (if (position litchar *regex-special-chars*)
  365. (return litchar)
  366. (progn
  367. (info "Now adding ~A index ~A to lit~%" litchar
  368. litindex)
  369. (setf lit (concatenate 'string lit
  370. (string litchar)))))))))
  371. (if (= (length lit) 1)
  372. (add-exp `((if (not (and (< index length)
  373. (eql (char string index) ,current)))
  374. (return-from compare nil)
  375. (incf index))))
  376. ;;
  377. ;; If we have a multi-character literal then we must
  378. ;; check to see if the next character (if there is one)
  379. ;; is an astrisk or a plus or a question mark. If so then we must not use this
  380. ;; character in the big literal.
  381. (progn
  382. (if (or (eql term #\*)
  383. (eql term #\+)
  384. (eql term #\?))
  385. (setf lit (subseq lit 0 (1- (length lit)))))
  386. (add-exp `((if (< length (+ index ,(length lit)))
  387. (return-from compare nil))
  388. (if (not (string= string ,lit :start1 index
  389. :end1 (+ index ,(length lit))))
  390. (return-from compare nil)
  391. (incf index ,(length lit)))))))
  392. (incf eindex (1- (length lit))))))))
  393. ;;
  394. ;; Plug end of list to return t. If we made it this far then
  395. ;; We have matched!
  396. (add-exp '((setf (cadr (aref *regex-groups* 0))
  397. index)))
  398. (add-exp '((return-from final-return t)))
  399. ;;
  400. ;;; (print expression)
  401. ;;
  402. ;; Now take the expression list and turn it into a lambda expression
  403. ;; replacing the special flags with lisp code.
  404. ;; For example: A BEGIN needs to be replace by an expression that
  405. ;; saves the current index, then evaluates everything till it gets to
  406. ;; the END then save the new index if it didn't fail.
  407. ;; On an ASTRISK I need to take the previous expression and wrap
  408. ;; it in a do that will evaluate the expression till an error
  409. ;; occurs and then another do that encompases the remainder of the
  410. ;; regular expression and iterates decrementing the index by one
  411. ;; of the matched expression sizes and then returns nil. After
  412. ;; the last expression insert a form that does a return t so that
  413. ;; if the entire nested sub-expression succeeds then the loop
  414. ;; is broken manually.
  415. ;;
  416. (setf result (copy-tree nil))
  417. ;;
  418. ;; Reversing the current expression makes building up the
  419. ;; lambda list easier due to the nexting of expressions when
  420. ;; and astrisk has been encountered.
  421. (setf expression (reverse expression))
  422. (do ((elt 0 (1+ elt)))
  423. ((>= elt (length expression)))
  424. (let ((piece (nth elt expression)))
  425. ;;
  426. ;; Now check for PLUS, if so then ditto the expression and then let the
  427. ;; ASTRISK below handle the rest.
  428. ;;
  429. (cond ((eql piece 'PLUS)
  430. (cond ((listp (nth (1+ elt) expression))
  431. (setf result (append (list (nth (1+ elt) expression))
  432. result)))
  433. ;;
  434. ;; duplicate the entire group
  435. ;; NOTE: This hasn't been implemented yet!!
  436. (t
  437. (error "GROUP repeat hasn't been implemented yet~%")))))
  438. (cond ((listp piece) ;Just append the list
  439. (setf result (append (list piece) result)))
  440. ((eql piece 'QUESTION) ; Wrap it in a block that won't fail
  441. (cond ((listp (nth (1+ elt) expression))
  442. (setf result
  443. (append `((progn (block compare
  444. ,(nth (1+ elt)
  445. expression))
  446. t))
  447. result))
  448. (incf elt))
  449. ;;
  450. ;; This is a QUESTION on an entire group which
  451. ;; hasn't been implemented yet!!!
  452. ;;
  453. (t
  454. (error "Optional groups not implemented yet~%"))))
  455. ((or (eql piece 'ASTRISK) ; Do the wild thing!
  456. (eql piece 'PLUS))
  457. (cond ((listp (nth (1+ elt) expression))
  458. ;;
  459. ;; This is a single character wild card so
  460. ;; do the simple form.
  461. ;;
  462. (setf result
  463. `((let ((oindex index))
  464. (block compare
  465. (do ()
  466. (nil)
  467. ,(nth (1+ elt) expression)))
  468. (do ((start index (1- start)))
  469. ((< start oindex) nil)
  470. (let ((index start))
  471. (block compare
  472. ,@result))))))
  473. (incf elt))
  474. (t
  475. ;;
  476. ;; This is a subgroup repeated so I must build
  477. ;; the loop using several values.
  478. ;;
  479. ))
  480. )
  481. (t t)))) ; Just ignore everything else.
  482. ;;
  483. ;; Now wrap the result in a lambda list that can then be
  484. ;; invoked or compiled, however the user wishes.
  485. ;;
  486. (if anchored
  487. (setf result
  488. `(lambda (string &key (start 0) (end (length string)))
  489. (setf *regex-groupings* ,group)
  490. (block final-return
  491. (block compare
  492. (let ((index start)
  493. (length end))
  494. ,@result)))))
  495. (setf result
  496. `(lambda (string &key (start 0) (end (length string)))
  497. (setf *regex-groupings* ,group)
  498. (block final-return
  499. (let ((length end))
  500. ,@fast-first
  501. (do ((marker start (1+ marker)))
  502. ((> marker end) nil)
  503. (let ((index marker))
  504. (if (block compare
  505. ,@result)
  506. (return t)))))))))))
  507. ;; (provide 'nregex)