| 1 |
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- |
|---|
| 2 |
;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.34 2008/07/06 22:36:30 edi Exp $ |
|---|
| 3 |
|
|---|
| 4 |
;;; The lexer's responsibility is to convert the regex string into a |
|---|
| 5 |
;;; sequence of tokens which are in turn consumed by the parser. |
|---|
| 6 |
;;; |
|---|
| 7 |
;;; The lexer is aware of Perl's 'extended mode' and it also 'knows' |
|---|
| 8 |
;;; (with a little help from the parser) how many register groups it |
|---|
| 9 |
;;; has opened so far. (The latter is necessary for interpreting |
|---|
| 10 |
;;; strings like "\\10" correctly.) |
|---|
| 11 |
|
|---|
| 12 |
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. |
|---|
| 13 |
|
|---|
| 14 |
;;; Redistribution and use in source and binary forms, with or without |
|---|
| 15 |
;;; modification, are permitted provided that the following conditions |
|---|
| 16 |
;;; are met: |
|---|
| 17 |
|
|---|
| 18 |
;;; * Redistributions of source code must retain the above copyright |
|---|
| 19 |
;;; notice, this list of conditions and the following disclaimer. |
|---|
| 20 |
|
|---|
| 21 |
;;; * Redistributions in binary form must reproduce the above |
|---|
| 22 |
;;; copyright notice, this list of conditions and the following |
|---|
| 23 |
;;; disclaimer in the documentation and/or other materials |
|---|
| 24 |
;;; provided with the distribution. |
|---|
| 25 |
|
|---|
| 26 |
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED |
|---|
| 27 |
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|---|
| 28 |
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|---|
| 29 |
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|---|
| 30 |
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|---|
| 31 |
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
|---|
| 32 |
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|---|
| 33 |
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
|---|
| 34 |
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
|---|
| 35 |
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|---|
| 36 |
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|---|
| 37 |
|
|---|
| 38 |
(in-package :cl-ppcre) |
|---|
| 39 |
|
|---|
| 40 |
(declaim (inline map-char-to-special-class)) |
|---|
| 41 |
(defun map-char-to-special-char-class (chr) |
|---|
| 42 |
(declare #.*standard-optimize-settings*) |
|---|
| 43 |
"Maps escaped characters like \"\\d\" to the tokens which represent |
|---|
| 44 |
their associated character classes." |
|---|
| 45 |
(case chr |
|---|
| 46 |
((#\d) |
|---|
| 47 |
:digit-class) |
|---|
| 48 |
((#\D) |
|---|
| 49 |
:non-digit-class) |
|---|
| 50 |
((#\w) |
|---|
| 51 |
:word-char-class) |
|---|
| 52 |
((#\W) |
|---|
| 53 |
:non-word-char-class) |
|---|
| 54 |
((#\s) |
|---|
| 55 |
:whitespace-char-class) |
|---|
| 56 |
((#\S) |
|---|
| 57 |
:non-whitespace-char-class))) |
|---|
| 58 |
|
|---|
| 59 |
(defstruct (lexer (:constructor make-lexer-internal)) |
|---|
| 60 |
"LEXER structures are used to hold the regex string which is |
|---|
| 61 |
currently lexed and to keep track of the lexer's state." |
|---|
| 62 |
(str "" :type string :read-only t) |
|---|
| 63 |
(len 0 :type fixnum :read-only t) |
|---|
| 64 |
(reg 0 :type fixnum) |
|---|
| 65 |
(pos 0 :type fixnum) |
|---|
| 66 |
(last-pos nil :type list)) |
|---|
| 67 |
|
|---|
| 68 |
(defun make-lexer (string) |
|---|
| 69 |
(declare (inline make-lexer-internal) |
|---|
| 70 |
#-:genera (string string)) |
|---|
| 71 |
(make-lexer-internal :str (maybe-coerce-to-simple-string string) |
|---|
| 72 |
:len (length string))) |
|---|
| 73 |
|
|---|
| 74 |
(declaim (inline end-of-string-p)) |
|---|
| 75 |
(defun end-of-string-p (lexer) |
|---|
| 76 |
(declare #.*standard-optimize-settings*) |
|---|
| 77 |
"Tests whether we're at the end of the regex string." |
|---|
| 78 |
(<= (lexer-len lexer) |
|---|
| 79 |
(lexer-pos lexer))) |
|---|
| 80 |
|
|---|
| 81 |
(declaim (inline looking-at-p)) |
|---|
| 82 |
(defun looking-at-p (lexer chr) |
|---|
| 83 |
(declare #.*standard-optimize-settings*) |
|---|
| 84 |
"Tests whether the next character the lexer would see is CHR. |
|---|
| 85 |
Does not respect extended mode." |
|---|
| 86 |
(and (not (end-of-string-p lexer)) |
|---|
| 87 |
(char= (schar (lexer-str lexer) (lexer-pos lexer)) |
|---|
| 88 |
chr))) |
|---|
| 89 |
|
|---|
| 90 |
(declaim (inline next-char-non-extended)) |
|---|
| 91 |
(defun next-char-non-extended (lexer) |
|---|
| 92 |
(declare #.*standard-optimize-settings*) |
|---|
| 93 |
"Returns the next character which is to be examined and updates the |
|---|
| 94 |
POS slot. Does not respect extended mode." |
|---|
| 95 |
(cond ((end-of-string-p lexer) nil) |
|---|
| 96 |
(t (prog1 |
|---|
| 97 |
(schar (lexer-str lexer) (lexer-pos lexer)) |
|---|
| 98 |
(incf (lexer-pos lexer)))))) |
|---|
| 99 |
|
|---|
| 100 |
(defun next-char (lexer) |
|---|
| 101 |
(declare #.*standard-optimize-settings*) |
|---|
| 102 |
"Returns the next character which is to be examined and updates the |
|---|
| 103 |
POS slot. Respects extended mode, i.e. whitespace, comments, and also |
|---|
| 104 |
nested comments are skipped if applicable." |
|---|
| 105 |
(let ((next-char (next-char-non-extended lexer)) |
|---|
| 106 |
last-loop-pos) |
|---|
| 107 |
(loop |
|---|
| 108 |
;; remember where we started |
|---|
| 109 |
(setq last-loop-pos (lexer-pos lexer)) |
|---|
| 110 |
;; first we look for nested comments like (?#foo) |
|---|
| 111 |
(when (and next-char |
|---|
| 112 |
(char= next-char #\() |
|---|
| 113 |
(looking-at-p lexer #\?)) |
|---|
| 114 |
(incf (lexer-pos lexer)) |
|---|
| 115 |
(cond ((looking-at-p lexer #\#) |
|---|
| 116 |
;; must be a nested comment - so we have to search for |
|---|
| 117 |
;; the closing parenthesis |
|---|
| 118 |
(let ((error-pos (- (lexer-pos lexer) 2))) |
|---|
| 119 |
(unless |
|---|
| 120 |
;; loop 'til ')' or end of regex string and |
|---|
| 121 |
;; return NIL if ')' wasn't encountered |
|---|
| 122 |
(loop for skip-char = next-char |
|---|
| 123 |
then (next-char-non-extended lexer) |
|---|
| 124 |
while (and skip-char |
|---|
| 125 |
(char/= skip-char #\))) |
|---|
| 126 |
finally (return skip-char)) |
|---|
| 127 |
(signal-syntax-error* error-pos "Comment group not closed."))) |
|---|
| 128 |
(setq next-char (next-char-non-extended lexer))) |
|---|
| 129 |
(t |
|---|
| 130 |
;; undo effect of previous INCF if we didn't see a # |
|---|
| 131 |
(decf (lexer-pos lexer))))) |
|---|
| 132 |
(when *extended-mode-p* |
|---|
| 133 |
;; now - if we're in extended mode - we skip whitespace and |
|---|
| 134 |
;; comments; repeat the following loop while we look at |
|---|
| 135 |
;; whitespace or #\# |
|---|
| 136 |
(loop while (and next-char |
|---|
| 137 |
(or (char= next-char #\#) |
|---|
| 138 |
(whitespacep next-char))) |
|---|
| 139 |
do (setq next-char |
|---|
| 140 |
(if (char= next-char #\#) |
|---|
| 141 |
;; if we saw a comment marker skip until |
|---|
| 142 |
;; we're behind #\Newline... |
|---|
| 143 |
(loop for skip-char = next-char |
|---|
| 144 |
then (next-char-non-extended lexer) |
|---|
| 145 |
while (and skip-char |
|---|
| 146 |
(char/= skip-char #\Newline)) |
|---|
| 147 |
finally (return (next-char-non-extended lexer))) |
|---|
| 148 |
;; ...otherwise (whitespace) skip until we |
|---|
| 149 |
;; see the next non-whitespace character |
|---|
| 150 |
(loop for skip-char = next-char |
|---|
| 151 |
then (next-char-non-extended lexer) |
|---|
| 152 |
while (and skip-char |
|---|
| 153 |
(whitespacep skip-char)) |
|---|
| 154 |
finally (return skip-char)))))) |
|---|
| 155 |
;; if the position has moved we have to repeat our tests |
|---|
| 156 |
;; because of cases like /^a (?#xxx) (?#yyy) {3}c/x which |
|---|
| 157 |
;; would be equivalent to /^a{3}c/ in Perl |
|---|
| 158 |
(unless (> (lexer-pos lexer) last-loop-pos) |
|---|
| 159 |
(return next-char))))) |
|---|
| 160 |
|
|---|
| 161 |
(declaim (inline fail)) |
|---|
| 162 |
(defun fail (lexer) |
|---|
| 163 |
(declare #.*standard-optimize-settings*) |
|---|
| 164 |
"Moves (LEXER-POS LEXER) back to the last position stored in |
|---|
| 165 |
\(LEXER-LAST-POS LEXER) and pops the LAST-POS stack." |
|---|
| 166 |
(unless (lexer-last-pos lexer) |
|---|
| 167 |
(signal-syntax-error "LAST-POS stack of LEXER ~A is empty." lexer)) |
|---|
| 168 |
(setf (lexer-pos lexer) (pop (lexer-last-pos lexer))) |
|---|
| 169 |
nil) |
|---|
| 170 |
|
|---|
| 171 |
(defun get-number (lexer &key (radix 10) max-length no-whitespace-p) |
|---|
| 172 |
(declare #.*standard-optimize-settings*) |
|---|
| 173 |
"Read and consume the number the lexer is currently looking at and |
|---|
| 174 |
return it. Returns NIL if no number could be identified. |
|---|
| 175 |
RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read |
|---|
| 176 |
at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL |
|---|
| 177 |
we don't tolerate whitespace in front of the number." |
|---|
| 178 |
(when (or (end-of-string-p lexer) |
|---|
| 179 |
(and no-whitespace-p |
|---|
| 180 |
(whitespacep (schar (lexer-str lexer) (lexer-pos lexer))))) |
|---|
| 181 |
(return-from get-number nil)) |
|---|
| 182 |
(multiple-value-bind (integer new-pos) |
|---|
| 183 |
(parse-integer (lexer-str lexer) |
|---|
| 184 |
:start (lexer-pos lexer) |
|---|
| 185 |
:end (if max-length |
|---|
| 186 |
(let ((end-pos (+ (lexer-pos lexer) |
|---|
| 187 |
(the fixnum max-length))) |
|---|
| 188 |
(lexer-len (lexer-len lexer))) |
|---|
| 189 |
(if (< end-pos lexer-len) |
|---|
| 190 |
end-pos |
|---|
| 191 |
lexer-len)) |
|---|
| 192 |
(lexer-len lexer)) |
|---|
| 193 |
:radix radix |
|---|
| 194 |
:junk-allowed t) |
|---|
| 195 |
(cond ((and integer (>= (the fixnum integer) 0)) |
|---|
| 196 |
(setf (lexer-pos lexer) new-pos) |
|---|
| 197 |
integer) |
|---|
| 198 |
(t nil)))) |
|---|
| 199 |
|
|---|
| 200 |
(declaim (inline try-number)) |
|---|
| 201 |
(defun try-number (lexer &key (radix 10) max-length no-whitespace-p) |
|---|
| 202 |
(declare #.*standard-optimize-settings*) |
|---|
| 203 |
"Like GET-NUMBER but won't consume anything if no number is seen." |
|---|
| 204 |
;; remember current position |
|---|
| 205 |
(push (lexer-pos lexer) (lexer-last-pos lexer)) |
|---|
| 206 |
(let ((number (get-number lexer |
|---|
| 207 |
:radix radix |
|---|
| 208 |
:max-length max-length |
|---|
| 209 |
:no-whitespace-p no-whitespace-p))) |
|---|
| 210 |
(or number (fail lexer)))) |
|---|
| 211 |
|
|---|
| 212 |
(declaim (inline make-char-from-code)) |
|---|
| 213 |
(defun make-char-from-code (number error-pos) |
|---|
| 214 |
(declare #.*standard-optimize-settings*) |
|---|
| 215 |
"Create character from char-code NUMBER. NUMBER can be NIL |
|---|
| 216 |
which is interpreted as 0. ERROR-POS is the position where |
|---|
| 217 |
the corresponding number started within the regex string." |
|---|
| 218 |
;; only look at rightmost eight bits in compliance with Perl |
|---|
| 219 |
(let ((code (logand #o377 (the fixnum (or number 0))))) |
|---|
| 220 |
(or (and (< code char-code-limit) |
|---|
| 221 |
(code-char code)) |
|---|
| 222 |
(signal-syntax-error* error-pos "No character for hex-code ~X." number)))) |
|---|
| 223 |
|
|---|
| 224 |
(defun unescape-char (lexer) |
|---|
| 225 |
(declare #.*standard-optimize-settings*) |
|---|
| 226 |
"Convert the characters\(s) following a backslash into a token |
|---|
| 227 |
which is returned. This function is to be called when the backslash |
|---|
| 228 |
has already been consumed. Special character classes like \\W are |
|---|
| 229 |
handled elsewhere." |
|---|
| 230 |
(when (end-of-string-p lexer) |
|---|
| 231 |
(signal-syntax-error "String ends with backslash.")) |
|---|
| 232 |
(let ((chr (next-char-non-extended lexer))) |
|---|
| 233 |
(case chr |
|---|
| 234 |
((#\E) |
|---|
| 235 |
;; if \Q quoting is on this is ignored, otherwise it's just an |
|---|
| 236 |
;; #\E |
|---|
| 237 |
(if *allow-quoting* |
|---|
| 238 |
:void |
|---|
| 239 |
#\E)) |
|---|
| 240 |
((#\c) |
|---|
| 241 |
;; \cx means control-x in Perl |
|---|
| 242 |
(let ((next-char (next-char-non-extended lexer))) |
|---|
| 243 |
(unless next-char |
|---|
| 244 |
(signal-syntax-error* (lexer-pos lexer) "Character missing after '\\c' at position ~A.")) |
|---|
| 245 |
(code-char (logxor #x40 (char-code (char-upcase next-char)))))) |
|---|
| 246 |
((#\x) |
|---|
| 247 |
;; \x should be followed by a hexadecimal char code, |
|---|
| 248 |
;; two digits or less |
|---|
| 249 |
(let* ((error-pos (lexer-pos lexer)) |
|---|
| 250 |
(number (get-number lexer :radix 16 :max-length 2 :no-whitespace-p t))) |
|---|
| 251 |
;; note that it is OK if \x is followed by zero digits |
|---|
| 252 |
(make-char-from-code number error-pos))) |
|---|
| 253 |
((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) |
|---|
| 254 |
;; \x should be followed by an octal char code, |
|---|
| 255 |
;; three digits or less |
|---|
| 256 |
(let* ((error-pos (decf (lexer-pos lexer))) |
|---|
| 257 |
(number (get-number lexer :radix 8 :max-length 3))) |
|---|
| 258 |
(make-char-from-code number error-pos))) |
|---|
| 259 |
;; the following five character names are 'semi-standard' |
|---|
| 260 |
;; according to the CLHS but I'm not aware of any implementation |
|---|
| 261 |
;; that doesn't implement them |
|---|
| 262 |
((#\t) |
|---|
| 263 |
#\Tab) |
|---|
| 264 |
((#\n) |
|---|
| 265 |
#\Newline) |
|---|
| 266 |
((#\r) |
|---|
| 267 |
#\Return) |
|---|
| 268 |
((#\f) |
|---|
| 269 |
#\Page) |
|---|
| 270 |
((#\b) |
|---|
| 271 |
#\Backspace) |
|---|
| 272 |
((#\a) |
|---|
| 273 |
(code-char 7)) ; ASCII bell |
|---|
| 274 |
((#\e) |
|---|
| 275 |
(code-char 27)) ; ASCII escape |
|---|
| 276 |
(otherwise |
|---|
| 277 |
;; all other characters aren't affected by a backslash |
|---|
| 278 |
chr)))) |
|---|
| 279 |
|
|---|
| 280 |
(defun read-char-property (lexer first-char) |
|---|
| 281 |
(declare #.*standard-optimize-settings*) |
|---|
| 282 |
(unless (eql (next-char-non-extended lexer) #\{) |
|---|
| 283 |
(signal-syntax-error* (lexer-pos lexer) "Expected left brace after \\~A." first-char)) |
|---|
| 284 |
(let ((name (with-output-to-string (out nil :element-type |
|---|
| 285 |
#+:lispworks 'lw:simple-char #-:lispworks 'character) |
|---|
| 286 |
(loop |
|---|
| 287 |
(let ((char (or (next-char-non-extended lexer) |
|---|
| 288 |
(signal-syntax-error "Unexpected EOF after \\~A{." first-char)))) |
|---|
| 289 |
(when (char= char #\}) |
|---|
| 290 |
(return)) |
|---|
| 291 |
(write-char char out)))))) |
|---|
| 292 |
(list (if (char= first-char #\p) :property :inverted-property) |
|---|
| 293 |
;; we must reverse here because of what PARSE-STRING does |
|---|
| 294 |
(nreverse name)))) |
|---|
| 295 |
|
|---|
| 296 |
(defun collect-char-class (lexer) |
|---|
| 297 |
"Reads and consumes characters from regex string until a right |
|---|
| 298 |
bracket is seen. Assembles them into a list \(which is returned) of |
|---|
| 299 |
characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and |
|---|
| 300 |
tokens representing special character classes." |
|---|
| 301 |
(declare #.*standard-optimize-settings*) |
|---|
| 302 |
(let ((start-pos (lexer-pos lexer)) ; remember start for error message |
|---|
| 303 |
hyphen-seen |
|---|
| 304 |
last-char |
|---|
| 305 |
list) |
|---|
| 306 |
(flet ((handle-char (c) |
|---|
| 307 |
"Do the right thing with character C depending on whether |
|---|
| 308 |
we're inside a range or not." |
|---|
| 309 |
(cond ((and hyphen-seen last-char) |
|---|
| 310 |
(setf (car list) (list :range last-char c) |
|---|
| 311 |
last-char nil)) |
|---|
| 312 |
(t |
|---|
| 313 |
(push c list) |
|---|
| 314 |
(setq last-char c))) |
|---|
| 315 |
(setq hyphen-seen nil))) |
|---|
| 316 |
(loop for first = t then nil |
|---|
| 317 |
for c = (next-char-non-extended lexer) |
|---|
| 318 |
;; leave loop if at end of string |
|---|
| 319 |
while c |
|---|
| 320 |
do (cond |
|---|
| 321 |
((char= c #\\) |
|---|
| 322 |
;; we've seen a backslash |
|---|
| 323 |
(let ((next-char (next-char-non-extended lexer))) |
|---|
| 324 |
(case next-char |
|---|
| 325 |
((#\d #\D #\w #\W #\s #\S) |
|---|
| 326 |
;; a special character class |
|---|
| 327 |
(push (map-char-to-special-char-class next-char) list) |
|---|
| 328 |
;; if the last character was a hyphen |
|---|
| 329 |
;; just collect it literally |
|---|
| 330 |
(when hyphen-seen |
|---|
| 331 |
(push #\- list)) |
|---|
| 332 |
;; if the next character is a hyphen do the same |
|---|
| 333 |
(when (looking-at-p lexer #\-) |
|---|
| 334 |
(push #\- list) |
|---|
| 335 |
(incf (lexer-pos lexer))) |
|---|
| 336 |
(setq hyphen-seen nil)) |
|---|
| 337 |
((#\P #\p) |
|---|
| 338 |
;; maybe a character property |
|---|
| 339 |
(cond ((null *property-resolver*) |
|---|
| 340 |
(handle-char next-char)) |
|---|
| 341 |
(t |
|---|
| 342 |
(push (read-char-property lexer next-char) list) |
|---|
| 343 |
;; if the last character was a hyphen |
|---|
| 344 |
;; just collect it literally |
|---|
| 345 |
(when hyphen-seen |
|---|
| 346 |
(push #\- list)) |
|---|
| 347 |
;; if the next character is a hyphen do the same |
|---|
| 348 |
(when (looking-at-p lexer #\-) |
|---|
| 349 |
(push #\- list) |
|---|
| 350 |
(incf (lexer-pos lexer))) |
|---|
| 351 |
(setq hyphen-seen nil)))) |
|---|
| 352 |
((#\E) |
|---|
| 353 |
;; if \Q quoting is on we ignore \E, |
|---|
| 354 |
;; otherwise it's just a plain #\E |
|---|
| 355 |
(unless *allow-quoting* |
|---|
| 356 |
(handle-char #\E))) |
|---|
| 357 |
(otherwise |
|---|
| 358 |
;; otherwise unescape the following character(s) |
|---|
| 359 |
(decf (lexer-pos lexer)) |
|---|
| 360 |
(handle-char (unescape-char lexer)))))) |
|---|
| 361 |
(first |
|---|
| 362 |
;; the first character must not be a right bracket |
|---|
| 363 |
;; and isn't treated specially if it's a hyphen |
|---|
| 364 |
(handle-char c)) |
|---|
| 365 |
((char= c #\]) |
|---|
| 366 |
;; end of character class |
|---|
| 367 |
;; make sure we collect a pending hyphen |
|---|
| 368 |
(when hyphen-seen |
|---|
| 369 |
(setq hyphen-seen nil) |
|---|
| 370 |
(handle-char #\-)) |
|---|
| 371 |
;; reverse the list to preserve the order intended |
|---|
| 372 |
;; by the author of the regex string |
|---|
| 373 |
(return-from collect-char-class (nreverse list))) |
|---|
| 374 |
((and (char= c #\-) |
|---|
| 375 |
last-char |
|---|
| 376 |
(not hyphen-seen)) |
|---|
| 377 |
;; if the last character was 'just a character' |
|---|
| 378 |
;; we expect to be in the middle of a range |
|---|
| 379 |
(setq hyphen-seen t)) |
|---|
| 380 |
((char= c #\-) |
|---|
| 381 |
;; otherwise this is just an ordinary hyphen |
|---|
| 382 |
(handle-char #\-)) |
|---|
| 383 |
(t |
|---|
| 384 |
;; default case - just collect the character |
|---|
| 385 |
(handle-char c)))) |
|---|
| 386 |
;; we can only exit the loop normally if we've reached the end |
|---|
| 387 |
;; of the regex string without seeing a right bracket |
|---|
| 388 |
(signal-syntax-error* start-pos "Missing right bracket to close character class.")))) |
|---|
| 389 |
|
|---|
| 390 |
(defun maybe-parse-flags (lexer) |
|---|
| 391 |
(declare #.*standard-optimize-settings*) |
|---|
| 392 |
"Reads a sequence of modifiers \(including #\\- to reverse their |
|---|
| 393 |
meaning) and returns a corresponding list of \"flag\" tokens. The |
|---|
| 394 |
\"x\" modifier is treated specially in that it dynamically modifies |
|---|
| 395 |
the behaviour of the lexer itself via the special variable |
|---|
| 396 |
*EXTENDED-MODE-P*." |
|---|
| 397 |
(prog1 |
|---|
| 398 |
(loop with set = t |
|---|
| 399 |
for chr = (next-char-non-extended lexer) |
|---|
| 400 |
unless chr |
|---|
| 401 |
do (signal-syntax-error "Unexpected end of string.") |
|---|
| 402 |
while (find chr "-imsx" :test #'char=) |
|---|
| 403 |
;; the first #\- will invert the meaning of all modifiers |
|---|
| 404 |
;; following it |
|---|
| 405 |
if (char= chr #\-) |
|---|
| 406 |
do (setq set nil) |
|---|
| 407 |
else if (char= chr #\x) |
|---|
| 408 |
do (setq *extended-mode-p* set) |
|---|
| 409 |
else collect (if set |
|---|
| 410 |
(case chr |
|---|
| 411 |
((#\i) |
|---|
| 412 |
:case-insensitive-p) |
|---|
| 413 |
((#\m) |
|---|
| 414 |
:multi-line-mode-p) |
|---|
| 415 |
((#\s) |
|---|
| 416 |
:single-line-mode-p)) |
|---|
| 417 |
(case chr |
|---|
| 418 |
((#\i) |
|---|
| 419 |
:case-sensitive-p) |
|---|
| 420 |
((#\m) |
|---|
| 421 |
:not-multi-line-mode-p) |
|---|
| 422 |
((#\s) |
|---|
| 423 |
:not-single-line-mode-p)))) |
|---|
| 424 |
(decf (lexer-pos lexer)))) |
|---|
| 425 |
|
|---|
| 426 |
(defun get-quantifier (lexer) |
|---|
| 427 |
(declare #.*standard-optimize-settings*) |
|---|
| 428 |
"Returns a list of two values (min max) if what the lexer is looking |
|---|
| 429 |
at can be interpreted as a quantifier. Otherwise returns NIL and |
|---|
| 430 |
resets the lexer to its old position." |
|---|
| 431 |
;; remember starting position for FAIL and UNGET-TOKEN functions |
|---|
| 432 |
(push (lexer-pos lexer) (lexer-last-pos lexer)) |
|---|
| 433 |
(let ((next-char (next-char lexer))) |
|---|
| 434 |
(case next-char |
|---|
| 435 |
((#\*) |
|---|
| 436 |
;; * (Kleene star): match 0 or more times |
|---|
| 437 |
'(0 nil)) |
|---|
| 438 |
((#\+) |
|---|
| 439 |
;; +: match 1 or more times |
|---|
| 440 |
'(1 nil)) |
|---|
| 441 |
((#\?) |
|---|
| 442 |
;; ?: match 0 or 1 times |
|---|
| 443 |
'(0 1)) |
|---|
| 444 |
((#\{) |
|---|
| 445 |
;; one of |
|---|
| 446 |
;; {n}: match exactly n times |
|---|
| 447 |
;; {n,}: match at least n times |
|---|
| 448 |
;; {n,m}: match at least n but not more than m times |
|---|
| 449 |
;; note that anything not matching one of these patterns will |
|---|
| 450 |
;; be interpreted literally - even whitespace isn't allowed |
|---|
| 451 |
(let ((num1 (get-number lexer :no-whitespace-p t))) |
|---|
| 452 |
(if num1 |
|---|
| 453 |
(let ((next-char (next-char-non-extended lexer))) |
|---|
| 454 |
(case next-char |
|---|
| 455 |
((#\,) |
|---|
| 456 |
(let* ((num2 (get-number lexer :no-whitespace-p t)) |
|---|
| 457 |
(next-char (next-char-non-extended lexer))) |
|---|
| 458 |
(case next-char |
|---|
| 459 |
((#\}) |
|---|
| 460 |
;; this is the case {n,} (NUM2 is NIL) or {n,m} |
|---|
| 461 |
(list num1 num2)) |
|---|
| 462 |
(otherwise |
|---|
| 463 |
(fail lexer))))) |
|---|
| 464 |
((#\}) |
|---|
| 465 |
;; this is the case {n} |
|---|
| 466 |
(list num1 num1)) |
|---|
| 467 |
(otherwise |
|---|
| 468 |
(fail lexer)))) |
|---|
| 469 |
;; no number following left curly brace, so we treat it |
|---|
| 470 |
;; like a normal character |
|---|
| 471 |
(fail lexer)))) |
|---|
| 472 |
;; cannot be a quantifier |
|---|
| 473 |
(otherwise |
|---|
| 474 |
(fail lexer))))) |
|---|
| 475 |
|
|---|
| 476 |
(defun parse-register-name-aux (lexer) |
|---|
| 477 |
"Reads and returns the name in a named register group. It is |
|---|
| 478 |
assumed that the starting #\< character has already been read. The |
|---|
| 479 |
closing #\> will also be consumed." |
|---|
| 480 |
;; we have to look for an ending > character now |
|---|
| 481 |
(let ((end-name (position #\> |
|---|
| 482 |
(lexer-str lexer) |
|---|
| 483 |
:start (lexer-pos lexer) |
|---|
| 484 |
:test #'char=))) |
|---|
| 485 |
(unless end-name |
|---|
| 486 |
;; there has to be > somewhere, syntax error otherwise |
|---|
| 487 |
(signal-syntax-error* (1- (lexer-pos lexer)) "Opening #\< in named group has no closing #\>.")) |
|---|
| 488 |
(let ((name (subseq (lexer-str lexer) |
|---|
| 489 |
(lexer-pos lexer) |
|---|
| 490 |
end-name))) |
|---|
| 491 |
(unless (every #'(lambda (char) |
|---|
| 492 |
(or (alphanumericp char) |
|---|
| 493 |
(char= #\- char))) |
|---|
| 494 |
name) |
|---|
| 495 |
;; register name can contain only alphanumeric characters or #\- |
|---|
| 496 |
(signal-syntax-error* (lexer-pos lexer) "Invalid character in named register group.")) |
|---|
| 497 |
;; advance lexer beyond "<name>" part |
|---|
| 498 |
(setf (lexer-pos lexer) (1+ end-name)) |
|---|
| 499 |
name))) |
|---|
| 500 |
|
|---|
| 501 |
(defun get-token (lexer) |
|---|
| 502 |
(declare #.*standard-optimize-settings*) |
|---|
| 503 |
"Returns and consumes the next token from the regex string \(or NIL)." |
|---|
| 504 |
;; remember starting position for UNGET-TOKEN function |
|---|
| 505 |
(push (lexer-pos lexer) |
|---|
| 506 |
(lexer-last-pos lexer)) |
|---|
| 507 |
(let ((next-char (next-char lexer))) |
|---|
| 508 |
(cond (next-char |
|---|
| 509 |
(case next-char |
|---|
| 510 |
;; the easy cases first - the following six characters |
|---|
| 511 |
;; always have a special meaning and get translated |
|---|
| 512 |
;; into tokens immediately |
|---|
| 513 |
((#\)) |
|---|
| 514 |
:close-paren) |
|---|
| 515 |
((#\|) |
|---|
| 516 |
:vertical-bar) |
|---|
| 517 |
((#\?) |
|---|
| 518 |
:question-mark) |
|---|
| 519 |
((#\.) |
|---|
| 520 |
:everything) |
|---|
| 521 |
((#\^) |
|---|
| 522 |
:start-anchor) |
|---|
| 523 |
((#\$) |
|---|
| 524 |
:end-anchor) |
|---|
| 525 |
((#\+ #\*) |
|---|
| 526 |
;; quantifiers will always be consumend by |
|---|
| 527 |
;; GET-QUANTIFIER, they must not appear here |
|---|
| 528 |
(signal-syntax-error* (1- (lexer-pos lexer)) "Quantifier '~A' not allowed." next-char)) |
|---|
| 529 |
((#\{) |
|---|
| 530 |
;; left brace isn't a special character in it's own |
|---|
| 531 |
;; right but we must check if what follows might |
|---|
| 532 |
;; look like a quantifier |
|---|
| 533 |
(let ((this-pos (lexer-pos lexer)) |
|---|
| 534 |
(this-last-pos (lexer-last-pos lexer))) |
|---|
| 535 |
(unget-token lexer) |
|---|
| 536 |
(when (get-quantifier lexer) |
|---|
| 537 |
(signal-syntax-error* (car this-last-pos) |
|---|
| 538 |
"Quantifier '~A' not allowed." |
|---|
| 539 |
(subseq (lexer-str lexer) |
|---|
| 540 |
(car this-last-pos) |
|---|
| 541 |
(lexer-pos lexer)))) |
|---|
| 542 |
(setf (lexer-pos lexer) this-pos |
|---|
| 543 |
(lexer-last-pos lexer) this-last-pos) |
|---|
| 544 |
next-char)) |
|---|
| 545 |
((#\[) |
|---|
| 546 |
;; left bracket always starts a character class |
|---|
| 547 |
(cons (cond ((looking-at-p lexer #\^) |
|---|
| 548 |
(incf (lexer-pos lexer)) |
|---|
| 549 |
:inverted-char-class) |
|---|
| 550 |
(t |
|---|
| 551 |
:char-class)) |
|---|
| 552 |
(collect-char-class lexer))) |
|---|
| 553 |
((#\\) |
|---|
| 554 |
;; backslash might mean different things so we have |
|---|
| 555 |
;; to peek one char ahead: |
|---|
| 556 |
(let ((next-char (next-char-non-extended lexer))) |
|---|
| 557 |
(case next-char |
|---|
| 558 |
((#\A) |
|---|
| 559 |
:modeless-start-anchor) |
|---|
| 560 |
((#\Z) |
|---|
| 561 |
:modeless-end-anchor) |
|---|
| 562 |
((#\z) |
|---|
| 563 |
:modeless-end-anchor-no-newline) |
|---|
| 564 |
((#\b) |
|---|
| 565 |
:word-boundary) |
|---|
| 566 |
((#\B) |
|---|
| 567 |
:non-word-boundary) |
|---|
| 568 |
((#\k) |
|---|
| 569 |
(cond ((and *allow-named-registers* |
|---|
| 570 |
(looking-at-p lexer #\<)) |
|---|
| 571 |
;; back-referencing a named register |
|---|
| 572 |
(incf (lexer-pos lexer)) |
|---|
| 573 |
(list :back-reference |
|---|
| 574 |
(nreverse (parse-register-name-aux lexer)))) |
|---|
| 575 |
(t |
|---|
| 576 |
;; false alarm, just unescape \k |
|---|
| 577 |
#\k))) |
|---|
| 578 |
((#\d #\D #\w #\W #\s #\S) |
|---|
| 579 |
;; these will be treated like character classes |
|---|
| 580 |
(map-char-to-special-char-class next-char)) |
|---|
| 581 |
((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) |
|---|
| 582 |
;; uh, a digit... |
|---|
| 583 |
(let* ((old-pos (decf (lexer-pos lexer))) |
|---|
| 584 |
;; ...so let's get the whole number first |
|---|
| 585 |
(backref-number (get-number lexer))) |
|---|
| 586 |
(declare (fixnum backref-number)) |
|---|
| 587 |
(cond ((and (> backref-number (lexer-reg lexer)) |
|---|
| 588 |
(<= 10 backref-number)) |
|---|
| 589 |
;; \10 and higher are treated as octal |
|---|
| 590 |
;; character codes if we haven't |
|---|
| 591 |
;; opened that much register groups |
|---|
| 592 |
;; yet |
|---|
| 593 |
(setf (lexer-pos lexer) old-pos) |
|---|
| 594 |
;; re-read the number from the old |
|---|
| 595 |
;; position and convert it to its |
|---|
| 596 |
;; corresponding character |
|---|
| 597 |
(make-char-from-code (get-number lexer :radix 8 :max-length 3) |
|---|
| 598 |
old-pos)) |
|---|
| 599 |
(t |
|---|
| 600 |
;; otherwise this must refer to a |
|---|
| 601 |
;; backreference |
|---|
| 602 |
(list :back-reference backref-number))))) |
|---|
| 603 |
((#\0) |
|---|
| 604 |
;; this always means an octal character code |
|---|
| 605 |
;; (at most three digits) |
|---|
| 606 |
(let ((old-pos (decf (lexer-pos lexer)))) |
|---|
| 607 |
(make-char-from-code (get-number lexer :radix 8 :max-length 3) |
|---|
| 608 |
old-pos))) |
|---|
| 609 |
((#\P #\p) |
|---|
| 610 |
;; might be a named property |
|---|
| 611 |
(cond (*property-resolver* (read-char-property lexer next-char)) |
|---|
| 612 |
(t next-char))) |
|---|
| 613 |
(otherwise |
|---|
| 614 |
;; in all other cases just unescape the |
|---|
| 615 |
;; character |
|---|
| 616 |
(decf (lexer-pos lexer)) |
|---|
| 617 |
(unescape-char lexer))))) |
|---|
| 618 |
((#\() |
|---|
| 619 |
;; an open parenthesis might mean different things |
|---|
| 620 |
;; depending on what follows... |
|---|
| 621 |
(cond ((looking-at-p lexer #\?) |
|---|
| 622 |
;; this is the case '(?' (and probably more behind) |
|---|
| 623 |
(incf (lexer-pos lexer)) |
|---|
| 624 |
;; we have to check for modifiers first |
|---|
| 625 |
;; because a colon might follow |
|---|
| 626 |
(let* ((flags (maybe-parse-flags lexer)) |
|---|
| 627 |
(next-char (next-char-non-extended lexer))) |
|---|
| 628 |
;; modifiers are only allowed if a colon |
|---|
| 629 |
;; or a closing parenthesis are following |
|---|
| 630 |
(when (and flags |
|---|
| 631 |
(not (find next-char ":)" :test #'char=))) |
|---|
| 632 |
(signal-syntax-error* (car (lexer-last-pos lexer)) |
|---|
| 633 |
"Sequence '~A' not recognized." |
|---|
| 634 |
(subseq (lexer-str lexer) |
|---|
| 635 |
(car (lexer-last-pos lexer)) |
|---|
| 636 |
(lexer-pos lexer)))) |
|---|
| 637 |
(case next-char |
|---|
| 638 |
((nil) |
|---|
| 639 |
;; syntax error |
|---|
| 640 |
(signal-syntax-error "End of string following '(?'.")) |
|---|
| 641 |
((#\)) |
|---|
| 642 |
;; an empty group except for the flags |
|---|
| 643 |
;; (if there are any) |
|---|
| 644 |
(or (and flags |
|---|
| 645 |
(cons :flags flags)) |
|---|
| 646 |
:void)) |
|---|
| 647 |
((#\() |
|---|
| 648 |
;; branch |
|---|
| 649 |
:open-paren-paren) |
|---|
| 650 |
((#\>) |
|---|
| 651 |
;; standalone |
|---|
| 652 |
:open-paren-greater) |
|---|
| 653 |
((#\=) |
|---|
| 654 |
;; positive look-ahead |
|---|
| 655 |
:open-paren-equal) |
|---|
| 656 |
((#\!) |
|---|
| 657 |
;; negative look-ahead |
|---|
| 658 |
:open-paren-exclamation) |
|---|
| 659 |
((#\:) |
|---|
| 660 |
;; non-capturing group - return flags as |
|---|
| 661 |
;; second value |
|---|
| 662 |
(values :open-paren-colon flags)) |
|---|
| 663 |
((#\<) |
|---|
| 664 |
;; might be a look-behind assertion or a named group, so |
|---|
| 665 |
;; check next character |
|---|
| 666 |
(let ((next-char (next-char-non-extended lexer))) |
|---|
| 667 |
(if (alpha-char-p next-char) |
|---|
| 668 |
(progn |
|---|
| 669 |
;; we have encountered a named group |
|---|
| 670 |
;; are we supporting register naming? |
|---|
| 671 |
(unless *allow-named-registers* |
|---|
| 672 |
(signal-syntax-error* (1- (lexer-pos lexer)) |
|---|
| 673 |
"Character '~A' may not follow '(?<'." |
|---|
| 674 |
next-char)) |
|---|
| 675 |
;; put the letter back |
|---|
| 676 |
(decf (lexer-pos lexer)) |
|---|
| 677 |
;; named group |
|---|
| 678 |
:open-paren-less-letter) |
|---|
| 679 |
(case next-char |
|---|
| 680 |
((#\=) |
|---|
| 681 |
;; positive look-behind |
|---|
| 682 |
:open-paren-less-equal) |
|---|
| 683 |
((#\!) |
|---|
| 684 |
;; negative look-behind |
|---|
| 685 |
:open-paren-less-exclamation) |
|---|
| 686 |
((#\)) |
|---|
| 687 |
;; Perl allows "(?<)" and treats |
|---|
| 688 |
;; it like a null string |
|---|
| 689 |
:void) |
|---|
| 690 |
((nil) |
|---|
| 691 |
;; syntax error |
|---|
| 692 |
(signal-syntax-error "End of string following '(?<'.")) |
|---|
| 693 |
(t |
|---|
| 694 |
;; also syntax error |
|---|
| 695 |
(signal-syntax-error* (1- (lexer-pos lexer)) |
|---|
| 696 |
"Character '~A' may not follow '(?<'." |
|---|
| 697 |
next-char )))))) |
|---|
| 698 |
(otherwise |
|---|
| 699 |
(signal-syntax-error* (1- (lexer-pos lexer)) |
|---|
| 700 |
"Character '~A' may not follow '(?'." |
|---|
| 701 |
next-char))))) |
|---|
| 702 |
(t |
|---|
| 703 |
;; if next-char was not #\? (this is within |
|---|
| 704 |
;; the first COND), we've just seen an opening |
|---|
| 705 |
;; parenthesis and leave it like that |
|---|
| 706 |
:open-paren))) |
|---|
| 707 |
(otherwise |
|---|
| 708 |
;; all other characters are their own tokens |
|---|
| 709 |
next-char))) |
|---|
| 710 |
;; we didn't get a character (this if the "else" branch from |
|---|
| 711 |
;; the first IF), so we don't return a token but NIL |
|---|
| 712 |
(t |
|---|
| 713 |
(pop (lexer-last-pos lexer)) |
|---|
| 714 |
nil)))) |
|---|
| 715 |
|
|---|
| 716 |
(declaim (inline unget-token)) |
|---|
| 717 |
(defun unget-token (lexer) |
|---|
| 718 |
(declare #.*standard-optimize-settings*) |
|---|
| 719 |
"Moves the lexer back to the last position stored in the LAST-POS stack." |
|---|
| 720 |
(if (lexer-last-pos lexer) |
|---|
| 721 |
(setf (lexer-pos lexer) |
|---|
| 722 |
(pop (lexer-last-pos lexer))) |
|---|
| 723 |
(error "No token to unget \(this should not happen)"))) |
|---|
| 724 |
|
|---|
| 725 |
(declaim (inline start-of-subexpr-p)) |
|---|
| 726 |
(defun start-of-subexpr-p (lexer) |
|---|
| 727 |
(declare #.*standard-optimize-settings*) |
|---|
| 728 |
"Tests whether the next token can start a valid sub-expression, i.e. |
|---|
| 729 |
a stand-alone regex." |
|---|
| 730 |
(let* ((pos (lexer-pos lexer)) |
|---|
| 731 |
(next-char (next-char lexer))) |
|---|
| 732 |
(not (or (null next-char) |
|---|
| 733 |
(prog1 |
|---|
| 734 |
(member (the character next-char) |
|---|
| 735 |
'(#\) #\|) |
|---|
| 736 |
:test #'char=) |
|---|
| 737 |
(setf (lexer-pos lexer) pos)))))) |
|---|