root/trunk/thirdparty/cl-ppcre/lexer.lisp

Revision 3581, 32.8 kB (checked in by edi, 6 months ago)

Update to current dev version

Line 
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2 ;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.34 2008/07/06 22:36:30 edi Exp $
3
4 ;;; The lexer's responsibility is to convert the regex string into a
5 ;;; sequence of tokens which are in turn consumed by the parser.
6 ;;;
7 ;;; The lexer is aware of Perl's 'extended mode' and it also 'knows'
8 ;;; (with a little help from the parser) how many register groups it
9 ;;; has opened so far.  (The latter is necessary for interpreting
10 ;;; strings like "\\10" correctly.)
11
12 ;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
13
14 ;;; Redistribution and use in source and binary forms, with or without
15 ;;; modification, are permitted provided that the following conditions
16 ;;; are met:
17
18 ;;;   * Redistributions of source code must retain the above copyright
19 ;;;     notice, this list of conditions and the following disclaimer.
20
21 ;;;   * Redistributions in binary form must reproduce the above
22 ;;;     copyright notice, this list of conditions and the following
23 ;;;     disclaimer in the documentation and/or other materials
24 ;;;     provided with the distribution.
25
26 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
27 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
30 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
32 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
34 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37
38 (in-package :cl-ppcre)
39
40 (declaim (inline map-char-to-special-class))
41 (defun map-char-to-special-char-class (chr)
42   (declare #.*standard-optimize-settings*)
43   "Maps escaped characters like \"\\d\" to the tokens which represent
44 their associated character classes."
45   (case chr
46     ((#\d)
47       :digit-class)
48     ((#\D)
49       :non-digit-class)
50     ((#\w)
51       :word-char-class)
52     ((#\W)
53       :non-word-char-class)
54     ((#\s)
55       :whitespace-char-class)
56     ((#\S)
57       :non-whitespace-char-class)))
58
59 (defstruct (lexer (:constructor make-lexer-internal))
60   "LEXER structures are used to hold the regex string which is
61 currently lexed and to keep track of the lexer's state."
62   (str "" :type string :read-only t)
63   (len 0 :type fixnum :read-only t)
64   (reg 0 :type fixnum)
65   (pos 0 :type fixnum)
66   (last-pos nil :type list))
67
68 (defun make-lexer (string)
69   (declare (inline make-lexer-internal)
70            #-:genera (string string))
71   (make-lexer-internal :str (maybe-coerce-to-simple-string string)
72                        :len (length string)))
73
74 (declaim (inline end-of-string-p))
75 (defun end-of-string-p (lexer)
76   (declare #.*standard-optimize-settings*)
77   "Tests whether we're at the end of the regex string."
78   (<= (lexer-len lexer)
79       (lexer-pos lexer)))
80
81 (declaim (inline looking-at-p))
82 (defun looking-at-p (lexer chr)
83   (declare #.*standard-optimize-settings*)
84   "Tests whether the next character the lexer would see is CHR.
85 Does not respect extended mode."
86   (and (not (end-of-string-p lexer))
87        (char= (schar (lexer-str lexer) (lexer-pos lexer))
88               chr)))
89
90 (declaim (inline next-char-non-extended))
91 (defun next-char-non-extended (lexer)
92   (declare #.*standard-optimize-settings*)
93   "Returns the next character which is to be examined and updates the
94 POS slot. Does not respect extended mode."
95   (cond ((end-of-string-p lexer) nil)
96         (t (prog1
97                (schar (lexer-str lexer) (lexer-pos lexer))
98              (incf (lexer-pos lexer))))))
99
100 (defun next-char (lexer)
101   (declare #.*standard-optimize-settings*)
102   "Returns the next character which is to be examined and updates the
103 POS slot. Respects extended mode, i.e.  whitespace, comments, and also
104 nested comments are skipped if applicable."
105   (let ((next-char (next-char-non-extended lexer))
106         last-loop-pos)
107     (loop
108       ;; remember where we started
109       (setq last-loop-pos (lexer-pos lexer))
110       ;; first we look for nested comments like (?#foo)
111       (when (and next-char
112                  (char= next-char #\()
113                  (looking-at-p lexer #\?))
114         (incf (lexer-pos lexer))
115         (cond ((looking-at-p lexer #\#)
116                 ;; must be a nested comment - so we have to search for
117                 ;; the closing parenthesis
118                 (let ((error-pos (- (lexer-pos lexer) 2)))
119                   (unless
120                       ;; loop 'til ')' or end of regex string and
121                       ;; return NIL if ')' wasn't encountered
122                       (loop for skip-char = next-char
123                             then (next-char-non-extended lexer)
124                             while (and skip-char
125                                        (char/= skip-char #\)))
126                             finally (return skip-char))
127                     (signal-syntax-error* error-pos "Comment group not closed.")))
128                 (setq next-char (next-char-non-extended lexer)))
129               (t
130                 ;; undo effect of previous INCF if we didn't see a #
131                 (decf (lexer-pos lexer)))))
132       (when *extended-mode-p*
133         ;; now - if we're in extended mode - we skip whitespace and
134         ;; comments; repeat the following loop while we look at
135         ;; whitespace or #\#
136         (loop while (and next-char
137                          (or (char= next-char #\#)
138                              (whitespacep next-char)))
139               do (setq next-char
140                          (if (char= next-char #\#)
141                            ;; if we saw a comment marker skip until
142                            ;; we're behind #\Newline...
143                            (loop for skip-char = next-char
144                                  then (next-char-non-extended lexer)
145                                  while (and skip-char
146                                             (char/= skip-char #\Newline))
147                                  finally (return (next-char-non-extended lexer)))
148                            ;; ...otherwise (whitespace) skip until we
149                            ;; see the next non-whitespace character
150                            (loop for skip-char = next-char
151                                  then (next-char-non-extended lexer)
152                                  while (and skip-char
153                                             (whitespacep skip-char))
154                                  finally (return skip-char))))))
155       ;; if the position has moved we have to repeat our tests
156       ;; because of cases like /^a (?#xxx) (?#yyy) {3}c/x which
157       ;; would be equivalent to /^a{3}c/ in Perl
158       (unless (> (lexer-pos lexer) last-loop-pos)
159         (return next-char)))))
160
161 (declaim (inline fail))
162 (defun fail (lexer)
163   (declare #.*standard-optimize-settings*)
164   "Moves (LEXER-POS LEXER) back to the last position stored in
165 \(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
166   (unless (lexer-last-pos lexer)
167     (signal-syntax-error "LAST-POS stack of LEXER ~A is empty." lexer))
168   (setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
169   nil)
170
171 (defun get-number (lexer &key (radix 10) max-length no-whitespace-p)
172   (declare #.*standard-optimize-settings*)
173   "Read and consume the number the lexer is currently looking at and
174 return it. Returns NIL if no number could be identified.
175 RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
176 at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
177 we don't tolerate whitespace in front of the number."
178   (when (or (end-of-string-p lexer)
179             (and no-whitespace-p
180                  (whitespacep (schar (lexer-str lexer) (lexer-pos lexer)))))
181     (return-from get-number nil))
182   (multiple-value-bind (integer new-pos)
183       (parse-integer (lexer-str lexer)
184                      :start (lexer-pos lexer)
185                      :end (if max-length
186                             (let ((end-pos (+ (lexer-pos lexer)
187                                               (the fixnum max-length)))
188                                   (lexer-len (lexer-len lexer)))
189                               (if (< end-pos lexer-len)
190                                 end-pos
191                                 lexer-len))
192                             (lexer-len lexer))
193                      :radix radix
194                      :junk-allowed t)
195     (cond ((and integer (>= (the fixnum integer) 0))
196             (setf (lexer-pos lexer) new-pos)
197             integer)
198           (t nil))))
199
200 (declaim (inline try-number))
201 (defun try-number (lexer &key (radix 10) max-length no-whitespace-p)
202   (declare #.*standard-optimize-settings*)
203   "Like GET-NUMBER but won't consume anything if no number is seen."
204   ;; remember current position
205   (push (lexer-pos lexer) (lexer-last-pos lexer))
206   (let ((number (get-number lexer
207                             :radix radix
208                             :max-length max-length
209                             :no-whitespace-p no-whitespace-p)))
210     (or number (fail lexer))))
211
212 (declaim (inline make-char-from-code))
213 (defun make-char-from-code (number error-pos)
214   (declare #.*standard-optimize-settings*)
215   "Create character from char-code NUMBER. NUMBER can be NIL
216 which is interpreted as 0. ERROR-POS is the position where
217 the corresponding number started within the regex string."
218   ;; only look at rightmost eight bits in compliance with Perl
219   (let ((code (logand #o377 (the fixnum (or number 0)))))
220     (or (and (< code char-code-limit)
221              (code-char code))
222         (signal-syntax-error* error-pos "No character for hex-code ~X." number))))
223
224 (defun unescape-char (lexer)
225   (declare #.*standard-optimize-settings*)
226   "Convert the characters\(s) following a backslash into a token
227 which is returned. This function is to be called when the backslash
228 has already been consumed. Special character classes like \\W are
229 handled elsewhere."
230   (when (end-of-string-p lexer)
231     (signal-syntax-error "String ends with backslash."))
232   (let ((chr (next-char-non-extended lexer)))
233     (case chr
234       ((#\E)
235         ;; if \Q quoting is on this is ignored, otherwise it's just an
236         ;; #\E
237         (if *allow-quoting*
238           :void
239           #\E))
240       ((#\c)
241         ;; \cx means control-x in Perl
242         (let ((next-char (next-char-non-extended lexer)))
243           (unless next-char
244             (signal-syntax-error* (lexer-pos lexer) "Character missing after '\\c' at position ~A."))
245           (code-char (logxor #x40 (char-code (char-upcase next-char))))))
246       ((#\x)
247         ;; \x should be followed by a hexadecimal char code,
248         ;; two digits or less
249         (let* ((error-pos (lexer-pos lexer))
250                (number (get-number lexer :radix 16 :max-length 2 :no-whitespace-p t)))
251           ;; note that it is OK if \x is followed by zero digits
252           (make-char-from-code number error-pos)))
253       ((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
254         ;; \x should be followed by an octal char code,
255         ;; three digits or less
256         (let* ((error-pos (decf (lexer-pos lexer)))
257                (number (get-number lexer :radix 8 :max-length 3)))
258           (make-char-from-code number error-pos)))
259       ;; the following five character names are 'semi-standard'
260       ;; according to the CLHS but I'm not aware of any implementation
261       ;; that doesn't implement them
262       ((#\t)
263         #\Tab)
264       ((#\n)
265         #\Newline)
266       ((#\r)
267         #\Return)
268       ((#\f)
269         #\Page)
270       ((#\b)
271         #\Backspace)
272       ((#\a)
273         (code-char 7))                  ; ASCII bell
274       ((#\e)
275         (code-char 27))                 ; ASCII escape
276       (otherwise
277         ;; all other characters aren't affected by a backslash
278         chr))))
279
280 (defun read-char-property (lexer first-char)
281   (declare #.*standard-optimize-settings*)
282   (unless (eql (next-char-non-extended lexer) #\{)
283     (signal-syntax-error* (lexer-pos lexer) "Expected left brace after \\~A." first-char))
284   (let ((name (with-output-to-string (out nil :element-type
285                                           #+:lispworks 'lw:simple-char #-:lispworks 'character)
286                   (loop
287                    (let ((char (or (next-char-non-extended lexer)
288                                    (signal-syntax-error "Unexpected EOF after \\~A{." first-char))))
289                      (when (char= char #\})
290                        (return))
291                      (write-char char out))))))
292     (list (if (char= first-char #\p) :property :inverted-property)
293           ;; we must reverse here because of what PARSE-STRING does
294           (nreverse name))))
295
296 (defun collect-char-class (lexer)
297   "Reads and consumes characters from regex string until a right
298 bracket is seen.  Assembles them into a list \(which is returned) of
299 characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
300 tokens representing special character classes."
301   (declare #.*standard-optimize-settings*)
302   (let ((start-pos (lexer-pos lexer))         ; remember start for error message
303         hyphen-seen
304         last-char
305         list)
306     (flet ((handle-char (c)
307              "Do the right thing with character C depending on whether
308 we're inside a range or not."
309              (cond ((and hyphen-seen last-char)
310                     (setf (car list) (list :range last-char c)
311                           last-char nil))
312                    (t
313                     (push c list)
314                     (setq last-char c)))
315              (setq hyphen-seen nil)))
316       (loop for first = t then nil
317             for c = (next-char-non-extended lexer)
318             ;; leave loop if at end of string
319             while c
320             do (cond
321                 ((char= c #\\)
322                  ;; we've seen a backslash
323                  (let ((next-char (next-char-non-extended lexer)))
324                    (case next-char
325                      ((#\d #\D #\w #\W #\s #\S)
326                       ;; a special character class
327                       (push (map-char-to-special-char-class next-char) list)
328                       ;; if the last character was a hyphen
329                       ;; just collect it literally
330                       (when hyphen-seen
331                         (push #\- list))
332                       ;; if the next character is a hyphen do the same
333                       (when (looking-at-p lexer #\-)
334                         (push #\- list)
335                         (incf (lexer-pos lexer)))
336                       (setq hyphen-seen nil))
337                      ((#\P #\p)
338                       ;; maybe a character property
339                       (cond ((null *property-resolver*)
340                              (handle-char next-char))
341                             (t
342                              (push (read-char-property lexer next-char) list)
343                              ;; if the last character was a hyphen
344                              ;; just collect it literally
345                              (when hyphen-seen
346                                (push #\- list))
347                              ;; if the next character is a hyphen do the same
348                              (when (looking-at-p lexer #\-)
349                                (push #\- list)
350                                (incf (lexer-pos lexer)))
351                              (setq hyphen-seen nil))))                       
352                      ((#\E)
353                       ;; if \Q quoting is on we ignore \E,
354                       ;; otherwise it's just a plain #\E
355                       (unless *allow-quoting*
356                         (handle-char #\E)))
357                      (otherwise
358                       ;; otherwise unescape the following character(s)
359                       (decf (lexer-pos lexer))
360                       (handle-char (unescape-char lexer))))))
361                 (first
362                  ;; the first character must not be a right bracket
363                  ;; and isn't treated specially if it's a hyphen
364                  (handle-char c))
365                 ((char= c #\])
366                  ;; end of character class
367                  ;; make sure we collect a pending hyphen
368                  (when hyphen-seen
369                    (setq hyphen-seen nil)
370                    (handle-char #\-))
371                  ;; reverse the list to preserve the order intended
372                  ;; by the author of the regex string
373                  (return-from collect-char-class (nreverse list)))
374                 ((and (char= c #\-)
375                       last-char
376                       (not hyphen-seen))
377                  ;; if the last character was 'just a character'
378                  ;; we expect to be in the middle of a range
379                  (setq hyphen-seen t))
380                 ((char= c #\-)
381                  ;; otherwise this is just an ordinary hyphen
382                  (handle-char #\-))
383                 (t
384                  ;; default case - just collect the character
385                  (handle-char c))))
386       ;; we can only exit the loop normally if we've reached the end
387       ;; of the regex string without seeing a right bracket
388       (signal-syntax-error* start-pos "Missing right bracket to close character class."))))
389
390 (defun maybe-parse-flags (lexer)
391   (declare #.*standard-optimize-settings*)
392   "Reads a sequence of modifiers \(including #\\- to reverse their
393 meaning) and returns a corresponding list of \"flag\" tokens.  The
394 \"x\" modifier is treated specially in that it dynamically modifies
395 the behaviour of the lexer itself via the special variable
396 *EXTENDED-MODE-P*."
397   (prog1
398     (loop with set = t
399           for chr = (next-char-non-extended lexer)
400           unless chr
401             do (signal-syntax-error "Unexpected end of string.")
402           while (find chr "-imsx" :test #'char=)
403           ;; the first #\- will invert the meaning of all modifiers
404           ;; following it
405           if (char= chr #\-)
406             do (setq set nil)
407           else if (char= chr #\x)
408             do (setq *extended-mode-p* set)
409           else collect (if set
410                          (case chr
411                            ((#\i)
412                              :case-insensitive-p)
413                            ((#\m)
414                              :multi-line-mode-p)
415                            ((#\s)
416                              :single-line-mode-p))
417                          (case chr
418                            ((#\i)
419                              :case-sensitive-p)
420                            ((#\m)
421                              :not-multi-line-mode-p)
422                            ((#\s)
423                              :not-single-line-mode-p))))
424     (decf (lexer-pos lexer))))
425
426 (defun get-quantifier (lexer)
427   (declare #.*standard-optimize-settings*)
428   "Returns a list of two values (min max) if what the lexer is looking
429 at can be interpreted as a quantifier. Otherwise returns NIL and
430 resets the lexer to its old position."
431   ;; remember starting position for FAIL and UNGET-TOKEN functions
432   (push (lexer-pos lexer) (lexer-last-pos lexer))
433   (let ((next-char (next-char lexer)))
434     (case next-char
435       ((#\*)
436         ;; * (Kleene star): match 0 or more times
437         '(0 nil))
438       ((#\+)
439         ;; +: match 1 or more times
440         '(1 nil))
441       ((#\?)
442         ;; ?: match 0 or 1 times
443         '(0 1))
444       ((#\{)
445         ;; one of
446         ;;   {n}:   match exactly n times
447         ;;   {n,}:  match at least n times
448         ;;   {n,m}: match at least n but not more than m times
449         ;; note that anything not matching one of these patterns will
450         ;; be interpreted literally - even whitespace isn't allowed
451         (let ((num1 (get-number lexer :no-whitespace-p t)))
452           (if num1
453             (let ((next-char (next-char-non-extended lexer)))
454               (case next-char
455                 ((#\,)
456                   (let* ((num2 (get-number lexer :no-whitespace-p t))
457                          (next-char (next-char-non-extended lexer)))
458                     (case next-char
459                       ((#\})
460                         ;; this is the case {n,} (NUM2 is NIL) or {n,m}
461                         (list num1 num2))
462                       (otherwise
463                         (fail lexer)))))
464                 ((#\})
465                   ;; this is the case {n}
466                   (list num1 num1))
467                 (otherwise
468                   (fail lexer))))
469             ;; no number following left curly brace, so we treat it
470             ;; like a normal character
471             (fail lexer))))
472       ;; cannot be a quantifier
473       (otherwise
474         (fail lexer)))))
475
476 (defun parse-register-name-aux (lexer)
477   "Reads and returns the name in a named register group.  It is
478 assumed that the starting #\< character has already been read.  The
479 closing #\> will also be consumed."
480   ;; we have to look for an ending > character now
481   (let ((end-name (position #\>
482                             (lexer-str lexer)
483                             :start (lexer-pos lexer)
484                             :test #'char=)))
485     (unless end-name
486       ;; there has to be > somewhere, syntax error otherwise
487       (signal-syntax-error* (1- (lexer-pos lexer)) "Opening #\< in named group has no closing #\>."))
488     (let ((name (subseq (lexer-str lexer)
489                         (lexer-pos lexer)
490                         end-name)))
491       (unless (every #'(lambda (char)
492                          (or (alphanumericp char)
493                              (char= #\- char)))
494                      name)
495         ;; register name can contain only alphanumeric characters or #\-
496         (signal-syntax-error* (lexer-pos lexer) "Invalid character in named register group."))
497       ;; advance lexer beyond "<name>" part
498       (setf (lexer-pos lexer) (1+ end-name))
499       name)))
500
501 (defun get-token (lexer)
502   (declare #.*standard-optimize-settings*)
503   "Returns and consumes the next token from the regex string \(or NIL)."
504   ;; remember starting position for UNGET-TOKEN function
505   (push (lexer-pos lexer)
506         (lexer-last-pos lexer))
507   (let ((next-char (next-char lexer)))
508     (cond (next-char
509            (case next-char
510              ;; the easy cases first - the following six characters
511              ;; always have a special meaning and get translated
512              ;; into tokens immediately
513              ((#\))
514               :close-paren)
515              ((#\|)
516               :vertical-bar)
517              ((#\?)
518               :question-mark)
519              ((#\.)
520               :everything)
521              ((#\^)
522               :start-anchor)
523              ((#\$)
524               :end-anchor)
525              ((#\+ #\*)
526               ;; quantifiers will always be consumend by
527               ;; GET-QUANTIFIER, they must not appear here
528               (signal-syntax-error* (1- (lexer-pos lexer)) "Quantifier '~A' not allowed." next-char))
529              ((#\{)
530               ;; left brace isn't a special character in it's own
531               ;; right but we must check if what follows might
532               ;; look like a quantifier
533               (let ((this-pos (lexer-pos lexer))
534                     (this-last-pos (lexer-last-pos lexer)))
535                 (unget-token lexer)
536                 (when (get-quantifier lexer)
537                   (signal-syntax-error* (car this-last-pos)
538                                         "Quantifier '~A' not allowed."
539                                         (subseq (lexer-str lexer)
540                                                 (car this-last-pos)
541                                                 (lexer-pos lexer))))
542                 (setf (lexer-pos lexer) this-pos
543                       (lexer-last-pos lexer) this-last-pos)
544                 next-char))
545              ((#\[)
546               ;; left bracket always starts a character class
547               (cons  (cond ((looking-at-p lexer #\^)
548                             (incf (lexer-pos lexer))
549                             :inverted-char-class)
550                            (t
551                             :char-class))
552                      (collect-char-class lexer)))
553              ((#\\)
554               ;; backslash might mean different things so we have
555               ;; to peek one char ahead:
556               (let ((next-char (next-char-non-extended lexer)))
557                 (case next-char
558                   ((#\A)
559                    :modeless-start-anchor)
560                   ((#\Z)
561                    :modeless-end-anchor)
562                   ((#\z)
563                    :modeless-end-anchor-no-newline)
564                   ((#\b)
565                    :word-boundary)
566                   ((#\B)
567                    :non-word-boundary)
568                   ((#\k)
569                    (cond ((and *allow-named-registers*
570                                (looking-at-p lexer #\<))
571                           ;; back-referencing a named register
572                           (incf (lexer-pos lexer))
573                           (list :back-reference
574                                 (nreverse (parse-register-name-aux lexer))))
575                          (t
576                           ;; false alarm, just unescape \k
577                           #\k)))
578                   ((#\d #\D #\w #\W #\s #\S)
579                    ;; these will be treated like character classes
580                    (map-char-to-special-char-class next-char))
581                   ((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
582                    ;; uh, a digit...
583                    (let* ((old-pos (decf (lexer-pos lexer)))
584                           ;; ...so let's get the whole number first
585                           (backref-number (get-number lexer)))
586                      (declare (fixnum backref-number))
587                      (cond ((and (> backref-number (lexer-reg lexer))
588                                  (<= 10 backref-number))
589                             ;; \10 and higher are treated as octal
590                             ;; character codes if we haven't
591                             ;; opened that much register groups
592                             ;; yet
593                             (setf (lexer-pos lexer) old-pos)
594                             ;; re-read the number from the old
595                             ;; position and convert it to its
596                             ;; corresponding character
597                             (make-char-from-code (get-number lexer :radix 8 :max-length 3)
598                                                  old-pos))
599                            (t
600                             ;; otherwise this must refer to a
601                             ;; backreference
602                             (list :back-reference backref-number)))))
603                   ((#\0)
604                    ;; this always means an octal character code
605                    ;; (at most three digits)
606                    (let ((old-pos (decf (lexer-pos lexer))))
607                      (make-char-from-code (get-number lexer :radix 8 :max-length 3)
608                                           old-pos)))
609                   ((#\P #\p)
610                    ;; might be a named property
611                    (cond (*property-resolver* (read-char-property lexer next-char))
612                          (t next-char)))
613                   (otherwise
614                    ;; in all other cases just unescape the
615                    ;; character
616                    (decf (lexer-pos lexer))
617                    (unescape-char lexer)))))
618              ((#\()
619               ;; an open parenthesis might mean different things
620               ;; depending on what follows...
621               (cond ((looking-at-p lexer #\?)
622                      ;; this is the case '(?' (and probably more behind)
623                      (incf (lexer-pos lexer))
624                      ;; we have to check for modifiers first
625                      ;; because a colon might follow
626                      (let* ((flags (maybe-parse-flags lexer))
627                             (next-char (next-char-non-extended lexer)))
628                        ;; modifiers are only allowed if a colon
629                        ;; or a closing parenthesis are following
630                        (when (and flags
631                                   (not (find next-char ":)" :test #'char=)))
632                          (signal-syntax-error* (car (lexer-last-pos lexer))
633                                                "Sequence '~A' not recognized."
634                                                (subseq (lexer-str lexer)
635                                                        (car (lexer-last-pos lexer))
636                                                        (lexer-pos lexer))))
637                        (case next-char
638                          ((nil)
639                           ;; syntax error
640                           (signal-syntax-error "End of string following '(?'."))
641                          ((#\))
642                           ;; an empty group except for the flags
643                           ;; (if there are any)
644                           (or (and flags
645                                    (cons :flags flags))
646                               :void))
647                          ((#\()
648                           ;; branch
649                           :open-paren-paren)
650                          ((#\>)
651                           ;; standalone
652                           :open-paren-greater)
653                          ((#\=)
654                           ;; positive look-ahead
655                           :open-paren-equal)
656                          ((#\!)
657                           ;; negative look-ahead
658                           :open-paren-exclamation)
659                          ((#\:)
660                           ;; non-capturing group - return flags as
661                           ;; second value
662                           (values :open-paren-colon flags))
663                          ((#\<)
664                           ;; might be a look-behind assertion or a named group, so
665                           ;; check next character
666                           (let ((next-char (next-char-non-extended lexer)))
667                             (if (alpha-char-p next-char)
668                                 (progn
669                                   ;; we have encountered a named group
670                                   ;; are we supporting register naming?
671                                   (unless *allow-named-registers*
672                                     (signal-syntax-error* (1- (lexer-pos lexer))
673                                                           "Character '~A' may not follow '(?<'."
674                                                           next-char))
675                                   ;; put the letter back
676                                   (decf (lexer-pos lexer))
677                                   ;; named group
678                                   :open-paren-less-letter)
679                                 (case next-char
680                                   ((#\=)
681                                    ;; positive look-behind
682                                    :open-paren-less-equal)
683                                   ((#\!)
684                                    ;; negative look-behind
685                                    :open-paren-less-exclamation)
686                                   ((#\))
687                                    ;; Perl allows "(?<)" and treats
688                                    ;; it like a null string
689                                    :void)
690                                   ((nil)
691                                    ;; syntax error
692                                    (signal-syntax-error "End of string following '(?<'."))
693                                   (t
694                                    ;; also syntax error
695                                    (signal-syntax-error* (1- (lexer-pos lexer))
696                                                          "Character '~A' may not follow '(?<'."
697                                                          next-char ))))))
698                          (otherwise
699                           (signal-syntax-error* (1- (lexer-pos lexer))
700                                                 "Character '~A' may not follow '(?'."
701                                                 next-char)))))
702                     (t
703                      ;; if next-char was not #\? (this is within
704                      ;; the first COND), we've just seen an opening
705                      ;; parenthesis and leave it like that
706                      :open-paren)))
707              (otherwise
708               ;; all other characters are their own tokens
709               next-char)))
710           ;; we didn't get a character (this if the "else" branch from
711           ;; the first IF), so we don't return a token but NIL
712           (t
713            (pop (lexer-last-pos lexer))
714            nil))))
715
716 (declaim (inline unget-token))
717 (defun unget-token (lexer)
718   (declare #.*standard-optimize-settings*)
719   "Moves the lexer back to the last position stored in the LAST-POS stack."
720   (if (lexer-last-pos lexer)
721     (setf (lexer-pos lexer)
722             (pop (lexer-last-pos lexer)))
723     (error "No token to unget \(this should not happen)")))
724
725 (declaim (inline start-of-subexpr-p))
726 (defun start-of-subexpr-p (lexer)
727   (declare #.*standard-optimize-settings*)
728   "Tests whether the next token can start a valid sub-expression, i.e.
729 a stand-alone regex."
730   (let* ((pos (lexer-pos lexer))
731          (next-char (next-char lexer)))
732     (not (or (null next-char)
733              (prog1
734                (member (the character next-char)
735                        '(#\) #\|)
736                        :test #'char=)
737                (setf (lexer-pos lexer) pos))))))
Note: See TracBrowser for help on using the browser.