root/trunk/thirdparty/cl-ppcre/convert.lisp

Revision 3581, 39.7 kB (checked in by edi, 6 months ago)

Update to current dev version

Line 
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2 ;;; $Header: /usr/local/cvsrep/cl-ppcre/convert.lisp,v 1.54 2008/07/23 02:14:06 edi Exp $
3
4 ;;; Here the parse tree is converted into its internal representation
5 ;;; using REGEX objects.  At the same time some optimizations are
6 ;;; already applied.
7
8 ;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
9
10 ;;; Redistribution and use in source and binary forms, with or without
11 ;;; modification, are permitted provided that the following conditions
12 ;;; are met:
13
14 ;;;   * Redistributions of source code must retain the above copyright
15 ;;;     notice, this list of conditions and the following disclaimer.
16
17 ;;;   * Redistributions in binary form must reproduce the above
18 ;;;     copyright notice, this list of conditions and the following
19 ;;;     disclaimer in the documentation and/or other materials
20 ;;;     provided with the distribution.
21
22 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
23 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
26 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
28 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
30 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
34 (in-package :cl-ppcre)
35
36 ;;; The flags that represent the "ism" modifiers are always kept
37 ;;; together in a three-element list. We use the following macros to
38 ;;; access individual elements.
39
40 (defmacro case-insensitive-mode-p (flags)
41   "Accessor macro to extract the first flag out of a three-element flag list."
42   `(first ,flags))
43
44 (defmacro multi-line-mode-p (flags)
45   "Accessor macro to extract the second flag out of a three-element flag list."
46   `(second ,flags))
47
48 (defmacro single-line-mode-p (flags)
49   "Accessor macro to extract the third flag out of a three-element flag list."
50   `(third ,flags))
51
52 (defun set-flag (token)
53   "Reads a flag token and sets or unsets the corresponding entry in
54 the special FLAGS list."
55   (declare #.*standard-optimize-settings*)
56   (declare (special flags))
57   (case token
58     ((:case-insensitive-p)
59       (setf (case-insensitive-mode-p flags) t))
60     ((:case-sensitive-p)
61       (setf (case-insensitive-mode-p flags) nil))
62     ((:multi-line-mode-p)
63       (setf (multi-line-mode-p flags) t))
64     ((:not-multi-line-mode-p)
65       (setf (multi-line-mode-p flags) nil))
66     ((:single-line-mode-p)
67       (setf (single-line-mode-p flags) t))
68     ((:not-single-line-mode-p)
69       (setf (single-line-mode-p flags) nil))
70     (otherwise
71       (signal-syntax-error "Unknown flag token ~A." token))))
72
73 (defgeneric resolve-property (property)
74   (:documentation "Resolves PROPERTY to a unary character test
75 function.  PROPERTY can either be a function designator or it can be a
76 string which is resolved using *PROPERTY-RESOLVER*.")
77   (:method ((property-name string))
78    (funcall *property-resolver* property-name))
79   (:method ((function-name symbol))
80    function-name)
81   (:method ((test-function function))
82    test-function))
83
84 (defun convert-char-class-to-test-function (list invertedp case-insensitive-p)
85   "Combines all items in LIST into test function and returns a
86 logical-OR combination of these functions.  Items can be single
87 characters, character ranges like \(:RANGE #\\A #\\E), or special
88 character classes like :DIGIT-CLASS.  Does the right thing with
89 respect to case-\(in)sensitivity as specified by the special variable
90 FLAGS."
91   (declare #.*standard-optimize-settings*)
92   (declare (special flags))
93   (let ((test-functions
94          (loop for item in list
95                collect (cond ((characterp item)
96                               ;; rebind so closure captures the right one
97                               (let ((this-char item))
98                                 (lambda (char)
99                                   (declare (character char this-char))
100                                   (char= char this-char))))
101                              ((symbolp item)
102                               (case item
103                                 ((:digit-class) #'digit-char-p)
104                                 ((:non-digit-class) (complement* #'digit-char-p))
105                                 ((:whitespace-char-class) #'whitespacep)
106                                 ((:non-whitespace-char-class) (complement* #'whitespacep))
107                                 ((:word-char-class) #'word-char-p)
108                                 ((:non-word-char-class) (complement* #'word-char-p))
109                                 (otherwise
110                                  (signal-syntax-error "Unknown symbol ~A in character class." item))))
111                              ((and (consp item)
112                                    (eq (first item) :property))
113                               (resolve-property (second item)))
114                              ((and (consp item)
115                                    (eq (first item) :inverted-property))
116                               (complement* (resolve-property (second item))))
117                              ((and (consp item)
118                                    (eq (first item) :range))
119                               (let ((from (second item))
120                                     (to (third item)))
121                                 (when (char> from to)
122                                   (signal-syntax-error "Invalid range from ~S to ~S in char-class." from to))
123                                 (lambda (char)
124                                   (declare (character char from to))
125                                   (char<= from char to))))
126                              (t (signal-syntax-error "Unknown item ~A in char-class list." item))))))
127     (unless test-functions
128       (signal-syntax-error "Empty character class."))
129     (cond ((cdr test-functions)           
130            (cond ((and invertedp case-insensitive-p)
131                   (lambda (char)
132                     (declare (character char))
133                     (loop with both-case-p = (both-case-p char)
134                           with char-down = (if both-case-p (char-downcase char) char)
135                           with char-up = (if both-case-p (char-upcase char) nil)
136                           for test-function in test-functions
137                           never (or (funcall test-function char-down)
138                                     (and char-up (funcall test-function char-up))))))
139                  (case-insensitive-p
140                   (lambda (char)
141                     (declare (character char))
142                     (loop with both-case-p = (both-case-p char)
143                           with char-down = (if both-case-p (char-downcase char) char)
144                           with char-up = (if both-case-p (char-upcase char) nil)
145                           for test-function in test-functions
146                           thereis (or (funcall test-function char-down)
147                                       (and char-up (funcall test-function char-up))))))
148                  (invertedp
149                   (lambda (char)
150                     (loop for test-function in test-functions
151                           never (funcall test-function char))))
152                  (t
153                   (lambda (char)
154                     (loop for test-function in test-functions
155                           thereis (funcall test-function char))))))
156           ;; there's only one test-function
157           (t (let ((test-function (first test-functions)))
158                (cond ((and invertedp case-insensitive-p)
159                       (lambda (char)
160                         (declare (character char))
161                         (not (or (funcall test-function (char-downcase char))
162                                  (and (both-case-p char)
163                                       (funcall test-function (char-upcase char)))))))
164                      (case-insensitive-p
165                       (lambda (char)
166                         (declare (character char))
167                         (or (funcall test-function (char-downcase char))
168                             (and (both-case-p char)
169                                  (funcall test-function (char-upcase char))))))
170                      (invertedp (complement* test-function))
171                      (t test-function)))))))
172
173 (defun maybe-split-repetition (regex
174                                greedyp
175                                minimum
176                                maximum
177                                min-len
178                                length
179                                reg-seen)
180   "Splits a REPETITION object into a constant and a varying part if
181 applicable, i.e. something like
182   a{3,} -> a{3}a*
183 The arguments to this function correspond to the REPETITION slots of
184 the same name."
185   (declare #.*standard-optimize-settings*)
186   (declare (fixnum minimum)
187            ((or fixnum null) maximum))
188   ;; note the usage of COPY-REGEX here; we can't use the same REGEX
189   ;; object in both REPETITIONS because they will have different
190   ;; offsets
191   (when maximum
192     (when (zerop maximum)
193       ;; trivial case: don't repeat at all
194       (return-from maybe-split-repetition
195         (make-instance 'void)))
196     (when (= 1 minimum maximum)
197       ;; another trivial case: "repeat" exactly once
198       (return-from maybe-split-repetition
199         regex)))
200   ;; first set up the constant part of the repetition
201   ;; maybe that's all we need
202   (let ((constant-repetition (if (plusp minimum)
203                                (make-instance 'repetition
204                                               :regex (copy-regex regex)
205                                               :greedyp greedyp
206                                               :minimum minimum
207                                               :maximum minimum
208                                               :min-len min-len
209                                               :len length
210                                               :contains-register-p reg-seen)
211                                ;; don't create garbage if minimum is 0
212                                nil)))
213     (when (and maximum
214                (= maximum minimum))
215       (return-from maybe-split-repetition
216         ;; no varying part needed because min = max
217         constant-repetition))
218     ;; now construct the varying part
219     (let ((varying-repetition
220             (make-instance 'repetition
221                            :regex regex
222                            :greedyp greedyp
223                            :minimum 0
224                            :maximum (if maximum (- maximum minimum) nil)
225                            :min-len min-len
226                            :len length
227                            :contains-register-p reg-seen)))
228       (cond ((zerop minimum)
229               ;; min = 0, no constant part needed
230               varying-repetition)
231             ((= 1 minimum)
232               ;; min = 1, constant part needs no REPETITION wrapped around
233               (make-instance 'seq
234                              :elements (list (copy-regex regex)
235                                              varying-repetition)))
236             (t
237               ;; general case
238               (make-instance 'seq
239                              :elements (list constant-repetition
240                                              varying-repetition)))))))
241
242 ;; During the conversion of the parse tree we keep track of the start
243 ;; of the parse tree in the special variable STARTS-WITH which'll
244 ;; either hold a STR object or an EVERYTHING object. The latter is the
245 ;; case if the regex starts with ".*" which implicitly anchors the
246 ;; regex at the start (perhaps modulo #\Newline).
247
248 (defun maybe-accumulate (str)
249   "Accumulate STR into the special variable STARTS-WITH if
250 ACCUMULATE-START-P (also special) is true and STARTS-WITH is either
251 NIL or a STR object of the same case mode. Always returns NIL."
252   (declare #.*standard-optimize-settings*)
253   (declare (special accumulate-start-p starts-with))
254   (declare (ftype (function (t) fixnum) len))
255   (when accumulate-start-p
256     (etypecase starts-with
257       (str
258         ;; STARTS-WITH already holds a STR, so we check if we can
259         ;; concatenate
260         (cond ((eq (case-insensitive-p starts-with)
261                    (case-insensitive-p str))
262                 ;; we modify STARTS-WITH in place
263                 (setf (len starts-with)
264                         (+ (len starts-with) (len str)))
265                 ;; note that we use SLOT-VALUE because the accessor
266                 ;; STR has a declared FTYPE which doesn't fit here
267                 (adjust-array (slot-value starts-with 'str)
268                               (len starts-with)
269                               :fill-pointer t)
270                 (setf (subseq (slot-value starts-with 'str)
271                               (- (len starts-with) (len str)))
272                         (str str)
273                       ;; STR objects that are parts of STARTS-WITH
274                       ;; always have their SKIP slot set to true
275                       ;; because the SCAN function will take care of
276                       ;; them, i.e. the matcher can ignore them
277                       (skip str) t))
278               (t (setq accumulate-start-p nil))))
279       (null
280         ;; STARTS-WITH is still empty, so we create a new STR object
281         (setf starts-with
282                 (make-instance 'str
283                                :str ""
284                                :case-insensitive-p (case-insensitive-p str))
285               ;; INITIALIZE-INSTANCE will coerce the STR to a simple
286               ;; string, so we have to fill it afterwards
287               (slot-value starts-with 'str)
288                 (make-array (len str)
289                             :initial-contents (str str)
290                             :element-type 'character
291                             :fill-pointer t
292                             :adjustable t)
293               (len starts-with)
294                 (len str)
295               ;; see remark about SKIP above
296               (skip str) t))
297       (everything
298         ;; STARTS-WITH already holds an EVERYTHING object - we can't
299         ;; concatenate
300         (setq accumulate-start-p nil))))
301   nil)
302
303 (declaim (inline convert-aux))
304 (defun convert-aux (parse-tree)
305   "Converts the parse tree PARSE-TREE into a REGEX object and returns
306 it.  Will also
307
308   - split and optimize repetitions,
309   - accumulate strings or EVERYTHING objects into the special variable
310     STARTS-WITH,
311   - keep track of all registers seen in the special variable REG-NUM,
312   - keep track of all named registers seen in the special variable REG-NAMES
313   - keep track of the highest backreference seen in the special
314     variable MAX-BACK-REF,
315   - maintain and adher to the currently applicable modifiers in the special
316     variable FLAGS, and
317   - maybe even wash your car..."
318   (declare #.*standard-optimize-settings*)
319   (if (consp parse-tree)
320     (convert-compound-parse-tree (first parse-tree) parse-tree)
321     (convert-simple-parse-tree parse-tree)))
322
323 (defgeneric convert-compound-parse-tree (token parse-tree &key)
324   (declare #.*standard-optimize-settings*)
325   (:documentation "Helper function for CONVERT-AUX which converts
326 parse trees which are conses and dispatches on TOKEN which is the
327 first element of the parse tree.")
328   (:method (token parse-tree &key)
329    (signal-syntax-error "Unknown token ~A in parse-tree." token)))
330
331 (defmethod convert-compound-parse-tree ((token (eql :sequence)) parse-tree &key)
332   "The case for parse trees like \(:SEQUENCE {<regex>}*)."
333   (declare #.*standard-optimize-settings*)
334   (cond ((cddr parse-tree)
335          ;; this is essentially like
336          ;; (MAPCAR 'CONVERT-AUX (REST PARSE-TREE))
337          ;; but we don't cons a new list
338          (loop for parse-tree-rest on (rest parse-tree)
339                while parse-tree-rest
340                do (setf (car parse-tree-rest)
341                         (convert-aux (car parse-tree-rest))))
342          (make-instance 'seq :elements (rest parse-tree)))
343         (t (convert-aux (second parse-tree)))))
344
345 (defmethod convert-compound-parse-tree ((token (eql :group)) parse-tree &key)
346   "The case for parse trees like \(:GROUP {<regex>}*).
347
348 This is a syntactical construct equivalent to :SEQUENCE intended to
349 keep the effect of modifiers local."
350   (declare #.*standard-optimize-settings*)
351   (declare (special flags))
352   ;; make a local copy of FLAGS and shadow the global value while we
353   ;; descend into the enclosed regexes
354   (let ((flags (copy-list flags)))
355     (declare (special flags))
356     (cond ((cddr parse-tree)
357            (loop for parse-tree-rest on (rest parse-tree)
358                  while parse-tree-rest
359                  do (setf (car parse-tree-rest)
360                           (convert-aux (car parse-tree-rest))))
361            (make-instance 'seq :elements (rest parse-tree)))
362           (t (convert-aux (second parse-tree))))))
363
364 (defmethod convert-compound-parse-tree ((token (eql :alternation)) parse-tree &key)
365   "The case for \(:ALTERNATION {<regex>}*)."
366   (declare #.*standard-optimize-settings*)
367   (declare (special accumulate-start-p))
368   ;; we must stop accumulating objects into STARTS-WITH once we reach
369   ;; an alternation
370   (setq accumulate-start-p nil)
371   (loop for parse-tree-rest on (rest parse-tree)
372         while parse-tree-rest
373         do (setf (car parse-tree-rest)
374                  (convert-aux (car parse-tree-rest))))
375   (make-instance 'alternation :choices (rest parse-tree)))
376
377 (defmethod convert-compound-parse-tree ((token (eql :branch)) parse-tree &key)
378   "The case for \(:BRANCH <test> <regex>).
379
380 Here, <test> must be look-ahead, look-behind or number; if <regex> is
381 an alternation it must have one or two choices."
382   (declare #.*standard-optimize-settings*)
383   (declare (special accumulate-start-p))
384   (setq accumulate-start-p nil)
385   (let* ((test-candidate (second parse-tree))
386          (test (cond ((numberp test-candidate)
387                       (when (zerop (the fixnum test-candidate))
388                         (signal-syntax-error "Register 0 doesn't exist: ~S." parse-tree))
389                       (1- (the fixnum test-candidate)))
390                      (t (convert-aux test-candidate))))
391          (alternations (convert-aux (third parse-tree))))
392     (when (and (not (numberp test))
393                (not (typep test 'lookahead))
394                (not (typep test 'lookbehind)))
395       (signal-syntax-error "Branch test must be look-ahead, look-behind or number: ~S." parse-tree))
396     (typecase alternations
397       (alternation
398        (case (length (choices alternations))
399          ((0)
400           (signal-syntax-error "No choices in branch: ~S." parse-tree))
401          ((1)
402           (make-instance 'branch
403                          :test test
404                          :then-regex (first
405                                       (choices alternations))))
406          ((2)
407           (make-instance 'branch
408                          :test test
409                          :then-regex (first
410                                       (choices alternations))
411                          :else-regex (second
412                                       (choices alternations))))
413          (otherwise
414           (signal-syntax-error "Too much choices in branch: ~S." parse-tree))))
415       (t
416        (make-instance 'branch
417                       :test test
418                       :then-regex alternations)))))
419
420 (defmethod convert-compound-parse-tree ((token (eql :positive-lookahead)) parse-tree &key)
421   "The case for \(:POSITIVE-LOOKAHEAD <regex>)."
422   (declare #.*standard-optimize-settings*)
423   (declare (special flags accumulate-start-p))
424   ;; keep the effect of modifiers local to the enclosed regex and stop
425   ;; accumulating into STARTS-WITH
426   (setq accumulate-start-p nil)
427   (let ((flags (copy-list flags)))
428     (declare (special flags))
429     (make-instance 'lookahead
430                    :regex (convert-aux (second parse-tree))
431                    :positivep t)))
432
433 (defmethod convert-compound-parse-tree ((token (eql :negative-lookahead)) parse-tree &key)
434   "The case for \(:NEGATIVE-LOOKAHEAD <regex>)."
435   (declare #.*standard-optimize-settings*)
436   ;; do the same as for positive look-aheads and just switch afterwards
437   (let ((regex (convert-compound-parse-tree :positive-lookahead parse-tree)))
438     (setf (slot-value regex 'positivep) nil)
439     regex))
440
441 (defmethod convert-compound-parse-tree ((token (eql :positive-lookbehind)) parse-tree &key)
442   "The case for \(:POSITIVE-LOOKBEHIND <regex>)."
443   (declare #.*standard-optimize-settings*)
444   (declare (special flags accumulate-start-p))
445   ;; keep the effect of modifiers local to the enclosed regex and stop
446   ;; accumulating into STARTS-WITH
447   (setq accumulate-start-p nil)
448   (let* ((flags (copy-list flags))
449          (regex (convert-aux (second parse-tree)))
450          (len (regex-length regex)))
451     (declare (special flags))
452     ;; lookbehind assertions must be of fixed length
453     (unless len
454       (signal-syntax-error "Variable length look-behind not implemented \(yet): ~S." parse-tree))
455     (make-instance 'lookbehind
456                    :regex regex
457                    :positivep t
458                    :len len)))
459
460 (defmethod convert-compound-parse-tree ((token (eql :negative-lookbehind)) parse-tree &key)
461   "The case for \(:NEGATIVE-LOOKBEHIND <regex>)."
462   (declare #.*standard-optimize-settings*)
463   ;; do the same as for positive look-behinds and just switch afterwards
464   (let ((regex (convert-compound-parse-tree :positive-lookbehind parse-tree)))
465     (setf (slot-value regex 'positivep) nil)
466     regex))
467
468 (defmethod convert-compound-parse-tree ((token (eql :greedy-repetition)) parse-tree &key (greedyp t))
469   "The case for \(:GREEDY-REPETITION|:NON-GREEDY-REPETITION <min> <max> <regex>).
470
471 This function is also used for the non-greedy case in which case it is
472 called with GREEDYP set to NIL as you would expect."
473   (declare #.*standard-optimize-settings*)
474   (declare (special accumulate-start-p starts-with))
475   ;; remember the value of ACCUMULATE-START-P upon entering
476   (let ((local-accumulate-start-p accumulate-start-p))
477     (let ((minimum (second parse-tree))
478           (maximum (third parse-tree)))
479       (declare (fixnum minimum))
480       (declare (type (or null fixnum) maximum))
481       (unless (and maximum
482                    (= 1 minimum maximum))
483         ;; set ACCUMULATE-START-P to NIL for the rest of
484         ;; the conversion because we can't continue to
485         ;; accumulate inside as well as after a proper
486         ;; repetition
487         (setq accumulate-start-p nil))
488       (let* (reg-seen
489              (regex (convert-aux (fourth parse-tree)))
490              (min-len (regex-min-length regex))
491              (length (regex-length regex)))
492         ;; note that this declaration already applies to
493         ;; the call to CONVERT-AUX above
494         (declare (special reg-seen))
495         (when (and local-accumulate-start-p
496                    (not starts-with)
497                    (zerop minimum)
498                    (not maximum))
499           ;; if this repetition is (equivalent to) ".*"
500           ;; and if we're at the start of the regex we
501           ;; remember it for ADVANCE-FN (see the SCAN
502           ;; function)
503           (setq starts-with (everythingp regex)))
504         (if (or (not reg-seen)
505                 (not greedyp)
506                 (not length)
507                 (zerop length)
508                 (and maximum (= minimum maximum)))
509           ;; the repetition doesn't enclose a register, or
510           ;; it's not greedy, or we can't determine it's
511           ;; (inner) length, or the length is zero, or the
512           ;; number of repetitions is fixed; in all of
513           ;; these cases we don't bother to optimize
514           (maybe-split-repetition regex
515                                   greedyp
516                                   minimum
517                                   maximum
518                                   min-len
519                                   length
520                                   reg-seen)
521           ;; otherwise we make a transformation that looks
522           ;; roughly like one of
523           ;;   <regex>* -> (?:<regex'>*<regex>)?
524           ;;   <regex>+ -> <regex'>*<regex>
525           ;; where the trick is that as much as possible
526           ;; registers from <regex> are removed in
527           ;; <regex'>
528           (let* (reg-seen ; new instance for REMOVE-REGISTERS
529                  (remove-registers-p t)
530                  (inner-regex (remove-registers regex))
531                  (inner-repetition
532                   ;; this is the "<regex'>" part
533                   (maybe-split-repetition inner-regex
534                                           ;; always greedy
535                                           t
536                                           ;; reduce minimum by 1
537                                           ;; unless it's already 0
538                                           (if (zerop minimum)
539                                             0
540                                             (1- minimum))
541                                           ;; reduce maximum by 1
542                                           ;; unless it's NIL
543                                           (and maximum
544                                                (1- maximum))
545                                           min-len
546                                           length
547                                           reg-seen))
548                  (inner-seq
549                   ;; this is the "<regex'>*<regex>" part
550                   (make-instance 'seq
551                                  :elements (list inner-repetition
552                                                  regex))))
553             ;; note that this declaration already applies
554             ;; to the call to REMOVE-REGISTERS above
555             (declare (special remove-registers-p reg-seen))
556             ;; wrap INNER-SEQ with a greedy
557             ;; {0,1}-repetition (i.e. "?") if necessary
558             (if (plusp minimum)
559               inner-seq
560               (maybe-split-repetition inner-seq
561                                       t
562                                       0
563                                       1
564                                       min-len
565                                       nil
566                                       t))))))))
567
568 (defmethod convert-compound-parse-tree ((token (eql :non-greedy-repetition)) parse-tree &key)
569   "The case for \(:NON-GREEDY-REPETITION <min> <max> <regex>)."
570   (declare #.*standard-optimize-settings*)
571   ;; just dispatch to the method above with GREEDYP explicitly set to NIL
572   (convert-compound-parse-tree :greedy-repetition parse-tree :greedyp nil))
573
574 (defmethod convert-compound-parse-tree ((token (eql :register)) parse-tree &key name)
575   "The case for \(:REGISTER <regex>).  Also used for named registers
576 when NAME is not NIL."
577   (declare #.*standard-optimize-settings*)
578   (declare (special flags reg-num reg-names))
579   ;; keep the effect of modifiers local to the enclosed regex; also,
580   ;; assign the current value of REG-NUM to the corresponding slot of
581   ;; the REGISTER object and increase this counter afterwards; for
582   ;; named register update REG-NAMES and set the corresponding name
583   ;; slot of the REGISTER object too
584   (let ((flags (copy-list flags))
585         (stored-reg-num reg-num))
586     (declare (special flags reg-seen named-reg-seen))
587     (setq reg-seen t)
588     (when name (setq named-reg-seen t))
589     (incf (the fixnum reg-num))
590     (push name reg-names)
591     (make-instance 'register
592                    :regex (convert-aux (if name (third parse-tree) (second parse-tree)))
593                    :num stored-reg-num
594                    :name name)))
595
596 (defmethod convert-compound-parse-tree ((token (eql :named-register)) parse-tree &key)
597   "The case for \(:NAMED-REGISTER <regex>)."
598   (declare #.*standard-optimize-settings*)
599   ;; call the method above and use the :NAME keyword argument
600   (convert-compound-parse-tree :register parse-tree :name (copy-seq (second parse-tree))))
601
602 (defmethod convert-compound-parse-tree ((token (eql :filter)) parse-tree &key)
603   "The case for \(:FILTER <function> &optional <length>)."
604   (declare #.*standard-optimize-settings*)
605   (declare (special accumulate-start-p))
606   ;; stop accumulating into STARTS-WITH
607   (setq accumulate-start-p nil)
608   (make-instance 'filter
609                  :fn (second parse-tree)
610                  :len (third parse-tree)))
611
612 (defmethod convert-compound-parse-tree ((token (eql :standalone)) parse-tree &key)
613   "The case for \(:STANDALONE <regex>)."
614   (declare #.*standard-optimize-settings*)
615   (declare (special flags accumulate-start-p))
616   ;; stop accumulating into STARTS-WITH
617   (setq accumulate-start-p nil)
618   ;; keep the effect of modifiers local to the enclosed regex
619   (let ((flags (copy-list flags)))
620     (declare (special flags))
621     (make-instance 'standalone :regex (convert-aux (second parse-tree)))))
622
623 (defmethod convert-compound-parse-tree ((token (eql :back-reference)) parse-tree &key)
624   "The case for \(:BACK-REFERENCE <number>|<name>)."
625   (declare #.*standard-optimize-settings*)
626   (declare (special flags accumulate-start-p reg-num reg-names max-back-ref))
627   (let* ((backref-name (and (stringp (second parse-tree))
628                             (second parse-tree)))
629          (referred-regs
630           (when backref-name
631             ;; find which register corresponds to the given name
632             ;; we have to deal with case where several registers share
633             ;; the same name and collect their respective numbers
634             (loop for name in reg-names
635                   for reg-index from 0
636                   when (string= name backref-name)
637                   ;; NOTE: REG-NAMES stores register names in reversed
638                   ;; order REG-NUM contains number of (any) registers
639                   ;; seen so far; 1- will be done later
640                   collect (- reg-num reg-index))))
641          ;; store the register number for the simple case
642          (backref-number (or (first referred-regs) (second parse-tree))))
643     (declare (type (or fixnum null) backref-number))
644     (when (or (not (typep backref-number 'fixnum))
645               (<= backref-number 0))
646       (signal-syntax-error "Illegal back-reference: ~S." parse-tree))
647     ;; stop accumulating into STARTS-WITH and increase MAX-BACK-REF if
648     ;; necessary
649     (setq accumulate-start-p nil
650           max-back-ref (max (the fixnum max-back-ref)
651                             backref-number))
652     (flet ((make-back-ref (backref-number)
653              (make-instance 'back-reference
654                             ;; we start counting from 0 internally
655                             :num (1- backref-number)
656                             :case-insensitive-p (case-insensitive-mode-p flags)
657                             ;; backref-name is NIL or string, safe to copy
658                             :name (copy-seq backref-name))))
659       (cond
660        ((cdr referred-regs)
661         ;; several registers share the same name we will try to match
662         ;; any of them, starting with the most recent first
663         ;; alternation is used to accomplish matching
664         (make-instance 'alternation
665                        :choices (loop
666                                  for reg-index in referred-regs
667                                  collect (make-back-ref reg-index))))
668        ;; simple case - backref corresponds to only one register
669        (t
670         (make-back-ref backref-number))))))
671
672 (defmethod convert-compound-parse-tree ((token (eql :regex)) parse-tree &key)
673   "The case for \(:REGEX <string>)."
674   (declare #.*standard-optimize-settings*)
675   (convert-aux (parse-string (second parse-tree))))
676
677 (defmethod convert-compound-parse-tree ((token (eql :char-class)) parse-tree &key invertedp)
678   "The case for \(:CHAR-CLASS {<item>}*) where item is one of
679
680 - a character,
681 - a character range: \(:RANGE <char1> <char2>), or
682 - a special char class symbol like :DIGIT-CHAR-CLASS.
683
684 Also used for inverted char classes when INVERTEDP is true."
685   (declare #.*standard-optimize-settings*)
686   (declare (special flags accumulate-start-p))
687   (let ((test-function
688          (create-optimized-test-function
689           (convert-char-class-to-test-function (rest parse-tree)
690                                                invertedp
691                                                (case-insensitive-mode-p flags)))))
692     (setq accumulate-start-p nil)
693     (make-instance 'char-class :test-function test-function)))
694
695 (defmethod convert-compound-parse-tree ((token (eql :inverted-char-class)) parse-tree &key)
696   "The case for \(:INVERTED-CHAR-CLASS {<item>}*)."
697   (declare #.*standard-optimize-settings*)
698   ;; just dispatch to the "real" method
699   (convert-compound-parse-tree :char-class parse-tree :invertedp t))
700
701 (defmethod convert-compound-parse-tree ((token (eql :property)) parse-tree &key)
702   "The case for \(:PROPERTY <name>) where <name> is a string."
703   (declare #.*standard-optimize-settings*)
704   (make-instance 'char-class :test-function (resolve-property (second parse-tree))))
705
706 (defmethod convert-compound-parse-tree ((token (eql :inverted-property)) parse-tree &key)
707   "The case for \(:INVERTED-PROPERTY <name>) where <name> is a string."
708   (declare #.*standard-optimize-settings*)
709   (make-instance 'char-class :test-function (complement* (resolve-property (second parse-tree)))))
710
711 (defmethod convert-compound-parse-tree ((token (eql :flags)) parse-tree &key)
712   "The case for \(:FLAGS {<flag>}*) where flag is a modifier symbol
713 like :CASE-INSENSITIVE-P."
714   (declare #.*standard-optimize-settings*)
715   ;; set/unset the flags corresponding to the symbols
716   ;; following :FLAGS
717   (mapc #'set-flag (rest parse-tree))
718   ;; we're only interested in the side effect of
719   ;; setting/unsetting the flags and turn this syntactical
720   ;; construct into a VOID object which'll be optimized
721   ;; away when creating the matcher
722   (make-instance 'void))
723
724 (defgeneric convert-simple-parse-tree (parse-tree)
725   (declare #.*standard-optimize-settings*)
726   (:documentation "Helper function for CONVERT-AUX which converts
727 parse trees which are atoms.")
728   (:method ((parse-tree (eql :void)))
729    (declare #.*standard-optimize-settings*)
730    (make-instance 'void))
731   (:method ((parse-tree (eql :word-boundary)))
732    (declare #.*standard-optimize-settings*)
733    (make-instance 'word-boundary :negatedp nil))
734   (:method ((parse-tree (eql :non-word-boundary)))
735    (declare #.*standard-optimize-settings*)
736    (make-instance 'word-boundary :negatedp t))
737   (:method ((parse-tree (eql :everything)))
738    (declare #.*standard-optimize-settings*)
739    (declare (special flags accumulate-start-p))
740    (setq accumulate-start-p nil)
741    (make-instance 'everything :single-line-p (single-line-mode-p flags)))
742   (:method ((parse-tree (eql :digit-class)))
743    (declare #.*standard-optimize-settings*)
744    (declare (special accumulate-start-p))
745    (setq accumulate-start-p nil)
746    (make-instance 'char-class :test-function #'digit-char-p))
747   (:method ((parse-tree (eql :word-char-class)))
748    (declare #.*standard-optimize-settings*)
749    (declare (special accumulate-start-p))
750    (setq accumulate-start-p nil)
751    (make-instance 'char-class :test-function #'word-char-p))
752   (:method ((parse-tree (eql :whitespace-char-class)))
753    (declare #.*standard-optimize-settings*)
754    (declare (special accumulate-start-p))
755    (setq accumulate-start-p nil)
756    (make-instance 'char-class :test-function #'whitespacep))
757   (:method ((parse-tree (eql :non-digit-class)))
758    (declare #.*standard-optimize-settings*)
759    (declare (special accumulate-start-p))
760    (setq accumulate-start-p nil)
761    (make-instance 'char-class :test-function (complement* #'digit-char-p)))
762   (:method ((parse-tree (eql :non-word-char-class)))
763    (declare #.*standard-optimize-settings*)
764    (declare (special accumulate-start-p))
765    (setq accumulate-start-p nil)
766    (make-instance 'char-class :test-function (complement* #'word-char-p)))
767   (:method ((parse-tree (eql :non-whitespace-char-class)))
768    (declare #.*standard-optimize-settings*)
769    (declare (special accumulate-start-p))
770    (setq accumulate-start-p nil)
771    (make-instance 'char-class :test-function (complement* #'whitespacep)))
772   (:method ((parse-tree (eql :start-anchor)))
773    ;; Perl's "^"
774    (declare #.*standard-optimize-settings*)
775    (declare (special flags))
776    (make-instance 'anchor :startp t :multi-line-p (multi-line-mode-p flags)))
777   (:method ((parse-tree (eql :end-anchor)))
778    ;; Perl's "$"
779    (declare #.*standard-optimize-settings*)
780    (declare (special flags))
781    (make-instance 'anchor :startp nil :multi-line-p (multi-line-mode-p flags)))
782   (:method ((parse-tree (eql :modeless-start-anchor)))
783    ;; Perl's "\A"
784    (declare #.*standard-optimize-settings*)
785    (make-instance 'anchor :startp t))
786   (:method ((parse-tree (eql :modeless-end-anchor)))
787    ;; Perl's "$\Z"
788    (declare #.*standard-optimize-settings*)
789    (make-instance 'anchor :startp nil))
790   (:method ((parse-tree (eql :modeless-end-anchor-no-newline)))
791    ;; Perl's "$\z"
792    (declare #.*standard-optimize-settings*)
793    (make-instance 'anchor :startp nil :no-newline-p t))
794   (:method ((parse-tree (eql :case-insensitive-p)))
795    (declare #.*standard-optimize-settings*)
796    (set-flag parse-tree)
797    (make-instance 'void))
798   (:method ((parse-tree (eql :case-sensitive-p)))
799    (declare #.*standard-optimize-settings*)
800    (set-flag parse-tree)
801    (make-instance 'void))
802   (:method ((parse-tree (eql :multi-line-mode-p)))
803    (declare #.*standard-optimize-settings*)
804    (set-flag parse-tree)
805    (make-instance 'void))
806   (:method ((parse-tree (eql :not-multi-line-mode-p)))
807    (declare #.*standard-optimize-settings*)
808    (set-flag parse-tree)
809    (make-instance 'void))
810   (:method ((parse-tree (eql :single-line-mode-p)))
811    (declare #.*standard-optimize-settings*)
812    (set-flag parse-tree)
813    (make-instance 'void))
814   (:method ((parse-tree (eql :not-single-line-mode-p)))
815    (declare #.*standard-optimize-settings*)
816    (set-flag parse-tree)
817    (make-instance 'void)))
818
819 (defmethod convert-simple-parse-tree ((parse-tree string))
820   (declare #.*standard-optimize-settings*)
821   (declare (special flags))
822   ;; turn strings into STR objects and try to accumulate into
823   ;; STARTS-WITH
824   (let ((str (make-instance 'str
825                             :str parse-tree
826                             :case-insensitive-p (case-insensitive-mode-p flags))))
827     (maybe-accumulate str)
828     str))
829
830 (defmethod convert-simple-parse-tree ((parse-tree character))
831   (declare #.*standard-optimize-settings*)
832   ;; dispatch to the method for strings
833   (convert-simple-parse-tree (string parse-tree)))
834        
835 (defmethod convert-simple-parse-tree (parse-tree)
836   "The default method - check if there's a translation."
837   (declare #.*standard-optimize-settings*)
838   (let ((translation (and (symbolp parse-tree) (parse-tree-synonym parse-tree))))
839     (if translation
840       (convert-aux (copy-tree translation))
841       (signal-syntax-error "Unknown token ~A in parse tree." parse-tree))))
842
843 (defun convert (parse-tree)
844   "Converts the parse tree PARSE-TREE into an equivalent REGEX object
845 and returns three values: the REGEX object, the number of registers
846 seen and an object the regex starts with which is either a STR object
847 or an EVERYTHING object \(if the regex starts with something like
848 \".*\") or NIL."
849   (declare #.*standard-optimize-settings*)
850   ;; this function basically just initializes the special variables
851   ;; and then calls CONVERT-AUX to do all the work
852   (let* ((flags (list nil nil nil))
853          (reg-num 0)
854          reg-names
855          named-reg-seen
856          (accumulate-start-p t)
857          starts-with
858          (max-back-ref 0)
859          (converted-parse-tree (convert-aux parse-tree)))
860     (declare (special flags reg-num reg-names named-reg-seen
861                       accumulate-start-p starts-with max-back-ref))
862     ;; make sure we don't reference registers which aren't there
863     (when (> (the fixnum max-back-ref)
864              (the fixnum reg-num))
865       (signal-syntax-error "Backreference to register ~A which has not been defined." max-back-ref))
866     (when (typep starts-with 'str)
867       (setf (slot-value starts-with 'str)
868               (coerce (slot-value starts-with 'str)
869                       #+:lispworks 'lw:simple-text-string
870                       #-:lispworks 'simple-string)))
871     (values converted-parse-tree reg-num starts-with
872             ;; we can't simply use *ALLOW-NAMED-REGISTERS*
873             ;; since parse-tree syntax ignores it
874             (when named-reg-seen
875               (nreverse reg-names)))))
Note: See TracBrowser for help on using the browser.