root/trunk/thirdparty/cl-ppcre/optimize.lisp

Revision 3581, 24.7 kB (checked in by edi, 6 months ago)

Update to current dev version

Line 
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2 ;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.35 2008/07/06 18:12:04 edi Exp $
3
4 ;;; This file contains optimizations which can be applied to converted
5 ;;; parse trees.
6
7 ;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
8
9 ;;; Redistribution and use in source and binary forms, with or without
10 ;;; modification, are permitted provided that the following conditions
11 ;;; are met:
12
13 ;;;   * Redistributions of source code must retain the above copyright
14 ;;;     notice, this list of conditions and the following disclaimer.
15
16 ;;;   * Redistributions in binary form must reproduce the above
17 ;;;     copyright notice, this list of conditions and the following
18 ;;;     disclaimer in the documentation and/or other materials
19 ;;;     provided with the distribution.
20
21 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
22 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
25 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 (in-package :cl-ppcre)
34
35 (defgeneric flatten (regex)
36   (declare #.*standard-optimize-settings*)
37   (:documentation "Merges adjacent sequences and alternations, i.e. it
38 transforms #<SEQ #<STR \"a\"> #<SEQ #<STR \"b\"> #<STR \"c\">>> to
39 #<SEQ #<STR \"a\"> #<STR \"b\"> #<STR \"c\">>. This is a destructive
40 operation on REGEX."))
41
42 (defmethod flatten ((seq seq))
43   (declare #.*standard-optimize-settings*)
44   ;; this looks more complicated than it is because we modify SEQ in
45   ;; place to avoid unnecessary consing
46   (let ((elements-rest (elements seq)))
47     (loop
48       (unless elements-rest
49         (return))
50       (let ((flattened-element (flatten (car elements-rest)))
51             (next-elements-rest (cdr elements-rest)))
52         (cond ((typep flattened-element 'seq)
53                 ;; FLATTENED-ELEMENT is a SEQ object, so we "splice"
54                 ;; it into out list of elements
55                 (let ((flattened-element-elements
56                         (elements flattened-element)))
57                   (setf (car elements-rest)
58                           (car flattened-element-elements)
59                         (cdr elements-rest)
60                           (nconc (cdr flattened-element-elements)
61                                  (cdr elements-rest)))))
62               (t
63                 ;; otherwise we just replace the current element with
64                 ;; its flattened counterpart
65                 (setf (car elements-rest) flattened-element)))
66         (setq elements-rest next-elements-rest))))
67   (let ((elements (elements seq)))
68     (cond ((cadr elements)
69             seq)
70           ((cdr elements)
71             (first elements))
72           (t (make-instance 'void)))))
73
74 (defmethod flatten ((alternation alternation))
75   (declare #.*standard-optimize-settings*)
76   ;; same algorithm as above
77   (let ((choices-rest (choices alternation)))
78     (loop
79       (unless choices-rest
80         (return))
81       (let ((flattened-choice (flatten (car choices-rest)))
82             (next-choices-rest (cdr choices-rest)))
83         (cond ((typep flattened-choice 'alternation)
84                 (let ((flattened-choice-choices
85                         (choices flattened-choice)))
86                   (setf (car choices-rest)
87                           (car flattened-choice-choices)
88                         (cdr choices-rest)
89                           (nconc (cdr flattened-choice-choices)
90                                  (cdr choices-rest)))))
91               (t
92                 (setf (car choices-rest) flattened-choice)))
93         (setq choices-rest next-choices-rest))))
94   (let ((choices (choices alternation)))
95     (cond ((cadr choices)
96             alternation)
97           ((cdr choices)
98             (first choices))
99           (t (signal-syntax-error "Encountered alternation without choices.")))))
100
101 (defmethod flatten ((branch branch))
102   (declare #.*standard-optimize-settings*)
103   (with-slots (test then-regex else-regex)
104       branch
105     (setq test
106             (if (numberp test)
107               test
108               (flatten test))
109           then-regex (flatten then-regex)
110           else-regex (flatten else-regex))
111     branch))
112
113 (defmethod flatten ((regex regex))
114   (declare #.*standard-optimize-settings*)
115   (typecase regex
116     ((or repetition register lookahead lookbehind standalone)
117       ;; if REGEX contains exactly one inner REGEX object flatten it
118       (setf (regex regex)
119               (flatten (regex regex)))
120       regex)
121     (t
122       ;; otherwise (ANCHOR, BACK-REFERENCE, CHAR-CLASS, EVERYTHING,
123       ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY)
124       ;; do nothing
125       regex)))
126
127 (defgeneric gather-strings (regex)
128   (declare #.*standard-optimize-settings*)
129   (:documentation "Collects adjacent strings or characters into one
130 string provided they have the same case mode. This is a destructive
131 operation on REGEX."))
132
133 (defmethod gather-strings ((seq seq))
134   (declare #.*standard-optimize-settings*)
135   ;; note that GATHER-STRINGS is to be applied after FLATTEN, i.e. it
136   ;; expects SEQ to be flattened already; in particular, SEQ cannot be
137   ;; empty and cannot contain embedded SEQ objects
138   (let* ((start-point (cons nil (elements seq)))
139          (curr-point start-point)
140          old-case-mode
141          collector
142          collector-start
143          (collector-length 0)
144          skip)
145     (declare (fixnum collector-length))
146     (loop
147       (let ((elements-rest (cdr curr-point)))
148         (unless elements-rest
149           (return))
150         (let* ((element (car elements-rest))
151                (case-mode (case-mode element old-case-mode)))
152           (cond ((and case-mode
153                       (eq case-mode old-case-mode))
154                   ;; if ELEMENT is a STR and we have collected a STR of
155                   ;; the same case mode in the last iteration we
156                   ;; concatenate ELEMENT onto COLLECTOR and remember the
157                   ;; value of its SKIP slot
158                   (let ((old-collector-length collector-length))
159                     (unless (and (adjustable-array-p collector)
160                                  (array-has-fill-pointer-p collector))
161                       (setq collector
162                               (make-array collector-length
163                                           :initial-contents collector
164                                           :element-type 'character
165                                           :fill-pointer t
166                                           :adjustable t)
167                             collector-start nil))
168                     (adjust-array collector
169                                   (incf collector-length (len element))
170                                   :fill-pointer t)
171                     (setf (subseq collector
172                                   old-collector-length)
173                             (str element)
174                           ;; it suffices to remember the last SKIP slot
175                           ;; because due to the way MAYBE-ACCUMULATE
176                           ;; works adjacent STR objects have the same
177                           ;; SKIP value
178                           skip (skip element)))
179                   (setf (cdr curr-point) (cdr elements-rest)))
180                 (t
181                   (let ((collected-string
182                           (cond (collector-start
183                                   collector-start)
184                                 (collector
185                                   ;; if we have collected something already
186                                   ;; we convert it into a STR
187                                   (make-instance 'str
188                                                  :skip skip
189                                                  :str collector
190                                                  :case-insensitive-p
191                                                  (eq old-case-mode
192                                                      :case-insensitive)))
193                                 (t nil))))
194                     (cond (case-mode
195                             ;; if ELEMENT is a string with a different case
196                             ;; mode than the last one we have either just
197                             ;; converted COLLECTOR into a STR or COLLECTOR
198                             ;; is still empty; in both cases we can now
199                             ;; begin to fill it anew
200                             (setq collector (str element)
201                                   collector-start element
202                                   ;; and we remember the SKIP value as above
203                                   skip (skip element)
204                                   collector-length (len element))
205                             (cond (collected-string
206                                     (setf (car elements-rest)
207                                             collected-string
208                                           curr-point
209                                             (cdr curr-point)))
210                                   (t
211                                     (setf (cdr curr-point)
212                                             (cdr elements-rest)))))
213                           (t
214                             ;; otherwise this is not a STR so we apply
215                             ;; GATHER-STRINGS to it and collect it directly
216                             ;; into RESULT
217                             (cond (collected-string
218                                     (setf (car elements-rest)
219                                             collected-string
220                                           curr-point
221                                             (cdr curr-point)
222                                           (cdr curr-point)
223                                             (cons (gather-strings element)
224                                                   (cdr curr-point))
225                                           curr-point
226                                             (cdr curr-point)))
227                                   (t
228                                     (setf (car elements-rest)
229                                             (gather-strings element)
230                                           curr-point
231                                             (cdr curr-point))))
232                             ;; we also have to empty COLLECTOR here in case
233                             ;; it was still filled from the last iteration
234                             (setq collector nil
235                                   collector-start nil))))))
236           (setq old-case-mode case-mode))))
237     (when collector
238       (setf (cdr curr-point)
239               (cons
240                (make-instance 'str
241                               :skip skip
242                               :str collector
243                               :case-insensitive-p
244                               (eq old-case-mode
245                                   :case-insensitive))
246                nil)))
247     (setf (elements seq) (cdr start-point))
248     seq))
249
250 (defmethod gather-strings ((alternation alternation))
251   (declare #.*standard-optimize-settings*)
252   ;; loop ON the choices of ALTERNATION so we can modify them directly
253   (loop for choices-rest on (choices alternation)
254         while choices-rest
255         do (setf (car choices-rest)
256                    (gather-strings (car choices-rest))))
257   alternation)
258
259 (defmethod gather-strings ((branch branch))
260   (declare #.*standard-optimize-settings*)
261   (with-slots (test then-regex else-regex)
262       branch
263     (setq test
264             (if (numberp test)
265               test
266               (gather-strings test))
267           then-regex (gather-strings then-regex)
268           else-regex (gather-strings else-regex))
269     branch))
270
271 (defmethod gather-strings ((regex regex))
272   (declare #.*standard-optimize-settings*)
273   (typecase regex
274     ((or repetition register lookahead lookbehind standalone)
275       ;; if REGEX contains exactly one inner REGEX object apply
276       ;; GATHER-STRINGS to it
277       (setf (regex regex)
278               (gather-strings (regex regex)))
279       regex)
280     (t
281       ;; otherwise (ANCHOR, BACK-REFERENCE, CHAR-CLASS, EVERYTHING,
282       ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY)
283       ;; do nothing
284       regex)))
285
286 ;; Note that START-ANCHORED-P will be called after FLATTEN and GATHER-STRINGS.
287
288 (defgeneric start-anchored-p (regex &optional in-seq-p)
289   (declare #.*standard-optimize-settings*)
290   (:documentation "Returns T if REGEX starts with a \"real\" start
291 anchor, i.e. one that's not in multi-line mode, NIL otherwise. If
292 IN-SEQ-P is true the function will return :ZERO-LENGTH if REGEX is a
293 zero-length assertion."))
294
295 (defmethod start-anchored-p ((seq seq) &optional in-seq-p)
296   (declare (ignore in-seq-p))
297   ;; note that START-ANCHORED-P is to be applied after FLATTEN and
298   ;; GATHER-STRINGS, i.e. SEQ cannot be empty and cannot contain
299   ;; embedded SEQ objects
300   (loop for element in (elements seq)
301         for anchored-p = (start-anchored-p element t)
302         ;; skip zero-length elements because they won't affect the
303         ;; "anchoredness" of the sequence
304         while (eq anchored-p :zero-length)
305         finally (return (and anchored-p (not (eq anchored-p :zero-length))))))
306
307 (defmethod start-anchored-p ((alternation alternation) &optional in-seq-p)
308   (declare #.*standard-optimize-settings*)
309   (declare (ignore in-seq-p))
310   ;; clearly an alternation can only be start-anchored if all of its
311   ;; choices are start-anchored
312   (loop for choice in (choices alternation)
313         always (start-anchored-p choice)))
314
315 (defmethod start-anchored-p ((branch branch) &optional in-seq-p)
316   (declare #.*standard-optimize-settings*)
317   (declare (ignore in-seq-p))
318   (and (start-anchored-p (then-regex branch))
319        (start-anchored-p (else-regex branch))))
320
321 (defmethod start-anchored-p ((repetition repetition) &optional in-seq-p)
322   (declare #.*standard-optimize-settings*)
323   (declare (ignore in-seq-p))
324   ;; well, this wouldn't make much sense, but anyway...
325   (and (plusp (minimum repetition))
326        (start-anchored-p (regex repetition))))
327
328 (defmethod start-anchored-p ((register register) &optional in-seq-p)
329   (declare #.*standard-optimize-settings*)
330   (declare (ignore in-seq-p))
331   (start-anchored-p (regex register)))
332
333 (defmethod start-anchored-p ((standalone standalone) &optional in-seq-p)
334   (declare #.*standard-optimize-settings*)
335   (declare (ignore in-seq-p))
336   (start-anchored-p (regex standalone)))
337
338 (defmethod start-anchored-p ((anchor anchor) &optional in-seq-p)
339   (declare #.*standard-optimize-settings*)
340   (declare (ignore in-seq-p))
341   (and (startp anchor)
342        (not (multi-line-p anchor))))
343
344 (defmethod start-anchored-p ((regex regex) &optional in-seq-p)
345   (declare #.*standard-optimize-settings*)
346   (typecase regex
347     ((or lookahead lookbehind word-boundary void)
348       ;; zero-length assertions
349       (if in-seq-p
350         :zero-length
351         nil))
352     (filter
353       (if (and in-seq-p
354                (len regex)
355                (zerop (len regex)))
356         :zero-length
357         nil))
358     (t
359       ;; BACK-REFERENCE, CHAR-CLASS, EVERYTHING, and STR
360       nil)))
361
362 ;; Note that END-STRING-AUX will be called after FLATTEN and GATHER-STRINGS.
363
364 (defgeneric end-string-aux (regex &optional old-case-insensitive-p)
365   (declare #.*standard-optimize-settings*)
366   (:documentation "Returns the constant string (if it exists) REGEX
367 ends with wrapped into a STR object, otherwise NIL.
368 OLD-CASE-INSENSITIVE-P is the CASE-INSENSITIVE-P slot of the last STR
369 collected or :VOID if no STR has been collected yet. (This is a helper
370 function called by END-STRIN.)"))
371
372 (defmethod end-string-aux ((str str)
373                            &optional (old-case-insensitive-p :void))
374   (declare #.*standard-optimize-settings*)
375   (declare (special last-str))
376   (cond ((and (not (skip str))          ; avoid constituents of STARTS-WITH
377               ;; only use STR if nothing has been collected yet or if
378               ;; the collected string has the same value for
379               ;; CASE-INSENSITIVE-P
380               (or (eq old-case-insensitive-p :void)
381                   (eq (case-insensitive-p str) old-case-insensitive-p)))
382           (setf last-str str
383                 ;; set the SKIP property of this STR
384                 (skip str) t)
385           str)
386         (t nil)))
387
388 (defmethod end-string-aux ((seq seq)
389                            &optional (old-case-insensitive-p :void))
390   (declare #.*standard-optimize-settings*)
391   (declare (special continuep))
392   (let (case-insensitive-p
393         concatenated-string
394         concatenated-start
395         (concatenated-length 0))
396     (declare (fixnum concatenated-length))
397     (loop for element in (reverse (elements seq))
398           ;; remember the case-(in)sensitivity of the last relevant
399           ;; STR object
400           for loop-old-case-insensitive-p = old-case-insensitive-p
401             then (if skip
402                    loop-old-case-insensitive-p
403                    (case-insensitive-p element-end))
404           ;; the end-string of the current element
405           for element-end = (end-string-aux element
406                                             loop-old-case-insensitive-p)
407           ;; whether we encountered a zero-length element
408           for skip = (if element-end
409                        (zerop (len element-end))
410                        nil)
411           ;; set CONTINUEP to NIL if we have to stop collecting to
412           ;; alert END-STRING-AUX methods on enclosing SEQ objects
413           unless element-end
414             do (setq continuep nil)
415           ;; end loop if we neither got a STR nor a zero-length
416           ;; element
417           while element-end
418           ;; only collect if not zero-length
419           unless skip
420             do (cond (concatenated-string
421                        (when concatenated-start
422                          (setf concatenated-string
423                                  (make-array concatenated-length
424                                              :initial-contents (reverse (str concatenated-start))
425                                              :element-type 'character
426                                              :fill-pointer t
427                                              :adjustable t)
428                                concatenated-start nil))
429                        (let ((len (len element-end))
430                              (str (str element-end)))
431                          (declare (fixnum len))
432                          (incf concatenated-length len)
433                          (loop for i of-type fixnum downfrom (1- len) to 0
434                                do (vector-push-extend (char str i)
435                                                       concatenated-string))))
436                      (t
437                        (setf concatenated-string
438                                t
439                              concatenated-start
440                                element-end
441                              concatenated-length
442                                (len element-end)
443                              case-insensitive-p
444                                (case-insensitive-p element-end))))
445           ;; stop collecting if END-STRING-AUX on inner SEQ has said so
446           while continuep)
447     (cond ((zerop concatenated-length)
448             ;; don't bother to return zero-length strings
449             nil)
450           (concatenated-start
451             concatenated-start)
452           (t
453             (make-instance 'str
454                            :str (nreverse concatenated-string)
455                            :case-insensitive-p case-insensitive-p)))))
456
457 (defmethod end-string-aux ((register register)
458                            &optional (old-case-insensitive-p :void))
459   (declare #.*standard-optimize-settings*)
460   (end-string-aux (regex register) old-case-insensitive-p))
461    
462 (defmethod end-string-aux ((standalone standalone)
463                            &optional (old-case-insensitive-p :void))
464   (declare #.*standard-optimize-settings*)
465   (end-string-aux (regex standalone) old-case-insensitive-p))
466    
467 (defmethod end-string-aux ((regex regex)
468                            &optional (old-case-insensitive-p :void))
469   (declare #.*standard-optimize-settings*)
470   (declare (special last-str end-anchored-p continuep))
471   (typecase regex
472     ((or anchor lookahead lookbehind word-boundary void)
473       ;; a zero-length REGEX object - for the sake of END-STRING-AUX
474       ;; this is a zero-length string
475       (when (and (typep regex 'anchor)
476                  (not (startp regex))
477                  (or (no-newline-p regex)
478                      (not (multi-line-p regex)))
479                  (eq old-case-insensitive-p :void))
480         ;; if this is a "real" end-anchor and we haven't collected
481         ;; anything so far we can set END-ANCHORED-P (where 1 or 0
482         ;; indicate whether we accept a #\Newline at the end or not)
483         (setq end-anchored-p (if (no-newline-p regex) 0 1)))
484       (make-instance 'str
485                      :str ""
486                      :case-insensitive-p :void))
487     (t
488       ;; (ALTERNATION, BACK-REFERENCE, BRANCH, CHAR-CLASS, EVERYTHING,
489       ;; REPETITION, FILTER)
490       nil)))
491
492 (defun end-string (regex)
493   (declare (special end-string-offset))
494   (declare #.*standard-optimize-settings*)
495   "Returns the constant string (if it exists) REGEX ends with wrapped
496 into a STR object, otherwise NIL."
497   ;; LAST-STR points to the last STR object (seen from the end) that's
498   ;; part of END-STRING; CONTINUEP is set to T if we stop collecting
499   ;; in the middle of a SEQ
500   (let ((continuep t)
501         last-str)
502     (declare (special continuep last-str))
503     (prog1
504       (end-string-aux regex)
505       (when last-str
506         ;; if we've found something set the START-OF-END-STRING-P of
507         ;; the leftmost STR collected accordingly and remember the
508         ;; OFFSET of this STR (in a special variable provided by the
509         ;; caller of this function)
510         (setf (start-of-end-string-p last-str) t
511               end-string-offset (offset last-str))))))
512
513 (defgeneric compute-min-rest (regex current-min-rest)
514   (declare #.*standard-optimize-settings*)
515   (:documentation "Returns the minimal length of REGEX plus
516 CURRENT-MIN-REST. This is similar to REGEX-MIN-LENGTH except that it
517 recurses down into REGEX and sets the MIN-REST slots of REPETITION
518 objects."))
519
520 (defmethod compute-min-rest ((seq seq) current-min-rest)
521   (declare #.*standard-optimize-settings*)
522   (loop for element in (reverse (elements seq))
523         for last-min-rest = current-min-rest then this-min-rest
524         for this-min-rest = (compute-min-rest element last-min-rest)
525         finally (return this-min-rest)))
526    
527 (defmethod compute-min-rest ((alternation alternation) current-min-rest)
528   (declare #.*standard-optimize-settings*)
529   (loop for choice in (choices alternation)
530         minimize (compute-min-rest choice current-min-rest)))
531
532 (defmethod compute-min-rest ((branch branch) current-min-rest)
533   (declare #.*standard-optimize-settings*)
534   (min (compute-min-rest (then-regex branch) current-min-rest)
535        (compute-min-rest (else-regex branch) current-min-rest)))
536
537 (defmethod compute-min-rest ((str str) current-min-rest)
538   (declare #.*standard-optimize-settings*)
539   (+ current-min-rest (len str)))
540    
541 (defmethod compute-min-rest ((filter filter) current-min-rest)
542   (declare #.*standard-optimize-settings*)
543   (+ current-min-rest (or (len filter) 0)))
544    
545 (defmethod compute-min-rest ((repetition repetition) current-min-rest)
546   (declare #.*standard-optimize-settings*)
547   (setf (min-rest repetition) current-min-rest)
548   (compute-min-rest (regex repetition) current-min-rest)
549   (+ current-min-rest (* (minimum repetition) (min-len repetition))))
550
551 (defmethod compute-min-rest ((register register) current-min-rest)
552   (declare #.*standard-optimize-settings*)
553   (compute-min-rest (regex register) current-min-rest))
554    
555 (defmethod compute-min-rest ((standalone standalone) current-min-rest)
556   (declare #.*standard-optimize-settings*)
557   (declare (ignore current-min-rest))
558   (compute-min-rest (regex standalone) 0))
559    
560 (defmethod compute-min-rest ((lookahead lookahead) current-min-rest)
561   (declare #.*standard-optimize-settings*)
562   (compute-min-rest (regex lookahead) 0)
563   current-min-rest)
564    
565 (defmethod compute-min-rest ((lookbehind lookbehind) current-min-rest)
566   (declare #.*standard-optimize-settings*)
567   (compute-min-rest (regex lookbehind) (+ current-min-rest (len lookbehind)))
568   current-min-rest)
569    
570 (defmethod compute-min-rest ((regex regex) current-min-rest)
571   (declare #.*standard-optimize-settings*)
572   (typecase regex
573     ((or char-class everything)
574       (1+ current-min-rest))
575     (t
576       ;; zero min-len and no embedded regexes (ANCHOR,
577       ;; BACK-REFERENCE, VOID, and WORD-BOUNDARY)
578       current-min-rest)))
Note: See TracBrowser for help on using the browser.