root/trunk/thirdparty/cl-ppcre/regex-class-util.lisp

Revision 3581, 19.9 kB (checked in by edi, 6 months ago)

Update to current dev version

Line 
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2 ;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class-util.lisp,v 1.8 2008/07/22 22:38:05 edi Exp $
3
4 ;;; This file contains some utility methods for REGEX objects.
5
6 ;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
7
8 ;;; Redistribution and use in source and binary forms, with or without
9 ;;; modification, are permitted provided that the following conditions
10 ;;; are met:
11
12 ;;;   * Redistributions of source code must retain the above copyright
13 ;;;     notice, this list of conditions and the following disclaimer.
14
15 ;;;   * Redistributions in binary form must reproduce the above
16 ;;;     copyright notice, this list of conditions and the following
17 ;;;     disclaimer in the documentation and/or other materials
18 ;;;     provided with the distribution.
19
20 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
21 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
24 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
26 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
32 (in-package :cl-ppcre)
33
34 ;;; The following four methods allow a VOID object to behave like a
35 ;;; zero-length STR object (only readers needed)
36
37 (defmethod len ((void void))
38   (declare #.*standard-optimize-settings*)
39   0)
40
41 (defmethod str ((void void))
42   (declare #.*standard-optimize-settings*)
43   "")
44
45 (defmethod skip ((void void))
46   (declare #.*standard-optimize-settings*)
47   nil)
48
49 (defmethod start-of-end-string-p ((void void))
50   (declare #.*standard-optimize-settings*)
51   nil)
52
53 (defgeneric case-mode (regex old-case-mode)
54   (declare #.*standard-optimize-settings*)
55   (:documentation "Utility function used by the optimizer (see GATHER-STRINGS).
56 Returns a keyword denoting the case-(in)sensitivity of a STR or its
57 second argument if the STR has length 0. Returns NIL for REGEX objects
58 which are not of type STR."))
59
60 (defmethod case-mode ((str str) old-case-mode)
61   (declare #.*standard-optimize-settings*)
62   (cond ((zerop (len str))
63           old-case-mode)
64         ((case-insensitive-p str)
65           :case-insensitive)
66         (t
67           :case-sensitive)))
68
69 (defmethod case-mode ((regex regex) old-case-mode)
70   (declare #.*standard-optimize-settings*)
71   (declare (ignore old-case-mode))
72   nil)
73
74 (defgeneric copy-regex (regex)
75   (declare #.*standard-optimize-settings*)
76   (:documentation "Implements a deep copy of a REGEX object."))
77
78 (defmethod copy-regex ((anchor anchor))
79   (declare #.*standard-optimize-settings*)
80   (make-instance 'anchor
81                  :startp (startp anchor)
82                  :multi-line-p (multi-line-p anchor)
83                  :no-newline-p (no-newline-p anchor)))
84
85 (defmethod copy-regex ((everything everything))
86   (declare #.*standard-optimize-settings*)
87   (make-instance 'everything
88                  :single-line-p (single-line-p everything)))
89
90 (defmethod copy-regex ((word-boundary word-boundary))
91   (declare #.*standard-optimize-settings*)
92   (make-instance 'word-boundary
93                  :negatedp (negatedp word-boundary)))
94
95 (defmethod copy-regex ((void void))
96   (declare #.*standard-optimize-settings*)
97   (make-instance 'void))
98
99 (defmethod copy-regex ((lookahead lookahead))
100   (declare #.*standard-optimize-settings*)
101   (make-instance 'lookahead
102                  :regex (copy-regex (regex lookahead))
103                  :positivep (positivep lookahead)))
104
105 (defmethod copy-regex ((seq seq))
106   (declare #.*standard-optimize-settings*)
107   (make-instance 'seq
108                  :elements (mapcar #'copy-regex (elements seq))))
109
110 (defmethod copy-regex ((alternation alternation))
111   (declare #.*standard-optimize-settings*)
112   (make-instance 'alternation
113                  :choices (mapcar #'copy-regex (choices alternation))))
114
115 (defmethod copy-regex ((branch branch))
116   (declare #.*standard-optimize-settings*)
117   (with-slots (test)
118       branch
119     (make-instance 'branch
120                    :test (if (typep test 'regex)
121                            (copy-regex test)
122                            test)
123                    :then-regex (copy-regex (then-regex branch))
124                    :else-regex (copy-regex (else-regex branch)))))
125
126 (defmethod copy-regex ((lookbehind lookbehind))
127   (declare #.*standard-optimize-settings*)
128   (make-instance 'lookbehind
129                  :regex (copy-regex (regex lookbehind))
130                  :positivep (positivep lookbehind)
131                  :len (len lookbehind)))
132
133 (defmethod copy-regex ((repetition repetition))
134   (declare #.*standard-optimize-settings*)
135   (make-instance 'repetition
136                  :regex (copy-regex (regex repetition))
137                  :greedyp (greedyp repetition)
138                  :minimum (minimum repetition)
139                  :maximum (maximum repetition)
140                  :min-len (min-len repetition)
141                  :len (len repetition)
142                  :contains-register-p (contains-register-p repetition)))
143
144 (defmethod copy-regex ((register register))
145   (declare #.*standard-optimize-settings*)
146   (make-instance 'register
147                  :regex (copy-regex (regex register))
148                  :num (num register)
149                  :name (name register)))
150
151 (defmethod copy-regex ((standalone standalone))
152   (declare #.*standard-optimize-settings*)
153   (make-instance 'standalone
154                  :regex (copy-regex (regex standalone))))
155
156 (defmethod copy-regex ((back-reference back-reference))
157   (declare #.*standard-optimize-settings*)
158   (make-instance 'back-reference
159                  :num (num back-reference)
160                  :case-insensitive-p (case-insensitive-p back-reference)))
161
162 (defmethod copy-regex ((char-class char-class))
163   (declare #.*standard-optimize-settings*)
164   (make-instance 'char-class
165                  :test-function (test-function char-class)))
166
167 (defmethod copy-regex ((str str))
168   (declare #.*standard-optimize-settings*)
169   (make-instance 'str
170                  :str (str str)
171                  :case-insensitive-p (case-insensitive-p str)))
172
173 (defmethod copy-regex ((filter filter))
174   (declare #.*standard-optimize-settings*)
175   (make-instance 'filter
176                  :fn (fn filter)
177                  :len (len filter)))
178
179 ;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been
180 ;;; wrapped into one function. Maybe in the next release...
181
182 ;;; Further note that this function is used by CONVERT to factor out
183 ;;; complicated repetitions, i.e. cases like
184 ;;;   (a)* -> (?:a*(a))?
185 ;;; This won't work for, say,
186 ;;;   ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))?
187 ;;; and therefore we stop REGISTER removal once we see an ALTERNATION.
188
189 (defgeneric remove-registers (regex)
190   (declare #.*standard-optimize-settings*)
191   (:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and
192 optionally removes embedded REGISTER objects if possible and if the
193 special variable REMOVE-REGISTERS-P is true."))
194
195 (defmethod remove-registers ((register register))
196   (declare #.*standard-optimize-settings*)
197   (declare (special remove-registers-p reg-seen))
198   (cond (remove-registers-p
199           (remove-registers (regex register)))
200         (t
201           ;; mark REG-SEEN as true so enclosing REPETITION objects
202           ;; (see method below) know if they contain a register or not
203           (setq reg-seen t)
204           (copy-regex register))))
205
206 (defmethod remove-registers ((repetition repetition))
207   (declare #.*standard-optimize-settings*)
208   (let* (reg-seen
209          (inner-regex (remove-registers (regex repetition))))
210     ;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
211     ;; (REGEX REPETITION) contains a REGISTER
212     (declare (special reg-seen))
213     (make-instance 'repetition
214                    :regex inner-regex
215                    :greedyp (greedyp repetition)
216                    :minimum (minimum repetition)
217                    :maximum (maximum repetition)
218                    :min-len (min-len repetition)
219                    :len (len repetition)
220                    :contains-register-p reg-seen)))
221
222 (defmethod remove-registers ((standalone standalone))
223   (declare #.*standard-optimize-settings*)
224   (make-instance 'standalone
225                  :regex (remove-registers (regex standalone))))
226
227 (defmethod remove-registers ((lookahead lookahead))
228   (declare #.*standard-optimize-settings*)
229   (make-instance 'lookahead
230                  :regex (remove-registers (regex lookahead))
231                  :positivep (positivep lookahead)))
232
233 (defmethod remove-registers ((lookbehind lookbehind))
234   (declare #.*standard-optimize-settings*)
235   (make-instance 'lookbehind
236                  :regex (remove-registers (regex lookbehind))
237                  :positivep (positivep lookbehind)
238                  :len (len lookbehind)))
239
240 (defmethod remove-registers ((branch branch))
241   (declare #.*standard-optimize-settings*)
242   (with-slots (test)
243       branch
244     (make-instance 'branch
245                    :test (if (typep test 'regex)
246                            (remove-registers test)
247                            test)
248                    :then-regex (remove-registers (then-regex branch))
249                    :else-regex (remove-registers (else-regex branch)))))
250
251 (defmethod remove-registers ((alternation alternation))
252   (declare #.*standard-optimize-settings*)
253   (declare (special remove-registers-p))
254   ;; an ALTERNATION, so we can't remove REGISTER objects further down
255   (setq remove-registers-p nil)
256   (copy-regex alternation))
257
258 (defmethod remove-registers ((regex regex))
259   (declare #.*standard-optimize-settings*)
260   (copy-regex regex))
261
262 (defmethod remove-registers ((seq seq))
263   (declare #.*standard-optimize-settings*)
264   (make-instance 'seq
265                  :elements (mapcar #'remove-registers (elements seq))))
266
267 (defgeneric everythingp (regex)
268   (declare #.*standard-optimize-settings*)
269   (:documentation "Returns an EVERYTHING object if REGEX is equivalent
270 to this object, otherwise NIL.  So, \"(.){1}\" would return true
271 \(i.e. the object corresponding to \".\", for example."))
272
273 (defmethod everythingp ((seq seq))
274   (declare #.*standard-optimize-settings*)
275   ;; we might have degenerate cases like (:SEQUENCE :VOID ...)
276   ;; due to the parsing process
277   (let ((cleaned-elements (remove-if #'(lambda (element)
278                                          (typep element 'void))
279                                      (elements seq))))
280     (and (= 1 (length cleaned-elements))
281          (everythingp (first cleaned-elements)))))
282
283 (defmethod everythingp ((alternation alternation))
284   (declare #.*standard-optimize-settings*)
285   (with-slots (choices)
286       alternation
287     (and (= 1 (length choices))
288          ;; this is unlikely to happen for human-generated regexes,
289          ;; but machine-generated ones might look like this
290          (everythingp (first choices)))))
291
292 (defmethod everythingp ((repetition repetition))
293   (declare #.*standard-optimize-settings*)
294   (with-slots (maximum minimum regex)
295       repetition
296     (and maximum
297          (= 1 minimum maximum)
298          ;; treat "<regex>{1,1}" like "<regex>"
299          (everythingp regex))))
300
301 (defmethod everythingp ((register register))
302   (declare #.*standard-optimize-settings*)
303   (everythingp (regex register)))
304
305 (defmethod everythingp ((standalone standalone))
306   (declare #.*standard-optimize-settings*)
307   (everythingp (regex standalone)))
308
309 (defmethod everythingp ((everything everything))
310   (declare #.*standard-optimize-settings*)
311   everything)
312
313 (defmethod everythingp ((regex regex))
314   (declare #.*standard-optimize-settings*)
315   ;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
316   ;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY
317   nil)
318
319 (defgeneric regex-length (regex)
320   (declare #.*standard-optimize-settings*)
321   (:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
322
323 (defmethod regex-length ((seq seq))
324   (declare #.*standard-optimize-settings*)
325   ;; simply add all inner lengths unless one of them is NIL
326   (loop for sub-regex in (elements seq)
327         for len = (regex-length sub-regex)
328         if (not len) do (return nil)
329         sum len))
330
331 (defmethod regex-length ((alternation alternation))
332   (declare #.*standard-optimize-settings*)
333   ;; only return a true value if all inner lengths are non-NIL and
334   ;; mutually equal
335   (loop for sub-regex in (choices alternation)
336         for old-len = nil then len
337         for len = (regex-length sub-regex)
338         if (or (not len)
339                (and old-len (/= len old-len))) do (return nil)
340         finally (return len)))
341
342 (defmethod regex-length ((branch branch))
343   (declare #.*standard-optimize-settings*)
344   ;; only return a true value if both alternations have a length and
345   ;; if they're equal
346   (let ((then-length (regex-length (then-regex branch))))
347     (and then-length
348          (eql then-length (regex-length (else-regex branch)))
349          then-length)))
350
351 (defmethod regex-length ((repetition repetition))
352   (declare #.*standard-optimize-settings*)
353   ;; we can only compute the length of a REPETITION object if the
354   ;; number of repetitions is fixed; note that we don't call
355   ;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
356   ;; always set correctly
357   (with-slots (len minimum maximum)
358       repetition
359     (if (and len
360              (eql minimum maximum))
361       (* minimum len)
362       nil)))
363
364 (defmethod regex-length ((register register))
365   (declare #.*standard-optimize-settings*)
366   (regex-length (regex register)))
367
368 (defmethod regex-length ((standalone standalone))
369   (declare #.*standard-optimize-settings*)
370   (regex-length (regex standalone)))
371
372 (defmethod regex-length ((back-reference back-reference))
373   (declare #.*standard-optimize-settings*)
374   ;; with enough effort we could possibly do better here, but
375   ;; currently we just give up and return NIL
376   nil)
377    
378 (defmethod regex-length ((char-class char-class))
379   (declare #.*standard-optimize-settings*)
380   1)
381
382 (defmethod regex-length ((everything everything))
383   (declare #.*standard-optimize-settings*)
384   1)
385
386 (defmethod regex-length ((str str))
387   (declare #.*standard-optimize-settings*)
388   (len str))
389
390 (defmethod regex-length ((filter filter))
391   (declare #.*standard-optimize-settings*)
392   (len filter))
393
394 (defmethod regex-length ((regex regex))
395   (declare #.*standard-optimize-settings*)
396   ;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
397   ;; WORD-BOUNDARY (which all have zero-length)
398   0)
399
400 (defgeneric regex-min-length (regex)
401   (declare #.*standard-optimize-settings*)
402   (:documentation "Returns the minimal length of REGEX."))
403
404 (defmethod regex-min-length ((seq seq))
405   (declare #.*standard-optimize-settings*)
406   ;; simply add all inner minimal lengths
407   (loop for sub-regex in (elements seq)
408         for len = (regex-min-length sub-regex)
409         sum len))
410
411 (defmethod regex-min-length ((alternation alternation))
412   (declare #.*standard-optimize-settings*)
413   ;; minimal length of an alternation is the minimal length of the
414   ;; "shortest" element
415   (loop for sub-regex in (choices alternation)
416         for len = (regex-min-length sub-regex)
417         minimize len))
418
419 (defmethod regex-min-length ((branch branch))
420   (declare #.*standard-optimize-settings*)
421   ;; minimal length of both alternations
422   (min (regex-min-length (then-regex branch))
423        (regex-min-length (else-regex branch))))
424
425 (defmethod regex-min-length ((repetition repetition))
426   (declare #.*standard-optimize-settings*)
427   ;; obviously the product of the inner minimal length and the minimal
428   ;; number of repetitions
429   (* (minimum repetition) (min-len repetition)))
430    
431 (defmethod regex-min-length ((register register))
432   (declare #.*standard-optimize-settings*)
433   (regex-min-length (regex register)))
434    
435 (defmethod regex-min-length ((standalone standalone))
436   (declare #.*standard-optimize-settings*)
437   (regex-min-length (regex standalone)))
438    
439 (defmethod regex-min-length ((char-class char-class))
440   (declare #.*standard-optimize-settings*)
441   1)
442
443 (defmethod regex-min-length ((everything everything))
444   (declare #.*standard-optimize-settings*)
445   1)
446
447 (defmethod regex-min-length ((str str))
448   (declare #.*standard-optimize-settings*)
449   (len str))
450    
451 (defmethod regex-min-length ((filter filter))
452   (declare #.*standard-optimize-settings*)
453   (or (len filter)
454       0))
455
456 (defmethod regex-min-length ((regex regex))
457   (declare #.*standard-optimize-settings*)
458   ;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
459   ;; LOOKBEHIND, VOID, and WORD-BOUNDARY
460   0)
461
462 (defgeneric compute-offsets (regex start-pos)
463   (declare #.*standard-optimize-settings*)
464   (:documentation "Returns the offset the following regex would have
465 relative to START-POS or NIL if we can't compute it. Sets the OFFSET
466 slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET
467 slots of STR objects further down the tree."))
468
469 ;; note that we're actually only interested in the offset of
470 ;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we
471 ;; can stop at variable-length alternations and don't need to descend
472 ;; into repetitions
473
474 (defmethod compute-offsets ((seq seq) start-pos)
475   (declare #.*standard-optimize-settings*)
476   (loop for element in (elements seq)
477         ;; advance offset argument for next call while looping through
478         ;; the elements
479         for pos = start-pos then curr-offset
480         for curr-offset = (compute-offsets element pos)
481         while curr-offset
482         finally (return curr-offset)))
483
484 (defmethod compute-offsets ((alternation alternation) start-pos)
485   (declare #.*standard-optimize-settings*)
486   (loop for choice in (choices alternation)
487         for old-offset = nil then curr-offset
488         for curr-offset = (compute-offsets choice start-pos)
489         ;; we stop immediately if two alternations don't result in the
490         ;; same offset
491         if (or (not curr-offset)
492                (and old-offset (/= curr-offset old-offset)))
493           do (return nil)
494         finally (return curr-offset)))
495
496 (defmethod compute-offsets ((branch branch) start-pos)
497   (declare #.*standard-optimize-settings*)
498   ;; only return offset if both alternations have equal value
499   (let ((then-offset (compute-offsets (then-regex branch) start-pos)))
500     (and then-offset
501          (eql then-offset (compute-offsets (else-regex branch) start-pos))
502          then-offset)))
503
504 (defmethod compute-offsets ((repetition repetition) start-pos)
505   (declare #.*standard-optimize-settings*)
506   ;; no need to descend into the inner regex
507   (with-slots (len minimum maximum)
508       repetition
509     (if (and len
510              (eq minimum maximum))
511       ;; fixed number of repetitions, so we know how to proceed
512       (+ start-pos (* minimum len))
513       ;; otherwise return NIL
514       nil)))
515
516 (defmethod compute-offsets ((register register) start-pos)
517   (declare #.*standard-optimize-settings*)
518   (compute-offsets (regex register) start-pos))
519    
520 (defmethod compute-offsets ((standalone standalone) start-pos)
521   (declare #.*standard-optimize-settings*)
522   (compute-offsets (regex standalone) start-pos))
523    
524 (defmethod compute-offsets ((char-class char-class) start-pos)
525   (declare #.*standard-optimize-settings*)
526   (1+ start-pos))
527    
528 (defmethod compute-offsets ((everything everything) start-pos)
529   (declare #.*standard-optimize-settings*)
530   (1+ start-pos))
531    
532 (defmethod compute-offsets ((str str) start-pos)
533   (declare #.*standard-optimize-settings*)
534   (setf (offset str) start-pos)
535   (+ start-pos (len str)))
536
537 (defmethod compute-offsets ((back-reference back-reference) start-pos)
538   (declare #.*standard-optimize-settings*)
539   ;; with enough effort we could possibly do better here, but
540   ;; currently we just give up and return NIL
541   (declare (ignore start-pos))
542   nil)
543
544 (defmethod compute-offsets ((filter filter) start-pos)
545   (declare #.*standard-optimize-settings*)
546   (let ((len (len filter)))
547     (if len
548       (+ start-pos len)
549       nil)))
550
551 (defmethod compute-offsets ((regex regex) start-pos)
552   (declare #.*standard-optimize-settings*)
553   ;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
554   ;; WORD-BOUNDARY (which all have zero-length)
555   start-pos)
Note: See TracBrowser for help on using the browser.