root/trunk/thirdparty/cl-ppcre/scanner.lisp

Revision 3601, 25.5 kB (checked in by edi, 6 months ago)

Update to release version

Line 
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2 ;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.35 2008/07/23 22:25:15 edi Exp $
3
4 ;;; Here the scanner for the actual regex as well as utility scanners
5 ;;; for the constant start and end strings are created.
6
7 ;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
8
9 ;;; Redistribution and use in source and binary forms, with or without
10 ;;; modification, are permitted provided that the following conditions
11 ;;; are met:
12
13 ;;;   * Redistributions of source code must retain the above copyright
14 ;;;     notice, this list of conditions and the following disclaimer.
15
16 ;;;   * Redistributions in binary form must reproduce the above
17 ;;;     copyright notice, this list of conditions and the following
18 ;;;     disclaimer in the documentation and/or other materials
19 ;;;     provided with the distribution.
20
21 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
22 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
25 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 (in-package :cl-ppcre)
34
35 (defmacro bmh-matcher-aux (&key case-insensitive-p)
36   "Auxiliary macro used by CREATE-BMH-MATCHER."
37   (let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
38     `(lambda (start-pos)
39        (declare (fixnum start-pos))
40        (if (or (minusp start-pos)
41                (> (the fixnum (+ start-pos m)) *end-pos*))
42          nil
43          (loop named bmh-matcher
44                for k of-type fixnum = (+ start-pos m -1)
45                then (+ k (max 1 (aref skip (char-code (schar *string* k)))))
46                while (< k *end-pos*)
47                do (loop for j of-type fixnum downfrom (1- m)
48                         for i of-type fixnum downfrom k
49                         while (and (>= j 0)
50                                    (,char-compare (schar *string* i)
51                                                   (schar pattern j)))
52                         finally (if (minusp j)
53                                   (return-from bmh-matcher (1+ i)))))))))
54
55 (defun create-bmh-matcher (pattern case-insensitive-p)
56   "Returns a Boyer-Moore-Horspool matcher which searches the (special)
57 simple-string *STRING* for the first occurence of the substring
58 PATTERN.  The search starts at the position START-POS within *STRING*
59 and stops before *END-POS* is reached.  Depending on the second
60 argument the search is case-insensitive or not.  If the special
61 variable *USE-BMH-MATCHERS* is NIL, use the standard SEARCH function
62 instead.  \(BMH matchers are faster but need much more space.)"
63   (declare #.*standard-optimize-settings*)
64   ;; see <http://www-igm.univ-mlv.fr/~lecroq/string/node18.html> for
65   ;; details
66   (unless *use-bmh-matchers*
67     (let ((test (if case-insensitive-p #'char-equal #'char=)))
68       (return-from create-bmh-matcher
69         (lambda (start-pos)
70           (declare (fixnum start-pos))
71           (and (not (minusp start-pos))
72                (search pattern
73                        *string*
74                        :start2 start-pos
75                        :end2 *end-pos*
76                        :test test))))))
77   (let* ((m (length pattern))
78          (skip (make-array *regex-char-code-limit*
79                            :element-type 'fixnum
80                            :initial-element m)))
81     (declare (fixnum m))
82     (loop for k of-type fixnum below m
83           if case-insensitive-p
84           do (setf (aref skip (char-code (char-upcase (schar pattern k)))) (- m k 1)
85                    (aref skip (char-code (char-downcase (schar pattern k)))) (- m k 1))
86           else
87           do (setf (aref skip (char-code (schar pattern k))) (- m k 1)))
88     (if case-insensitive-p
89       (bmh-matcher-aux :case-insensitive-p t)
90       (bmh-matcher-aux))))
91
92 (defmacro char-searcher-aux (&key case-insensitive-p)
93   "Auxiliary macro used by CREATE-CHAR-SEARCHER."
94   (let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
95     `(lambda (start-pos)
96       (declare (fixnum start-pos))
97       (and (not (minusp start-pos))
98            (loop for i of-type fixnum from start-pos below *end-pos*
99                  thereis (and (,char-compare (schar *string* i) chr) i))))))
100
101 (defun create-char-searcher (chr case-insensitive-p)
102   "Returns a function which searches the (special) simple-string
103 *STRING* for the first occurence of the character CHR. The search
104 starts at the position START-POS within *STRING* and stops before
105 *END-POS* is reached.  Depending on the second argument the search is
106 case-insensitive or not."
107   (declare #.*standard-optimize-settings*)
108   (if case-insensitive-p
109     (char-searcher-aux :case-insensitive-p t)
110     (char-searcher-aux)))
111
112 (declaim (inline newline-skipper))
113 (defun newline-skipper (start-pos)
114   "Finds the next occurence of a character in *STRING* which is behind
115 a #\Newline."
116   (declare #.*standard-optimize-settings*)
117   (declare (fixnum start-pos))
118   ;; we can start with (1- START-POS) without testing for (PLUSP
119   ;; START-POS) because we know we'll never call NEWLINE-SKIPPER on
120   ;; the first iteration
121   (loop for i of-type fixnum from (1- start-pos) below *end-pos*
122         thereis (and (char= (schar *string* i)
123                             #\Newline)
124                      (1+ i))))
125
126 (defmacro insert-advance-fn (advance-fn)
127   "Creates the actual closure returned by CREATE-SCANNER-AUX by
128 replacing '(ADVANCE-FN-DEFINITION) with a suitable definition for
129 ADVANCE-FN.  This is a utility macro used by CREATE-SCANNER-AUX."
130   (subst
131    advance-fn '(advance-fn-definition)
132    '(lambda (string start end)
133      (block scan
134        ;; initialize a couple of special variables used by the
135        ;; matchers (see file specials.lisp)
136        (let* ((*string* string)
137               (*start-pos* start)
138               (*end-pos* end)
139               ;; we will search forward for END-STRING if this value
140               ;; isn't at least as big as POS (see ADVANCE-FN), so it
141               ;; is safe to start to the left of *START-POS*; note
142               ;; that this value will _never_ be decremented - this
143               ;; is crucial to the scanning process
144               (*end-string-pos* (1- *start-pos*))
145               ;; the next five will shadow the variables defined by
146               ;; DEFPARAMETER; at this point, we don't know if we'll
147               ;; actually use them, though
148               (*repeat-counters* *repeat-counters*)
149               (*last-pos-stores* *last-pos-stores*)
150               (*reg-starts* *reg-starts*)
151               (*regs-maybe-start* *regs-maybe-start*)
152               (*reg-ends* *reg-ends*)
153               ;; we might be able to optimize the scanning process by
154               ;; (virtually) shifting *START-POS* to the right
155               (scan-start-pos *start-pos*)
156               (starts-with-str (if start-string-test
157                                  (str starts-with)
158                                  nil))
159               ;; we don't need to try further than MAX-END-POS
160               (max-end-pos (- *end-pos* min-len)))
161          (declare (fixnum scan-start-pos)
162                   (function match-fn))
163          ;; definition of ADVANCE-FN will be inserted here by macrology
164          (labels ((advance-fn-definition))
165            (declare (inline advance-fn))
166            (when (plusp rep-num)
167              ;; we have at least one REPETITION which needs to count
168              ;; the number of repetitions
169              (setq *repeat-counters* (make-array rep-num
170                                                  :initial-element 0
171                                                  :element-type 'fixnum)))
172            (when (plusp zero-length-num)
173              ;; we have at least one REPETITION which needs to watch
174              ;; out for zero-length repetitions
175              (setq *last-pos-stores* (make-array zero-length-num
176                                                  :initial-element nil)))
177            (when (plusp reg-num)
178              ;; we have registers in our regular expression
179              (setq *reg-starts* (make-array reg-num :initial-element nil)
180                    *regs-maybe-start* (make-array reg-num :initial-element nil)
181                    *reg-ends* (make-array reg-num :initial-element nil)))
182            (when end-anchored-p
183              ;; the regular expression has a constant end string which
184              ;; is anchored at the very end of the target string
185              ;; (perhaps modulo a #\Newline)
186              (let ((end-test-pos (- *end-pos* (the fixnum end-string-len))))
187                (declare (fixnum end-test-pos)
188                         (function end-string-test))
189                (unless (setq *end-string-pos* (funcall end-string-test
190                                                        end-test-pos))
191                  (when (and (= 1 (the fixnum end-anchored-p))
192                             (> *end-pos* scan-start-pos)
193                             (char= #\Newline (schar *string* (1- *end-pos*))))
194                    ;; if we didn't find an end string candidate from
195                    ;; END-TEST-POS and if a #\Newline at the end is
196                    ;; allowed we try it again from one position to the
197                    ;; left
198                    (setq *end-string-pos* (funcall end-string-test
199                                                    (1- end-test-pos))))))
200              (unless (and *end-string-pos*
201                           (<= *start-pos* *end-string-pos*))
202                ;; no end string candidate found, so give up
203                (return-from scan nil))
204              (when end-string-offset
205                ;; if the offset of the constant end string from the
206                ;; left of the regular expression is known we can start
207                ;; scanning further to the right; this is similar to
208                ;; what we might do in ADVANCE-FN
209                (setq scan-start-pos (max scan-start-pos
210                                          (- (the fixnum *end-string-pos*)
211                                             (the fixnum end-string-offset))))))
212              (cond
213                (start-anchored-p
214                  ;; we're anchored at the start of the target string,
215                  ;; so no need to try again after first failure
216                  (when (or (/= *start-pos* scan-start-pos)
217                            (< max-end-pos *start-pos*))
218                    ;; if END-STRING-OFFSET has proven that we don't
219                    ;; need to bother to scan from *START-POS* or if the
220                    ;; minimal length of the regular expression is
221                    ;; longer than the target string we give up
222                    (return-from scan nil))
223                  (when starts-with-str
224                    (locally
225                      (declare (fixnum starts-with-len))
226                      (cond ((and (case-insensitive-p starts-with)
227                                  (not (*string*-equal starts-with-str
228                                                       *start-pos*
229                                                       (+ *start-pos*
230                                                          starts-with-len)
231                                                       0 starts-with-len)))
232                              ;; the regular expression has a
233                              ;; case-insensitive constant start string
234                              ;; and we didn't find it
235                              (return-from scan nil))
236                            ((and (not (case-insensitive-p starts-with))
237                                  (not (*string*= starts-with-str
238                                             *start-pos*
239                                             (+ *start-pos* starts-with-len)
240                                             0 starts-with-len)))
241                              ;; the regular expression has a
242                              ;; case-sensitive constant start string
243                              ;; and we didn't find it
244                              (return-from scan nil))
245                            (t nil))))
246                  (when (and end-string-test
247                             (not end-anchored-p))
248                    ;; the regular expression has a constant end string
249                    ;; which isn't anchored so we didn't check for it
250                    ;; already
251                    (block end-string-loop
252                      ;; we temporarily use *END-STRING-POS* as our
253                      ;; starting position to look for end string
254                      ;; candidates
255                      (setq *end-string-pos* *start-pos*)
256                      (loop
257                        (unless (setq *end-string-pos*
258                                        (funcall (the function end-string-test)
259                                                 *end-string-pos*))
260                          ;; no end string candidate found, so give up
261                          (return-from scan nil))
262                        (unless end-string-offset
263                          ;; end string doesn't have an offset so we
264                          ;; can start scanning now
265                          (return-from end-string-loop))
266                        (let ((maybe-start-pos (- (the fixnum *end-string-pos*)
267                                                  (the fixnum end-string-offset))))
268                          (cond ((= maybe-start-pos *start-pos*)
269                                  ;; offset of end string into regular
270                                  ;; expression matches start anchor -
271                                  ;; fine...
272                                  (return-from end-string-loop))
273                                ((and (< maybe-start-pos *start-pos*)
274                                      (< (+ *end-string-pos* end-string-len) *end-pos*))
275                                  ;; no match but maybe we find another
276                                  ;; one to the right - try again
277                                  (incf *end-string-pos*))
278                                (t
279                                  ;; otherwise give up
280                                  (return-from scan nil)))))))
281                  ;; if we got here we scan exactly once
282                  (let ((next-pos (funcall match-fn *start-pos*)))
283                    (when next-pos
284                      (values (if next-pos *start-pos* nil)
285                              next-pos
286                              *reg-starts*
287                              *reg-ends*))))
288                (t
289                  (loop for pos = (if starts-with-everything
290                                    ;; don't jump to the next
291                                    ;; #\Newline on the first
292                                    ;; iteration
293                                    scan-start-pos
294                                    (advance-fn scan-start-pos))
295                            then (advance-fn pos)
296                        ;; give up if the regular expression can't fit
297                        ;; into the rest of the target string
298                        while (and pos
299                                   (<= (the fixnum pos) max-end-pos))
300                        do (let ((next-pos (funcall match-fn pos)))
301                             (when next-pos
302                               (return-from scan (values pos
303                                                         next-pos
304                                                         *reg-starts*
305                                                         *reg-ends*)))
306                             ;; not yet found, increment POS
307                             #-cormanlisp (incf (the fixnum pos))
308                             #+cormanlisp (incf pos)))))))))
309     :test #'equalp))
310
311 (defun create-scanner-aux (match-fn
312                            min-len
313                            start-anchored-p
314                            starts-with
315                            start-string-test
316                            end-anchored-p
317                            end-string-test
318                            end-string-len
319                            end-string-offset
320                            rep-num
321                            zero-length-num
322                            reg-num)
323   "Auxiliary function to create and return a scanner \(which is
324 actually a closure).  Used by CREATE-SCANNER."
325   (declare #.*standard-optimize-settings*)
326   (declare (fixnum min-len zero-length-num rep-num reg-num))
327   (let ((starts-with-len (if (typep starts-with 'str)
328                            (len starts-with)))
329         (starts-with-everything (typep starts-with 'everything)))
330     (cond
331       ;; this COND statement dispatches on the different versions we
332       ;; have for ADVANCE-FN and creates different closures for each;
333       ;; note that you see only the bodies of ADVANCE-FN below - the
334       ;; actual scanner is defined in INSERT-ADVANCE-FN above; (we
335       ;; could have done this with closures instead of macrology but
336       ;; would have consed a lot more)
337       ((and start-string-test end-string-test end-string-offset)
338         ;; we know that the regular expression has constant start and
339         ;; end strings and we know the end string's offset (from the
340         ;; left)
341         (insert-advance-fn
342           (advance-fn (pos)
343             (declare (fixnum end-string-offset starts-with-len)
344                      (function start-string-test end-string-test))
345             (loop
346               (unless (setq pos (funcall start-string-test pos))
347                 ;; give up completely if we can't find a start string
348                 ;; candidate
349                 (return-from scan nil))
350               (locally
351                 ;; from here we know that POS is a FIXNUM
352                 (declare (fixnum pos))
353                 (when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
354                   ;; if we already found an end string candidate the
355                   ;; position of which matches the start string
356                   ;; candidate we're done
357                   (return-from advance-fn pos))
358                 (let ((try-pos (+ pos starts-with-len)))
359                   ;; otherwise try (again) to find an end string
360                   ;; candidate which starts behind the start string
361                   ;; candidate
362                   (loop
363                     (unless (setq *end-string-pos*
364                                     (funcall end-string-test try-pos))
365                       ;; no end string candidate found, so give up
366                       (return-from scan nil))
367                     ;; NEW-POS is where we should start scanning
368                     ;; according to the end string candidate
369                     (let ((new-pos (- (the fixnum *end-string-pos*)
370                                       end-string-offset)))
371                       (declare (fixnum new-pos *end-string-pos*))
372                       (cond ((= new-pos pos)
373                               ;; if POS and NEW-POS are equal then the
374                               ;; two candidates agree so we're fine
375                               (return-from advance-fn pos))
376                             ((> new-pos pos)
377                               ;; if NEW-POS is further to the right we
378                               ;; advance POS and try again, i.e. we go
379                               ;; back to the start of the outer LOOP
380                               (setq pos new-pos)
381                               ;; this means "return from inner LOOP"
382                               (return))
383                             (t
384                               ;; otherwise NEW-POS is smaller than POS,
385                               ;; so we have to redo the inner LOOP to
386                               ;; find another end string candidate
387                               ;; further to the right
388                               (setq try-pos (1+ *end-string-pos*))))))))))))
389       ((and starts-with-everything end-string-test end-string-offset)
390         ;; we know that the regular expression starts with ".*" (which
391         ;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
392         ;; with a constant end string and we know the end string's
393         ;; offset (from the left)
394         (insert-advance-fn
395           (advance-fn (pos)
396             (declare (fixnum end-string-offset)
397                      (function end-string-test))
398             (loop
399               (unless (setq pos (newline-skipper pos))
400                 ;; if we can't find a #\Newline we give up immediately
401                 (return-from scan nil))
402               (locally
403                 ;; from here we know that POS is a FIXNUM
404                 (declare (fixnum pos))
405                 (when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
406                   ;; if we already found an end string candidate the
407                   ;; position of which matches the place behind the
408                   ;; #\Newline we're done
409                   (return-from advance-fn pos))
410                 (let ((try-pos pos))
411                   ;; otherwise try (again) to find an end string
412                   ;; candidate which starts behind the #\Newline
413                   (loop
414                     (unless (setq *end-string-pos*
415                                     (funcall end-string-test try-pos))
416                       ;; no end string candidate found, so we give up
417                       (return-from scan nil))
418                     ;; NEW-POS is where we should start scanning
419                     ;; according to the end string candidate
420                     (let ((new-pos (- (the fixnum *end-string-pos*)
421                                       end-string-offset)))
422                       (declare (fixnum new-pos *end-string-pos*))
423                       (cond ((= new-pos pos)
424                               ;; if POS and NEW-POS are equal then the
425                               ;; the end string candidate agrees with
426                               ;; the #\Newline so we're fine
427                               (return-from advance-fn pos))
428                             ((> new-pos pos)
429                               ;; if NEW-POS is further to the right we
430                               ;; advance POS and try again, i.e. we go
431                               ;; back to the start of the outer LOOP
432                               (setq pos new-pos)
433                               ;; this means "return from inner LOOP"
434                               (return))
435                             (t
436                               ;; otherwise NEW-POS is smaller than POS,
437                               ;; so we have to redo the inner LOOP to
438                               ;; find another end string candidate
439                               ;; further to the right
440                               (setq try-pos (1+ *end-string-pos*))))))))))))
441       ((and start-string-test end-string-test)
442         ;; we know that the regular expression has constant start and
443         ;; end strings; similar to the first case but we only need to
444         ;; check for the end string, it doesn't provide enough
445         ;; information to advance POS
446         (insert-advance-fn
447           (advance-fn (pos)
448             (declare (function start-string-test end-string-test))
449             (unless (setq pos (funcall start-string-test pos))
450               (return-from scan nil))
451             (if (<= (the fixnum pos)
452                     (the fixnum *end-string-pos*))
453               (return-from advance-fn pos))
454             (unless (setq *end-string-pos* (funcall end-string-test pos))
455               (return-from scan nil))
456             pos)))
457       ((and starts-with-everything end-string-test)
458         ;; we know that the regular expression starts with ".*" (which
459         ;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
460         ;; with a constant end string; similar to the second case but we
461         ;; only need to check for the end string, it doesn't provide
462         ;; enough information to advance POS
463         (insert-advance-fn
464           (advance-fn (pos)
465             (declare (function end-string-test))
466             (unless (setq pos (newline-skipper pos))
467               (return-from scan nil))
468             (if (<= (the fixnum pos)
469                     (the fixnum *end-string-pos*))
470               (return-from advance-fn pos))
471             (unless (setq *end-string-pos* (funcall end-string-test pos))
472               (return-from scan nil))
473             pos)))
474       (start-string-test
475         ;; just check for constant start string candidate
476         (insert-advance-fn
477           (advance-fn (pos)
478             (declare (function start-string-test))
479             (unless (setq pos (funcall start-string-test pos))
480               (return-from scan nil))
481             pos)))
482       (starts-with-everything
483         ;; just advance POS with NEWLINE-SKIPPER
484         (insert-advance-fn
485           (advance-fn (pos)
486             (unless (setq pos (newline-skipper pos))
487               (return-from scan nil))
488             pos)))
489       (end-string-test
490         ;; just check for the next end string candidate if POS has
491         ;; advanced beyond the last one
492         (insert-advance-fn
493           (advance-fn (pos)
494             (declare (function end-string-test))
495             (if (<= (the fixnum pos)
496                     (the fixnum *end-string-pos*))
497               (return-from advance-fn pos))
498             (unless (setq *end-string-pos* (funcall end-string-test pos))
499               (return-from scan nil))
500             pos)))
501       (t
502         ;; not enough optimization information about the regular
503         ;; expression to optimize so we just return POS
504         (insert-advance-fn
505           (advance-fn (pos)
506             pos))))))
Note: See TracBrowser for help on using the browser.