root/trunk/thirdparty/cl-ppcre/api.lisp

Revision 3581, 61.0 kB (checked in by edi, 6 months ago)

Update to current dev version

Line 
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2 ;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.84 2008/07/06 18:12:04 edi Exp $
3
4 ;;; The external API for creating and using scanners.
5
6 ;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
7
8 ;;; Redistribution and use in source and binary forms, with or without
9 ;;; modification, are permitted provided that the following conditions
10 ;;; are met:
11
12 ;;;   * Redistributions of source code must retain the above copyright
13 ;;;     notice, this list of conditions and the following disclaimer.
14
15 ;;;   * Redistributions in binary form must reproduce the above
16 ;;;     copyright notice, this list of conditions and the following
17 ;;;     disclaimer in the documentation and/or other materials
18 ;;;     provided with the distribution.
19
20 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
21 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
24 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
26 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
32 (in-package :cl-ppcre)
33
34 (defgeneric create-scanner (regex &key case-insensitive-mode
35                                        multi-line-mode
36                                        single-line-mode
37                                        extended-mode
38                                        destructive)
39   (:documentation "Accepts a regular expression - either as a
40 parse-tree or as a string - and returns a scan closure which will scan
41 strings for this regular expression and a list mapping registers to
42 their names \(NIL stands for unnamed ones).  The \"mode\" keyboard
43 arguments are equivalent to the imsx modifiers in Perl.  If
44 DESTRUCTIVE is not NIL, the function is allowed to destructively
45 modify its first argument \(but only if it's a parse tree)."))
46
47 #-:use-acl-regexp2-engine
48 (defmethod create-scanner ((regex-string string) &key case-insensitive-mode
49                                                       multi-line-mode
50                                                       single-line-mode
51                                                       extended-mode
52                                                       destructive)
53   (declare #.*standard-optimize-settings*)
54   (declare (ignore destructive))
55   ;; parse the string into a parse-tree and then call CREATE-SCANNER
56   ;; again
57   (let* ((*extended-mode-p* extended-mode)
58          (quoted-regex-string (if *allow-quoting*
59                                 (quote-sections (clean-comments regex-string extended-mode))
60                                 regex-string))
61          (*syntax-error-string* (copy-seq quoted-regex-string)))
62     ;; wrap the result with :GROUP to avoid infinite loops for
63     ;; constant strings
64     (create-scanner (cons :group (list (parse-string quoted-regex-string)))
65                     :case-insensitive-mode case-insensitive-mode
66                     :multi-line-mode multi-line-mode
67                     :single-line-mode single-line-mode
68                     :destructive t)))
69
70 #-:use-acl-regexp2-engine
71 (defmethod create-scanner ((scanner function) &key case-insensitive-mode
72                                                    multi-line-mode
73                                                    single-line-mode
74                                                    extended-mode
75                                                    destructive)
76   (declare #.*standard-optimize-settings*)
77   (declare (ignore destructive))
78   (when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode)
79     (signal-invocation-error "You can't use the keyword arguments to modify an existing scanner."))
80   scanner)
81
82 #-:use-acl-regexp2-engine
83 (defmethod create-scanner ((parse-tree t) &key case-insensitive-mode
84                                                multi-line-mode
85                                                single-line-mode
86                                                extended-mode
87                                                destructive)
88   (declare #.*standard-optimize-settings*)
89   (when extended-mode
90     (signal-invocation-error "Extended mode doesn't make sense in parse trees."))
91   ;; convert parse-tree into internal representation REGEX and at the
92   ;; same time compute the number of registers and the constant string
93   ;; (or anchor) the regex starts with (if any)
94   (unless destructive
95     (setq parse-tree (copy-tree parse-tree)))
96   (let (flags)
97     (if single-line-mode
98       (push :single-line-mode-p flags))
99     (if multi-line-mode
100       (push :multi-line-mode-p flags))
101     (if case-insensitive-mode
102       (push :case-insensitive-p flags))
103     (when flags
104       (setq parse-tree (list :group (cons :flags flags) parse-tree))))
105   (let ((*syntax-error-string* nil))
106     (multiple-value-bind (regex reg-num starts-with reg-names)
107         (convert parse-tree)
108       ;; simplify REGEX by flattening nested SEQ and ALTERNATION
109       ;; constructs and gathering STR objects
110       (let ((regex (gather-strings (flatten regex))))
111         ;; set the MIN-REST slots of the REPETITION objects
112         (compute-min-rest regex 0)
113         ;; set the OFFSET slots of the STR objects
114         (compute-offsets regex 0)
115         (let* (end-string-offset
116                end-anchored-p
117                ;; compute the constant string the regex ends with (if
118                ;; any) and at the same time set the special variables
119                ;; END-STRING-OFFSET and END-ANCHORED-P
120                (end-string (end-string regex))
121                ;; if we found a non-zero-length end-string we create an
122                ;; efficient search function for it
123                (end-string-test (and end-string
124                                      (plusp (len end-string))
125                                      (if (= 1 (len end-string))
126                                        (create-char-searcher
127                                         (schar (str end-string) 0)
128                                         (case-insensitive-p end-string))
129                                        (create-bmh-matcher
130                                         (str end-string)
131                                         (case-insensitive-p end-string)))))
132                ;; initialize the counters for CREATE-MATCHER-AUX
133                (*rep-num* 0)
134                (*zero-length-num* 0)
135                ;; create the actual matcher function (which does all the
136                ;; work of matching the regular expression) corresponding
137                ;; to REGEX and at the same time set the special
138                ;; variables *REP-NUM* and *ZERO-LENGTH-NUM*
139                (match-fn (create-matcher-aux regex #'identity))
140                ;; if the regex starts with a string we create an
141                ;; efficient search function for it
142                (start-string-test (and (typep starts-with 'str)
143                                        (plusp (len starts-with))
144                                        (if (= 1 (len starts-with))
145                                          (create-char-searcher
146                                           (schar (str starts-with) 0)
147                                           (case-insensitive-p starts-with))
148                                          (create-bmh-matcher
149                                           (str starts-with)
150                                           (case-insensitive-p starts-with))))))
151           (declare (special end-string-offset end-anchored-p end-string))
152           ;; now create the scanner and return it
153           (values (create-scanner-aux match-fn
154                                       (regex-min-length regex)
155                                       (or (start-anchored-p regex)
156                                           ;; a dot in single-line-mode also
157                                           ;; implicitly anchors the regex at
158                                           ;; the start, i.e. if we can't match
159                                           ;; from the first position we won't
160                                           ;; match at all
161                                           (and (typep starts-with 'everything)
162                                                (single-line-p starts-with)))
163                                       starts-with
164                                       start-string-test
165                                       ;; only mark regex as end-anchored if we
166                                       ;; found a non-zero-length string before
167                                       ;; the anchor
168                                       (and end-string-test end-anchored-p)
169                                       end-string-test
170                                       (if end-string-test
171                                           (len end-string)
172                                           nil)
173                                       end-string-offset
174                                       *rep-num*
175                                       *zero-length-num*
176                                       reg-num)
177                   reg-names))))))
178
179 #+:use-acl-regexp2-engine
180 (declaim (inline create-scanner))
181 #+:use-acl-regexp2-engine
182 (defmethod create-scanner ((scanner regexp::regular-expression) &key case-insensitive-mode
183                                                                      multi-line-mode
184                                                                      single-line-mode
185                                                                      extended-mode
186                                                                      destructive)
187   (declare #.*standard-optimize-settings*)
188   (declare (ignore destructive))
189   (when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode)
190     (signal-invocation-error "You can't use the keyword arguments to modify an existing scanner."))
191   scanner)
192
193 #+:use-acl-regexp2-engine
194 (defmethod create-scanner ((parse-tree t) &key case-insensitive-mode
195                                                multi-line-mode
196                                                single-line-mode
197                                                extended-mode
198                                                destructive)
199   (declare #.*standard-optimize-settings*)
200   (declare (ignore destructive))
201   (excl:compile-re parse-tree
202                    :case-fold case-insensitive-mode
203                    :ignore-whitespace extended-mode
204                    :multiple-lines multi-line-mode
205                    :single-line single-line-mode
206                    :return :index))
207
208 (defgeneric scan (regex target-string &key start end real-start-pos)
209   (:documentation "Searches TARGET-STRING from START to END and tries
210 to match REGEX.  On success returns four values - the start of the
211 match, the end of the match, and two arrays denoting the beginnings
212 and ends of register matches.  On failure returns NIL.  REGEX can be a
213 string which will be parsed according to Perl syntax, a parse tree, or
214 a pre-compiled scanner created by CREATE-SCANNER.  TARGET-STRING will
215 be coerced to a simple string if it isn't one already.  The
216 REAL-START-POS parameter should be ignored - it exists only for
217 internal purposes."))
218
219 #-:use-acl-regexp2-engine
220 (defmethod scan ((regex-string string) target-string
221                                        &key (start 0)
222                                             (end (length target-string))
223                                             ((:real-start-pos *real-start-pos*) nil))
224   (declare #.*standard-optimize-settings*)
225   ;; note that the scanners are optimized for simple strings so we
226   ;; have to coerce TARGET-STRING into one if it isn't already
227   (funcall (create-scanner regex-string)
228            (maybe-coerce-to-simple-string target-string)
229            start end))
230
231 #-:use-acl-regexp2-engine
232 (defmethod scan ((scanner function) target-string
233                                     &key (start 0)
234                                          (end (length target-string))
235                                          ((:real-start-pos *real-start-pos*) nil))
236   (declare #.*standard-optimize-settings*)
237   (funcall scanner
238            (maybe-coerce-to-simple-string target-string)
239            start end))
240
241 #-:use-acl-regexp2-engine
242 (defmethod scan ((parse-tree t) target-string
243                                 &key (start 0)
244                                      (end (length target-string))
245                                      ((:real-start-pos *real-start-pos*) nil))
246   (declare #.*standard-optimize-settings*)
247   (funcall (create-scanner parse-tree)
248            (maybe-coerce-to-simple-string target-string)
249            start end))
250
251 #+:use-acl-regexp2-engine
252 (declaim (inline scan))
253 #+:use-acl-regexp2-engine
254 (defmethod scan ((parse-tree t) target-string
255                                 &key (start 0)
256                                      (end (length target-string))
257                                      ((:real-start-pos *real-start-pos*) nil))
258   (declare #.*standard-optimize-settings*)
259   (when (< end start)
260     (return-from scan nil))
261   (let ((results (multiple-value-list (excl:match-re parse-tree target-string
262                                                      :start start
263                                                      :end end
264                                                      :return :index))))
265     (declare (dynamic-extent results))
266     (cond ((null (first results)) nil)
267           (t (let* ((no-of-regs (- (length results) 2))
268                     (reg-starts (make-array no-of-regs
269                                             :element-type '(or null fixnum)))
270                     (reg-ends (make-array no-of-regs
271                                           :element-type '(or null fixnum)))
272                     (match (second results)))
273                (loop for (start . end) in (cddr results)
274                      for i from 0
275                      do (setf (aref reg-starts i) start
276                               (aref reg-ends i) end))
277                (values (car match) (cdr match) reg-starts reg-ends))))))
278
279 #-:cormanlisp
280 (define-compiler-macro scan (&whole form &environment env regex target-string &rest rest)
281   "Make sure that constant forms are compiled into scanners at compile time."
282   (cond ((constantp regex env)
283           `(scan (load-time-value (create-scanner ,regex))
284                  ,target-string ,@rest))
285         (t form)))
286
287 (defun scan-to-strings (regex target-string &key (start 0)
288                                                  (end (length target-string))
289                                                  sharedp)
290   "Like SCAN but returns substrings of TARGET-STRING instead of
291 positions, i.e. this function returns two values on success: the whole
292 match as a string plus an array of substrings (or NILs) corresponding
293 to the matched registers.  If SHAREDP is true, the substrings may
294 share structure with TARGET-STRING."
295   (declare #.*standard-optimize-settings*)
296   (multiple-value-bind (match-start match-end reg-starts reg-ends)
297       (scan regex target-string :start start :end end)
298     (unless match-start
299       (return-from scan-to-strings nil))
300     (let ((substr-fn (if sharedp #'nsubseq #'subseq)))
301       (values (funcall substr-fn
302                        target-string match-start match-end)
303               (map 'vector
304                    (lambda (reg-start reg-end)
305                      (if reg-start
306                        (funcall substr-fn
307                                 target-string reg-start reg-end)
308                        nil))
309                    reg-starts
310                    reg-ends)))))
311
312 #-:cormanlisp
313 (define-compiler-macro scan-to-strings
314     (&whole form &environment env regex target-string &rest rest)
315   "Make sure that constant forms are compiled into scanners at compile time."
316   (cond ((constantp regex env)
317           `(scan-to-strings (load-time-value (create-scanner ,regex))
318                             ,target-string ,@rest))
319         (t form)))
320
321 (defmacro register-groups-bind (var-list (regex target-string
322                                                 &key start end sharedp)
323                                 &body body)
324   "Executes BODY with the variables in VAR-LIST bound to the
325 corresponding register groups after TARGET-STRING has been matched
326 against REGEX, i.e. each variable is either bound to a string or to
327 NIL.  If there is no match, BODY is _not_ executed. For each element
328 of VAR-LIST which is NIL there's no binding to the corresponding
329 register group.  The number of variables in VAR-LIST must not be
330 greater than the number of register groups.  If SHAREDP is true, the
331 substrings may share structure with TARGET-STRING."
332   (with-rebinding (target-string)
333     (with-unique-names (match-start match-end reg-starts reg-ends
334                                     start-index substr-fn)
335       `(multiple-value-bind (,match-start ,match-end ,reg-starts ,reg-ends)
336             (scan ,regex ,target-string :start (or ,start 0)
337                                         :end (or ,end (length ,target-string)))
338           (declare (ignore ,match-end))
339           (when ,match-start           
340             (let* ,(cons
341                     `(,substr-fn (if ,sharedp
342                                    #'nsubseq
343                                    #'subseq))
344                     (loop for (function var) in (normalize-var-list var-list)
345                           for counter from 0
346                           when var
347                             collect `(,var (let ((,start-index
348                                                    (aref ,reg-starts ,counter)))
349                                              (if ,start-index
350                                                (funcall ,function
351                                                         (funcall ,substr-fn
352                                                                  ,target-string
353                                                                  ,start-index
354                                                                  (aref ,reg-ends ,counter)))
355                                                nil)))))
356               ,@body))))))
357
358 (defmacro do-scans ((match-start match-end reg-starts reg-ends regex
359                                  target-string
360                                  &optional result-form
361                                  &key start end)
362                     &body body
363                     &environment env)
364   "Iterates over TARGET-STRING and tries to match REGEX as often as
365 possible evaluating BODY with MATCH-START, MATCH-END, REG-STARTS, and
366 REG-ENDS bound to the four return values of each match in turn.  After
367 the last match, returns RESULT-FORM if provided or NIL otherwise. An
368 implicit block named NIL surrounds DO-SCANS; RETURN may be used to
369 terminate the loop immediately.  If REGEX matches an empty string the
370 scan is continued one position behind this match. BODY may start with
371 declarations."
372   (with-rebinding (target-string)
373     (with-unique-names (%start %end %regex scanner)
374       (declare (ignorable %regex scanner))
375       ;; the NIL BLOCK to enable exits via (RETURN ...)
376       `(block nil
377          (let* ((,%start (or ,start 0))
378                 (,%end (or ,end (length ,target-string)))
379                 ,@(unless (constantp regex env)
380                     ;; leave constant regular expressions as they are -
381                     ;; SCAN's compiler macro will take care of them;
382                     ;; otherwise create a scanner unless the regex is
383                     ;; already a function (otherwise SCAN will do this
384                     ;; on each iteration)
385                     `((,%regex ,regex)
386                       (,scanner (typecase ,%regex
387                                   (function ,%regex)
388                                   (t (create-scanner ,%regex)))))))
389            ;; coerce TARGET-STRING to a simple string unless it is one
390            ;; already (otherwise SCAN will do this on each iteration)
391            (setq ,target-string
392                  (maybe-coerce-to-simple-string ,target-string))
393            (loop
394             ;; invoke SCAN and bind the returned values to the
395             ;; provided variables
396             (multiple-value-bind
397                 (,match-start ,match-end ,reg-starts ,reg-ends)
398                 (scan ,(cond ((constantp regex env) regex)
399                              (t scanner))
400                       ,target-string :start ,%start :end ,%end
401                       :real-start-pos (or ,start 0))
402               ;; declare the variables to be IGNORABLE to prevent the
403               ;; compiler from issuing warnings
404               (declare
405                (ignorable ,match-start ,match-end ,reg-starts ,reg-ends))
406               (unless ,match-start
407                 ;; stop iteration on first failure
408                 (return ,result-form))
409               ;; execute BODY (wrapped in LOCALLY so it can start with
410               ;; declarations)
411               (locally
412                 ,@body)
413               ;; advance by one position if we had a zero-length match
414               (setq ,%start (if (= ,match-start ,match-end)
415                               (1+ ,match-end)
416                               ,match-end)))))))))
417
418 (defmacro do-matches ((match-start match-end regex
419                                    target-string
420                                    &optional result-form
421                                    &key start end)
422                       &body body)
423   "Iterates over TARGET-STRING and tries to match REGEX as often as
424 possible evaluating BODY with MATCH-START and MATCH-END bound to the
425 start/end positions of each match in turn.  After the last match,
426 returns RESULT-FORM if provided or NIL otherwise.  An implicit block
427 named NIL surrounds DO-MATCHES; RETURN may be used to terminate the
428 loop immediately.  If REGEX matches an empty string the scan is
429 continued one position behind this match.  BODY may start with
430 declarations."
431   ;; this is a simplified form of DO-SCANS - we just provide two dummy
432   ;; vars and ignore them
433   (with-unique-names (reg-starts reg-ends)
434     `(do-scans (,match-start ,match-end
435                 ,reg-starts ,reg-ends
436                 ,regex ,target-string
437                 ,result-form
438                 :start ,start :end ,end)
439       ,@body)))
440
441 (defmacro do-matches-as-strings ((match-var regex
442                                             target-string
443                                             &optional result-form
444                                             &key start end sharedp)
445                                  &body body)
446   "Iterates over TARGET-STRING and tries to match REGEX as often as
447 possible evaluating BODY with MATCH-VAR bound to the substring of
448 TARGET-STRING corresponding to each match in turn.  After the last
449 match, returns RESULT-FORM if provided or NIL otherwise.  An implicit
450 block named NIL surrounds DO-MATCHES-AS-STRINGS; RETURN may be used to
451 terminate the loop immediately.  If REGEX matches an empty string the
452 scan is continued one position behind this match.  If SHAREDP is true,
453 the substrings may share structure with TARGET-STRING.  BODY may start
454 with declarations."
455   (with-rebinding (target-string)
456     (with-unique-names (match-start match-end substr-fn)
457       `(let ((,substr-fn (if ,sharedp #'nsubseq #'subseq)))
458         ;; simple use DO-MATCHES to extract the substrings
459         (do-matches (,match-start ,match-end ,regex ,target-string
460                      ,result-form :start ,start :end ,end)
461           (let ((,match-var
462                   (funcall ,substr-fn
463                            ,target-string ,match-start ,match-end)))
464             ,@body))))))
465
466 (defmacro do-register-groups (var-list (regex target-string
467                                               &optional result-form
468                                               &key start end sharedp)
469                                        &body body)
470   "Iterates over TARGET-STRING and tries to match REGEX as often as
471 possible evaluating BODY with the variables in VAR-LIST bound to the
472 corresponding register groups for each match in turn, i.e. each
473 variable is either bound to a string or to NIL.  For each element of
474 VAR-LIST which is NIL there's no binding to the corresponding register
475 group. The number of variables in VAR-LIST must not be greater than
476 the number of register groups.  After the last match, returns
477 RESULT-FORM if provided or NIL otherwise.  An implicit block named NIL
478 surrounds DO-REGISTER-GROUPS; RETURN may be used to terminate the loop
479 immediately. If REGEX matches an empty string the scan is continued
480 one position behind this match.  If SHAREDP is true, the substrings
481 may share structure with TARGET-STRING.  BODY may start with
482 declarations."
483   (with-rebinding (target-string)
484     (with-unique-names (substr-fn match-start match-end
485                                   reg-starts reg-ends start-index)
486       `(let ((,substr-fn (if ,sharedp
487                           #'nsubseq
488                           #'subseq)))
489         (do-scans (,match-start ,match-end ,reg-starts ,reg-ends
490                                 ,regex ,target-string
491                                 ,result-form :start ,start :end ,end)
492           (let ,(loop for (function var) in (normalize-var-list var-list)
493                       for counter from 0
494                       when var
495                         collect `(,var (let ((,start-index
496                                                (aref ,reg-starts ,counter)))
497                                          (if ,start-index
498                                            (funcall ,function
499                                                     (funcall ,substr-fn
500                                                              ,target-string
501                                                              ,start-index
502                                                              (aref ,reg-ends ,counter)))
503                                            nil))))
504             ,@body))))))
505
506 (defun all-matches (regex target-string
507                           &key (start 0)
508                                (end (length target-string)))
509   "Returns a list containing the start and end positions of all
510 matches of REGEX against TARGET-STRING, i.e. if there are N matches
511 the list contains (* 2 N) elements.  If REGEX matches an empty string
512 the scan is continued one position behind this match."
513   (declare #.*standard-optimize-settings*)
514   (let (result-list)
515     (do-matches (match-start match-end
516                  regex target-string
517                  (nreverse result-list)
518                  :start start :end end)
519       (push match-start result-list)
520       (push match-end result-list))))
521
522 #-:cormanlisp
523 (define-compiler-macro all-matches (&whole form &environment env regex &rest rest)
524    "Make sure that constant forms are compiled into scanners at
525 compile time."
526    (cond ((constantp regex env)
527            `(all-matches (load-time-value (create-scanner ,regex))
528                          ,@rest))
529          (t form)))
530
531 (defun all-matches-as-strings (regex target-string
532                                      &key (start 0)
533                                           (end (length target-string))
534                                           sharedp)
535   "Returns a list containing all substrings of TARGET-STRING which
536 match REGEX. If REGEX matches an empty string the scan is continued
537 one position behind this match. If SHAREDP is true, the substrings may
538 share structure with TARGET-STRING."
539   (declare #.*standard-optimize-settings*)
540   (let (result-list)
541     (do-matches-as-strings (match regex target-string (nreverse result-list)
542                                   :start start :end end :sharedp sharedp)
543       (push match result-list))))
544
545 #-:cormanlisp
546 (define-compiler-macro all-matches-as-strings (&whole form &environment env regex &rest rest)
547    "Make sure that constant forms are compiled into scanners at
548 compile time."
549    (cond ((constantp regex env)
550            `(all-matches-as-strings
551              (load-time-value (create-scanner ,regex))
552              ,@rest))
553          (t form)))
554
555 (defun split (regex target-string
556                     &key (start 0)
557                          (end (length target-string))
558                          limit
559                          with-registers-p
560                          omit-unmatched-p
561                          sharedp)
562   "Matches REGEX against TARGET-STRING as often as possible and
563 returns a list of the substrings between the matches.  If
564 WITH-REGISTERS-P is true, substrings corresponding to matched
565 registers are inserted into the list as well.  If OMIT-UNMATCHED-P is
566 true, unmatched registers will simply be left out, otherwise they will
567 show up as NIL.  LIMIT limits the number of elements returned -
568 registers aren't counted.  If LIMIT is NIL \(or 0 which is
569 equivalent), trailing empty strings are removed from the result list.
570 If REGEX matches an empty string the scan is continued one position
571 behind this match.  If SHAREDP is true, the substrings may share
572 structure with TARGET-STRING."
573   (declare #.*standard-optimize-settings*)
574   ;; initialize list of positions POS-LIST to extract substrings with
575   ;; START so that the start of the next match will mark the end of
576   ;; the first substring
577   (let ((pos-list (list start))
578         (counter 0))
579     ;; how would Larry Wall do it?
580     (when (eql limit 0)
581       (setq limit nil))
582     (do-scans (match-start match-end
583                reg-starts reg-ends
584                regex target-string nil
585                :start start :end end)
586       (unless (and (= match-start match-end)
587                    (= match-start (car pos-list)))
588         ;; push start of match on list unless this would be an empty
589         ;; string adjacent to the last element pushed onto the list
590         (when (and limit
591                    (>= (incf counter) limit))
592           (return))
593         (push match-start pos-list)
594         (when with-registers-p
595           ;; optionally insert matched registers
596           (loop for reg-start across reg-starts
597                 for reg-end across reg-ends
598                 if reg-start
599                   ;; but only if they've matched
600                   do (push reg-start pos-list)
601                      (push reg-end pos-list)
602                 else unless omit-unmatched-p
603                   ;; or if we're allowed to insert NIL instead
604                   do (push nil pos-list)
605                      (push nil pos-list)))
606         ;; now end of match
607         (push match-end pos-list)))
608     ;; end of whole string
609     (push end pos-list)
610     ;; now collect substrings
611     (nreverse
612      (loop with substr-fn = (if sharedp #'nsubseq #'subseq)
613            with string-seen = nil
614            for (this-end this-start) on pos-list by #'cddr
615            ;; skip empty strings from end of list
616            if (or limit
617                   (setq string-seen
618                           (or string-seen
619                               (and this-start
620                                    (> this-end this-start)))))
621            collect (if this-start
622                      (funcall substr-fn
623                               target-string this-start this-end)
624                      nil)))))
625
626 #-:cormanlisp
627 (define-compiler-macro split (&whole form &environment env regex target-string &rest rest)
628   "Make sure that constant forms are compiled into scanners at compile time."
629   (cond ((constantp regex env)
630           `(split (load-time-value (create-scanner ,regex))
631                   ,target-string ,@rest))
632         (t form)))
633
634 (defun string-case-modifier (str from to start end)
635   (declare #.*standard-optimize-settings*)
636   (declare (fixnum from to start end))
637   "Checks whether all words in STR between FROM and TO are upcased,
638 downcased or capitalized and returns a function which applies a
639 corresponding case modification to strings.  Returns #'IDENTITY
640 otherwise, especially if words in the target area extend beyond FROM
641 or TO.  STR is supposed to be bounded by START and END.  It is assumed
642 that \(<= START FROM TO END)."
643   (case
644       (if (or (<= to from)
645               (and (< start from)
646                    (alphanumericp (char str (1- from)))
647                    (alphanumericp (char str from)))
648               (and (< to end)
649                    (alphanumericp (char str to))
650                    (alphanumericp (char str (1- to)))))
651         ;; if it's a zero-length string or if words extend beyond FROM
652         ;; or TO we return NIL, i.e. #'IDENTITY
653         nil
654         ;; otherwise we loop through STR from FROM to TO
655         (loop with last-char-both-case
656               with current-result
657               for index of-type fixnum from from below to
658               for chr = (char str index)
659               do (cond ((not #-:cormanlisp (both-case-p chr)
660                              #+:cormanlisp (or (upper-case-p chr)
661                                                (lower-case-p chr)))
662                          ;; this character doesn't have a case so we
663                          ;; consider it as a word boundary (note that
664                          ;; this differs from how \b works in Perl)
665                          (setq last-char-both-case nil))
666                        ((upper-case-p chr)
667                          ;; an uppercase character
668                          (setq current-result
669                                  (if last-char-both-case
670                                    ;; not the first character in a
671                                    (case current-result
672                                      ((:undecided) :upcase)
673                                      ((:downcase :capitalize) (return nil))
674                                      ((:upcase) current-result))
675                                    (case current-result
676                                      ((nil) :undecided)
677                                      ((:downcase) (return nil))
678                                      ((:capitalize :upcase) current-result)))
679                                last-char-both-case t))
680                        (t
681                          ;; a lowercase character
682                          (setq current-result
683                                  (case current-result
684                                    ((nil) :downcase)
685                                    ((:undecided) :capitalize)
686                                    ((:downcase) current-result)
687                                    ((:capitalize) (if last-char-both-case
688                                                     current-result
689                                                     (return nil)))
690                                    ((:upcase) (return nil)))
691                                last-char-both-case t)))
692               finally (return current-result)))
693     ((nil) #'identity)
694     ((:undecided :upcase) #'string-upcase)
695     ((:downcase) #'string-downcase)
696     ((:capitalize) #'string-capitalize)))
697
698 ;; first create a scanner to identify the special parts of the
699 ;; replacement string (eat your own dog food...)
700
701 (defgeneric build-replacement-template (replacement-string)
702   (declare #.*standard-optimize-settings*)
703   (:documentation "Converts a replacement string for REGEX-REPLACE or
704 REGEX-REPLACE-ALL into a replacement template which is an
705 S-expression."))
706
707 #-:cormanlisp
708 (let* ((*use-bmh-matchers* nil)
709        (reg-scanner (create-scanner "\\\\(?:\\\\|\\{\\d+\\}|\\d+|&|`|')")))
710   (defmethod build-replacement-template ((replacement-string string))
711     (declare #.*standard-optimize-settings*)
712     (let ((from 0)
713           ;; COLLECTOR will hold the (reversed) template
714           (collector '()))
715       ;; scan through all special parts of the replacement string
716       (do-matches (match-start match-end reg-scanner replacement-string)
717         (when (< from match-start)
718           ;; strings between matches are copied verbatim
719           (push (subseq replacement-string from match-start) collector))
720         ;; PARSE-START is true if the pattern matched a number which
721         ;; refers to a register
722         (let* ((parse-start (position-if #'digit-char-p
723                                          replacement-string
724                                          :start match-start
725                                          :end match-end))
726                (token (if parse-start
727                         (1- (parse-integer replacement-string
728                                            :start parse-start
729                                            :junk-allowed t))
730                         ;; if we didn't match a number we convert the
731                         ;; character to a symbol
732                         (case (char replacement-string (1+ match-start))
733                           ((#\&) :match)
734                           ((#\`) :before-match)
735                           ((#\') :after-match)
736                           ((#\\) :backslash)))))
737           (when (and (numberp token) (< token 0))
738             ;; make sure we don't accept something like "\\0"
739             (signal-invocation-error "Illegal substring ~S in replacement string."
740                                      (subseq replacement-string match-start match-end)))
741           (push token collector))
742         ;; remember where the match ended
743         (setq from match-end))
744       (when (< from (length replacement-string))
745         ;; push the rest of the replacement string onto the list
746         (push (subseq replacement-string from) collector))
747       (nreverse collector))))
748
749 #-:cormanlisp
750 (defmethod build-replacement-template ((replacement-function function))
751   (declare #.*standard-optimize-settings*)
752   (list replacement-function))
753
754 #-:cormanlisp
755 (defmethod build-replacement-template ((replacement-function-symbol symbol))
756   (declare #.*standard-optimize-settings*)
757   (list replacement-function-symbol))
758        
759 #-:cormanlisp
760 (defmethod build-replacement-template ((replacement-list list))
761   (declare #.*standard-optimize-settings*)
762   replacement-list)
763
764 ;;; Corman Lisp's methods can't be closures... :(
765 #+:cormanlisp
766 (let* ((*use-bmh-matchers* nil)
767        (reg-scanner (create-scanner "\\\\(?:\\\\|\\{\\d+\\}|\\d+|&|`|')")))
768   (defun build-replacement-template (replacement)
769     (declare #.*standard-optimize-settings*)
770     (typecase replacement
771       (string
772         (let ((from 0)
773               ;; COLLECTOR will hold the (reversed) template
774               (collector '()))
775           ;; scan through all special parts of the replacement string
776           (do-matches (match-start match-end reg-scanner replacement)
777             (when (< from match-start)
778               ;; strings between matches are copied verbatim
779               (push (subseq replacement from match-start) collector))
780             ;; PARSE-START is true if the pattern matched a number which
781             ;; refers to a register
782             (let* ((parse-start (position-if #'digit-char-p
783                                              replacement
784                                              :start match-start
785                                              :end match-end))
786                    (token (if parse-start
787                             (1- (parse-integer replacement
788                                                :start parse-start
789                                                :junk-allowed t))
790                             ;; if we didn't match a number we convert the
791                             ;; character to a symbol
792                             (case (char replacement (1+ match-start))
793                               ((#\&) :match)
794                               ((#\`) :before-match)
795                               ((#\') :after-match)
796                               ((#\\) :backslash)))))
797               (when (and (numberp token) (< token 0))
798                 ;; make sure we don't accept something like "\\0"
799                 (signal-invocation-error "Illegal substring ~S in replacement string."
800                                          (subseq replacement match-start match-end)))
801               (push token collector))
802             ;; remember where the match ended
803             (setq from match-end))
804           (when (< from (length replacement))
805             ;; push the rest of the replacement string onto the list
806             (push (nsubseq replacement from) collector))
807           (nreverse collector)))
808       (list
809         replacement)
810       (t
811         (list replacement)))))
812        
813 (defun build-replacement (replacement-template
814                           target-string
815                           start end
816                           match-start match-end
817                           reg-starts reg-ends
818                           simple-calls
819                           element-type)
820   (declare #.*standard-optimize-settings*)
821   "Accepts a replacement template and the current values from the
822 matching process in REGEX-REPLACE or REGEX-REPLACE-ALL and returns the
823 corresponding string."
824   ;; the upper exclusive bound of the register numbers in the regular
825   ;; expression
826   (let ((reg-bound (if reg-starts
827                      (array-dimension reg-starts 0)
828                      0)))
829     (with-output-to-string (s nil :element-type element-type)
830       (loop for token in replacement-template
831             do (typecase token
832                  (string
833                    ;; transfer string parts verbatim
834                    (write-string token s))
835                  (integer
836                    ;; replace numbers with the corresponding registers
837                    (when (>= token reg-bound)
838                      ;; but only if the register was referenced in the
839                      ;; regular expression
840                      (signal-invocation-error "Reference to non-existent register ~A in replacement string."
841                                               (1+ token)))
842                    (when (svref reg-starts token)
843                      ;; and only if it matched, i.e. no match results
844                      ;; in an empty string
845                      (write-string target-string s
846                                    :start (svref reg-starts token)
847                                    :end (svref reg-ends token))))
848                  (function
849