| 1 |
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- |
|---|
| 2 |
;;; $Header: /usr/local/cvsrep/cl-ppcre/api.lisp,v 1.84 2008/07/06 18:12:04 edi Exp $ |
|---|
| 3 |
|
|---|
| 4 |
;;; The external API for creating and using scanners. |
|---|
| 5 |
|
|---|
| 6 |
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. |
|---|
| 7 |
|
|---|
| 8 |
;;; Redistribution and use in source and binary forms, with or without |
|---|
| 9 |
;;; modification, are permitted provided that the following conditions |
|---|
| 10 |
;;; are met: |
|---|
| 11 |
|
|---|
| 12 |
;;; * Redistributions of source code must retain the above copyright |
|---|
| 13 |
;;; notice, this list of conditions and the following disclaimer. |
|---|
| 14 |
|
|---|
| 15 |
;;; * Redistributions in binary form must reproduce the above |
|---|
| 16 |
;;; copyright notice, this list of conditions and the following |
|---|
| 17 |
;;; disclaimer in the documentation and/or other materials |
|---|
| 18 |
;;; provided with the distribution. |
|---|
| 19 |
|
|---|
| 20 |
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED |
|---|
| 21 |
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|---|
| 22 |
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|---|
| 23 |
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|---|
| 24 |
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|---|
| 25 |
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
|---|
| 26 |
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|---|
| 27 |
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
|---|
| 28 |
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
|---|
| 29 |
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|---|
| 30 |
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|---|
| 31 |
|
|---|
| 32 |
(in-package :cl-ppcre) |
|---|
| 33 |
|
|---|
| 34 |
(defgeneric create-scanner (regex &key case-insensitive-mode |
|---|
| 35 |
multi-line-mode |
|---|
| 36 |
single-line-mode |
|---|
| 37 |
extended-mode |
|---|
| 38 |
destructive) |
|---|
| 39 |
(:documentation "Accepts a regular expression - either as a |
|---|
| 40 |
parse-tree or as a string - and returns a scan closure which will scan |
|---|
| 41 |
strings for this regular expression and a list mapping registers to |
|---|
| 42 |
their names \(NIL stands for unnamed ones). The \"mode\" keyboard |
|---|
| 43 |
arguments are equivalent to the imsx modifiers in Perl. If |
|---|
| 44 |
DESTRUCTIVE is not NIL, the function is allowed to destructively |
|---|
| 45 |
modify its first argument \(but only if it's a parse tree).")) |
|---|
| 46 |
|
|---|
| 47 |
#-:use-acl-regexp2-engine |
|---|
| 48 |
(defmethod create-scanner ((regex-string string) &key case-insensitive-mode |
|---|
| 49 |
multi-line-mode |
|---|
| 50 |
single-line-mode |
|---|
| 51 |
extended-mode |
|---|
| 52 |
destructive) |
|---|
| 53 |
(declare #.*standard-optimize-settings*) |
|---|
| 54 |
(declare (ignore destructive)) |
|---|
| 55 |
;; parse the string into a parse-tree and then call CREATE-SCANNER |
|---|
| 56 |
;; again |
|---|
| 57 |
(let* ((*extended-mode-p* extended-mode) |
|---|
| 58 |
(quoted-regex-string (if *allow-quoting* |
|---|
| 59 |
(quote-sections (clean-comments regex-string extended-mode)) |
|---|
| 60 |
regex-string)) |
|---|
| 61 |
(*syntax-error-string* (copy-seq quoted-regex-string))) |
|---|
| 62 |
;; wrap the result with :GROUP to avoid infinite loops for |
|---|
| 63 |
;; constant strings |
|---|
| 64 |
(create-scanner (cons :group (list (parse-string quoted-regex-string))) |
|---|
| 65 |
:case-insensitive-mode case-insensitive-mode |
|---|
| 66 |
:multi-line-mode multi-line-mode |
|---|
| 67 |
:single-line-mode single-line-mode |
|---|
| 68 |
:destructive t))) |
|---|
| 69 |
|
|---|
| 70 |
#-:use-acl-regexp2-engine |
|---|
| 71 |
(defmethod create-scanner ((scanner function) &key case-insensitive-mode |
|---|
| 72 |
multi-line-mode |
|---|
| 73 |
single-line-mode |
|---|
| 74 |
extended-mode |
|---|
| 75 |
destructive) |
|---|
| 76 |
(declare #.*standard-optimize-settings*) |
|---|
| 77 |
(declare (ignore destructive)) |
|---|
| 78 |
(when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode) |
|---|
| 79 |
(signal-invocation-error "You can't use the keyword arguments to modify an existing scanner.")) |
|---|
| 80 |
scanner) |
|---|
| 81 |
|
|---|
| 82 |
#-:use-acl-regexp2-engine |
|---|
| 83 |
(defmethod create-scanner ((parse-tree t) &key case-insensitive-mode |
|---|
| 84 |
multi-line-mode |
|---|
| 85 |
single-line-mode |
|---|
| 86 |
extended-mode |
|---|
| 87 |
destructive) |
|---|
| 88 |
(declare #.*standard-optimize-settings*) |
|---|
| 89 |
(when extended-mode |
|---|
| 90 |
(signal-invocation-error "Extended mode doesn't make sense in parse trees.")) |
|---|
| 91 |
;; convert parse-tree into internal representation REGEX and at the |
|---|
| 92 |
;; same time compute the number of registers and the constant string |
|---|
| 93 |
;; (or anchor) the regex starts with (if any) |
|---|
| 94 |
(unless destructive |
|---|
| 95 |
(setq parse-tree (copy-tree parse-tree))) |
|---|
| 96 |
(let (flags) |
|---|
| 97 |
(if single-line-mode |
|---|
| 98 |
(push :single-line-mode-p flags)) |
|---|
| 99 |
(if multi-line-mode |
|---|
| 100 |
(push :multi-line-mode-p flags)) |
|---|
| 101 |
(if case-insensitive-mode |
|---|
| 102 |
(push :case-insensitive-p flags)) |
|---|
| 103 |
(when flags |
|---|
| 104 |
(setq parse-tree (list :group (cons :flags flags) parse-tree)))) |
|---|
| 105 |
(let ((*syntax-error-string* nil)) |
|---|
| 106 |
(multiple-value-bind (regex reg-num starts-with reg-names) |
|---|
| 107 |
(convert parse-tree) |
|---|
| 108 |
;; simplify REGEX by flattening nested SEQ and ALTERNATION |
|---|
| 109 |
;; constructs and gathering STR objects |
|---|
| 110 |
(let ((regex (gather-strings (flatten regex)))) |
|---|
| 111 |
;; set the MIN-REST slots of the REPETITION objects |
|---|
| 112 |
(compute-min-rest regex 0) |
|---|
| 113 |
;; set the OFFSET slots of the STR objects |
|---|
| 114 |
(compute-offsets regex 0) |
|---|
| 115 |
(let* (end-string-offset |
|---|
| 116 |
end-anchored-p |
|---|
| 117 |
;; compute the constant string the regex ends with (if |
|---|
| 118 |
;; any) and at the same time set the special variables |
|---|
| 119 |
;; END-STRING-OFFSET and END-ANCHORED-P |
|---|
| 120 |
(end-string (end-string regex)) |
|---|
| 121 |
;; if we found a non-zero-length end-string we create an |
|---|
| 122 |
;; efficient search function for it |
|---|
| 123 |
(end-string-test (and end-string |
|---|
| 124 |
(plusp (len end-string)) |
|---|
| 125 |
(if (= 1 (len end-string)) |
|---|
| 126 |
(create-char-searcher |
|---|
| 127 |
(schar (str end-string) 0) |
|---|
| 128 |
(case-insensitive-p end-string)) |
|---|
| 129 |
(create-bmh-matcher |
|---|
| 130 |
(str end-string) |
|---|
| 131 |
(case-insensitive-p end-string))))) |
|---|
| 132 |
;; initialize the counters for CREATE-MATCHER-AUX |
|---|
| 133 |
(*rep-num* 0) |
|---|
| 134 |
(*zero-length-num* 0) |
|---|
| 135 |
;; create the actual matcher function (which does all the |
|---|
| 136 |
;; work of matching the regular expression) corresponding |
|---|
| 137 |
;; to REGEX and at the same time set the special |
|---|
| 138 |
;; variables *REP-NUM* and *ZERO-LENGTH-NUM* |
|---|
| 139 |
(match-fn (create-matcher-aux regex #'identity)) |
|---|
| 140 |
;; if the regex starts with a string we create an |
|---|
| 141 |
;; efficient search function for it |
|---|
| 142 |
(start-string-test (and (typep starts-with 'str) |
|---|
| 143 |
(plusp (len starts-with)) |
|---|
| 144 |
(if (= 1 (len starts-with)) |
|---|
| 145 |
(create-char-searcher |
|---|
| 146 |
(schar (str starts-with) 0) |
|---|
| 147 |
(case-insensitive-p starts-with)) |
|---|
| 148 |
(create-bmh-matcher |
|---|
| 149 |
(str starts-with) |
|---|
| 150 |
(case-insensitive-p starts-with)))))) |
|---|
| 151 |
(declare (special end-string-offset end-anchored-p end-string)) |
|---|
| 152 |
;; now create the scanner and return it |
|---|
| 153 |
(values (create-scanner-aux match-fn |
|---|
| 154 |
(regex-min-length regex) |
|---|
| 155 |
(or (start-anchored-p regex) |
|---|
| 156 |
;; a dot in single-line-mode also |
|---|
| 157 |
;; implicitly anchors the regex at |
|---|
| 158 |
;; the start, i.e. if we can't match |
|---|
| 159 |
;; from the first position we won't |
|---|
| 160 |
;; match at all |
|---|
| 161 |
(and (typep starts-with 'everything) |
|---|
| 162 |
(single-line-p starts-with))) |
|---|
| 163 |
starts-with |
|---|
| 164 |
start-string-test |
|---|
| 165 |
;; only mark regex as end-anchored if we |
|---|
| 166 |
;; found a non-zero-length string before |
|---|
| 167 |
;; the anchor |
|---|
| 168 |
(and end-string-test end-anchored-p) |
|---|
| 169 |
end-string-test |
|---|
| 170 |
(if end-string-test |
|---|
| 171 |
(len end-string) |
|---|
| 172 |
nil) |
|---|
| 173 |
end-string-offset |
|---|
| 174 |
*rep-num* |
|---|
| 175 |
*zero-length-num* |
|---|
| 176 |
reg-num) |
|---|
| 177 |
reg-names)))))) |
|---|
| 178 |
|
|---|
| 179 |
#+:use-acl-regexp2-engine |
|---|
| 180 |
(declaim (inline create-scanner)) |
|---|
| 181 |
#+:use-acl-regexp2-engine |
|---|
| 182 |
(defmethod create-scanner ((scanner regexp::regular-expression) &key case-insensitive-mode |
|---|
| 183 |
multi-line-mode |
|---|
| 184 |
single-line-mode |
|---|
| 185 |
extended-mode |
|---|
| 186 |
destructive) |
|---|
| 187 |
(declare #.*standard-optimize-settings*) |
|---|
| 188 |
(declare (ignore destructive)) |
|---|
| 189 |
(when (or case-insensitive-mode multi-line-mode single-line-mode extended-mode) |
|---|
| 190 |
(signal-invocation-error "You can't use the keyword arguments to modify an existing scanner.")) |
|---|
| 191 |
scanner) |
|---|
| 192 |
|
|---|
| 193 |
#+:use-acl-regexp2-engine |
|---|
| 194 |
(defmethod create-scanner ((parse-tree t) &key case-insensitive-mode |
|---|
| 195 |
multi-line-mode |
|---|
| 196 |
single-line-mode |
|---|
| 197 |
extended-mode |
|---|
| 198 |
destructive) |
|---|
| 199 |
(declare #.*standard-optimize-settings*) |
|---|
| 200 |
(declare (ignore destructive)) |
|---|
| 201 |
(excl:compile-re parse-tree |
|---|
| 202 |
:case-fold case-insensitive-mode |
|---|
| 203 |
:ignore-whitespace extended-mode |
|---|
| 204 |
:multiple-lines multi-line-mode |
|---|
| 205 |
:single-line single-line-mode |
|---|
| 206 |
:return :index)) |
|---|
| 207 |
|
|---|
| 208 |
(defgeneric scan (regex target-string &key start end real-start-pos) |
|---|
| 209 |
(:documentation "Searches TARGET-STRING from START to END and tries |
|---|
| 210 |
to match REGEX. On success returns four values - the start of the |
|---|
| 211 |
match, the end of the match, and two arrays denoting the beginnings |
|---|
| 212 |
and ends of register matches. On failure returns NIL. REGEX can be a |
|---|
| 213 |
string which will be parsed according to Perl syntax, a parse tree, or |
|---|
| 214 |
a pre-compiled scanner created by CREATE-SCANNER. TARGET-STRING will |
|---|
| 215 |
be coerced to a simple string if it isn't one already. The |
|---|
| 216 |
REAL-START-POS parameter should be ignored - it exists only for |
|---|
| 217 |
internal purposes.")) |
|---|
| 218 |
|
|---|
| 219 |
#-:use-acl-regexp2-engine |
|---|
| 220 |
(defmethod scan ((regex-string string) target-string |
|---|
| 221 |
&key (start 0) |
|---|
| 222 |
(end (length target-string)) |
|---|
| 223 |
((:real-start-pos *real-start-pos*) nil)) |
|---|
| 224 |
(declare #.*standard-optimize-settings*) |
|---|
| 225 |
;; note that the scanners are optimized for simple strings so we |
|---|
| 226 |
;; have to coerce TARGET-STRING into one if it isn't already |
|---|
| 227 |
(funcall (create-scanner regex-string) |
|---|
| 228 |
(maybe-coerce-to-simple-string target-string) |
|---|
| 229 |
start end)) |
|---|
| 230 |
|
|---|
| 231 |
#-:use-acl-regexp2-engine |
|---|
| 232 |
(defmethod scan ((scanner function) target-string |
|---|
| 233 |
&key (start 0) |
|---|
| 234 |
(end (length target-string)) |
|---|
| 235 |
((:real-start-pos *real-start-pos*) nil)) |
|---|
| 236 |
(declare #.*standard-optimize-settings*) |
|---|
| 237 |
(funcall scanner |
|---|
| 238 |
(maybe-coerce-to-simple-string target-string) |
|---|
| 239 |
start end)) |
|---|
| 240 |
|
|---|
| 241 |
#-:use-acl-regexp2-engine |
|---|
| 242 |
(defmethod scan ((parse-tree t) target-string |
|---|
| 243 |
&key (start 0) |
|---|
| 244 |
(end (length target-string)) |
|---|
| 245 |
((:real-start-pos *real-start-pos*) nil)) |
|---|
| 246 |
(declare #.*standard-optimize-settings*) |
|---|
| 247 |
(funcall (create-scanner parse-tree) |
|---|
| 248 |
(maybe-coerce-to-simple-string target-string) |
|---|
| 249 |
start end)) |
|---|
| 250 |
|
|---|
| 251 |
#+:use-acl-regexp2-engine |
|---|
| 252 |
(declaim (inline scan)) |
|---|
| 253 |
#+:use-acl-regexp2-engine |
|---|
| 254 |
(defmethod scan ((parse-tree t) target-string |
|---|
| 255 |
&key (start 0) |
|---|
| 256 |
(end (length target-string)) |
|---|
| 257 |
((:real-start-pos *real-start-pos*) nil)) |
|---|
| 258 |
(declare #.*standard-optimize-settings*) |
|---|
| 259 |
(when (< end start) |
|---|
| 260 |
(return-from scan nil)) |
|---|
| 261 |
(let ((results (multiple-value-list (excl:match-re parse-tree target-string |
|---|
| 262 |
:start start |
|---|
| 263 |
:end end |
|---|
| 264 |
:return :index)))) |
|---|
| 265 |
(declare (dynamic-extent results)) |
|---|
| 266 |
(cond ((null (first results)) nil) |
|---|
| 267 |
(t (let* ((no-of-regs (- (length results) 2)) |
|---|
| 268 |
(reg-starts (make-array no-of-regs |
|---|
| 269 |
:element-type '(or null fixnum))) |
|---|
| 270 |
(reg-ends (make-array no-of-regs |
|---|
| 271 |
:element-type '(or null fixnum))) |
|---|
| 272 |
(match (second results))) |
|---|
| 273 |
(loop for (start . end) in (cddr results) |
|---|
| 274 |
for i from 0 |
|---|
| 275 |
do (setf (aref reg-starts i) start |
|---|
| 276 |
(aref reg-ends i) end)) |
|---|
| 277 |
(values (car match) (cdr match) reg-starts reg-ends)))))) |
|---|
| 278 |
|
|---|
| 279 |
#-:cormanlisp |
|---|
| 280 |
(define-compiler-macro scan (&whole form &environment env regex target-string &rest rest) |
|---|
| 281 |
"Make sure that constant forms are compiled into scanners at compile time." |
|---|
| 282 |
(cond ((constantp regex env) |
|---|
| 283 |
`(scan (load-time-value (create-scanner ,regex)) |
|---|
| 284 |
,target-string ,@rest)) |
|---|
| 285 |
(t form))) |
|---|
| 286 |
|
|---|
| 287 |
(defun scan-to-strings (regex target-string &key (start 0) |
|---|
| 288 |
(end (length target-string)) |
|---|
| 289 |
sharedp) |
|---|
| 290 |
"Like SCAN but returns substrings of TARGET-STRING instead of |
|---|
| 291 |
positions, i.e. this function returns two values on success: the whole |
|---|
| 292 |
match as a string plus an array of substrings (or NILs) corresponding |
|---|
| 293 |
to the matched registers. If SHAREDP is true, the substrings may |
|---|
| 294 |
share structure with TARGET-STRING." |
|---|
| 295 |
(declare #.*standard-optimize-settings*) |
|---|
| 296 |
(multiple-value-bind (match-start match-end reg-starts reg-ends) |
|---|
| 297 |
(scan regex target-string :start start :end end) |
|---|
| 298 |
(unless match-start |
|---|
| 299 |
(return-from scan-to-strings nil)) |
|---|
| 300 |
(let ((substr-fn (if sharedp #'nsubseq #'subseq))) |
|---|
| 301 |
(values (funcall substr-fn |
|---|
| 302 |
target-string match-start match-end) |
|---|
| 303 |
(map 'vector |
|---|
| 304 |
(lambda (reg-start reg-end) |
|---|
| 305 |
(if reg-start |
|---|
| 306 |
(funcall substr-fn |
|---|
| 307 |
target-string reg-start reg-end) |
|---|
| 308 |
nil)) |
|---|
| 309 |
reg-starts |
|---|
| 310 |
reg-ends))))) |
|---|
| 311 |
|
|---|
| 312 |
#-:cormanlisp |
|---|
| 313 |
(define-compiler-macro scan-to-strings |
|---|
| 314 |
(&whole form &environment env regex target-string &rest rest) |
|---|
| 315 |
"Make sure that constant forms are compiled into scanners at compile time." |
|---|
| 316 |
(cond ((constantp regex env) |
|---|
| 317 |
`(scan-to-strings (load-time-value (create-scanner ,regex)) |
|---|
| 318 |
,target-string ,@rest)) |
|---|
| 319 |
(t form))) |
|---|
| 320 |
|
|---|
| 321 |
(defmacro register-groups-bind (var-list (regex target-string |
|---|
| 322 |
&key start end sharedp) |
|---|
| 323 |
&body body) |
|---|
| 324 |
"Executes BODY with the variables in VAR-LIST bound to the |
|---|
| 325 |
corresponding register groups after TARGET-STRING has been matched |
|---|
| 326 |
against REGEX, i.e. each variable is either bound to a string or to |
|---|
| 327 |
NIL. If there is no match, BODY is _not_ executed. For each element |
|---|
| 328 |
of VAR-LIST which is NIL there's no binding to the corresponding |
|---|
| 329 |
register group. The number of variables in VAR-LIST must not be |
|---|
| 330 |
greater than the number of register groups. If SHAREDP is true, the |
|---|
| 331 |
substrings may share structure with TARGET-STRING." |
|---|
| 332 |
(with-rebinding (target-string) |
|---|
| 333 |
(with-unique-names (match-start match-end reg-starts reg-ends |
|---|
| 334 |
start-index substr-fn) |
|---|
| 335 |
`(multiple-value-bind (,match-start ,match-end ,reg-starts ,reg-ends) |
|---|
| 336 |
(scan ,regex ,target-string :start (or ,start 0) |
|---|
| 337 |
:end (or ,end (length ,target-string))) |
|---|
| 338 |
(declare (ignore ,match-end)) |
|---|
| 339 |
(when ,match-start |
|---|
| 340 |
(let* ,(cons |
|---|
| 341 |
`(,substr-fn (if ,sharedp |
|---|
| 342 |
#'nsubseq |
|---|
| 343 |
#'subseq)) |
|---|
| 344 |
(loop for (function var) in (normalize-var-list var-list) |
|---|
| 345 |
for counter from 0 |
|---|
| 346 |
when var |
|---|
| 347 |
collect `(,var (let ((,start-index |
|---|
| 348 |
(aref ,reg-starts ,counter))) |
|---|
| 349 |
(if ,start-index |
|---|
| 350 |
(funcall ,function |
|---|
| 351 |
(funcall ,substr-fn |
|---|
| 352 |
,target-string |
|---|
| 353 |
,start-index |
|---|
| 354 |
(aref ,reg-ends ,counter))) |
|---|
| 355 |
nil))))) |
|---|
| 356 |
,@body)))))) |
|---|
| 357 |
|
|---|
| 358 |
(defmacro do-scans ((match-start match-end reg-starts reg-ends regex |
|---|
| 359 |
target-string |
|---|
| 360 |
&optional result-form |
|---|
| 361 |
&key start end) |
|---|
| 362 |
&body body |
|---|
| 363 |
&environment env) |
|---|
| 364 |
"Iterates over TARGET-STRING and tries to match REGEX as often as |
|---|
| 365 |
possible evaluating BODY with MATCH-START, MATCH-END, REG-STARTS, and |
|---|
| 366 |
REG-ENDS bound to the four return values of each match in turn. After |
|---|
| 367 |
the last match, returns RESULT-FORM if provided or NIL otherwise. An |
|---|
| 368 |
implicit block named NIL surrounds DO-SCANS; RETURN may be used to |
|---|
| 369 |
terminate the loop immediately. If REGEX matches an empty string the |
|---|
| 370 |
scan is continued one position behind this match. BODY may start with |
|---|
| 371 |
declarations." |
|---|
| 372 |
(with-rebinding (target-string) |
|---|
| 373 |
(with-unique-names (%start %end %regex scanner) |
|---|
| 374 |
(declare (ignorable %regex scanner)) |
|---|
| 375 |
;; the NIL BLOCK to enable exits via (RETURN ...) |
|---|
| 376 |
`(block nil |
|---|
| 377 |
(let* ((,%start (or ,start 0)) |
|---|
| 378 |
(,%end (or ,end (length ,target-string))) |
|---|
| 379 |
,@(unless (constantp regex env) |
|---|
| 380 |
;; leave constant regular expressions as they are - |
|---|
| 381 |
;; SCAN's compiler macro will take care of them; |
|---|
| 382 |
;; otherwise create a scanner unless the regex is |
|---|
| 383 |
;; already a function (otherwise SCAN will do this |
|---|
| 384 |
;; on each iteration) |
|---|
| 385 |
`((,%regex ,regex) |
|---|
| 386 |
(,scanner (typecase ,%regex |
|---|
| 387 |
(function ,%regex) |
|---|
| 388 |
(t (create-scanner ,%regex))))))) |
|---|
| 389 |
;; coerce TARGET-STRING to a simple string unless it is one |
|---|
| 390 |
;; already (otherwise SCAN will do this on each iteration) |
|---|
| 391 |
(setq ,target-string |
|---|
| 392 |
(maybe-coerce-to-simple-string ,target-string)) |
|---|
| 393 |
(loop |
|---|
| 394 |
;; invoke SCAN and bind the returned values to the |
|---|
| 395 |
;; provided variables |
|---|
| 396 |
(multiple-value-bind |
|---|
| 397 |
(,match-start ,match-end ,reg-starts ,reg-ends) |
|---|
| 398 |
(scan ,(cond ((constantp regex env) regex) |
|---|
| 399 |
(t scanner)) |
|---|
| 400 |
,target-string :start ,%start :end ,%end |
|---|
| 401 |
:real-start-pos (or ,start 0)) |
|---|
| 402 |
;; declare the variables to be IGNORABLE to prevent the |
|---|
| 403 |
;; compiler from issuing warnings |
|---|
| 404 |
(declare |
|---|
| 405 |
(ignorable ,match-start ,match-end ,reg-starts ,reg-ends)) |
|---|
| 406 |
(unless ,match-start |
|---|
| 407 |
;; stop iteration on first failure |
|---|
| 408 |
(return ,result-form)) |
|---|
| 409 |
;; execute BODY (wrapped in LOCALLY so it can start with |
|---|
| 410 |
;; declarations) |
|---|
| 411 |
(locally |
|---|
| 412 |
,@body) |
|---|
| 413 |
;; advance by one position if we had a zero-length match |
|---|
| 414 |
(setq ,%start (if (= ,match-start ,match-end) |
|---|
| 415 |
(1+ ,match-end) |
|---|
| 416 |
,match-end))))))))) |
|---|
| 417 |
|
|---|
| 418 |
(defmacro do-matches ((match-start match-end regex |
|---|
| 419 |
target-string |
|---|
| 420 |
&optional result-form |
|---|
| 421 |
&key start end) |
|---|
| 422 |
&body body) |
|---|
| 423 |
"Iterates over TARGET-STRING and tries to match REGEX as often as |
|---|
| 424 |
possible evaluating BODY with MATCH-START and MATCH-END bound to the |
|---|
| 425 |
start/end positions of each match in turn. After the last match, |
|---|
| 426 |
returns RESULT-FORM if provided or NIL otherwise. An implicit block |
|---|
| 427 |
named NIL surrounds DO-MATCHES; RETURN may be used to terminate the |
|---|
| 428 |
loop immediately. If REGEX matches an empty string the scan is |
|---|
| 429 |
continued one position behind this match. BODY may start with |
|---|
| 430 |
declarations." |
|---|
| 431 |
;; this is a simplified form of DO-SCANS - we just provide two dummy |
|---|
| 432 |
;; vars and ignore them |
|---|
| 433 |
(with-unique-names (reg-starts reg-ends) |
|---|
| 434 |
`(do-scans (,match-start ,match-end |
|---|
| 435 |
,reg-starts ,reg-ends |
|---|
| 436 |
,regex ,target-string |
|---|
| 437 |
,result-form |
|---|
| 438 |
:start ,start :end ,end) |
|---|
| 439 |
,@body))) |
|---|
| 440 |
|
|---|
| 441 |
(defmacro do-matches-as-strings ((match-var regex |
|---|
| 442 |
target-string |
|---|
| 443 |
&optional result-form |
|---|
| 444 |
&key start end sharedp) |
|---|
| 445 |
&body body) |
|---|
| 446 |
"Iterates over TARGET-STRING and tries to match REGEX as often as |
|---|
| 447 |
possible evaluating BODY with MATCH-VAR bound to the substring of |
|---|
| 448 |
TARGET-STRING corresponding to each match in turn. After the last |
|---|
| 449 |
match, returns RESULT-FORM if provided or NIL otherwise. An implicit |
|---|
| 450 |
block named NIL surrounds DO-MATCHES-AS-STRINGS; RETURN may be used to |
|---|
| 451 |
terminate the loop immediately. If REGEX matches an empty string the |
|---|
| 452 |
scan is continued one position behind this match. If SHAREDP is true, |
|---|
| 453 |
the substrings may share structure with TARGET-STRING. BODY may start |
|---|
| 454 |
with declarations." |
|---|
| 455 |
(with-rebinding (target-string) |
|---|
| 456 |
(with-unique-names (match-start match-end substr-fn) |
|---|
| 457 |
`(let ((,substr-fn (if ,sharedp #'nsubseq #'subseq))) |
|---|
| 458 |
;; simple use DO-MATCHES to extract the substrings |
|---|
| 459 |
(do-matches (,match-start ,match-end ,regex ,target-string |
|---|
| 460 |
,result-form :start ,start :end ,end) |
|---|
| 461 |
(let ((,match-var |
|---|
| 462 |
(funcall ,substr-fn |
|---|
| 463 |
,target-string ,match-start ,match-end))) |
|---|
| 464 |
,@body)))))) |
|---|
| 465 |
|
|---|
| 466 |
(defmacro do-register-groups (var-list (regex target-string |
|---|
| 467 |
&optional result-form |
|---|
| 468 |
&key start end sharedp) |
|---|
| 469 |
&body body) |
|---|
| 470 |
"Iterates over TARGET-STRING and tries to match REGEX as often as |
|---|
| 471 |
possible evaluating BODY with the variables in VAR-LIST bound to the |
|---|
| 472 |
corresponding register groups for each match in turn, i.e. each |
|---|
| 473 |
variable is either bound to a string or to NIL. For each element of |
|---|
| 474 |
VAR-LIST which is NIL there's no binding to the corresponding register |
|---|
| 475 |
group. The number of variables in VAR-LIST must not be greater than |
|---|
| 476 |
the number of register groups. After the last match, returns |
|---|
| 477 |
RESULT-FORM if provided or NIL otherwise. An implicit block named NIL |
|---|
| 478 |
surrounds DO-REGISTER-GROUPS; RETURN may be used to terminate the loop |
|---|
| 479 |
immediately. If REGEX matches an empty string the scan is continued |
|---|
| 480 |
one position behind this match. If SHAREDP is true, the substrings |
|---|
| 481 |
may share structure with TARGET-STRING. BODY may start with |
|---|
| 482 |
declarations." |
|---|
| 483 |
(with-rebinding (target-string) |
|---|
| 484 |
(with-unique-names (substr-fn match-start match-end |
|---|
| 485 |
reg-starts reg-ends start-index) |
|---|
| 486 |
`(let ((,substr-fn (if ,sharedp |
|---|
| 487 |
#'nsubseq |
|---|
| 488 |
#'subseq))) |
|---|
| 489 |
(do-scans (,match-start ,match-end ,reg-starts ,reg-ends |
|---|
| 490 |
,regex ,target-string |
|---|
| 491 |
,result-form :start ,start :end ,end) |
|---|
| 492 |
(let ,(loop for (function var) in (normalize-var-list var-list) |
|---|
| 493 |
for counter from 0 |
|---|
| 494 |
when var |
|---|
| 495 |
collect `(,var (let ((,start-index |
|---|
| 496 |
(aref ,reg-starts ,counter))) |
|---|
| 497 |
(if ,start-index |
|---|
| 498 |
(funcall ,function |
|---|
| 499 |
(funcall ,substr-fn |
|---|
| 500 |
,target-string |
|---|
| 501 |
,start-index |
|---|
| 502 |
(aref ,reg-ends ,counter))) |
|---|
| 503 |
nil)))) |
|---|
| 504 |
,@body)))))) |
|---|
| 505 |
|
|---|
| 506 |
(defun all-matches (regex target-string |
|---|
| 507 |
&key (start 0) |
|---|
| 508 |
(end (length target-string))) |
|---|
| 509 |
"Returns a list containing the start and end positions of all |
|---|
| 510 |
matches of REGEX against TARGET-STRING, i.e. if there are N matches |
|---|
| 511 |
the list contains (* 2 N) elements. If REGEX matches an empty string |
|---|
| 512 |
the scan is continued one position behind this match." |
|---|
| 513 |
(declare #.*standard-optimize-settings*) |
|---|
| 514 |
(let (result-list) |
|---|
| 515 |
(do-matches (match-start match-end |
|---|
| 516 |
regex target-string |
|---|
| 517 |
(nreverse result-list) |
|---|
| 518 |
:start start :end end) |
|---|
| 519 |
(push match-start result-list) |
|---|
| 520 |
(push match-end result-list)))) |
|---|
| 521 |
|
|---|
| 522 |
#-:cormanlisp |
|---|
| 523 |
(define-compiler-macro all-matches (&whole form &environment env regex &rest rest) |
|---|
| 524 |
"Make sure that constant forms are compiled into scanners at |
|---|
| 525 |
compile time." |
|---|
| 526 |
(cond ((constantp regex env) |
|---|
| 527 |
`(all-matches (load-time-value (create-scanner ,regex)) |
|---|
| 528 |
,@rest)) |
|---|
| 529 |
(t form))) |
|---|
| 530 |
|
|---|
| 531 |
(defun all-matches-as-strings (regex target-string |
|---|
| 532 |
&key (start 0) |
|---|
| 533 |
(end (length target-string)) |
|---|
| 534 |
sharedp) |
|---|
| 535 |
"Returns a list containing all substrings of TARGET-STRING which |
|---|
| 536 |
match REGEX. If REGEX matches an empty string the scan is continued |
|---|
| 537 |
one position behind this match. If SHAREDP is true, the substrings may |
|---|
| 538 |
share structure with TARGET-STRING." |
|---|
| 539 |
(declare #.*standard-optimize-settings*) |
|---|
| 540 |
(let (result-list) |
|---|
| 541 |
(do-matches-as-strings (match regex target-string (nreverse result-list) |
|---|
| 542 |
:start start :end end :sharedp sharedp) |
|---|
| 543 |
(push match result-list)))) |
|---|
| 544 |
|
|---|
| 545 |
#-:cormanlisp |
|---|
| 546 |
(define-compiler-macro all-matches-as-strings (&whole form &environment env regex &rest rest) |
|---|
| 547 |
"Make sure that constant forms are compiled into scanners at |
|---|
| 548 |
compile time." |
|---|
| 549 |
(cond ((constantp regex env) |
|---|
| 550 |
`(all-matches-as-strings |
|---|
| 551 |
(load-time-value (create-scanner ,regex)) |
|---|
| 552 |
,@rest)) |
|---|
| 553 |
(t form))) |
|---|
| 554 |
|
|---|
| 555 |
(defun split (regex target-string |
|---|
| 556 |
&key (start 0) |
|---|
| 557 |
(end (length target-string)) |
|---|
| 558 |
limit |
|---|
| 559 |
with-registers-p |
|---|
| 560 |
omit-unmatched-p |
|---|
| 561 |
sharedp) |
|---|
| 562 |
"Matches REGEX against TARGET-STRING as often as possible and |
|---|
| 563 |
returns a list of the substrings between the matches. If |
|---|
| 564 |
WITH-REGISTERS-P is true, substrings corresponding to matched |
|---|
| 565 |
registers are inserted into the list as well. If OMIT-UNMATCHED-P is |
|---|
| 566 |
true, unmatched registers will simply be left out, otherwise they will |
|---|
| 567 |
show up as NIL. LIMIT limits the number of elements returned - |
|---|
| 568 |
registers aren't counted. If LIMIT is NIL \(or 0 which is |
|---|
| 569 |
equivalent), trailing empty strings are removed from the result list. |
|---|
| 570 |
If REGEX matches an empty string the scan is continued one position |
|---|
| 571 |
behind this match. If SHAREDP is true, the substrings may share |
|---|
| 572 |
structure with TARGET-STRING." |
|---|
| 573 |
(declare #.*standard-optimize-settings*) |
|---|
| 574 |
;; initialize list of positions POS-LIST to extract substrings with |
|---|
| 575 |
;; START so that the start of the next match will mark the end of |
|---|
| 576 |
;; the first substring |
|---|
| 577 |
(let ((pos-list (list start)) |
|---|
| 578 |
(counter 0)) |
|---|
| 579 |
;; how would Larry Wall do it? |
|---|
| 580 |
(when (eql limit 0) |
|---|
| 581 |
(setq limit nil)) |
|---|
| 582 |
(do-scans (match-start match-end |
|---|
| 583 |
reg-starts reg-ends |
|---|
| 584 |
regex target-string nil |
|---|
| 585 |
:start start :end end) |
|---|
| 586 |
(unless (and (= match-start match-end) |
|---|
| 587 |
(= match-start (car pos-list))) |
|---|
| 588 |
;; push start of match on list unless this would be an empty |
|---|
| 589 |
;; string adjacent to the last element pushed onto the list |
|---|
| 590 |
(when (and limit |
|---|
| 591 |
(>= (incf counter) limit)) |
|---|
| 592 |
(return)) |
|---|
| 593 |
(push match-start pos-list) |
|---|
| 594 |
(when with-registers-p |
|---|
| 595 |
;; optionally insert matched registers |
|---|
| 596 |
(loop for reg-start across reg-starts |
|---|
| 597 |
for reg-end across reg-ends |
|---|
| 598 |
if reg-start |
|---|
| 599 |
;; but only if they've matched |
|---|
| 600 |
do (push reg-start pos-list) |
|---|
| 601 |
(push reg-end pos-list) |
|---|
| 602 |
else unless omit-unmatched-p |
|---|
| 603 |
;; or if we're allowed to insert NIL instead |
|---|
| 604 |
do (push nil pos-list) |
|---|
| 605 |
(push nil pos-list))) |
|---|
| 606 |
;; now end of match |
|---|
| 607 |
(push match-end pos-list))) |
|---|
| 608 |
;; end of whole string |
|---|
| 609 |
(push end pos-list) |
|---|
| 610 |
;; now collect substrings |
|---|
| 611 |
(nreverse |
|---|
| 612 |
(loop with substr-fn = (if sharedp #'nsubseq #'subseq) |
|---|
| 613 |
with string-seen = nil |
|---|
| 614 |
for (this-end this-start) on pos-list by #'cddr |
|---|
| 615 |
;; skip empty strings from end of list |
|---|
| 616 |
if (or limit |
|---|
| 617 |
(setq string-seen |
|---|
| 618 |
(or string-seen |
|---|
| 619 |
(and this-start |
|---|
| 620 |
(> this-end this-start))))) |
|---|
| 621 |
collect (if this-start |
|---|
| 622 |
(funcall substr-fn |
|---|
| 623 |
target-string this-start this-end) |
|---|
| 624 |
nil))))) |
|---|
| 625 |
|
|---|
| 626 |
#-:cormanlisp |
|---|
| 627 |
(define-compiler-macro split (&whole form &environment env regex target-string &rest rest) |
|---|
| 628 |
"Make sure that constant forms are compiled into scanners at compile time." |
|---|
| 629 |
(cond ((constantp regex env) |
|---|
| 630 |
`(split (load-time-value (create-scanner ,regex)) |
|---|
| 631 |
,target-string ,@rest)) |
|---|
| 632 |
(t form))) |
|---|
| 633 |
|
|---|
| 634 |
(defun string-case-modifier (str from to start end) |
|---|
| 635 |
(declare #.*standard-optimize-settings*) |
|---|
| 636 |
(declare (fixnum from to start end)) |
|---|
| 637 |
"Checks whether all words in STR between FROM and TO are upcased, |
|---|
| 638 |
downcased or capitalized and returns a function which applies a |
|---|
| 639 |
corresponding case modification to strings. Returns #'IDENTITY |
|---|
| 640 |
otherwise, especially if words in the target area extend beyond FROM |
|---|
| 641 |
or TO. STR is supposed to be bounded by START and END. It is assumed |
|---|
| 642 |
that \(<= START FROM TO END)." |
|---|
| 643 |
(case |
|---|
| 644 |
(if (or (<= to from) |
|---|
| 645 |
(and (< start from) |
|---|
| 646 |
(alphanumericp (char str (1- from))) |
|---|
| 647 |
(alphanumericp (char str from))) |
|---|
| 648 |
(and (< to end) |
|---|
| 649 |
(alphanumericp (char str to)) |
|---|
| 650 |
(alphanumericp (char str (1- to))))) |
|---|
| 651 |
;; if it's a zero-length string or if words extend beyond FROM |
|---|
| 652 |
;; or TO we return NIL, i.e. #'IDENTITY |
|---|
| 653 |
nil |
|---|
| 654 |
;; otherwise we loop through STR from FROM to TO |
|---|
| 655 |
(loop with last-char-both-case |
|---|
| 656 |
with current-result |
|---|
| 657 |
for index of-type fixnum from from below to |
|---|
| 658 |
for chr = (char str index) |
|---|
| 659 |
do (cond ((not #-:cormanlisp (both-case-p chr) |
|---|
| 660 |
#+:cormanlisp (or (upper-case-p chr) |
|---|
| 661 |
(lower-case-p chr))) |
|---|
| 662 |
;; this character doesn't have a case so we |
|---|
| 663 |
;; consider it as a word boundary (note that |
|---|
| 664 |
;; this differs from how \b works in Perl) |
|---|
| 665 |
(setq last-char-both-case nil)) |
|---|
| 666 |
((upper-case-p chr) |
|---|
| 667 |
;; an uppercase character |
|---|
| 668 |
(setq current-result |
|---|
| 669 |
(if last-char-both-case |
|---|
| 670 |
;; not the first character in a |
|---|
| 671 |
(case current-result |
|---|
| 672 |
((:undecided) :upcase) |
|---|
| 673 |
((:downcase :capitalize) (return nil)) |
|---|
| 674 |
((:upcase) current-result)) |
|---|
| 675 |
(case current-result |
|---|
| 676 |
((nil) :undecided) |
|---|
| 677 |
((:downcase) (return nil)) |
|---|
| 678 |
((:capitalize :upcase) current-result))) |
|---|
| 679 |
last-char-both-case t)) |
|---|
| 680 |
(t |
|---|
| 681 |
;; a lowercase character |
|---|
| 682 |
(setq current-result |
|---|
| 683 |
(case current-result |
|---|
| 684 |
((nil) :downcase) |
|---|
| 685 |
((:undecided) :capitalize) |
|---|
| 686 |
((:downcase) current-result) |
|---|
| 687 |
((:capitalize) (if last-char-both-case |
|---|
| 688 |
current-result |
|---|
| 689 |
(return nil))) |
|---|
| 690 |
((:upcase) (return nil))) |
|---|
| 691 |
last-char-both-case t))) |
|---|
| 692 |
finally (return current-result))) |
|---|
| 693 |
((nil) #'identity) |
|---|
| 694 |
((:undecided :upcase) #'string-upcase) |
|---|
| 695 |
((:downcase) #'string-downcase) |
|---|
| 696 |
((:capitalize) #'string-capitalize))) |
|---|
| 697 |
|
|---|
| 698 |
;; first create a scanner to identify the special parts of the |
|---|
| 699 |
;; replacement string (eat your own dog food...) |
|---|
| 700 |
|
|---|
| 701 |
(defgeneric build-replacement-template (replacement-string) |
|---|
| 702 |
(declare #.*standard-optimize-settings*) |
|---|
| 703 |
(:documentation "Converts a replacement string for REGEX-REPLACE or |
|---|
| 704 |
REGEX-REPLACE-ALL into a replacement template which is an |
|---|
| 705 |
S-expression.")) |
|---|
| 706 |
|
|---|
| 707 |
#-:cormanlisp |
|---|
| 708 |
(let* ((*use-bmh-matchers* nil) |
|---|
| 709 |
(reg-scanner (create-scanner "\\\\(?:\\\\|\\{\\d+\\}|\\d+|&|`|')"))) |
|---|
| 710 |
(defmethod build-replacement-template ((replacement-string string)) |
|---|
| 711 |
(declare #.*standard-optimize-settings*) |
|---|
| 712 |
(let ((from 0) |
|---|
| 713 |
;; COLLECTOR will hold the (reversed) template |
|---|
| 714 |
(collector '())) |
|---|
| 715 |
;; scan through all special parts of the replacement string |
|---|
| 716 |
(do-matches (match-start match-end reg-scanner replacement-string) |
|---|
| 717 |
(when (< from match-start) |
|---|
| 718 |
;; strings between matches are copied verbatim |
|---|
| 719 |
(push (subseq replacement-string from match-start) collector)) |
|---|
| 720 |
;; PARSE-START is true if the pattern matched a number which |
|---|
| 721 |
;; refers to a register |
|---|
| 722 |
(let* ((parse-start (position-if #'digit-char-p |
|---|
| 723 |
replacement-string |
|---|
| 724 |
:start match-start |
|---|
| 725 |
:end match-end)) |
|---|
| 726 |
(token (if parse-start |
|---|
| 727 |
(1- (parse-integer replacement-string |
|---|
| 728 |
:start parse-start |
|---|
| 729 |
:junk-allowed t)) |
|---|
| 730 |
;; if we didn't match a number we convert the |
|---|
| 731 |
;; character to a symbol |
|---|
| 732 |
(case (char replacement-string (1+ match-start)) |
|---|
| 733 |
((#\&) :match) |
|---|
| 734 |
((#\`) :before-match) |
|---|
| 735 |
((#\') :after-match) |
|---|
| 736 |
((#\\) :backslash))))) |
|---|
| 737 |
(when (and (numberp token) (< token 0)) |
|---|
| 738 |
;; make sure we don't accept something like "\\0" |
|---|
| 739 |
(signal-invocation-error "Illegal substring ~S in replacement string." |
|---|
| 740 |
(subseq replacement-string match-start match-end))) |
|---|
| 741 |
(push token collector)) |
|---|
| 742 |
;; remember where the match ended |
|---|
| 743 |
(setq from match-end)) |
|---|
| 744 |
(when (< from (length replacement-string)) |
|---|
| 745 |
;; push the rest of the replacement string onto the list |
|---|
| 746 |
(push (subseq replacement-string from) collector)) |
|---|
| 747 |
(nreverse collector)))) |
|---|
| 748 |
|
|---|
| 749 |
#-:cormanlisp |
|---|
| 750 |
(defmethod build-replacement-template ((replacement-function function)) |
|---|
| 751 |
(declare #.*standard-optimize-settings*) |
|---|
| 752 |
(list replacement-function)) |
|---|
| 753 |
|
|---|
| 754 |
#-:cormanlisp |
|---|
| 755 |
(defmethod build-replacement-template ((replacement-function-symbol symbol)) |
|---|
| 756 |
(declare #.*standard-optimize-settings*) |
|---|
| 757 |
(list replacement-function-symbol)) |
|---|
| 758 |
|
|---|
| 759 |
#-:cormanlisp |
|---|
| 760 |
(defmethod build-replacement-template ((replacement-list list)) |
|---|
| 761 |
(declare #.*standard-optimize-settings*) |
|---|
| 762 |
replacement-list) |
|---|
| 763 |
|
|---|
| 764 |
;;; Corman Lisp's methods can't be closures... :( |
|---|
| 765 |
#+:cormanlisp |
|---|
| 766 |
(let* ((*use-bmh-matchers* nil) |
|---|
| 767 |
(reg-scanner (create-scanner "\\\\(?:\\\\|\\{\\d+\\}|\\d+|&|`|')"))) |
|---|
| 768 |
(defun build-replacement-template (replacement) |
|---|
| 769 |
(declare #.*standard-optimize-settings*) |
|---|
| 770 |
(typecase replacement |
|---|
| 771 |
(string |
|---|
| 772 |
(let ((from 0) |
|---|
| 773 |
;; COLLECTOR will hold the (reversed) template |
|---|
| 774 |
(collector '())) |
|---|
| 775 |
;; scan through all special parts of the replacement string |
|---|
| 776 |
(do-matches (match-start match-end reg-scanner replacement) |
|---|
| 777 |
(when (< from match-start) |
|---|
| 778 |
;; strings between matches are copied verbatim |
|---|
| 779 |
(push (subseq replacement from match-start) collector)) |
|---|
| 780 |
;; PARSE-START is true if the pattern matched a number which |
|---|
| 781 |
;; refers to a register |
|---|
| 782 |
(let* ((parse-start (position-if #'digit-char-p |
|---|
| 783 |
replacement |
|---|
| 784 |
:start match-start |
|---|
| 785 |
:end match-end)) |
|---|
| 786 |
(token (if parse-start |
|---|
| 787 |
(1- (parse-integer replacement |
|---|
| 788 |
:start parse-start |
|---|
| 789 |
:junk-allowed t)) |
|---|
| 790 |
;; if we didn't match a number we convert the |
|---|
| 791 |
;; character to a symbol |
|---|
| 792 |
(case (char replacement (1+ match-start)) |
|---|
| 793 |
((#\&) :match) |
|---|
| 794 |
((#\`) :before-match) |
|---|
| 795 |
((#\') :after-match) |
|---|
| 796 |
((#\\) :backslash))))) |
|---|
| 797 |
(when (and (numberp token) (< token 0)) |
|---|
| 798 |
;; make sure we don't accept something like "\\0" |
|---|
| 799 |
(signal-invocation-error "Illegal substring ~S in replacement string." |
|---|
| 800 |
(subseq replacement match-start match-end))) |
|---|
| 801 |
(push token collector)) |
|---|
| 802 |
;; remember where the match ended |
|---|
| 803 |
(setq from match-end)) |
|---|
| 804 |
(when (< from (length replacement)) |
|---|
| 805 |
;; push the rest of the replacement string onto the list |
|---|
| 806 |
(push (nsubseq replacement from) collector)) |
|---|
| 807 |
(nreverse collector))) |
|---|
| 808 |
(list |
|---|
| 809 |
replacement) |
|---|
| 810 |
(t |
|---|
| 811 |
(list replacement))))) |
|---|
| 812 |
|
|---|
| 813 |
(defun build-replacement (replacement-template |
|---|
| 814 |
target-string |
|---|
| 815 |
start end |
|---|
| 816 |
match-start match-end |
|---|
| 817 |
reg-starts reg-ends |
|---|
| 818 |
simple-calls |
|---|
| 819 |
element-type) |
|---|
| 820 |
(declare #.*standard-optimize-settings*) |
|---|
| 821 |
"Accepts a replacement template and the current values from the |
|---|
| 822 |
matching process in REGEX-REPLACE or REGEX-REPLACE-ALL and returns the |
|---|
| 823 |
corresponding string." |
|---|
| 824 |
;; the upper exclusive bound of the register numbers in the regular |
|---|
| 825 |
;; expression |
|---|
| 826 |
(let ((reg-bound (if reg-starts |
|---|
| 827 |
(array-dimension reg-starts 0) |
|---|
| 828 |
0))) |
|---|
| 829 |
(with-output-to-string (s nil :element-type element-type) |
|---|
| 830 |
(loop for token in replacement-template |
|---|
| 831 |
do (typecase token |
|---|
| 832 |
(string |
|---|
| 833 |
;; transfer string parts verbatim |
|---|
| 834 |
(write-string token s)) |
|---|
| 835 |
(integer |
|---|
| 836 |
;; replace numbers with the corresponding registers |
|---|
| 837 |
(when (>= token reg-bound) |
|---|
| 838 |
;; but only if the register was referenced in the |
|---|
| 839 |
;; regular expression |
|---|
| 840 |
(signal-invocation-error "Reference to non-existent register ~A in replacement string." |
|---|
| 841 |
(1+ token))) |
|---|
| 842 |
(when (svref reg-starts token) |
|---|
| 843 |
;; and only if it matched, i.e. no match results |
|---|
| 844 |
;; in an empty string |
|---|
| 845 |
(write-string target-string s |
|---|
| 846 |
:start (svref reg-starts token) |
|---|
| 847 |
:end (svref reg-ends token)))) |
|---|
| 848 |
(function |
|---|
| 849 |
|
|---|