| 1 |
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- |
|---|
| 2 |
;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.30 2008/07/06 18:12:05 edi Exp $ |
|---|
| 3 |
|
|---|
| 4 |
;;; The parser will - with the help of the lexer - parse a regex |
|---|
| 5 |
;;; string and convert it into a "parse tree" (see docs for details |
|---|
| 6 |
;;; about the syntax of these trees). Note that the lexer might |
|---|
| 7 |
;;; return illegal parse trees. It is assumed that the conversion |
|---|
| 8 |
;;; process later on will track them down. |
|---|
| 9 |
|
|---|
| 10 |
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. |
|---|
| 11 |
|
|---|
| 12 |
;;; Redistribution and use in source and binary forms, with or without |
|---|
| 13 |
;;; modification, are permitted provided that the following conditions |
|---|
| 14 |
;;; are met: |
|---|
| 15 |
|
|---|
| 16 |
;;; * Redistributions of source code must retain the above copyright |
|---|
| 17 |
;;; notice, this list of conditions and the following disclaimer. |
|---|
| 18 |
|
|---|
| 19 |
;;; * Redistributions in binary form must reproduce the above |
|---|
| 20 |
;;; copyright notice, this list of conditions and the following |
|---|
| 21 |
;;; disclaimer in the documentation and/or other materials |
|---|
| 22 |
;;; provided with the distribution. |
|---|
| 23 |
|
|---|
| 24 |
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED |
|---|
| 25 |
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|---|
| 26 |
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|---|
| 27 |
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|---|
| 28 |
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|---|
| 29 |
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
|---|
| 30 |
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|---|
| 31 |
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
|---|
| 32 |
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
|---|
| 33 |
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|---|
| 34 |
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|---|
| 35 |
|
|---|
| 36 |
(in-package :cl-ppcre) |
|---|
| 37 |
|
|---|
| 38 |
(defun group (lexer) |
|---|
| 39 |
"Parses and consumes a <group>. |
|---|
| 40 |
The productions are: <group> -> \"\(\"<regex>\")\" |
|---|
| 41 |
\"\(?:\"<regex>\")\" |
|---|
| 42 |
\"\(?>\"<regex>\")\" |
|---|
| 43 |
\"\(?<flags>:\"<regex>\")\" |
|---|
| 44 |
\"\(?=\"<regex>\")\" |
|---|
| 45 |
\"\(?!\"<regex>\")\" |
|---|
| 46 |
\"\(?<=\"<regex>\")\" |
|---|
| 47 |
\"\(?<!\"<regex>\")\" |
|---|
| 48 |
\"\(?\(\"<num>\")\"<regex>\")\" |
|---|
| 49 |
\"\(?\(\"<regex>\")\"<regex>\")\" |
|---|
| 50 |
\"\(?<name>\"<regex>\")\" \(when *ALLOW-NAMED-REGISTERS* is T) |
|---|
| 51 |
<legal-token> |
|---|
| 52 |
where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS. |
|---|
| 53 |
Will return <parse-tree> or \(<grouping-type> <parse-tree>) where |
|---|
| 54 |
<grouping-type> is one of six keywords - see source for details." |
|---|
| 55 |
(declare #.*standard-optimize-settings*) |
|---|
| 56 |
(multiple-value-bind (open-token flags) |
|---|
| 57 |
(get-token lexer) |
|---|
| 58 |
(cond ((eq open-token :open-paren-paren) |
|---|
| 59 |
;; special case for conditional regular expressions; note |
|---|
| 60 |
;; that at this point we accept a couple of illegal |
|---|
| 61 |
;; combinations which'll be sorted out later by the |
|---|
| 62 |
;; converter |
|---|
| 63 |
(let* ((open-paren-pos (car (lexer-last-pos lexer))) |
|---|
| 64 |
;; check if what follows "(?(" is a number |
|---|
| 65 |
(number (try-number lexer :no-whitespace-p t)) |
|---|
| 66 |
;; make changes to extended-mode-p local |
|---|
| 67 |
(*extended-mode-p* *extended-mode-p*)) |
|---|
| 68 |
(declare (fixnum open-paren-pos)) |
|---|
| 69 |
(cond (number |
|---|
| 70 |
;; condition is a number (i.e. refers to a |
|---|
| 71 |
;; back-reference) |
|---|
| 72 |
(let* ((inner-close-token (get-token lexer)) |
|---|
| 73 |
(reg-expr (reg-expr lexer)) |
|---|
| 74 |
(close-token (get-token lexer))) |
|---|
| 75 |
(unless (eq inner-close-token :close-paren) |
|---|
| 76 |
(signal-syntax-error* (+ open-paren-pos 2) |
|---|
| 77 |
"Opening paren has no matching closing paren.")) |
|---|
| 78 |
(unless (eq close-token :close-paren) |
|---|
| 79 |
(signal-syntax-error* open-paren-pos |
|---|
| 80 |
"Opening paren has no matching closing paren.")) |
|---|
| 81 |
(list :branch number reg-expr))) |
|---|
| 82 |
(t |
|---|
| 83 |
;; condition must be a full regex (actually a |
|---|
| 84 |
;; look-behind or look-ahead); and here comes a |
|---|
| 85 |
;; terrible kludge: instead of being cleanly |
|---|
| 86 |
;; separated from the lexer, the parser pushes |
|---|
| 87 |
;; back the lexer by one position, thereby |
|---|
| 88 |
;; landing in the middle of the 'token' "(?(" - |
|---|
| 89 |
;; yuck!! |
|---|
| 90 |
(decf (lexer-pos lexer)) |
|---|
| 91 |
(let* ((inner-reg-expr (group lexer)) |
|---|
| 92 |
(reg-expr (reg-expr lexer)) |
|---|
| 93 |
(close-token (get-token lexer))) |
|---|
| 94 |
(unless (eq close-token :close-paren) |
|---|
| 95 |
(signal-syntax-error* open-paren-pos |
|---|
| 96 |
"Opening paren has no matching closing paren.")) |
|---|
| 97 |
(list :branch inner-reg-expr reg-expr)))))) |
|---|
| 98 |
((member open-token '(:open-paren |
|---|
| 99 |
:open-paren-colon |
|---|
| 100 |
:open-paren-greater |
|---|
| 101 |
:open-paren-equal |
|---|
| 102 |
:open-paren-exclamation |
|---|
| 103 |
:open-paren-less-equal |
|---|
| 104 |
:open-paren-less-exclamation |
|---|
| 105 |
:open-paren-less-letter) |
|---|
| 106 |
:test #'eq) |
|---|
| 107 |
;; make changes to extended-mode-p local |
|---|
| 108 |
(let ((*extended-mode-p* *extended-mode-p*)) |
|---|
| 109 |
;; we saw one of the six token representing opening |
|---|
| 110 |
;; parentheses |
|---|
| 111 |
(let* ((open-paren-pos (car (lexer-last-pos lexer))) |
|---|
| 112 |
(register-name (when (eq open-token :open-paren-less-letter) |
|---|
| 113 |
(parse-register-name-aux lexer))) |
|---|
| 114 |
(reg-expr (reg-expr lexer)) |
|---|
| 115 |
(close-token (get-token lexer))) |
|---|
| 116 |
(when (or (eq open-token :open-paren) |
|---|
| 117 |
(eq open-token :open-paren-less-letter)) |
|---|
| 118 |
;; if this is the "("<regex>")" or "(?"<name>""<regex>")" production we have to |
|---|
| 119 |
;; increment the register counter of the lexer |
|---|
| 120 |
(incf (lexer-reg lexer))) |
|---|
| 121 |
(unless (eq close-token :close-paren) |
|---|
| 122 |
;; the token following <regex> must be the closing |
|---|
| 123 |
;; parenthesis or this is a syntax error |
|---|
| 124 |
(signal-syntax-error* open-paren-pos |
|---|
| 125 |
"Opening paren has no matching closing paren.")) |
|---|
| 126 |
(if flags |
|---|
| 127 |
;; if the lexer has returned a list of flags this must |
|---|
| 128 |
;; have been the "(?:"<regex>")" production |
|---|
| 129 |
(cons :group (nconc flags (list reg-expr))) |
|---|
| 130 |
(if (eq open-token :open-paren-less-letter) |
|---|
| 131 |
(list :named-register |
|---|
| 132 |
;; every string was reversed, so we have to |
|---|
| 133 |
;; reverse it back to get the name |
|---|
| 134 |
(nreverse register-name) |
|---|
| 135 |
reg-expr) |
|---|
| 136 |
(list (case open-token |
|---|
| 137 |
((:open-paren) |
|---|
| 138 |
:register) |
|---|
| 139 |
((:open-paren-colon) |
|---|
| 140 |
:group) |
|---|
| 141 |
((:open-paren-greater) |
|---|
| 142 |
:standalone) |
|---|
| 143 |
((:open-paren-equal) |
|---|
| 144 |
:positive-lookahead) |
|---|
| 145 |
((:open-paren-exclamation) |
|---|
| 146 |
:negative-lookahead) |
|---|
| 147 |
((:open-paren-less-equal) |
|---|
| 148 |
:positive-lookbehind) |
|---|
| 149 |
((:open-paren-less-exclamation) |
|---|
| 150 |
:negative-lookbehind)) |
|---|
| 151 |
reg-expr)))))) |
|---|
| 152 |
(t |
|---|
| 153 |
;; this is the <legal-token> production; <legal-token> is |
|---|
| 154 |
;; any token which passes START-OF-SUBEXPR-P (otherwise |
|---|
| 155 |
;; parsing had already stopped in the SEQ method) |
|---|
| 156 |
open-token)))) |
|---|
| 157 |
|
|---|
| 158 |
(defun greedy-quant (lexer) |
|---|
| 159 |
"Parses and consumes a <greedy-quant>. |
|---|
| 160 |
The productions are: <greedy-quant> -> <group> | <group><quantifier> |
|---|
| 161 |
where <quantifier> is parsed by the lexer function GET-QUANTIFIER. |
|---|
| 162 |
Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)." |
|---|
| 163 |
(declare #.*standard-optimize-settings*) |
|---|
| 164 |
(let* ((group (group lexer)) |
|---|
| 165 |
(token (get-quantifier lexer))) |
|---|
| 166 |
(if token |
|---|
| 167 |
;; if GET-QUANTIFIER returned a non-NIL value it's the |
|---|
| 168 |
;; two-element list (<min> <max>) |
|---|
| 169 |
(list :greedy-repetition (first token) (second token) group) |
|---|
| 170 |
group))) |
|---|
| 171 |
|
|---|
| 172 |
(defun quant (lexer) |
|---|
| 173 |
"Parses and consumes a <quant>. |
|---|
| 174 |
The productions are: <quant> -> <greedy-quant> | <greedy-quant>\"?\". |
|---|
| 175 |
Will return the <parse-tree> returned by GREEDY-QUANT and optionally |
|---|
| 176 |
change :GREEDY-REPETITION to :NON-GREEDY-REPETITION." |
|---|
| 177 |
(declare #.*standard-optimize-settings*) |
|---|
| 178 |
(let* ((greedy-quant (greedy-quant lexer)) |
|---|
| 179 |
(pos (lexer-pos lexer)) |
|---|
| 180 |
(next-char (next-char lexer))) |
|---|
| 181 |
(when next-char |
|---|
| 182 |
(if (char= next-char #\?) |
|---|
| 183 |
(setf (car greedy-quant) :non-greedy-repetition) |
|---|
| 184 |
(setf (lexer-pos lexer) pos))) |
|---|
| 185 |
greedy-quant)) |
|---|
| 186 |
|
|---|
| 187 |
(defun seq (lexer) |
|---|
| 188 |
"Parses and consumes a <seq>. |
|---|
| 189 |
The productions are: <seq> -> <quant> | <quant><seq>. |
|---|
| 190 |
Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)." |
|---|
| 191 |
(declare #.*standard-optimize-settings*) |
|---|
| 192 |
(flet ((make-array-from-two-chars (char1 char2) |
|---|
| 193 |
(let ((string (make-array 2 |
|---|
| 194 |
:element-type 'character |
|---|
| 195 |
:fill-pointer t |
|---|
| 196 |
:adjustable t))) |
|---|
| 197 |
(setf (aref string 0) char1) |
|---|
| 198 |
(setf (aref string 1) char2) |
|---|
| 199 |
string))) |
|---|
| 200 |
;; Note that we're calling START-OF-SUBEXPR-P before we actually try |
|---|
| 201 |
;; to parse a <seq> or <quant> in order to catch empty regular |
|---|
| 202 |
;; expressions |
|---|
| 203 |
(if (start-of-subexpr-p lexer) |
|---|
| 204 |
(let ((quant (quant lexer))) |
|---|
| 205 |
(if (start-of-subexpr-p lexer) |
|---|
| 206 |
(let* ((seq (seq lexer)) |
|---|
| 207 |
(quant-is-char-p (characterp quant)) |
|---|
| 208 |
(seq-is-sequence-p (and (consp seq) |
|---|
| 209 |
(eq (first seq) :sequence)))) |
|---|
| 210 |
(cond ((and quant-is-char-p |
|---|
| 211 |
(characterp seq)) |
|---|
| 212 |
(make-array-from-two-chars seq quant)) |
|---|
| 213 |
((and quant-is-char-p |
|---|
| 214 |
(stringp seq)) |
|---|
| 215 |
(vector-push-extend quant seq) |
|---|
| 216 |
seq) |
|---|
| 217 |
((and quant-is-char-p |
|---|
| 218 |
seq-is-sequence-p |
|---|
| 219 |
(characterp (second seq))) |
|---|
| 220 |
(cond ((cddr seq) |
|---|
| 221 |
(setf (cdr seq) |
|---|
| 222 |
(cons |
|---|
| 223 |
(make-array-from-two-chars (second seq) |
|---|
| 224 |
quant) |
|---|
| 225 |
(cddr seq))) |
|---|
| 226 |
seq) |
|---|
| 227 |
(t (make-array-from-two-chars (second seq) quant)))) |
|---|
| 228 |
((and quant-is-char-p |
|---|
| 229 |
seq-is-sequence-p |
|---|
| 230 |
(stringp (second seq))) |
|---|
| 231 |
(cond ((cddr seq) |
|---|
| 232 |
(setf (cdr seq) |
|---|
| 233 |
(cons |
|---|
| 234 |
(progn |
|---|
| 235 |
(vector-push-extend quant (second seq)) |
|---|
| 236 |
(second seq)) |
|---|
| 237 |
(cddr seq))) |
|---|
| 238 |
seq) |
|---|
| 239 |
(t |
|---|
| 240 |
(vector-push-extend quant (second seq)) |
|---|
| 241 |
(second seq)))) |
|---|
| 242 |
(seq-is-sequence-p |
|---|
| 243 |
;; if <seq> is also a :SEQUENCE parse tree we merge |
|---|
| 244 |
;; both lists into one to avoid unnecessary consing |
|---|
| 245 |
(setf (cdr seq) |
|---|
| 246 |
(cons quant (cdr seq))) |
|---|
| 247 |
seq) |
|---|
| 248 |
(t (list :sequence quant seq)))) |
|---|
| 249 |
quant)) |
|---|
| 250 |
:void))) |
|---|
| 251 |
|
|---|
| 252 |
(defun reg-expr (lexer) |
|---|
| 253 |
"Parses and consumes a <regex>, a complete regular expression. |
|---|
| 254 |
The productions are: <regex> -> <seq> | <seq>\"|\"<regex>. |
|---|
| 255 |
Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)." |
|---|
| 256 |
(declare #.*standard-optimize-settings*) |
|---|
| 257 |
(let ((pos (lexer-pos lexer))) |
|---|
| 258 |
(case (next-char lexer) |
|---|
| 259 |
((nil) |
|---|
| 260 |
;; if we didn't get any token we return :VOID which stands for |
|---|
| 261 |
;; "empty regular expression" |
|---|
| 262 |
:void) |
|---|
| 263 |
((#\|) |
|---|
| 264 |
;; now check whether the expression started with a vertical |
|---|
| 265 |
;; bar, i.e. <seq> - the left alternation - is empty |
|---|
| 266 |
(list :alternation :void (reg-expr lexer))) |
|---|
| 267 |
(otherwise |
|---|
| 268 |
;; otherwise un-read the character we just saw and parse a |
|---|
| 269 |
;; <seq> plus the character following it |
|---|
| 270 |
(setf (lexer-pos lexer) pos) |
|---|
| 271 |
(let* ((seq (seq lexer)) |
|---|
| 272 |
(pos (lexer-pos lexer))) |
|---|
| 273 |
(case (next-char lexer) |
|---|
| 274 |
((nil) |
|---|
| 275 |
;; no further character, just a <seq> |
|---|
| 276 |
seq) |
|---|
| 277 |
((#\|) |
|---|
| 278 |
;; if the character was a vertical bar, this is an |
|---|
| 279 |
;; alternation and we have the second production |
|---|
| 280 |
(let ((reg-expr (reg-expr lexer))) |
|---|
| 281 |
(cond ((and (consp reg-expr) |
|---|
| 282 |
(eq (first reg-expr) :alternation)) |
|---|
| 283 |
;; again we try to merge as above in SEQ |
|---|
| 284 |
(setf (cdr reg-expr) |
|---|
| 285 |
(cons seq (cdr reg-expr))) |
|---|
| 286 |
reg-expr) |
|---|
| 287 |
(t (list :alternation seq reg-expr))))) |
|---|
| 288 |
(otherwise |
|---|
| 289 |
;; a character which is not a vertical bar - this is |
|---|
| 290 |
;; either a syntax error or we're inside of a group and |
|---|
| 291 |
;; the next character is a closing parenthesis; so we |
|---|
| 292 |
;; just un-read the character and let another function |
|---|
| 293 |
;; take care of it |
|---|
| 294 |
(setf (lexer-pos lexer) pos) |
|---|
| 295 |
seq))))))) |
|---|
| 296 |
|
|---|
| 297 |
(defun reverse-strings (parse-tree) |
|---|
| 298 |
"Recursively walks through PARSE-TREE and destructively reverses all |
|---|
| 299 |
strings in it." |
|---|
| 300 |
(declare #.*standard-optimize-settings*) |
|---|
| 301 |
(cond ((stringp parse-tree) |
|---|
| 302 |
(nreverse parse-tree)) |
|---|
| 303 |
((consp parse-tree) |
|---|
| 304 |
(loop for parse-tree-rest on parse-tree |
|---|
| 305 |
while parse-tree-rest |
|---|
| 306 |
do (setf (car parse-tree-rest) |
|---|
| 307 |
(reverse-strings (car parse-tree-rest)))) |
|---|
| 308 |
parse-tree) |
|---|
| 309 |
(t parse-tree))) |
|---|
| 310 |
|
|---|
| 311 |
(defun parse-string (string) |
|---|
| 312 |
"Translate the regex string STRING into a parse tree." |
|---|
| 313 |
(declare #.*standard-optimize-settings*) |
|---|
| 314 |
(let* ((lexer (make-lexer string)) |
|---|
| 315 |
(parse-tree (reverse-strings (reg-expr lexer)))) |
|---|
| 316 |
;; check whether we've consumed the whole regex string |
|---|
| 317 |
(if (end-of-string-p lexer) |
|---|
| 318 |
parse-tree |
|---|
| 319 |
(signal-syntax-error* (lexer-pos lexer) "Expected end of string.")))) |
|---|