root/trunk/thirdparty/cl-ppcre/parser.lisp

Revision 3581, 15.3 kB (checked in by edi, 6 months ago)

Update to current dev version

Line 
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2 ;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.30 2008/07/06 18:12:05 edi Exp $
3
4 ;;; The parser will - with the help of the lexer - parse a regex
5 ;;; string and convert it into a "parse tree" (see docs for details
6 ;;; about the syntax of these trees).  Note that the lexer might
7 ;;; return illegal parse trees.  It is assumed that the conversion
8 ;;; process later on will track them down.
9
10 ;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved.
11
12 ;;; Redistribution and use in source and binary forms, with or without
13 ;;; modification, are permitted provided that the following conditions
14 ;;; are met:
15
16 ;;;   * Redistributions of source code must retain the above copyright
17 ;;;     notice, this list of conditions and the following disclaimer.
18
19 ;;;   * Redistributions in binary form must reproduce the above
20 ;;;     copyright notice, this list of conditions and the following
21 ;;;     disclaimer in the documentation and/or other materials
22 ;;;     provided with the distribution.
23
24 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
25 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
26 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
28 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
30 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
32 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
33 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
36 (in-package :cl-ppcre)
37
38 (defun group (lexer)
39   "Parses and consumes a <group>.
40 The productions are: <group> -> \"\(\"<regex>\")\"
41                                 \"\(?:\"<regex>\")\"
42                                 \"\(?>\"<regex>\")\"
43                                 \"\(?<flags>:\"<regex>\")\"
44                                 \"\(?=\"<regex>\")\"
45                                 \"\(?!\"<regex>\")\"
46                                 \"\(?<=\"<regex>\")\"
47                                 \"\(?<!\"<regex>\")\"
48                                 \"\(?\(\"<num>\")\"<regex>\")\"
49                                 \"\(?\(\"<regex>\")\"<regex>\")\"
50                                 \"\(?<name>\"<regex>\")\" \(when *ALLOW-NAMED-REGISTERS* is T)
51                                 <legal-token>
52 where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS.
53 Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
54 <grouping-type> is one of six keywords - see source for details."
55   (declare #.*standard-optimize-settings*)
56   (multiple-value-bind (open-token flags)
57       (get-token lexer)
58     (cond ((eq open-token :open-paren-paren)
59             ;; special case for conditional regular expressions; note
60             ;; that at this point we accept a couple of illegal
61             ;; combinations which'll be sorted out later by the
62             ;; converter
63             (let* ((open-paren-pos (car (lexer-last-pos lexer)))
64                    ;; check if what follows "(?(" is a number
65                    (number (try-number lexer :no-whitespace-p t))
66                    ;; make changes to extended-mode-p local
67                    (*extended-mode-p* *extended-mode-p*))
68               (declare (fixnum open-paren-pos))
69               (cond (number
70                       ;; condition is a number (i.e. refers to a
71                       ;; back-reference)
72                       (let* ((inner-close-token (get-token lexer))
73                              (reg-expr (reg-expr lexer))
74                              (close-token (get-token lexer)))
75                         (unless (eq inner-close-token :close-paren)
76                           (signal-syntax-error* (+ open-paren-pos 2)
77                                                 "Opening paren has no matching closing paren."))
78                         (unless (eq close-token :close-paren)
79                           (signal-syntax-error* open-paren-pos
80                                                 "Opening paren has no matching closing paren."))
81                         (list :branch number reg-expr)))
82                     (t
83                       ;; condition must be a full regex (actually a
84                       ;; look-behind or look-ahead); and here comes a
85                       ;; terrible kludge: instead of being cleanly
86                       ;; separated from the lexer, the parser pushes
87                       ;; back the lexer by one position, thereby
88                       ;; landing in the middle of the 'token' "(?(" -
89                       ;; yuck!!
90                       (decf (lexer-pos lexer))
91                       (let* ((inner-reg-expr (group lexer))
92                              (reg-expr (reg-expr lexer))
93                              (close-token (get-token lexer)))
94                         (unless (eq close-token :close-paren)
95                           (signal-syntax-error* open-paren-pos
96                                                 "Opening paren has no matching closing paren."))
97                         (list :branch inner-reg-expr reg-expr))))))
98           ((member open-token '(:open-paren
99                                 :open-paren-colon
100                                 :open-paren-greater
101                                 :open-paren-equal
102                                 :open-paren-exclamation
103                                 :open-paren-less-equal
104                                 :open-paren-less-exclamation
105                                 :open-paren-less-letter)
106                    :test #'eq)
107             ;; make changes to extended-mode-p local
108             (let ((*extended-mode-p* *extended-mode-p*))
109               ;; we saw one of the six token representing opening
110               ;; parentheses
111               (let* ((open-paren-pos (car (lexer-last-pos lexer)))
112                      (register-name (when (eq open-token :open-paren-less-letter)
113                                       (parse-register-name-aux lexer)))
114                      (reg-expr (reg-expr lexer))
115                      (close-token (get-token lexer)))
116                 (when (or (eq open-token :open-paren)
117                           (eq open-token :open-paren-less-letter))
118                   ;; if this is the "("<regex>")" or "(?"<name>""<regex>")" production we have to
119                   ;; increment the register counter of the lexer
120                   (incf (lexer-reg lexer)))
121                 (unless (eq close-token :close-paren)
122                   ;; the token following <regex> must be the closing
123                   ;; parenthesis or this is a syntax error
124                   (signal-syntax-error* open-paren-pos
125                                         "Opening paren has no matching closing paren."))
126                 (if flags
127                   ;; if the lexer has returned a list of flags this must
128                   ;; have been the "(?:"<regex>")" production
129                   (cons :group (nconc flags (list reg-expr)))
130                   (if (eq open-token :open-paren-less-letter)
131                       (list :named-register
132                             ;; every string was reversed, so we have to
133                             ;; reverse it back to get the name
134                             (nreverse register-name)
135                             reg-expr)
136                       (list (case open-token
137                               ((:open-paren)
138                                :register)
139                               ((:open-paren-colon)
140                                :group)
141                               ((:open-paren-greater)
142                                :standalone)
143                               ((:open-paren-equal)
144                                :positive-lookahead)
145                               ((:open-paren-exclamation)
146                                :negative-lookahead)
147                               ((:open-paren-less-equal)
148                                :positive-lookbehind)
149                               ((:open-paren-less-exclamation)
150                                :negative-lookbehind))
151                             reg-expr))))))
152           (t
153            ;; this is the <legal-token> production; <legal-token> is
154            ;; any token which passes START-OF-SUBEXPR-P (otherwise
155            ;; parsing had already stopped in the SEQ method)
156            open-token))))
157
158 (defun greedy-quant (lexer)
159   "Parses and consumes a <greedy-quant>.
160 The productions are: <greedy-quant> -> <group> | <group><quantifier>
161 where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
162 Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
163   (declare #.*standard-optimize-settings*)
164   (let* ((group (group lexer))
165          (token (get-quantifier lexer)))
166     (if token
167       ;; if GET-QUANTIFIER returned a non-NIL value it's the
168       ;; two-element list (<min> <max>)
169       (list :greedy-repetition (first token) (second token) group)
170       group)))
171
172 (defun quant (lexer)
173   "Parses and consumes a <quant>.
174 The productions are: <quant> -> <greedy-quant> | <greedy-quant>\"?\".
175 Will return the <parse-tree> returned by GREEDY-QUANT and optionally
176 change :GREEDY-REPETITION to :NON-GREEDY-REPETITION."
177   (declare #.*standard-optimize-settings*)
178   (let* ((greedy-quant (greedy-quant lexer))
179          (pos (lexer-pos lexer))
180          (next-char (next-char lexer)))
181     (when next-char
182       (if (char= next-char #\?)
183         (setf (car greedy-quant) :non-greedy-repetition)
184         (setf (lexer-pos lexer) pos)))
185     greedy-quant))
186
187 (defun seq (lexer)
188   "Parses and consumes a <seq>.
189 The productions are: <seq> -> <quant> | <quant><seq>.
190 Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
191   (declare #.*standard-optimize-settings*)
192   (flet ((make-array-from-two-chars (char1 char2)
193            (let ((string (make-array 2
194                                      :element-type 'character
195                                      :fill-pointer t
196                                      :adjustable t)))
197              (setf (aref string 0) char1)
198              (setf (aref string 1) char2)
199              string)))
200     ;; Note that we're calling START-OF-SUBEXPR-P before we actually try
201     ;; to parse a <seq> or <quant> in order to catch empty regular
202     ;; expressions
203     (if (start-of-subexpr-p lexer)
204       (let ((quant (quant lexer)))
205         (if (start-of-subexpr-p lexer)
206           (let* ((seq (seq lexer))
207                  (quant-is-char-p (characterp quant))
208                  (seq-is-sequence-p (and (consp seq)
209                                          (eq (first seq) :sequence))))
210             (cond ((and quant-is-char-p
211                         (characterp seq))
212                     (make-array-from-two-chars seq quant))
213                   ((and quant-is-char-p
214                         (stringp seq))
215                     (vector-push-extend quant seq)
216                     seq)
217                   ((and quant-is-char-p
218                         seq-is-sequence-p
219                         (characterp (second seq)))
220                     (cond ((cddr seq)
221                             (setf (cdr seq)
222                                     (cons
223                                      (make-array-from-two-chars (second seq)
224                                                                 quant)
225                                      (cddr seq)))
226                             seq)
227                           (t (make-array-from-two-chars (second seq) quant))))
228                   ((and quant-is-char-p
229                         seq-is-sequence-p
230                         (stringp (second seq)))
231                     (cond ((cddr seq)
232                             (setf (cdr seq)
233                                     (cons
234                                      (progn
235                                        (vector-push-extend quant (second seq))
236                                        (second seq))
237                                      (cddr seq)))
238                             seq)
239                           (t
240                             (vector-push-extend quant (second seq))
241                             (second seq))))
242                   (seq-is-sequence-p
243                     ;; if <seq> is also a :SEQUENCE parse tree we merge
244                     ;; both lists into one to avoid unnecessary consing
245                     (setf (cdr seq)
246                             (cons quant (cdr seq)))
247                     seq)
248                   (t (list :sequence quant seq))))
249           quant))
250       :void)))
251  
252 (defun reg-expr (lexer)
253   "Parses and consumes a <regex>, a complete regular expression.
254 The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
255 Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
256   (declare #.*standard-optimize-settings*)
257   (let ((pos (lexer-pos lexer)))
258     (case (next-char lexer)
259       ((nil)
260         ;; if we didn't get any token we return :VOID which stands for
261         ;; "empty regular expression"
262         :void)
263       ((#\|)
264         ;; now check whether the expression started with a vertical
265         ;; bar, i.e. <seq> - the left alternation - is empty
266         (list :alternation :void (reg-expr lexer)))
267       (otherwise
268         ;; otherwise un-read the character we just saw and parse a
269         ;; <seq> plus the character following it
270         (setf (lexer-pos lexer) pos)
271         (let* ((seq (seq lexer))
272                (pos (lexer-pos lexer)))
273           (case (next-char lexer)
274             ((nil)
275               ;; no further character, just a <seq>
276               seq)
277             ((#\|)
278               ;; if the character was a vertical bar, this is an
279               ;; alternation and we have the second production
280               (let ((reg-expr (reg-expr lexer)))
281                 (cond ((and (consp reg-expr)
282                             (eq (first reg-expr) :alternation))
283                         ;; again we try to merge as above in SEQ
284                         (setf (cdr reg-expr)
285                                 (cons seq (cdr reg-expr)))
286                         reg-expr)
287                       (t (list :alternation seq reg-expr)))))
288             (otherwise
289               ;; a character which is not a vertical bar - this is
290               ;; either a syntax error or we're inside of a group and
291               ;; the next character is a closing parenthesis; so we
292               ;; just un-read the character and let another function
293               ;; take care of it
294               (setf (lexer-pos lexer) pos)
295               seq)))))))
296
297 (defun reverse-strings (parse-tree)
298   "Recursively walks through PARSE-TREE and destructively reverses all
299 strings in it."
300   (declare #.*standard-optimize-settings*)
301   (cond ((stringp parse-tree)
302           (nreverse parse-tree))
303         ((consp parse-tree)
304           (loop for parse-tree-rest on parse-tree
305                 while parse-tree-rest
306                 do (setf (car parse-tree-rest)
307                            (reverse-strings (car parse-tree-rest))))
308           parse-tree)
309         (t parse-tree)))
310
311 (defun parse-string (string)
312   "Translate the regex string STRING into a parse tree."
313   (declare #.*standard-optimize-settings*)
314   (let* ((lexer (make-lexer string))
315          (parse-tree (reverse-strings (reg-expr lexer))))
316     ;; check whether we've consumed the whole regex string
317     (if (end-of-string-p lexer)
318       parse-tree
319       (signal-syntax-error* (lexer-pos lexer) "Expected end of string."))))
Note: See TracBrowser for help on using the browser.