| 1 |
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- |
|---|
| 2 |
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.42 2008/07/22 22:38:05 edi Exp $ |
|---|
| 3 |
|
|---|
| 4 |
;;; This file defines the REGEX class. REGEX objects are used to |
|---|
| 5 |
;;; represent the (transformed) parse trees internally |
|---|
| 6 |
|
|---|
| 7 |
;;; Copyright (c) 2002-2008, Dr. Edmund Weitz. All rights reserved. |
|---|
| 8 |
|
|---|
| 9 |
;;; Redistribution and use in source and binary forms, with or without |
|---|
| 10 |
;;; modification, are permitted provided that the following conditions |
|---|
| 11 |
;;; are met: |
|---|
| 12 |
|
|---|
| 13 |
;;; * Redistributions of source code must retain the above copyright |
|---|
| 14 |
;;; notice, this list of conditions and the following disclaimer. |
|---|
| 15 |
|
|---|
| 16 |
;;; * Redistributions in binary form must reproduce the above |
|---|
| 17 |
;;; copyright notice, this list of conditions and the following |
|---|
| 18 |
;;; disclaimer in the documentation and/or other materials |
|---|
| 19 |
;;; provided with the distribution. |
|---|
| 20 |
|
|---|
| 21 |
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED |
|---|
| 22 |
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|---|
| 23 |
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|---|
| 24 |
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|---|
| 25 |
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|---|
| 26 |
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
|---|
| 27 |
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|---|
| 28 |
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
|---|
| 29 |
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
|---|
| 30 |
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|---|
| 31 |
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|---|
| 32 |
|
|---|
| 33 |
(in-package :cl-ppcre) |
|---|
| 34 |
|
|---|
| 35 |
(defclass regex () |
|---|
| 36 |
() |
|---|
| 37 |
(:documentation "The REGEX base class. All other classes inherit |
|---|
| 38 |
from this one.")) |
|---|
| 39 |
|
|---|
| 40 |
(defclass seq (regex) |
|---|
| 41 |
((elements :initarg :elements |
|---|
| 42 |
:accessor elements |
|---|
| 43 |
:type cons |
|---|
| 44 |
:documentation "A list of REGEX objects.")) |
|---|
| 45 |
(:documentation "SEQ objects represents sequences of regexes. |
|---|
| 46 |
\(Like \"ab\" is the sequence of \"a\" and \"b\".)")) |
|---|
| 47 |
|
|---|
| 48 |
(defclass alternation (regex) |
|---|
| 49 |
((choices :initarg :choices |
|---|
| 50 |
:accessor choices |
|---|
| 51 |
:type cons |
|---|
| 52 |
:documentation "A list of REGEX objects")) |
|---|
| 53 |
(:documentation "ALTERNATION objects represent alternations of |
|---|
| 54 |
regexes. \(Like \"a|b\" ist the alternation of \"a\" or \"b\".)")) |
|---|
| 55 |
|
|---|
| 56 |
(defclass lookahead (regex) |
|---|
| 57 |
((regex :initarg :regex |
|---|
| 58 |
:accessor regex |
|---|
| 59 |
:documentation "The REGEX object we're checking.") |
|---|
| 60 |
(positivep :initarg :positivep |
|---|
| 61 |
:reader positivep |
|---|
| 62 |
:documentation "Whether this assertion is positive.")) |
|---|
| 63 |
(:documentation "LOOKAHEAD objects represent look-ahead assertions.")) |
|---|
| 64 |
|
|---|
| 65 |
(defclass lookbehind (regex) |
|---|
| 66 |
((regex :initarg :regex |
|---|
| 67 |
:accessor regex |
|---|
| 68 |
:documentation "The REGEX object we're checking.") |
|---|
| 69 |
(positivep :initarg :positivep |
|---|
| 70 |
:reader positivep |
|---|
| 71 |
:documentation "Whether this assertion is positive.") |
|---|
| 72 |
(len :initarg :len |
|---|
| 73 |
:accessor len |
|---|
| 74 |
:type fixnum |
|---|
| 75 |
:documentation "The \(fixed) length of the enclosed regex.")) |
|---|
| 76 |
(:documentation "LOOKBEHIND objects represent look-behind assertions.")) |
|---|
| 77 |
|
|---|
| 78 |
(defclass repetition (regex) |
|---|
| 79 |
((regex :initarg :regex |
|---|
| 80 |
:accessor regex |
|---|
| 81 |
:documentation "The REGEX that's repeated.") |
|---|
| 82 |
(greedyp :initarg :greedyp |
|---|
| 83 |
:reader greedyp |
|---|
| 84 |
:documentation "Whether the repetition is greedy.") |
|---|
| 85 |
(minimum :initarg :minimum |
|---|
| 86 |
:accessor minimum |
|---|
| 87 |
:type fixnum |
|---|
| 88 |
:documentation "The minimal number of repetitions.") |
|---|
| 89 |
(maximum :initarg :maximum |
|---|
| 90 |
:accessor maximum |
|---|
| 91 |
:documentation "The maximal number of repetitions. |
|---|
| 92 |
Can be NIL for unbounded.") |
|---|
| 93 |
(min-len :initarg :min-len |
|---|
| 94 |
:reader min-len |
|---|
| 95 |
:documentation "The minimal length of the enclosed regex.") |
|---|
| 96 |
(len :initarg :len |
|---|
| 97 |
:reader len |
|---|
| 98 |
:documentation "The length of the enclosed regex. NIL if |
|---|
| 99 |
unknown.") |
|---|
| 100 |
(min-rest :initform 0 |
|---|
| 101 |
:accessor min-rest |
|---|
| 102 |
:type fixnum |
|---|
| 103 |
:documentation "The minimal number of characters which |
|---|
| 104 |
must appear after this repetition.") |
|---|
| 105 |
(contains-register-p :initarg :contains-register-p |
|---|
| 106 |
:reader contains-register-p |
|---|
| 107 |
:documentation "Whether the regex contains a |
|---|
| 108 |
register.")) |
|---|
| 109 |
(:documentation "REPETITION objects represent repetitions of regexes.")) |
|---|
| 110 |
|
|---|
| 111 |
(defclass register (regex) |
|---|
| 112 |
((regex :initarg :regex |
|---|
| 113 |
:accessor regex |
|---|
| 114 |
:documentation "The inner regex.") |
|---|
| 115 |
(num :initarg :num |
|---|
| 116 |
:reader num |
|---|
| 117 |
:type fixnum |
|---|
| 118 |
:documentation "The number of this register, starting from 0. |
|---|
| 119 |
This is the index into *REGS-START* and *REGS-END*.") |
|---|
| 120 |
(name :initarg :name |
|---|
| 121 |
:reader name |
|---|
| 122 |
:documentation "Name of this register or NIL.")) |
|---|
| 123 |
(:documentation "REGISTER objects represent register groups.")) |
|---|
| 124 |
|
|---|
| 125 |
(defclass standalone (regex) |
|---|
| 126 |
((regex :initarg :regex |
|---|
| 127 |
:accessor regex |
|---|
| 128 |
:documentation "The inner regex.")) |
|---|
| 129 |
(:documentation "A standalone regular expression.")) |
|---|
| 130 |
|
|---|
| 131 |
(defclass back-reference (regex) |
|---|
| 132 |
((num :initarg :num |
|---|
| 133 |
:accessor num |
|---|
| 134 |
:type fixnum |
|---|
| 135 |
:documentation "The number of the register this |
|---|
| 136 |
reference refers to.") |
|---|
| 137 |
(name :initarg :name |
|---|
| 138 |
:accessor name |
|---|
| 139 |
:documentation "The name of the register this |
|---|
| 140 |
reference refers to or NIL.") |
|---|
| 141 |
(case-insensitive-p :initarg :case-insensitive-p |
|---|
| 142 |
:reader case-insensitive-p |
|---|
| 143 |
:documentation "Whether we check |
|---|
| 144 |
case-insensitively.")) |
|---|
| 145 |
(:documentation "BACK-REFERENCE objects represent backreferences.")) |
|---|
| 146 |
|
|---|
| 147 |
(defclass char-class (regex) |
|---|
| 148 |
((test-function :initarg :test-function |
|---|
| 149 |
:reader test-function |
|---|
| 150 |
:type (or function symbol nil) |
|---|
| 151 |
:documentation "A unary function \(accepting a |
|---|
| 152 |
character) which stands in for the character class and does the work |
|---|
| 153 |
of checking whether a character belongs to the class.")) |
|---|
| 154 |
(:documentation "CHAR-CLASS objects represent character classes.")) |
|---|
| 155 |
|
|---|
| 156 |
(defclass str (regex) |
|---|
| 157 |
((str :initarg :str |
|---|
| 158 |
:accessor str |
|---|
| 159 |
:type string |
|---|
| 160 |
:documentation "The actual string.") |
|---|
| 161 |
(len :initform 0 |
|---|
| 162 |
:accessor len |
|---|
| 163 |
:type fixnum |
|---|
| 164 |
:documentation "The length of the string.") |
|---|
| 165 |
(case-insensitive-p :initarg :case-insensitive-p |
|---|
| 166 |
:reader case-insensitive-p |
|---|
| 167 |
:documentation "If we match case-insensitively.") |
|---|
| 168 |
(offset :initform nil |
|---|
| 169 |
:accessor offset |
|---|
| 170 |
:documentation "Offset from the left of the whole |
|---|
| 171 |
parse tree. The first regex has offset 0. NIL if unknown, i.e. behind |
|---|
| 172 |
a variable-length regex.") |
|---|
| 173 |
(skip :initform nil |
|---|
| 174 |
:initarg :skip |
|---|
| 175 |
:accessor skip |
|---|
| 176 |
:documentation "If we can avoid testing for this |
|---|
| 177 |
string because the SCAN function has done this already.") |
|---|
| 178 |
(start-of-end-string-p :initform nil |
|---|
| 179 |
:accessor start-of-end-string-p |
|---|
| 180 |
:documentation "If this is the unique |
|---|
| 181 |
STR which starts END-STRING (a slot of MATCHER).")) |
|---|
| 182 |
(:documentation "STR objects represent string.")) |
|---|
| 183 |
|
|---|
| 184 |
(defclass anchor (regex) |
|---|
| 185 |
((startp :initarg :startp |
|---|
| 186 |
:reader startp |
|---|
| 187 |
:documentation "Whether this is a \"start anchor\".") |
|---|
| 188 |
(multi-line-p :initarg :multi-line-p |
|---|
| 189 |
:initform nil |
|---|
| 190 |
:reader multi-line-p |
|---|
| 191 |
:documentation "Whether we're in multi-line mode, |
|---|
| 192 |
i.e. whether each #\\Newline is surrounded by anchors.") |
|---|
| 193 |
(no-newline-p :initarg :no-newline-p |
|---|
| 194 |
:initform nil |
|---|
| 195 |
:reader no-newline-p |
|---|
| 196 |
:documentation "Whether we ignore #\\Newline at the end.")) |
|---|
| 197 |
(:documentation "ANCHOR objects represent anchors like \"^\" or \"$\".")) |
|---|
| 198 |
|
|---|
| 199 |
(defclass everything (regex) |
|---|
| 200 |
((single-line-p :initarg :single-line-p |
|---|
| 201 |
:reader single-line-p |
|---|
| 202 |
:documentation "Whether we're in single-line mode, |
|---|
| 203 |
i.e. whether we also match #\\Newline.")) |
|---|
| 204 |
(:documentation "EVERYTHING objects represent regexes matching |
|---|
| 205 |
\"everything\", i.e. dots.")) |
|---|
| 206 |
|
|---|
| 207 |
(defclass word-boundary (regex) |
|---|
| 208 |
((negatedp :initarg :negatedp |
|---|
| 209 |
:reader negatedp |
|---|
| 210 |
:documentation "Whether we mean the opposite, |
|---|
| 211 |
i.e. no word-boundary.")) |
|---|
| 212 |
(:documentation "WORD-BOUNDARY objects represent word-boundary assertions.")) |
|---|
| 213 |
|
|---|
| 214 |
(defclass branch (regex) |
|---|
| 215 |
((test :initarg :test |
|---|
| 216 |
:accessor test |
|---|
| 217 |
:documentation "The test of this branch, one of |
|---|
| 218 |
LOOKAHEAD, LOOKBEHIND, or a number.") |
|---|
| 219 |
(then-regex :initarg :then-regex |
|---|
| 220 |
:accessor then-regex |
|---|
| 221 |
:documentation "The regex that's to be matched if the |
|---|
| 222 |
test succeeds.") |
|---|
| 223 |
(else-regex :initarg :else-regex |
|---|
| 224 |
:initform (make-instance 'void) |
|---|
| 225 |
:accessor else-regex |
|---|
| 226 |
:documentation "The regex that's to be matched if the |
|---|
| 227 |
test fails.")) |
|---|
| 228 |
(:documentation "BRANCH objects represent Perl's conditional regular |
|---|
| 229 |
expressions.")) |
|---|
| 230 |
|
|---|
| 231 |
(defclass filter (regex) |
|---|
| 232 |
((fn :initarg :fn |
|---|
| 233 |
:accessor fn |
|---|
| 234 |
:type (or function symbol) |
|---|
| 235 |
:documentation "The user-defined function.") |
|---|
| 236 |
(len :initarg :len |
|---|
| 237 |
:reader len |
|---|
| 238 |
:documentation "The fixed length of this filter or NIL.")) |
|---|
| 239 |
(:documentation "FILTER objects represent arbitrary functions |
|---|
| 240 |
defined by the user.")) |
|---|
| 241 |
|
|---|
| 242 |
(defclass void (regex) |
|---|
| 243 |
() |
|---|
| 244 |
(:documentation "VOID objects represent empty regular expressions.")) |
|---|
| 245 |
|
|---|
| 246 |
(defmethod initialize-instance :after ((str str) &rest init-args) |
|---|
| 247 |
(declare #.*standard-optimize-settings*) |
|---|
| 248 |
(declare (ignore init-args)) |
|---|
| 249 |
"Automatically computes the length of a STR after initialization." |
|---|
| 250 |
(let ((str-slot (slot-value str 'str))) |
|---|
| 251 |
(unless (typep str-slot 'simple-string) |
|---|
| 252 |
(setf (slot-value str 'str) (coerce str-slot 'simple-string)))) |
|---|
| 253 |
(setf (len str) (length (str str)))) |
|---|
| 254 |
|
|---|