| 1 |
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- |
|---|
| 2 |
;;; $Header: /usr/local/cvsrep/cl-ppcre/charset.lisp,v 1.9 2008/07/23 00:47:58 edi Exp $ |
|---|
| 3 |
|
|---|
| 4 |
;;; A specialized set implementation for characters by Nikodemus Siivola. |
|---|
| 5 |
|
|---|
| 6 |
;;; Copyright (c) 2008, Nikodemus Siivola. All rights reserved. |
|---|
| 7 |
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved. |
|---|
| 8 |
|
|---|
| 9 |
;;; Redistribution and use in source and binary forms, with or without |
|---|
| 10 |
;;; modification, are permitted provided that the following conditions |
|---|
| 11 |
;;; are met: |
|---|
| 12 |
|
|---|
| 13 |
;;; * Redistributions of source code must retain the above copyright |
|---|
| 14 |
;;; notice, this list of conditions and the following disclaimer. |
|---|
| 15 |
|
|---|
| 16 |
;;; * Redistributions in binary form must reproduce the above |
|---|
| 17 |
;;; copyright notice, this list of conditions and the following |
|---|
| 18 |
;;; disclaimer in the documentation and/or other materials |
|---|
| 19 |
;;; provided with the distribution. |
|---|
| 20 |
|
|---|
| 21 |
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED |
|---|
| 22 |
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|---|
| 23 |
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|---|
| 24 |
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|---|
| 25 |
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|---|
| 26 |
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
|---|
| 27 |
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|---|
| 28 |
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
|---|
| 29 |
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
|---|
| 30 |
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|---|
| 31 |
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|---|
| 32 |
|
|---|
| 33 |
(in-package :cl-ppcre) |
|---|
| 34 |
|
|---|
| 35 |
(defconstant +probe-depth+ 3 |
|---|
| 36 |
"Maximum number of collisions \(for any element) we accept before we |
|---|
| 37 |
allocate more storage. This is now fixed, but could be made to vary |
|---|
| 38 |
depending on the size of the storage vector \(e.g. in the range of |
|---|
| 39 |
1-4). Larger probe-depths mean more collisions are tolerated before |
|---|
| 40 |
the table grows, but increase the constant factor.") |
|---|
| 41 |
|
|---|
| 42 |
(defun make-char-vector (size) |
|---|
| 43 |
"Returns a vector of size SIZE to hold characters. All elements are |
|---|
| 44 |
initialized to #\Null except for the first one which is initialized to |
|---|
| 45 |
#\?." |
|---|
| 46 |
(declare #.*standard-optimize-settings*) |
|---|
| 47 |
(declare (type (integer 2 #.(1- array-total-size-limit)) size)) |
|---|
| 48 |
;; since #\Null always hashes to 0, store something else there |
|---|
| 49 |
;; initially, and #\Null everywhere else |
|---|
| 50 |
(let ((result (make-array size |
|---|
| 51 |
:element-type #-:lispworks 'character #+:lispworks 'lw:simple-char |
|---|
| 52 |
:initial-element (code-char 0)))) |
|---|
| 53 |
(setf (char result 0) #\?) |
|---|
| 54 |
result)) |
|---|
| 55 |
|
|---|
| 56 |
(defstruct (charset (:constructor make-charset ())) |
|---|
| 57 |
;; this is set to 0 when we stop hashing and just use a CHAR-CODE |
|---|
| 58 |
;; indexed vector |
|---|
| 59 |
(depth +probe-depth+ :type fixnum) |
|---|
| 60 |
;; the number of characters in this set |
|---|
| 61 |
(count 0 :type fixnum) |
|---|
| 62 |
;; the storage vector |
|---|
| 63 |
(vector (make-char-vector 12) :type (simple-array character (*)))) |
|---|
| 64 |
|
|---|
| 65 |
;; seems to be necessary for some Lisps like ClozureCL |
|---|
| 66 |
(defmethod make-load-form ((set charset) &optional environment) |
|---|
| 67 |
(make-load-form-saving-slots set :environment environment)) |
|---|
| 68 |
|
|---|
| 69 |
(declaim (inline mix)) |
|---|
| 70 |
(defun mix (code hash) |
|---|
| 71 |
"Given a character code CODE and a hash code HASH, computes and |
|---|
| 72 |
returns the \"next\" hash code. See comments below." |
|---|
| 73 |
(declare #.*standard-optimize-settings*) |
|---|
| 74 |
;; mixing the CHAR-CODE back in at each step makes sure that if two |
|---|
| 75 |
;; characters collide (their hashes end up pointing in the same |
|---|
| 76 |
;; storage vector index) on one round, they should (hopefully!) not |
|---|
| 77 |
;; collide on the next |
|---|
| 78 |
(sxhash (logand most-positive-fixnum (+ code hash)))) |
|---|
| 79 |
|
|---|
| 80 |
(declaim (inline compute-index)) |
|---|
| 81 |
(defun compute-index (hash vector) |
|---|
| 82 |
"Computes and returns the index into the vector VECTOR corresponding |
|---|
| 83 |
to the hash code HASH." |
|---|
| 84 |
(declare #.*standard-optimize-settings*) |
|---|
| 85 |
(1+ (mod hash (1- (length vector))))) |
|---|
| 86 |
|
|---|
| 87 |
(defun in-charset-p (char set) |
|---|
| 88 |
"Checks whether the character CHAR is in the charset SET." |
|---|
| 89 |
(declare #.*standard-optimize-settings*) |
|---|
| 90 |
(declare (character char) (charset set)) |
|---|
| 91 |
(let ((vector (charset-vector set)) |
|---|
| 92 |
(depth (charset-depth set)) |
|---|
| 93 |
(code (char-code char))) |
|---|
| 94 |
(declare (fixnum depth)) |
|---|
| 95 |
;; as long as the set remains reasonably small, we use non-linear |
|---|
| 96 |
;; hashing - the first hash of any character is its CHAR-CODE, and |
|---|
| 97 |
;; subsequent hashes are computed by MIX above |
|---|
| 98 |
(cond ((or |
|---|
| 99 |
;; depth 0 is special - each char maps only to its code, |
|---|
| 100 |
;; nothing else |
|---|
| 101 |
(zerop depth) |
|---|
| 102 |
;; index 0 is special - only #\Null maps to it, no matter |
|---|
| 103 |
;; what the depth is |
|---|
| 104 |
(zerop code)) |
|---|
| 105 |
(eq char (char vector code))) |
|---|
| 106 |
(t |
|---|
| 107 |
;; otherwise hash starts out as the character code, but |
|---|
| 108 |
;; maps to indexes 1-N |
|---|
| 109 |
(let ((hash code)) |
|---|
| 110 |
(tagbody |
|---|
| 111 |
:retry |
|---|
| 112 |
(let* ((index (compute-index hash vector)) |
|---|
| 113 |
(x (char vector index))) |
|---|
| 114 |
(cond ((eq x (code-char 0)) |
|---|
| 115 |
;; empty, no need to probe further |
|---|
| 116 |
(return-from in-charset-p nil)) |
|---|
| 117 |
((eq x char) |
|---|
| 118 |
;; got it |
|---|
| 119 |
(return-from in-charset-p t)) |
|---|
| 120 |
((zerop (decf depth)) |
|---|
| 121 |
;; max probe depth reached, nothing found |
|---|
| 122 |
(return-from in-charset-p nil)) |
|---|
| 123 |
(t |
|---|
| 124 |
;; nothing yet, try next place |
|---|
| 125 |
(setf hash (mix code hash)) |
|---|
| 126 |
(go :retry)))))))))) |
|---|
| 127 |
|
|---|
| 128 |
(defun add-to-charset (char set) |
|---|
| 129 |
"Adds the character CHAR to the charset SET, extending SET if |
|---|
| 130 |
necessary. Returns CHAR." |
|---|
| 131 |
(declare #.*standard-optimize-settings*) |
|---|
| 132 |
(or (%add-to-charset char set t) |
|---|
| 133 |
(%add-to-charset/expand char set) |
|---|
| 134 |
(error "Oops, this should not happen...")) |
|---|
| 135 |
char) |
|---|
| 136 |
|
|---|
| 137 |
(defun %add-to-charset (char set count) |
|---|
| 138 |
"Tries to add the character CHAR to the charset SET without |
|---|
| 139 |
extending it. Returns NIL if this fails. Counts CHAR as new |
|---|
| 140 |
if COUNT is true and it is added to SET." |
|---|
| 141 |
(declare #.*standard-optimize-settings*) |
|---|
| 142 |
(declare (character char) (charset set)) |
|---|
| 143 |
(let ((vector (charset-vector set)) |
|---|
| 144 |
(depth (charset-depth set)) |
|---|
| 145 |
(code (char-code char))) |
|---|
| 146 |
(declare (fixnum depth)) |
|---|
| 147 |
;; see comments in IN-CHARSET-P for algorithm |
|---|
| 148 |
(cond ((or (zerop depth) (zerop code)) |
|---|
| 149 |
(unless (eq char (char vector code)) |
|---|
| 150 |
(setf (char vector code) char) |
|---|
| 151 |
(when count |
|---|
| 152 |
(incf (charset-count set)))) |
|---|
| 153 |
char) |
|---|
| 154 |
(t |
|---|
| 155 |
(let ((hash code)) |
|---|
| 156 |
(tagbody |
|---|
| 157 |
:retry |
|---|
| 158 |
(let* ((index (compute-index hash vector)) |
|---|
| 159 |
(x (char vector index))) |
|---|
| 160 |
(cond ((eq x (code-char 0)) |
|---|
| 161 |
(setf (char vector index) char) |
|---|
| 162 |
(when count |
|---|
| 163 |
(incf (charset-count set))) |
|---|
| 164 |
(return-from %add-to-charset char)) |
|---|
| 165 |
((eq x char) |
|---|
| 166 |
(return-from %add-to-charset char)) |
|---|
| 167 |
((zerop (decf depth)) |
|---|
| 168 |
;; need to expand the table |
|---|
| 169 |
(return-from %add-to-charset nil)) |
|---|
| 170 |
(t |
|---|
| 171 |
(setf hash (mix code hash)) |
|---|
| 172 |
(go :retry)))))))))) |
|---|
| 173 |
|
|---|
| 174 |
(defun %add-to-charset/expand (char set) |
|---|
| 175 |
"Extends the charset SET and then adds the character CHAR to it." |
|---|
| 176 |
(declare #.*standard-optimize-settings*) |
|---|
| 177 |
(declare (character char) (charset set)) |
|---|
| 178 |
(let* ((old-vector (charset-vector set)) |
|---|
| 179 |
(new-size (* 2 (length old-vector)))) |
|---|
| 180 |
(tagbody |
|---|
| 181 |
:retry |
|---|
| 182 |
;; when the table grows large (currently over 1/3 of |
|---|
| 183 |
;; CHAR-CODE-LIMIT), we dispense with hashing and just allocate a |
|---|
| 184 |
;; storage vector with space for all characters, so that each |
|---|
| 185 |
;; character always uses only the CHAR-CODE |
|---|
| 186 |
(multiple-value-bind (new-depth new-vector) |
|---|
| 187 |
(if (>= new-size #.(truncate char-code-limit 3)) |
|---|
| 188 |
(values 0 (make-char-vector char-code-limit)) |
|---|
| 189 |
(values +probe-depth+ (make-char-vector new-size))) |
|---|
| 190 |
(setf (charset-depth set) new-depth |
|---|
| 191 |
(charset-vector set) new-vector) |
|---|
| 192 |
(flet ((try-add (x) |
|---|
| 193 |
;; don't count - old characters are already accounted |
|---|
| 194 |
;; for, and might count the new one multiple times as |
|---|
| 195 |
;; well |
|---|
| 196 |
(unless (%add-to-charset x set nil) |
|---|
| 197 |
(assert (not (zerop new-depth))) |
|---|
| 198 |
(setf new-size (* 2 new-size)) |
|---|
| 199 |
(go :retry)))) |
|---|
| 200 |
(try-add char) |
|---|
| 201 |
(dotimes (i (length old-vector)) |
|---|
| 202 |
(let ((x (char old-vector i))) |
|---|
| 203 |
(if (eq x (code-char 0)) |
|---|
| 204 |
(when (zerop i) |
|---|
| 205 |
(try-add x)) |
|---|
| 206 |
(unless (zerop i) |
|---|
| 207 |
(try-add x)))))))) |
|---|
| 208 |
;; added and expanded, /now/ count the new character. |
|---|
| 209 |
(incf (charset-count set)) |
|---|
| 210 |
t)) |
|---|
| 211 |
|
|---|
| 212 |
(defun map-charset (function charset) |
|---|
| 213 |
"Calls FUNCTION with all characters in SET. Returns NIL." |
|---|
| 214 |
(declare #.*standard-optimize-settings*) |
|---|
| 215 |
(declare (function function)) |
|---|
| 216 |
(let* ((n (charset-count charset)) |
|---|
| 217 |
(vector (charset-vector charset)) |
|---|
| 218 |
(size (length vector))) |
|---|
| 219 |
;; see comments in IN-CHARSET-P for algorithm |
|---|
| 220 |
(when (eq (code-char 0) (char vector 0)) |
|---|
| 221 |
(funcall function (code-char 0)) |
|---|
| 222 |
(decf n)) |
|---|
| 223 |
(loop for i from 1 below size |
|---|
| 224 |
for char = (char vector i) |
|---|
| 225 |
unless (eq (code-char 0) char) do |
|---|
| 226 |
(funcall function char) |
|---|
| 227 |
;; this early termination test should be worth it when |
|---|
| 228 |
;; mapping across depth 0 charsets. |
|---|
| 229 |
(when (zerop (decf n)) |
|---|
| 230 |
(return-from map-charset nil)))) |
|---|
| 231 |
nil) |
|---|
| 232 |
|
|---|
| 233 |
(defun create-charset-from-test-function (test-function start end) |
|---|
| 234 |
"Creates and returns a charset representing all characters with |
|---|
| 235 |
character codes between START and END which satisfy TEST-FUNCTION." |
|---|
| 236 |
(declare #.*standard-optimize-settings*) |
|---|
| 237 |
(loop with charset = (make-charset) |
|---|
| 238 |
for code from start below end |
|---|
| 239 |
for char = (code-char code) |
|---|
| 240 |
when (and char (funcall test-function char)) |
|---|
| 241 |
do (add-to-charset char charset) |
|---|
| 242 |
finally (return charset))) |
|---|