| 1 |
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- |
|---|
| 2 |
;;; $Header: /usr/local/cvsrep/cl-ppcre/charmap.lisp,v 1.18 2008/07/22 23:54:59 edi Exp $ |
|---|
| 3 |
|
|---|
| 4 |
;;; An optimized representation of sets of characters. |
|---|
| 5 |
|
|---|
| 6 |
;;; Copyright (c) 2008, Dr. Edmund Weitz. All rights reserved. |
|---|
| 7 |
|
|---|
| 8 |
;;; Redistribution and use in source and binary forms, with or without |
|---|
| 9 |
;;; modification, are permitted provided that the following conditions |
|---|
| 10 |
;;; are met: |
|---|
| 11 |
|
|---|
| 12 |
;;; * Redistributions of source code must retain the above copyright |
|---|
| 13 |
;;; notice, this list of conditions and the following disclaimer. |
|---|
| 14 |
|
|---|
| 15 |
;;; * Redistributions in binary form must reproduce the above |
|---|
| 16 |
;;; copyright notice, this list of conditions and the following |
|---|
| 17 |
;;; disclaimer in the documentation and/or other materials |
|---|
| 18 |
;;; provided with the distribution. |
|---|
| 19 |
|
|---|
| 20 |
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED |
|---|
| 21 |
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|---|
| 22 |
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|---|
| 23 |
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
|---|
| 24 |
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|---|
| 25 |
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
|---|
| 26 |
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|---|
| 27 |
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
|---|
| 28 |
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
|---|
| 29 |
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|---|
| 30 |
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|---|
| 31 |
|
|---|
| 32 |
(in-package :cl-ppcre) |
|---|
| 33 |
|
|---|
| 34 |
(defstruct (charmap (:constructor make-charmap%)) |
|---|
| 35 |
;; a bit vector mapping char codes to "booleans" (1 for set members, |
|---|
| 36 |
;; 0 for others) |
|---|
| 37 |
(vector #*0 :type simple-bit-vector) |
|---|
| 38 |
;; the smallest character code of all characters in the set |
|---|
| 39 |
(start 0 :type fixnum) |
|---|
| 40 |
;; the upper (exclusive) bound of all character codes in the set |
|---|
| 41 |
(end 0 :type fixnum) |
|---|
| 42 |
;; the number of characters in the set, or NIL if this is unknown |
|---|
| 43 |
(count nil :type (or fixnum null)) |
|---|
| 44 |
;; whether the charmap actually represents the complement of the set |
|---|
| 45 |
(complementp nil :type boolean)) |
|---|
| 46 |
|
|---|
| 47 |
;; seems to be necessary for some Lisps like ClozureCL |
|---|
| 48 |
(defmethod make-load-form ((map charmap) &optional environment) |
|---|
| 49 |
(make-load-form-saving-slots map :environment environment)) |
|---|
| 50 |
|
|---|
| 51 |
(declaim (inline in-charmap-p)) |
|---|
| 52 |
(defun in-charmap-p (char charmap) |
|---|
| 53 |
"Tests whether the character CHAR belongs to the set represented by CHARMAP." |
|---|
| 54 |
(declare #.*standard-optimize-settings*) |
|---|
| 55 |
(declare (character char) (charmap charmap)) |
|---|
| 56 |
(let* ((char-code (char-code char)) |
|---|
| 57 |
(char-in-vector-p |
|---|
| 58 |
(let ((charmap-start (charmap-start charmap))) |
|---|
| 59 |
(declare (fixnum charmap-start)) |
|---|
| 60 |
(and (<= charmap-start char-code) |
|---|
| 61 |
(< char-code (the fixnum (charmap-end charmap))) |
|---|
| 62 |
(= 1 (sbit (the simple-bit-vector (charmap-vector charmap)) |
|---|
| 63 |
(- char-code charmap-start))))))) |
|---|
| 64 |
(cond ((charmap-complementp charmap) (not char-in-vector-p)) |
|---|
| 65 |
(t char-in-vector-p)))) |
|---|
| 66 |
|
|---|
| 67 |
(defun charmap-contents (charmap) |
|---|
| 68 |
"Returns a list of all characters belonging to a character map. |
|---|
| 69 |
Only works for non-complement charmaps." |
|---|
| 70 |
(declare #.*standard-optimize-settings*) |
|---|
| 71 |
(declare (charmap charmap)) |
|---|
| 72 |
(and (not (charmap-complementp charmap)) |
|---|
| 73 |
(loop for code of-type fixnum from (charmap-start charmap) to (charmap-end charmap) |
|---|
| 74 |
for i across (the simple-bit-vector (charmap-vector charmap)) |
|---|
| 75 |
when (= i 1) |
|---|
| 76 |
collect (code-char code)))) |
|---|
| 77 |
|
|---|
| 78 |
(defun make-charmap (start end test-function &optional complementp) |
|---|
| 79 |
"Creates and returns a charmap representing all characters with |
|---|
| 80 |
character codes in the interval [start end) that satisfy |
|---|
| 81 |
TEST-FUNCTION. The COMPLEMENTP slot of the charmap is set to the |
|---|
| 82 |
value of the optional argument, but this argument doesn't have an |
|---|
| 83 |
effect on how TEST-FUNCTION is used." |
|---|
| 84 |
(declare #.*standard-optimize-settings*) |
|---|
| 85 |
(declare (fixnum start end)) |
|---|
| 86 |
(let ((vector (make-array (- end start) :element-type 'bit)) |
|---|
| 87 |
(count 0)) |
|---|
| 88 |
(declare (fixnum count)) |
|---|
| 89 |
(loop for code from start below end |
|---|
| 90 |
for char = (code-char code) |
|---|
| 91 |
for index from 0 |
|---|
| 92 |
when char do |
|---|
| 93 |
(incf count) |
|---|
| 94 |
(setf (sbit vector index) (if (funcall test-function char) 1 0))) |
|---|
| 95 |
(make-charmap% :vector vector |
|---|
| 96 |
:start start |
|---|
| 97 |
:end end |
|---|
| 98 |
;; we don't know for sure if COMPLEMENTP is true as |
|---|
| 99 |
;; there isn't a necessary a character for each |
|---|
| 100 |
;; integer below *REGEX-CHAR-CODE-LIMIT* |
|---|
| 101 |
:count (and (not complementp) count) |
|---|
| 102 |
;; make sure it's boolean |
|---|
| 103 |
:complementp (not (not complementp))))) |
|---|
| 104 |
|
|---|
| 105 |
(defun create-charmap-from-test-function (test-function start end) |
|---|
| 106 |
"Creates and returns a charmap representing all characters with |
|---|
| 107 |
character codes between START and END which satisfy TEST-FUNCTION. |
|---|
| 108 |
Tries to find the smallest interval which is necessary to represent |
|---|
| 109 |
the character set and uses the complement representation if that |
|---|
| 110 |
helps." |
|---|
| 111 |
(declare #.*standard-optimize-settings*) |
|---|
| 112 |
(let (start-in end-in start-out end-out) |
|---|
| 113 |
;; determine the smallest intervals containing the set and its |
|---|
| 114 |
;; complement, [start-in, end-in) and [start-out, end-out) - first |
|---|
| 115 |
;; the lower bound |
|---|
| 116 |
(loop for code from start below end |
|---|
| 117 |
for char = (code-char code) |
|---|
| 118 |
until (and start-in start-out) |
|---|
| 119 |
when (and char |
|---|
| 120 |
(not start-in) |
|---|
| 121 |
(funcall test-function char)) |
|---|
| 122 |
do (setq start-in code) |
|---|
| 123 |
when (and char |
|---|
| 124 |
(not start-out) |
|---|
| 125 |
(not (funcall test-function char))) |
|---|
| 126 |
do (setq start-out code)) |
|---|
| 127 |
(unless start-in |
|---|
| 128 |
;; no character satisfied the test, so return a "pseudo" charmap |
|---|
| 129 |
;; where IN-CHARMAP-P is always false |
|---|
| 130 |
(return-from create-charmap-from-test-function |
|---|
| 131 |
(make-charmap% :count 0))) |
|---|
| 132 |
(unless start-out |
|---|
| 133 |
;; no character failed the test, so return a "pseudo" charmap |
|---|
| 134 |
;; where IN-CHARMAP-P is always true |
|---|
| 135 |
(return-from create-charmap-from-test-function |
|---|
| 136 |
(make-charmap% :complementp t))) |
|---|
| 137 |
;; now determine upper bound |
|---|
| 138 |
(loop for code from (1- end) downto start |
|---|
| 139 |
for char = (code-char code) |
|---|
| 140 |
until (and end-in end-out) |
|---|
| 141 |
when (and char |
|---|
| 142 |
(not end-in) |
|---|
| 143 |
(funcall test-function char)) |
|---|
| 144 |
do (setq end-in (1+ code)) |
|---|
| 145 |
when (and char |
|---|
| 146 |
(not end-out) |
|---|
| 147 |
(not (funcall test-function char))) |
|---|
| 148 |
do (setq end-out (1+ code))) |
|---|
| 149 |
;; use the smaller interval |
|---|
| 150 |
(cond ((<= (- end-in start-in) (- end-out start-out)) |
|---|
| 151 |
(make-charmap start-in end-in test-function)) |
|---|
| 152 |
(t (make-charmap start-out end-out (complement* test-function) t))))) |
|---|