root/trunk/projects/bos/web/utf-8.lisp

Revision 3671, 3.6 kB (checked in by ksprotte, 4 months ago)

again whitespace cleanup + removed tabs

Line 
1 (in-package :bos.web)
2
3 ;; this code is heavily inspired from trivial-utf-8
4 ;; it only has one API function, which was not provided
5 ;; exactly as we need it by trivial-utf-8
6
7 ;; API
8 ;; utf-8-string-to-bytes STRING
9
10 (eval-when (:compile-toplevel :load-toplevel :execute)
11   (defparameter *optimize*
12     '(optimize (speed 3) (safety 0) (space 0) (debug 1)
13       (compilation-speed 0))))
14
15 (define-condition utf-8-decoding-error (simple-error)
16   ((message :initarg :message)
17    (byte :initarg :byte :initform nil))
18   (:report (lambda (err stream)
19              (format stream (slot-value err 'message)
20                      (slot-value err 'byte)))))
21
22 (declaim (inline utf-8-group-size))
23 (defun utf-8-group-size (byte)
24   "Determine the amount of bytes that are part of the character
25 starting with a given byte."
26   (declare (type fixnum byte)
27            #.*optimize*)
28   (cond ((zerop (logand byte #b10000000)) 1)
29         ((= (logand byte #b11100000) #b11000000) 2)
30         ((= (logand byte #b11110000) #b11100000) 3)
31         ((= (logand byte #b11111000) #b11110000) 4)
32         (t (error 'utf-8-decoding-error :byte byte
33                   :message "Invalid byte at start of character: 0x~X"))))
34
35 (defun utf-8-string-length (string)
36   "Calculate the length of the string encoded by the given bytes."
37   (declare (type simple-string string)
38            #.*optimize*)
39   (loop :with i = 0
40      :with string-length = 0
41      :with array-length = (length string)
42      :while (< i array-length)
43      :do (progn
44            (incf (the fixnum string-length) 1)
45            (incf i (utf-8-group-size (char-code (char string i)))))
46      :finally (return string-length)))
47
48 (defun get-utf-8-character (string group-size &optional (start 0))
49   "Given an array of bytes and the amount of bytes to use,
50 extract the character starting at the given start position."
51   (declare (type simple-string string)
52            (type fixnum group-size start)
53            #.*optimize*)
54   (labels ((next-byte ()
55              (prog1 (char-code (char string start))
56                (incf start)))
57            (six-bits (byte)
58              (unless (= (logand byte #b11000000) #b10000000)
59                (error 'utf-8-decoding-error :byte byte
60                       :message "Invalid byte 0x~X inside a character."))
61              (ldb (byte 6 0) byte)))
62     (case group-size
63       (1 (next-byte))
64       (2 (logior (ash (ldb (byte 5 0) (next-byte)) 6)
65                  (six-bits (next-byte))))
66       (3 (logior (ash (ldb (byte 4 0) (next-byte)) 12)
67                  (ash (six-bits (next-byte)) 6)
68                  (six-bits (next-byte))))
69       (4 (logior (ash (ldb (byte 3 0) (next-byte)) 18)
70                  (ash (six-bits (next-byte)) 12)
71                  (ash (six-bits (next-byte)) 6)
72                  (six-bits (next-byte)))))))
73
74 (defun utf-8-string-to-bytes (string)
75   (declare #.*optimize*)
76   (loop
77      with buffer = (make-array (utf-8-string-length string)
78                                :element-type '(unsigned-byte 16))
79      with string-position = 0
80      with buffer-position = 0
81      with string-length = (length string)
82      while (< string-position string-length)
83      do (let* ((byte (char-code (char string string-position)))
84                (current-group (utf-8-group-size byte)))
85           (when (> (+ current-group string-position) string-length)
86             (error 'utf-8-decoding-error
87                    :message "Unfinished character at end of byte array."))
88           (setf (aref buffer buffer-position)
89                 (get-utf-8-character string current-group string-position))
90           (incf buffer-position 1)
91           (incf string-position current-group))
92      finally (return buffer)))
Note: See TracBrowser for help on using the browser.