| 1 |
(in-package :bos.web) |
|---|
| 2 |
|
|---|
| 3 |
;; this code is heavily inspired from trivial-utf-8 |
|---|
| 4 |
;; it only has one API function, which was not provided |
|---|
| 5 |
;; exactly as we need it by trivial-utf-8 |
|---|
| 6 |
|
|---|
| 7 |
;; API |
|---|
| 8 |
;; utf-8-string-to-bytes STRING |
|---|
| 9 |
|
|---|
| 10 |
(eval-when (:compile-toplevel :load-toplevel :execute) |
|---|
| 11 |
(defparameter *optimize* |
|---|
| 12 |
'(optimize (speed 3) (safety 0) (space 0) (debug 1) |
|---|
| 13 |
(compilation-speed 0)))) |
|---|
| 14 |
|
|---|
| 15 |
(define-condition utf-8-decoding-error (simple-error) |
|---|
| 16 |
((message :initarg :message) |
|---|
| 17 |
(byte :initarg :byte :initform nil)) |
|---|
| 18 |
(:report (lambda (err stream) |
|---|
| 19 |
(format stream (slot-value err 'message) |
|---|
| 20 |
(slot-value err 'byte))))) |
|---|
| 21 |
|
|---|
| 22 |
(declaim (inline utf-8-group-size)) |
|---|
| 23 |
(defun utf-8-group-size (byte) |
|---|
| 24 |
"Determine the amount of bytes that are part of the character |
|---|
| 25 |
starting with a given byte." |
|---|
| 26 |
(declare (type fixnum byte) |
|---|
| 27 |
#.*optimize*) |
|---|
| 28 |
(cond ((zerop (logand byte #b10000000)) 1) |
|---|
| 29 |
((= (logand byte #b11100000) #b11000000) 2) |
|---|
| 30 |
((= (logand byte #b11110000) #b11100000) 3) |
|---|
| 31 |
((= (logand byte #b11111000) #b11110000) 4) |
|---|
| 32 |
(t (error 'utf-8-decoding-error :byte byte |
|---|
| 33 |
:message "Invalid byte at start of character: 0x~X")))) |
|---|
| 34 |
|
|---|
| 35 |
(defun utf-8-string-length (string) |
|---|
| 36 |
"Calculate the length of the string encoded by the given bytes." |
|---|
| 37 |
(declare (type simple-string string) |
|---|
| 38 |
#.*optimize*) |
|---|
| 39 |
(loop :with i = 0 |
|---|
| 40 |
:with string-length = 0 |
|---|
| 41 |
:with array-length = (length string) |
|---|
| 42 |
:while (< i array-length) |
|---|
| 43 |
:do (progn |
|---|
| 44 |
(incf (the fixnum string-length) 1) |
|---|
| 45 |
(incf i (utf-8-group-size (char-code (char string i))))) |
|---|
| 46 |
:finally (return string-length))) |
|---|
| 47 |
|
|---|
| 48 |
(defun get-utf-8-character (string group-size &optional (start 0)) |
|---|
| 49 |
"Given an array of bytes and the amount of bytes to use, |
|---|
| 50 |
extract the character starting at the given start position." |
|---|
| 51 |
(declare (type simple-string string) |
|---|
| 52 |
(type fixnum group-size start) |
|---|
| 53 |
#.*optimize*) |
|---|
| 54 |
(labels ((next-byte () |
|---|
| 55 |
(prog1 (char-code (char string start)) |
|---|
| 56 |
(incf start))) |
|---|
| 57 |
(six-bits (byte) |
|---|
| 58 |
(unless (= (logand byte #b11000000) #b10000000) |
|---|
| 59 |
(error 'utf-8-decoding-error :byte byte |
|---|
| 60 |
:message "Invalid byte 0x~X inside a character.")) |
|---|
| 61 |
(ldb (byte 6 0) byte))) |
|---|
| 62 |
(case group-size |
|---|
| 63 |
(1 (next-byte)) |
|---|
| 64 |
(2 (logior (ash (ldb (byte 5 0) (next-byte)) 6) |
|---|
| 65 |
(six-bits (next-byte)))) |
|---|
| 66 |
(3 (logior (ash (ldb (byte 4 0) (next-byte)) 12) |
|---|
| 67 |
(ash (six-bits (next-byte)) 6) |
|---|
| 68 |
(six-bits (next-byte)))) |
|---|
| 69 |
(4 (logior (ash (ldb (byte 3 0) (next-byte)) 18) |
|---|
| 70 |
(ash (six-bits (next-byte)) 12) |
|---|
| 71 |
(ash (six-bits (next-byte)) 6) |
|---|
| 72 |
(six-bits (next-byte))))))) |
|---|
| 73 |
|
|---|
| 74 |
(defun utf-8-string-to-bytes (string) |
|---|
| 75 |
(declare #.*optimize*) |
|---|
| 76 |
(loop |
|---|
| 77 |
with buffer = (make-array (utf-8-string-length string) |
|---|
| 78 |
:element-type '(unsigned-byte 16)) |
|---|
| 79 |
with string-position = 0 |
|---|
| 80 |
with buffer-position = 0 |
|---|
| 81 |
with string-length = (length string) |
|---|
| 82 |
while (< string-position string-length) |
|---|
| 83 |
do (let* ((byte (char-code (char string string-position))) |
|---|
| 84 |
(current-group (utf-8-group-size byte))) |
|---|
| 85 |
(when (> (+ current-group string-position) string-length) |
|---|
| 86 |
(error 'utf-8-decoding-error |
|---|
| 87 |
:message "Unfinished character at end of byte array.")) |
|---|
| 88 |
(setf (aref buffer buffer-position) |
|---|
| 89 |
(get-utf-8-character string current-group string-position)) |
|---|
| 90 |
(incf buffer-position 1) |
|---|
| 91 |
(incf string-position current-group)) |
|---|
| 92 |
finally (return buffer))) |
|---|