;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-INTERPOL; Base: 10 -*- ;;; $Header: /usr/local/cvsrep/cl-interpol/read.lisp,v 1.31 2008/07/23 15:13:08 edi Exp $ ;;; Copyright (c) 2003-2008, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions ;;; are met: ;;; * Redistributions of source code must retain the above copyright ;;; notice, this list of conditions and the following disclaimer. ;;; * Redistributions in binary form must reproduce the above ;;; copyright notice, this list of conditions and the following ;;; disclaimer in the documentation and/or other materials ;;; provided with the distribution. ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (in-package :cl-interpol) (defun read-while (predicate &key max) "Reads characters from *STREAM* while PREDICATE returns a true value for each character. Returns at most MAX characters if MAX is true." (when (eql max 0) (return-from read-while "")) (let ((collector (make-collector))) (loop for count of-type fixnum from 1 for c = (peek-char*) while (and (or (not max) (<= count max)) c (funcall predicate c)) do (vector-push-extend (read-char*) collector) finally (return collector)))) (declaim (inline get-number)) (defun get-number (&key (radix 10) max) "Reads and consumes the number *STREAM* is currently looking at and returns it. Returns NIL if no number could be identified. RADIX is used as in PARSE-INTEGER. If MAX is not NIL we'll read at most the next MAX characters." (parse-integer (read-while (lambda (c) (digit-char-p c radix)) :max max) :radix radix :junk-allowed t)) (defun resolve-unicode-name (name) "Tries to return a character which was encoded as \\N." (or (character-named name) (gethash (canonicalize-name name) *unicode-aliases*))) (defun get-char-from-unicode-name () "Parses and returns a named character after \"\\N\" has already been read. This function reads from *STREAM*." (let ((next-char (read-char*))) (unless (char= next-char #\{) (signal-reader-error "Expected { after \\N")) (let ((name (read-while (lambda (c) (and (char/= c #\}) (char/= c *term-char*)))))) (let ((next-char (read-char*))) (unless (char= next-char #\}) (signal-reader-error "Expected } after Unicode character name"))) (or (resolve-unicode-name name) (signal-reader-error "Could not find character with name '~A'" name))))) (defun unescape-char (regex-mode) "Convert the characters(s) on *STREAM* following a backslash into a character which is returned. This function is to be called when the backslash has already been consumed." (let ((chr (read-char*))) ;; certain escape sequences are left as is when in regex mode (when (or (and (eq regex-mode :in-char-class) (find chr "pPwWsSdD" :test #'char=)) (and (eq regex-mode t) (find chr "kpPwWsSdDbBAZz" :test #'char=))) (return-from unescape-char (concatenate 'string "\\" (string chr)))) (let ((result (case chr ((#\N) ;; named Unicode chars (get-char-from-unicode-name)) ((#\c) ;; \cx means control-x (when (char= (peek-char*) *term-char*) (signal-reader-error "String ended after \\c")) (code-char (logxor #x40 (char-code (char-upcase (read-char*)))))) ((#\x) (cond ((char= (peek-char*) #\{) ;; "wide" hex char, i.e. hexadecimal number is ;; enclosed in curly brackets (read-char*) (prog1 (let ((code (or (get-number :radix 16) ;; allow for empty string 0))) (or (and (< code char-code-limit) (code-char code)) (signal-reader-error "No character for char-code #x~X" code))) (unless (char= (peek-char*) #\}) (signal-reader-error "Expected } after hex code")) (read-char*))) (t ;; \x should be followed by a hexadecimal char ;; code, two digits or less; note that it is ;; OK if \x is followed by zero digits (make-char-from-code (get-number :radix 16 :max 2))))) ((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) (cond ((and (eq regex-mode t) (char/= chr #\0)) ;; leave as is if we're in regex mode (and not ;; within in a character class) (concatenate 'string "\\" (string chr))) ((or (char= chr #\8) (char= chr #\9)) ;; outside of regex mode "\8" is "8" (in regex ;; mode it is read like "\08"...) chr) (t (unread-char chr *stream*) ;; now \x should be followed by an octal char ;; code, three digits or less (make-char-from-code (get-number :radix 8 :max 3))))) ;; the following five character names are ;; 'semi-standard' according to the CLHS but I'm not ;; aware of any implementation that doesn't implement ;; them ((#\t) #\Tab) ((#\n) #\Newline) ((#\r) #\Return) ((#\f) #\Page) ((#\b) #\Backspace) ((#\a) (code-char 7)) ; ASCII bell ((#\e) (code-char 27)) ; ASCII escape (otherwise ;; all other characters aren't affected by a backslash chr)))) (cond ((and (characterp result) ;; some characters must be 'protected' from CL-PPCRE (or (and (eq regex-mode :in-char-class) (find result "\\^[]-" :test #'char=)) (and (eq regex-mode t) (find result "\\^[]-.$|()*+?" :test #'char=)))) (concatenate 'string "\\" (string result))) (t result))))) (declaim (inline normal-name-char-p) (inline never-name-char-p)) (defun normal-name-char-p (c) (and c (or (alphanumericp c) (member c '(#\_ #\- #\+ #\*))))) (defun never-name-char-p (c) (or (not c) (get-macro-character c) (member c '(#\$ #\@)))) (defvar quell-warnings-form #+sbcl '(declare (optimize (sb-ext:inhibit-warnings 3))) #-sbcl nil "A declaration form to quiet warnings about unbound variables within a lexical environment.") (defun read-longest-name () (coerce (loop until (never-name-char-p (peek-char nil *stream* nil nil t)) collect (read-char*)) 'string)) (defun read-optional-delimited () "Read the stuff following an optional delimiter, returning a form that tries to deal correctly with lexical variables." (flet ((try-pos (name i form) (let ((ostr (gensym))) `(handler-case (with-output-to-string (,ostr) (princ ,(read-from-string (subseq name 0 i)) ,ostr) (princ ,(subseq name i) ,ostr) ,ostr) (unbound-variable () ,form))))) (loop with name = (read-longest-name) with form = `(error ,(format nil "Interpolation error in ~s~%" name)) with ostr = (gensym) for i = (position-if-not #'normal-name-char-p name) then (position-if-not #'normal-name-char-p name :start (1+ i)) unless i return `(let () ,quell-warnings-form (handler-case (with-output-to-string (,ostr) (princ ,(read-from-string name) ,ostr) ,ostr) (unbound-variable () ,form))) if (> i 0) do (setq form (try-pos name i form)) if (< i (length name)) do (setq form (try-pos name (1+ i) form))))) (declaim (inline read-form)) (defun read-form () "Reads and returns one or more Lisp forms from *STREAM* if the character we're looking at is a valid inner delimiter. Otherwise returns NIL." (let* ((start-delimiter (peek-char*)) (end-delimiter (get-end-delimiter start-delimiter *inner-delimiters*))) (cond ((null end-delimiter) (if *optional-delimiters-p* (read-optional-delimited) nil)) (t `(progn ,@(progn (read-char*) (let ((*readtable* (copy-readtable*))) ;; temporarily change the readtable (set-syntax-from-char end-delimiter #\)) (read-delimited-list end-delimiter *stream* t)))))))) (defun interpol-reader (*stream* char arg) "The actual reader function for the 'sub-character' #\?." (declare (ignore arg char)) (let ((*start-char* (read-char*)) ;; REGEX-MODE is true if we're in regular expression mode; it ;; can have one of the values :START-OF-CHAR-CLASS, ;; :START-OF-NEGATED-CHAR-CLASS, or :IN-CHAR-CLASS if we're ;; inside of a character class or just about to start one - ;; otherwise the value is T regex-mode ;; EXTENDED-MODE is true if we're in extended regular ;; expression mode extended-mode) (when (char-equal *start-char* #\r) (setq regex-mode t *start-char* (read-char*))) (when (char-equal *start-char* #\x) (setq extended-mode t *start-char* (read-char*))) (when (and (not regex-mode) (find *start-char* *regex-delimiters* :test #'char=)) (setq regex-mode t)) (unless regex-mode (setq extended-mode nil)) (let ((*term-char* (get-end-delimiter *start-char* *outer-delimiters* :errorp t)) (*pair-level* 0) (*inner-delimiters* (if regex-mode (intersection *inner-delimiters* '((#\{ . #\})) :test #'equal) *inner-delimiters*)) *saw-backslash* *readtable-copy*) (prog1 (inner-reader regex-mode extended-mode nil nil) ;; consume the closing outer delimiter (read-char*))))) (defun inner-reader (regex-mode extended-mode quote-mode case-mode) "Helper function for INTERPOL-READER which does all the work. May call itself recursively." ;; REGEX-MODE and EXTENDED-MODE as described above; QUOTE-MODE is ;; true if we're inside a \Q scope; CASE-MODE is true if we're ;; inside a \L or \U scope (let* ((string-stream (gensym)) ;; the string stream ;; we use for WITH-OUTPUT-TO-STRING ;; if this is not a constant string (collector (make-collector)) ;; we collect ;; characters into this ;; extentable string result ;; a list of all characters, strings, and forms ;; so far (in reverse order while withing the loop) handle-next-char) (block main-loop ;; we need this name so we can leave the LOOP below (flet ((compute-result () ;; local function used to leave the loop and compute ;; the final RESULT (setq result (nreverse (if (plusp (length collector)) ;; add COLLECTOR if it's not empty (cons collector result) result))) (return-from main-loop)) (parse-with-case-mode (action-name) ;; local function used to read while in a \U or \L scope (let ((string-to-modify ;; read until \E, \L, \U, or end of string (inner-reader regex-mode extended-mode regex-mode t))) (if (stringp string-to-modify) ;; modify directly if constant string (funcall action-name string-to-modify) ;; otherwise create a form to do that at run time `(write-string (,action-name ,string-to-modify) ,string-stream))))) (loop (let ((next-char (read-char*))) (when regex-mode ;; when in regex mode make sure where we are with ;; respect to character classes (setq regex-mode (case next-char ((#\[) (ecase regex-mode ((:start-of-char-class :start-of-negated-char-class :in-char-class) :in-char-class) ((t) :start-of-char-class))) ((#\^) (ecase regex-mode ((:start-of-char-class) :start-of-negated-char-class) ((:start-of-negated-char-class :in-char-class) :in-char-class) ((t) t))) ((#\]) (ecase regex-mode ((:start-of-char-class :start-of-negated-char-class) :in-char-class) ((:in-char-class t) t))) (otherwise (ecase regex-mode ((:start-of-char-class :start-of-negated-char-class :in-char-class) :in-char-class) ((t) t)))))) (when (and (char= next-char *start-char*) (char/= *start-char* *term-char*)) ;; if we see, say, #\( and our closing delimiter is #\) ;; we increment *PAIR-LEVEL* so the parentheses can next ;; without ending the string (incf *pair-level*)) (let ((interpolation (cond ((and (char= next-char *term-char*) (plusp *pair-level*)) ;; although this is the outer closing ;; delimiter we don't stop parsing because ;; we're insided a nested pair of ;; bracketing characters (decf *pair-level*) *term-char*) ((char= next-char *term-char*) ;; now we really stop - but we don't ;; consume the closing delimiter because ;; we may need it again to end another ;; scope (unread-char next-char *stream*) (compute-result)) (t (case next-char ((#\L) (cond ((not *saw-backslash*) ;; a normal #\L, no 'pending' ;; backslash #\L) (case-mode ;; a backslashed #\L which ;; we've seen before but we ;; still have to close at ;; least one \Q/\L/\E scope (unread-char #\L *stream*) (compute-result)) (t ;; all scopes are closed, now ;; read and downcase 'till \E ;; or somesuch (setq *saw-backslash* nil) (parse-with-case-mode 'string-downcase)))) ((#\U) ;; see comments for #\L above (cond ((not *saw-backslash*) #\U) (case-mode (unread-char #\U *stream*) (compute-result)) (t (setq *saw-backslash* nil) (parse-with-case-mode 'string-upcase)))) ((#\Space #\Tab #\Linefeed #\Return #\Page) (cond ((and extended-mode (not (eq regex-mode :in-char-class))) ;; in extended mode (if not in ;; a character class) ;; whitespace is removed "") (t next-char))) ((#\() (cond ((and (eq regex-mode t) (null quote-mode) (char/= *term-char* #\?) (eql (peek-char*) #\?)) ;; this could start an ;; embedded comment in regex ;; mode (and we're /not/ ;; inside of a \Q scope or a ;; character class) (read-char*) (cond ((and (char/= *term-char* #\#) (eql (peek-char*) #\#)) ;; yes, it's a ;; comment, so consume ;; characters 'till #\) (read-while (lambda (char) (and (char/= char #\)) (char/= char *term-char*)))) (cond ((char= (read-char*) *term-char*) (signal-reader-error "Incomplete regex comment starting with '(#'")) ((not (digit-char-p (peek-char*) 16)) "") ;; special case ;; if next ;; character ;; could ;; potentially ;; continue an ;; octal or ;; hexadecimal ;; representation (t "(?:)"))) ;; no, wasn't a comment (t "(?"))) (t #\())) ((#\#) (cond ((and (eq regex-mode t) extended-mode (null quote-mode)) ;; we're in extended regex ;; mode and not inside of a \Q ;; scope or a character class, ;; so this is a comment and we ;; consume it 'till #\Newline ;; or *TERM-CHAR* (read-while (lambda (char) (and (char/= char #\Newline) (char/= char *term-char*)))) (when (char= (peek-char*) #\Newline) (read-char*)) (cond ((not (digit-char-p (peek-char*) 16)) "") ;; special case, see above (t "(?:)"))) (t #\#))) ((#\\) (case (peek-char*) ((#\Q) ;; \Q - start a new quote scope (read-char*) (let ((string-to-quote (inner-reader regex-mode extended-mode t case-mode))) (if (stringp string-to-quote) ;; if we got a constant string ;; we modify it directly (quote-meta-chars string-to-quote) ;; otherwise we expand into code `(write-string (quote-meta-chars ,string-to-quote) ,string-stream)))) ((#\L) ;; \L - start a new case-modifying ;; scope (cond (case-mode ;; if we're already in ;; this mode we have to ;; end all previous scopes ;; first - we set ;; *SAW-BACKSLASH* to T so ;; the #\L is read until ;; all scopes are finished (setq *saw-backslash* t) (compute-result)) (t ;; all scopes are closed, now ;; read and downcase 'till \E ;; or somesuch (setq *saw-backslash* nil) (read-char*) (parse-with-case-mode 'string-downcase)))) ((#\U) ;; see comments for #\L above (cond (case-mode (setq *saw-backslash* t) (compute-result)) (t (setq *saw-backslash* nil) (read-char*) (parse-with-case-mode 'string-upcase)))) ((#\E) ;; \E - ends exactly one scope (read-char*) (if (or quote-mode case-mode) (compute-result) "")) ((#\l) ;; \l - downcase next character (read-char*) ;; remember that we have to do this (setq handle-next-char :downcase) nil) ((#\u) ;; \u - upcase next character (read-char*) ;; remember that we have to do this (setq handle-next-char :upcase) nil) (otherwise ;; otherwise this is a ;; backslash-escaped character (unescape-char regex-mode)))) ((#\$) ;; #\$ - might be an interpolation (let ((form (read-form))) (cond ((null form) ;; no, just dollar sign #\$) (handle-next-char ;; yes, and we have to ;; modify the first ;; character (prog1 (let ((string (gensym))) `(let ((,string (format nil "~A" ,form))) (when (plusp (length ,string)) (setf (char ,string 0) (,(if (eq handle-next-char :downcase) 'char-downcase 'char-upcase) (char ,string 0)))) (write-string ,string ,string-stream))) (setq handle-next-char nil))) (t ;; no modification, just ;; insert a form to PRINC ;; this interpolation `(princ ,form ,string-stream))))) ((#\@) ;; #\Q - might be an interpolation (let ((form (read-form)) (element (gensym)) (first (gensym))) (cond ((null form) ;; no, just at-sign #\@) (handle-next-char ;; yes, and we have to ;; modify the first ;; character (prog1 (let ((string (gensym))) `(loop for ,first = t then nil for ,element in ,form unless ,first do (princ *list-delimiter* ,string-stream) if ,first do (let ((,string (format nil "~A" ,element))) (when (plusp (length ,string)) (setf (char ,string 0) (,(if (eq handle-next-char :downcase) 'char-downcase 'char-upcase) (char ,string 0)))) (write-string ,string ,string-stream)) else do (princ ,element ,string-stream))) (setq handle-next-char nil))) (t ;; no modification, just ;; insert a form to PRINC ;; this interpolated list ;; (including the list ;; delimiters inbetween) `(loop for ,first = t then nil for ,element in ,form unless ,first do (princ *list-delimiter* ,string-stream) do (princ ,element ,string-stream)))))) ;; just a 'normal' character (otherwise next-char)))))) (when interpolation ;; INTERPOLATION is NIL if we just saw #\l or #\u (when (and handle-next-char (consp interpolation) (eq (first interpolation) 'write-string)) ;; if we have to upcase or downcase the following ;; character and we just collected a form (from a ;; \Q/\L/\U scope) we have to insert code for the ;; modification (setf (second interpolation) (let ((string (gensym))) `(let ((,string ,(second interpolation))) (when (plusp (length ,string)) (setf (char ,string 0) (,(if (eq handle-next-char :downcase) 'char-downcase 'char-upcase) (char ,string 0)))) ,string))) (setq handle-next-char nil)) (cond ((characterp interpolation) ;; add one character to COLLECTOR and handle ;; it according to HANDLE-NEXT-CHAR (vector-push-extend (case handle-next-char ((:downcase) (setq handle-next-char nil) (char-downcase interpolation)) ((:upcase) (setq handle-next-char nil) (char-upcase interpolation)) (otherwise interpolation)) collector)) ((stringp interpolation) ;; add a string to COLLECTOR and handle its ;; first character according to ;; HANDLE-NEXT-CHAR (loop for char across interpolation do (vector-push-extend (case handle-next-char ((:downcase) (setq handle-next-char nil) (char-downcase char)) ((:upcase) (setq handle-next-char nil) (char-upcase char)) (otherwise char)) collector))) ((plusp (length collector)) ;; add code (to be executed at runtime) but ;; make sure to empty COLLECTOR first (push collector result) (push interpolation result) ;; reset collector (setf collector (make-collector))) (t ;; same but COLLECTOR is empty (push interpolation result))))))))) (if (every #'stringp result) ;; if all elements of RESULT are strings we can return a ;; constant string (string-list-to-string result) ;; otherwise we have to wrap the PRINCs emitted above into a ;; WITH-OUTPUT-TO-STRING form `(with-output-to-string (,string-stream) ,@(loop for interpolation in result if (stringp interpolation) collect `(write-string ,interpolation ,string-stream) else collect interpolation))))) (defun %enable-interpol-syntax () "Internal function used to enable reader syntax and store current readtable on stack." (push *readtable* *previous-readtables*) (setq *readtable* (copy-readtable)) (set-dispatch-macro-character #\# #\? #'interpol-reader) (values)) (defun %disable-interpol-syntax () "Internal function used to restore previous readtable." (if *previous-readtables* (setq *readtable* (pop *previous-readtables*)) (setq *readtable* (copy-readtable nil))) (values)) (defmacro enable-interpol-syntax () "Enable CL-INTERPOL reader syntax." `(eval-when (:compile-toplevel :load-toplevel :execute) (%enable-interpol-syntax))) (defmacro disable-interpol-syntax () "Restore readtable which was active before last call to ENABLE-INTERPOL-SYNTAX. If there was no such call, the standard readtable is used." `(eval-when (:compile-toplevel :load-toplevel :execute) (%disable-interpol-syntax)))