root/trunk/thirdparty/cl-pdf/pdf-parser.lisp

Revision 4148, 20.5 kB (checked in by hans, 2 weeks ago)

Fix some small bugs.

Line 
1 ;;; cl-pdf copyright 2002-2005 Marc Battyani see license.txt for the details
2 ;;; You can reach me at marc.battyani@fractalconcept.com or marc@battyani.net
3 ;;; The homepage of cl-pdf is here: http://www.fractalconcept.com/asp/html/cl-pdf.html
4
5 ;; Thanks to Arthur Lemmens for the recursive descent parser :)
6 ;; (It's much nicer than my original LALR parser ;-)
7 ;; See the example at the end for usage.
8
9 (in-package #:pdf)
10
11 (defvar *pdf-input-stream* nil)
12 (defvar *indirect-objects* nil)
13
14 (defun find-indirect-object (obj-number gen-number)
15   (let ((object.pos (gethash (cons obj-number gen-number) *indirect-objects*)))
16     (if object.pos
17         (values (car object.pos) (cdr object.pos))
18         (make-indirect-object obj-number gen-number 0))))
19
20 (defun make-indirect-object (obj-number gen-number position)
21   (let ((object (or (car (gethash (cons obj-number gen-number) *indirect-objects*))
22                     (make-instance 'indirect-object
23                                    :obj-number obj-number
24                                    :gen-number gen-number
25                                    :content :unread
26                                    :no-link t))))
27     (setf (gethash (cons obj-number gen-number) *indirect-objects*) (cons object position))
28     object))
29
30 (defun read-indirect-object (obj-number gen-number)
31   (multiple-value-bind (object position) (find-indirect-object obj-number gen-number)
32     (when (eq (content object) :unread)
33       (setf (content object) (read-indirect-object-content position)))
34     object))
35
36 (defun load-indirect-object (object)
37   (when (eq (content object) :unread)
38     (read-indirect-object (obj-number object) (gen-number object)))
39   object)
40
41 (defun delete-indirect-object (object)
42   (remhash (cons (obj-number object) (gen-number object)) *indirect-objects*))
43
44 (defun load-all-indirect-objects ()
45   (loop for object.pos being the hash-value of *indirect-objects*
46         do (load-indirect-object (car object.pos))))
47
48 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
49 ;;; Parser
50 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
51
52 (define-condition pdf-parse-error ()
53   ((stream :initarg :stream :reader pdf-parse-error-stream)
54    (message :initarg :message :reader pdf-parse-error-message))
55   (:report (lambda (condition stream)
56              (format stream "~&Error at position ~D while reading PDF document.~%~A~%~A~%"                     (file-position (pdf-parse-error-stream condition))                     (pdf-parse-error-message condition)
57                      (pdf-parse-error-stream condition)))))
58
59 (defun unexpected-character-error (char)
60   (cerror "Ignore the character and continue."
61           'pdf-parse-error
62           :stream *pdf-input-stream*
63           :message (format nil "Unexpected character ~S at ~A." char
64                            (file-position *pdf-input-stream*))))
65
66 (defvar +white-char+ (coerce '(#\Space #\Newline #\Return #\Tab #\Null #\Page) 'string))
67
68 (defun white-char-p (c)
69   (find c +white-char+))
70
71 (defun octal-digit-char-p (char)
72   (find char "01234567"))
73
74 (defun hex-digit-char-p (char)
75   (find char "0123456789ABCDEF"))
76
77 (defun numeric-char-p (char)
78   (find char "0123456789-."))
79
80 (defun name-char-p (char)
81   (and (<= #x21 (char-code char) #x7E)
82        (not (find char "%()<>[]{}/#"))))
83
84 ;;; Skipping characters
85
86 (defun eat-char (expected-char)
87   (let ((char (read-char *pdf-input-stream*)))
88     (unless (char= char expected-char)
89       (cerror "Ignore the character and continue."
90               'pdf-parse-error
91               :stream *pdf-input-stream*
92               :message (format nil "Unexpected character ~S (expected ~S) at ~A."
93                                char expected-char (file-position *pdf-input-stream*))))))
94
95 (defun eat-chars (chars)
96   (loop for char across chars
97         do (eat-char char)))
98
99 (defun eat-keyword (keyword)
100   (skip-whitespace t)
101   (eat-chars keyword)
102   (skip-whitespace t))
103
104 (defun skip-whitespace (eof-error-p &key (skip-comments t))
105   (loop
106    (let ((char (read-char *pdf-input-stream* eof-error-p)))
107      (cond ((null char)
108             (return :eof))
109            ((and skip-comments (eql char #\%))
110             (loop for c = (read-char *pdf-input-stream* nil)
111                   until (or (null c) (eq c #\Newline))))
112            ((not (white-char-p char))
113             (unread-char char *pdf-input-stream*)
114             (return char))))))
115
116 ;;; PDF Basic objects (see Chapter 4)
117
118 (defun read-object (&optional (eof-error-p t))
119   "Returns one of the following PDF objects: boolean (:true or :false),
120 number (Lisp number), string (Lisp string), name (Lisp symbol in the PDF
121 package), array (Lisp vector), dictionary (Lisp property list), stream
122 (Lisp pdf-stream) or null (Lisp NIL). When EOF-ERRORP is nil, it returns
123 :eof for the end of the stream (otherwise it signals an error)."
124   (skip-whitespace eof-error-p)
125   (let ((char (peek-char nil *pdf-input-stream* eof-error-p)))
126     (cond ((numeric-char-p char)
127            (read-number))
128           ((eql char #\()
129            (eat-char #\()
130            (read-pdf-string))
131           ((eql char #\/)
132            (eat-char #\/)
133            (read-name ))
134           ((eql char #\[)
135            (eat-char #\[)
136            (read-array))
137           ((eql char #\<)
138            (eat-char #\<)
139            (let ((next-char (peek-char nil *pdf-input-stream*)))
140              (if (char= next-char #\<)
141                  (progn (eat-char #\<)
142                    (read-dictionary-or-stream))
143                (read-hex-string))))
144           ((eql char #\t)
145            (eat-chars "true")
146            "true")
147           ((eql char #\f)
148            (eat-chars "false")
149            "false")
150           ((eql char #\n)
151            (eat-chars "null")
152            nil)
153           ((eql char #\e)
154            ;; this is probably an empty indirect object.  WRITE-OBJECT
155            ;; can write them, so we should be able to read them too.
156            nil)
157           (t (unexpected-character-error char)))))
158
159 (defun read-number ()
160   (read-from-string
161    (with-output-to-string (s)
162      (loop for char = (read-char *pdf-input-stream* nil nil)
163            until (or (null char) (not (numeric-char-p char)))
164            do (write-char char s)
165            finally (when char
166                      (unread-char char *pdf-input-stream*))))))
167
168 ;;; Strings
169
170 (defun read-pdf-string ()
171   ;; TODO: Deal with encodings.
172   (with-output-to-string (s)
173     (write-char #\( s)
174     (loop for prev-char = #\( then char
175           for char = (read-char *pdf-input-stream*)
176           until (and (char= char #\))(not (char= prev-char #\\)))
177           do (write-char char s))
178     (write-char #\) s)))
179
180 (defun read-hex-string ()
181   (with-output-to-string (s)
182     (write-char #\< s)
183     (loop for char = (read-char *pdf-input-stream*)
184        until (char= char #\>)
185        do (write-char char s))
186     (write-char #\> s)))
187
188 (defun parse-hex (digit-1 digit-2)
189   (flet ((parse-digit (digit)
190            (- (char-code digit)
191               (if (char<= #\0 digit #\9) #.(char-code #\0) #.(- (char-code #\A) 10)))))
192     (+ (* 16 (parse-digit (char-upcase digit-1)))
193        (parse-digit (char-upcase digit-2)))))
194
195 ;;; Names
196
197 (defun read-name ()
198   (with-output-to-string (s)
199     (write-char #\/ s)
200     (loop (let ((char (read-char *pdf-input-stream* nil nil)))
201             (cond ((null char) (return))
202                   ((eql char #\#)
203                    (let ((digit-1 (read-char *pdf-input-stream*))
204                          (digit-2 (read-char *pdf-input-stream*)))
205                      (unless (and (hex-digit-char-p digit-1)
206                                   (hex-digit-char-p digit-2))
207                        (error 'pdf-parse-error
208                               :stream *pdf-input-stream*
209                               :message "Illegal hexadecimal escape sequence in PDF name."))
210                      (write-char (code-char (parse-hex digit-1 digit-2))
211                                  s)))
212                   ((name-char-p char)
213                    (write-char char s))
214                   (t (unread-char char *pdf-input-stream*)
215                      (return)))))))
216
217 ;;; Arrays
218
219 (defun read-array ()
220   ;; #\[ has already been read
221   (let ((stack '()))
222     (loop (skip-whitespace t)
223           (let ((char (peek-char nil *pdf-input-stream*)))
224             (case char
225               ( #\]
226                 (eat-char #\])
227                 (return))
228               ( #\R
229                 (eat-char #\R)
230                 ;; Reduce last two numbers to indirect-reference
231                 (let ((generation-number (pop stack))
232                       (object-number (pop stack)))
233                   (assert (integerp generation-number))
234                   (assert (integerp object-number))
235                   (push (find-indirect-object object-number generation-number) stack)))
236               (otherwise
237                (push (read-object) stack)))))
238     (make-array (length stack)
239                 :initial-contents (nreverse stack))))
240
241 ;;; Dictionaries and streams
242
243 (defun read-dictionary-properties ()
244   (let ((plist '()))
245     (loop (skip-whitespace t)
246           (let ((char (peek-char nil *pdf-input-stream*)))
247             (case char
248               ( #\>
249                 (eat-chars ">>")
250                 (return))
251               ( #\R
252                 (eat-char #\R)
253                 ;; Reduce last two numbers to indirect-object
254                 (let ((generation-number (pop plist))
255                       (object-number (pop plist)))
256                   (assert (integerp generation-number))
257                   (assert (integerp object-number))
258                   (push (find-indirect-object object-number generation-number) plist)))
259               (otherwise
260                (push (read-object t)
261                      plist)))))
262     (loop for (k v . rest) on (nreverse plist) by #'cddr
263           collect (cons k v))))
264
265 (defun read-dictionary ()
266   (make-instance 'dictionary :dict-values (read-dictionary-properties)))
267
268 (defun read-dictionary-or-stream ()
269   (let ((properties (read-dictionary-properties)))
270     ;; Check if dictionary is followed by a stream
271     (skip-whitespace nil)
272     (let ((char (peek-char nil *pdf-input-stream* nil nil)))
273       (cond ((eql char #\s)
274              ;; Don't use EAT-KEYWORD here, because it may eat too many newlines!
275              (eat-chars "stream")
276              (read-line *pdf-input-stream*)
277              (read-pdf-stream properties))
278             (t (make-instance 'dictionary :dict-values properties))))))
279
280 (defun read-pdf-stream (properties)
281   (let ((length (cdr (assoc "/Length" properties :test #'string=))))
282     (when (typep length 'indirect-object)
283       (let ((saved-filepos (file-position *pdf-input-stream*)))
284         (setq length (content (load-indirect-object length)))
285         (file-position *pdf-input-stream* saved-filepos)))
286     (assert (integerp length))
287     (let ((content (loop repeat (ceiling length array-total-size-limit)
288                       for buffer-size = (min length array-total-size-limit)
289                       ;; while (plusp buffer-size)
290                       collect (let* ((buffer (make-string buffer-size))
291                                      (bytes-read (read-sequence buffer *pdf-input-stream*)))
292                                 (when (< bytes-read buffer-size)
293                                   (error 'pdf-parse-error
294                                          :stream *pdf-input-stream*
295                                          :message "Unexpected end of PDF-stream."))
296                                 buffer)
297                       do (decf length buffer-size))))
298       (eat-keyword "endstream")
299       (make-instance 'pdf-stream :dict-values properties :content content :no-compression t))))
300
301 ;;; Integers
302
303 (defun read-integer ()
304   (let ((object (read-object)))
305     (unless (integerp object)
306       (error 'pdf-parse-error
307              :stream *pdf-input-stream*
308              :message "Integer expected."))
309     object))
310
311 ;;; Indirect objects
312
313 (defun read-indirect-object-content (file-position)
314   (file-position *pdf-input-stream* file-position)
315   (let* ((object-number (read-integer))
316          (generation-number (read-integer)))
317     (eat-keyword "obj")
318     (let ((object (read-object)))
319       ;; Some producers forget the "endobj" at the end of a stream
320       ;; Let's try to be tolerant.
321       (skip-whitespace t)
322       (when (char= #\e (peek-char nil *pdf-input-stream*))
323         (eat-keyword "endobj"))
324       object)))
325
326 ;;; xref
327
328 (defun read-xref-and-trailer (position)
329   (let (first-trailer)
330     (loop
331        (read-cross-reference-subsections position)
332        (let* ((trailer (read-trailer)))
333          (unless first-trailer (setf first-trailer trailer))
334          (let ((prev-position (get-dict-value trailer "/Prev")))
335            (if prev-position
336                (setq position prev-position)
337                (return first-trailer)))))))
338
339 (defun read-cross-reference-subsections (position)
340   (file-position *pdf-input-stream* position)
341   (eat-keyword "xref")
342   (loop
343    ;; the eat-keyword skips whitespace the first time, but there can
344    ;; be more whitespace the second time around.
345      (skip-whitespace t)
346      (let ((char (peek-char nil *pdf-input-stream*)))
347        (cond ((char= char #\t) (return))
348              (t (read-cross-reference-subsection))))))
349
350 (defun read-cross-reference-subsection ()
351   (let ((first-entry (read-integer))
352         (nr-entries (read-integer)))
353     (loop repeat nr-entries
354           for number from first-entry
355           do (read-cross-reference-entry number))))
356
357 (defun read-cross-reference-entry (number)
358   (let ((position (read-integer)))
359     (skip-whitespace nil)
360     (let ((gen (read-integer)))
361       (skip-whitespace nil)
362       (let ((type (read-char *pdf-input-stream*)))
363         (skip-whitespace nil)
364         (if (char= type #\n)
365             (make-indirect-object number gen position))))))
366
367 (defun read-trailer ()
368   (eat-keyword "trailer")
369   (skip-whitespace t)
370   (eat-chars "<<")
371   (read-dictionary))
372
373 (defconstant +xref-search-size+ 1024
374   "Read this many bytes at end of file to look for 'startxref'")
375
376 (defun find-cross-reference-start ()
377   (let ((file-length (file-length *pdf-input-stream*))
378         (buffer (make-string +xref-search-size+)))
379     (file-position *pdf-input-stream* (- file-length +xref-search-size+))
380     (read-sequence buffer *pdf-input-stream*)
381     (let ((position (search "startxref" buffer)))
382       (unless position
383         (error 'pdf-parse-error :stream *pdf-input-stream*
384                :message "Can't find start address of cross-reference table."))
385       (parse-integer buffer :start (+ position 10) :junk-allowed t))))
386
387 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
388 ;;; Higher level Stuff
389 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390
391 (defun renumber-all-indirect-objects ()
392   "Gives all indirect objects a consecutive numbering.
393 Returns the first unused object-number."
394   (let ((objects (make-array (hash-table-count *indirect-objects*)
395                              :fill-pointer 0 :adjustable t
396                              :initial-element nil)))
397     (setf (objects *document*) objects)
398     (setf (last-object-number *document*) (hash-table-count *indirect-objects*))
399     (loop for object.pos being the hash-value of *indirect-objects*
400           for object = (car object.pos)
401           for number from 1 do
402          (setf (obj-number object) number
403                (gen-number object) 0)
404          (vector-push-extend object objects))))
405
406 (defun read-pdf-file (file)
407   (let ((*indirect-objects* (make-hash-table :test #'equal))
408         (*document* (make-instance 'document :empty t)))
409     (with-open-file (*pdf-input-stream* file
410                      :direction :input
411                      :external-format +external-format+)
412       (read-pdf))
413     *document*))
414
415 (defun collect-pages (page pages-vector root-page-node)
416   (if (string= (get-dict-value (content page) "/Type") "/Pages")
417       (progn
418         (loop for kid-page across (get-dict-value (content page) "/Kids")
419               do (collect-pages kid-page pages-vector root-page-node))
420         (unless (eq page root-page-node) (delete-indirect-object page)))
421       (progn
422         (unless (eq page root-page-node)
423           (change-dict-value (content page) "/Parent" root-page-node))
424         (vector-push-extend page pages-vector))))
425
426 (defun read-pdf ()
427   (let* ((trailer (read-xref-and-trailer (find-cross-reference-start)))
428          (%visited-object% (make-hash-table :test #'eql)))
429     (setf (catalog *document*) (get-dict-value trailer "/Root")
430           (docinfo *document*) (get-dict-value trailer "/Info"))
431     (load-all-indirect-objects)
432     (let* ((root-page-node (change-class (get-dict-value (content (catalog *document*)) "/Pages") 'page-node))
433            (pages-vector (make-array 1 :fill-pointer 0 :adjustable t)))
434       (collect-pages root-page-node pages-vector root-page-node)
435       (setf (pages root-page-node) pages-vector)
436       (change-dict-value (content root-page-node) "/Count" #'(lambda () (length (pages root-page-node))))
437       (change-dict-value (content root-page-node) "/Kids" (pages root-page-node))
438       (renumber-all-indirect-objects)
439       (setf (root-page *document*) root-page-node))))
440
441 (defvar *original-content* nil)
442 (defvar *current-content* nil)
443
444 (defun insert-original-page-content ()
445   ;; save graphics state
446   (write-line " q" *page-stream*)
447   (vector-push-extend (make-instance 'indirect-object :content
448                                      (make-instance 'pdf-stream :content
449                                                     (get-output-stream-string *page-stream*)))
450                       *current-content*)
451   (if (vectorp *original-content*)
452       (loop for content across *original-content* do
453            (vector-push-extend content *current-content*))
454       (vector-push-extend *original-content* *current-content*))
455   (setf *page-stream* (make-string-output-stream))
456   ;; restore graphics state
457   (write-line " Q" *page-stream*))
458
459 (export 'insert-original-page-content)
460
461 (defun ensure-dictionary (obj)
462   (if (typep obj 'indirect-object)
463       (content obj)
464       obj))
465
466 (defun open-page (page-num)
467   (let* ((page (aref (pages *root-page*) page-num))
468          (dict (content page))
469          (resources (ensure-dictionary (get-dict-value dict "/Resources"))))
470     (let ((fonts (ensure-dictionary (get-dict-value resources "/Font")))
471           (xobjects (ensure-dictionary (get-dict-value resources "/XObject")))
472           (content-stream (make-instance 'pdf-stream)))
473       (setf *original-content* (get-dict-value dict "/Contents"))
474       (setf *current-content* (make-array 10 :fill-pointer 0 :adjustable t ))
475       (change-class page 'page)
476       (unless resources
477         (setf resources (make-instance 'dictionary)))
478       (change-dict-value dict "/Resources" resources)
479       (unless fonts
480         (setf fonts (make-instance 'dictionary)))
481       (change-dict-value resources "/Font" fonts)
482       (unless xobjects
483         (setf xobjects (make-instance 'dictionary)))
484       (change-dict-value resources "/XObject" xobjects)
485       (setf (bounds page)(get-dict-value dict "/MediaBox")
486             (resources page) resources
487             (font-objects page) fonts
488             (xobjects page) xobjects
489             (content-stream page) content-stream)
490       (change-dict-value dict "/Contents" *current-content*))
491     page))
492
493 (defmacro with-existing-document ((file &key (creator "") author title subject keywords) &body body)
494   `(let* ((*document* (read-pdf-file ,file))
495           (*root-page* (root-page *document*))
496           (*outlines-stack* (list (outline-root *document*)))
497           (*page* nil)
498           (*page-number* 0))
499      (add-doc-info *document* :creator ,creator :author ,author
500                    :title ,title :subject ,subject :keywords ,keywords)
501     ,@body))
502
503 (export 'with-existing-document)
504
505 (defmacro with-existing-page ((page-number) &body body)
506   `(let* ((*original-content* nil)
507           (*current-content* nil)
508           (*page* (open-page ,page-number)))
509      (setf (content (content-stream *page*))
510            (let ((*page-stream* (make-string-output-stream)))
511              ,@body
512              (vector-push-extend
513               (make-instance 'indirect-object :content
514                              (make-instance 'pdf-stream :content
515                                             (get-output-stream-string pdf::*page-stream*)))
516               *current-content*)))))
517
518 (export 'with-existing-page)
519
520 #|
521
522 (pdf:with-existing-document (#P"/tmp/MS-32.pdf")
523   (pdf:with-existing-page (0)
524     (let ((helvetica (pdf:get-font "Helvetica")))
525       (pdf:in-text-mode
526        (pdf:set-font helvetica 45.0)
527        (pdf:move-text 95 700)
528        (pdf:draw-text "cl-pdf-parser example"))
529       (pdf:with-saved-state
530           (pdf:translate 200 60)
531         (pdf:scale 0.7 0.7)
532         (pdf:rotate 15)
533         (pdf:insert-original-page-content))
534       (pdf:translate 230 400)
535       (loop repeat 170
536          for i = 0.67 then (* i 1.03)
537          do (pdf:in-text-mode
538              (pdf:set-font helvetica i)
539              (pdf:set-rgb-fill (/ (random 255) 255.0)(/ (random 255) 255.0)(/ (random 255) 255.0))
540              (pdf:move-text (* i 3) 0)
541              (pdf:draw-text "cl-pdf"))
542            (pdf:rotate 13))))
543   (pdf:with-existing-page (1)
544     (let ((helvetica (pdf:get-font "Helvetica")))
545       (pdf:insert-original-page-content)
546       (pdf:in-text-mode
547        (pdf:set-font helvetica 60.0)
548        (pdf:move-text 250 250)
549        (pdf:rotate 30)
550        (pdf:set-rgb-fill 1.0 0.0 0.0)
551        (pdf:draw-text "cl-pdf-parser example"))))
552   (pdf:with-page ()       ;add a new page
553     (let ((helvetica (pdf:get-font "Helvetica")))
554       (pdf:in-text-mode
555        (pdf:set-font helvetica 36.0)
556        (pdf:move-text 100 800)
557        (pdf:draw-text "cl-pdf: Example 1"))
558       (pdf:translate 230 500)
559       (loop repeat 150
560          for i = 0.67 then (* i 1.045)
561          do (pdf:in-text-mode
562              (pdf:set-font helvetica i)
563              (pdf:set-rgb-fill (/ (random 255) 255.0)(/ (random 255) 255.0)(/ (random 255) 255.0))
564              (pdf:move-text (* i 3) 0)
565              (pdf:draw-text "cl-typesetting"))
566            (pdf:rotate 13))))
567   (pdf:write-document #P"/tmp/t.pdf"))
568
569 |#
Note: See TracBrowser for help on using the browser.