;;; pict.scm --- A simple picture language for Guile ;;; Copyright © 2019 Ricardo Wurmus ;;; ;;; This library is free software; you can redistribute it and/or ;;; modify it under the terms of the GNU Lesser General Public License ;;; as published by the Free Software Foundation; either version 3 of ;;; the License, or (at your option) any later version. ;;; ;;; This library is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;; Lesser General Public License for more details. ;;; ;;; You should have received a copy of the GNU Lesser General Public ;;; License along with this library. If not, see ;;; . ;;; Commentary: ;;; ;;; This is an adaptation of Guile's xml->sxml from (sxml simple), as ;;; released in version 2.2.4. The difference is in the handling of ;;; namespaces, which is important for round trips XML->SXML->XML ;;; again. For more information see Guile bug 20339 ;;; (define-module (pict sxml) #:use-module (sxml simple) #:use-module (sxml ssax input-parse) #:use-module (sxml ssax) #:use-module (sxml transform) #:use-module (ice-9 match) #:use-module (srfi srfi-1) #:use-module (srfi srfi-13) #:export (xml->sxml)) (define* (xml->sxml #:optional (string-or-port (current-input-port)) #:key (namespaces '()) (declare-namespaces? #t) (trim-whitespace? #f) (entities '()) (default-entity-handler #f) (doctype-handler #f)) "Use SSAX to parse an XML document into SXML. Takes one optional argument, @var{string-or-port}, which defaults to the current input port." ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix ;; that the user wants on elements of a given namespace in the ;; resulting SXML, regardless of the abbreviated namespaces defined in ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true, ;; these namespaces are treated as if they were declared in the DTD. ;; ENTITIES: alist of SYMBOL -> STRING. ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)). ;; A DOC-PREFIX of #f indicates that it comes from the user. ;; Otherwise, prefixes are symbols. (define (munge-namespaces namespaces) (map (lambda (el) (match el ((prefix . uri-string) (cons* (and declare-namespaces? prefix) prefix (ssax:uri-string->symbol uri-string))))) namespaces)) (define (user-namespaces) (munge-namespaces namespaces)) (define (user-entities) (if (and default-entity-handler (not (assq '*DEFAULT* entities))) (acons '*DEFAULT* default-entity-handler entities) entities)) (define (name->sxml name namespaces) (match name ((prefix . local-part) (let ((abbrev (and=> (find (match-lambda ((abbrev uri . rest) (and (eq? uri prefix) abbrev))) namespaces) first))) (match abbrev ('*DEFAULT* local-part) (_ (symbol-append abbrev (string->symbol ":") local-part))))) (_ name))) (define (doctype-continuation seed) (lambda* (#:key (entities '()) (namespaces '())) (values #f (append entities (user-entities)) (append (munge-namespaces namespaces) (user-namespaces)) seed))) ;; The SEED in this parser is the SXML: initialized to '() at each new ;; level by the fdown handlers; built in reverse by the fhere parsers; ;; and reverse-collected by the fup handlers. (define parser (ssax:make-parser NEW-LEVEL-SEED ; fdown (lambda (elem-gi attributes namespaces expected-content seed) '()) FINISH-ELEMENT ; fup (lambda (elem-gi attributes namespaces parent-seed seed) (let ((seed (if trim-whitespace? ((@@ (sxml simple) ssax:reverse-collect-str-drop-ws) seed) ((@@ (sxml simple) ssax:reverse-collect-str) seed))) (attrs (append ;; Namespace declarations (filter-map (match-lambda (('*DEFAULT* . _) #f) ((abbrev uri . _) (list (symbol-append 'xmlns: abbrev) (symbol->string uri)))) namespaces) (attlist-fold (lambda (attr accum) (cons (list (name->sxml (car attr) namespaces) (cdr attr)) accum)) '() attributes)))) (acons (name->sxml elem-gi namespaces) (if (null? attrs) seed (cons (cons '@ attrs) seed)) parent-seed))) CHAR-DATA-HANDLER ; fhere (lambda (string1 string2 seed) (if (string-null? string2) (cons string1 seed) (cons* string2 string1 seed))) DOCTYPE ;; -> ELEMS ENTITIES NAMESPACES SEED ;; ;; ELEMS is for validation and currently unused. ;; ;; ENTITIES is an alist of parsed entities (symbol -> string). ;; ;; NAMESPACES is as above. ;; ;; SEED builds up the content. (lambda (port docname systemid internal-subset? seed) (call-with-values (lambda () (cond (doctype-handler (doctype-handler docname systemid (and internal-subset? ((@@ (sxml simple) read-internal-doctype-as-string) port)))) (else (when internal-subset? (ssax:skip-internal-dtd port)) (values)))) (doctype-continuation seed))) UNDECL-ROOT ;; This is like the DOCTYPE handler, but for documents that do not ;; have a entry. (lambda (elem-gi seed) (call-with-values (lambda () (if doctype-handler (doctype-handler #f #f #f) (values))) (doctype-continuation seed))) PI ((*DEFAULT* . (lambda (port pi-tag seed) (cons (list '*PI* pi-tag (ssax:read-pi-body-as-string port)) seed)))))) (let* ((port (if (string? string-or-port) (open-input-string string-or-port) string-or-port)) (elements (reverse (parser port '())))) `(*TOP* ,@elements)))