diff options
author | Stefan Israelsson Tampe <stefan.itampe@gmail.com> | 2018-09-11 21:15:32 +0200 |
---|---|---|
committer | Stefan Israelsson Tampe <stefan.itampe@gmail.com> | 2018-09-11 21:15:32 +0200 |
commit | 11bcce56c7a6b57f86b7086461f8f71c78c910ff (patch) | |
tree | 4c0b3c771537a336fc660226aaae5a28e9de1ab9 | |
parent | c75be2215da7dcc993ca8450bafcbab609132871 (diff) |
sax framework in xml
-rw-r--r-- | modules/language/python/module/os.scm | 10 | ||||
-rw-r--r-- | modules/language/python/module/xml/sax.py | 108 | ||||
-rw-r--r-- | modules/language/python/module/xml/sax/__init__.py | 107 | ||||
-rw-r--r-- | modules/language/python/module/xml/sax/_exceptions.py | 135 | ||||
-rw-r--r-- | modules/language/python/module/xml/sax/expatreader.py | 447 | ||||
-rw-r--r-- | modules/language/python/module/xml/sax/handler.py | 346 | ||||
-rw-r--r-- | modules/language/python/module/xml/sax/saxutils.py | 371 | ||||
-rw-r--r-- | modules/language/python/module/xml/sax/xmlreader.py | 383 |
8 files changed, 1907 insertions, 0 deletions
diff --git a/modules/language/python/module/os.scm b/modules/language/python/module/os.scm index 7823633..64c93b2 100644 --- a/modules/language/python/module/os.scm +++ b/modules/language/python/module/os.scm @@ -230,6 +230,16 @@ (raise (ValueError "cant stringify k in env[x]"))))))) (if r r (raise IndexError))))) + (define __contains__ + (lambda (self k) + (let ((r ((@ (guile) getenv) + (catch #t + (lambda () + (pystring k)) + (lambda x + #f))))) + r))) + (define __setitem__ (lambda (self k v) (call-with-values diff --git a/modules/language/python/module/xml/sax.py b/modules/language/python/module/xml/sax.py new file mode 100644 index 0000000..e070bde --- /dev/null +++ b/modules/language/python/module/xml/sax.py @@ -0,0 +1,108 @@ +module(xml,sax) +"""Simple API for XML (SAX) implementation for Python. + +This module provides an implementation of the SAX 2 interface; +information about the Java version of the interface can be found at +http://www.megginson.com/SAX/. The Python version of the interface is +documented at <...>. + +This package contains the following modules: + +handler -- Base classes and constants which define the SAX 2 API for + the 'client-side' of SAX for Python. + +saxutils -- Implementation of the convenience classes commonly used to + work with SAX. + +xmlreader -- Base classes and constants which define the SAX 2 API for + the parsers used with SAX for Python. + +expatreader -- Driver that allows use of the Expat parser with SAX. +""" + +from xml.sax.xmlreader import InputSource +from xml.sax.handler import ContentHandler, ErrorHandler +from xml.sax._exceptions import SAXException, SAXNotRecognizedException, \ + SAXParseException, SAXNotSupportedException, \ + SAXReaderNotAvailable + + +def parse(source, handler, errorHandler=ErrorHandler()): + parser = make_parser() + parser.setContentHandler(handler) + parser.setErrorHandler(errorHandler) + parser.parse(source) + +def parseString(string, handler, errorHandler=ErrorHandler()): + import io + if errorHandler is None: + errorHandler = ErrorHandler() + parser = make_parser() + parser.setContentHandler(handler) + parser.setErrorHandler(errorHandler) + + inpsrc = InputSource() + if isinstance(string, str): + inpsrc.setCharacterStream(io.StringIO(string)) + else: + inpsrc.setByteStream(io.BytesIO(string)) + parser.parse(inpsrc) + +# this is the parser list used by the make_parser function if no +# alternatives are given as parameters to the function + +default_parser_list = ["xml.sax.expatreader"] + +# tell modulefinder that importing sax potentially imports expatreader +_false = 0 +if _false: + import xml.sax.expatreader + +import os, sys +if "PY_SAX_PARSER" in os.environ: + default_parser_list = os.environ["PY_SAX_PARSER"].split(",") +del os + +_key = "python.xml.sax.parser" +if sys.platform[:4] == "java" and sys.registry.containsKey(_key): + default_parser_list = sys.registry.getProperty(_key).split(",") + + +def make_parser(parser_list = []): + """Creates and returns a SAX parser. + + Creates the first parser it is able to instantiate of the ones + given in the list created by doing parser_list + + default_parser_list. The lists must contain the names of Python + modules containing both a SAX parser and a create_parser function.""" + + for parser_name in parser_list + default_parser_list: + try: + return _create_parser(parser_name) + except ImportError as e: + import sys + if parser_name in sys.modules: + # The parser module was found, but importing it + # failed unexpectedly, pass this exception through + raise + except SAXReaderNotAvailable: + # The parser module detected that it won't work properly, + # so try the next one + pass + + raise SAXReaderNotAvailable("No parsers found", None) + +# --- Internal utility methods used by make_parser + +if sys.platform[ : 4] == "java": + def _create_parser(parser_name): + from org.python.core import imp + drv_module = imp.importName(parser_name, 0, globals()) + return drv_module.create_parser() + +else: + def _create_parser(parser_name): + drv_module = __import__(parser_name,{},{},['create_parser']) + return drv_module.create_parser() + +del sys diff --git a/modules/language/python/module/xml/sax/__init__.py b/modules/language/python/module/xml/sax/__init__.py new file mode 100644 index 0000000..ef67ae6 --- /dev/null +++ b/modules/language/python/module/xml/sax/__init__.py @@ -0,0 +1,107 @@ +"""Simple API for XML (SAX) implementation for Python. + +This module provides an implementation of the SAX 2 interface; +information about the Java version of the interface can be found at +http://www.megginson.com/SAX/. The Python version of the interface is +documented at <...>. + +This package contains the following modules: + +handler -- Base classes and constants which define the SAX 2 API for + the 'client-side' of SAX for Python. + +saxutils -- Implementation of the convenience classes commonly used to + work with SAX. + +xmlreader -- Base classes and constants which define the SAX 2 API for + the parsers used with SAX for Python. + +expatreader -- Driver that allows use of the Expat parser with SAX. +""" + +from .xmlreader import InputSource +from .handler import ContentHandler, ErrorHandler +from ._exceptions import SAXException, SAXNotRecognizedException, \ + SAXParseException, SAXNotSupportedException, \ + SAXReaderNotAvailable + + +def parse(source, handler, errorHandler=ErrorHandler()): + parser = make_parser() + parser.setContentHandler(handler) + parser.setErrorHandler(errorHandler) + parser.parse(source) + +def parseString(string, handler, errorHandler=ErrorHandler()): + import io + if errorHandler is None: + errorHandler = ErrorHandler() + parser = make_parser() + parser.setContentHandler(handler) + parser.setErrorHandler(errorHandler) + + inpsrc = InputSource() + if isinstance(string, str): + inpsrc.setCharacterStream(io.StringIO(string)) + else: + inpsrc.setByteStream(io.BytesIO(string)) + parser.parse(inpsrc) + +# this is the parser list used by the make_parser function if no +# alternatives are given as parameters to the function + +default_parser_list = ["xml.sax.expatreader"] + +# tell modulefinder that importing sax potentially imports expatreader +_false = 0 +if _false: + import xml.sax.expatreader + +import os, sys +if "PY_SAX_PARSER" in os.environ: + default_parser_list = os.environ["PY_SAX_PARSER"].split(",") +del os + +_key = "python.xml.sax.parser" +if sys.platform[:4] == "java" and sys.registry.containsKey(_key): + default_parser_list = sys.registry.getProperty(_key).split(",") + + +def make_parser(parser_list = []): + """Creates and returns a SAX parser. + + Creates the first parser it is able to instantiate of the ones + given in the list created by doing parser_list + + default_parser_list. The lists must contain the names of Python + modules containing both a SAX parser and a create_parser function.""" + + for parser_name in parser_list + default_parser_list: + try: + return _create_parser(parser_name) + except ImportError as e: + import sys + if parser_name in sys.modules: + # The parser module was found, but importing it + # failed unexpectedly, pass this exception through + raise + except SAXReaderNotAvailable: + # The parser module detected that it won't work properly, + # so try the next one + pass + + raise SAXReaderNotAvailable("No parsers found", None) + +# --- Internal utility methods used by make_parser + +if sys.platform[ : 4] == "java": + def _create_parser(parser_name): + from org.python.core import imp + drv_module = imp.importName(parser_name, 0, globals()) + return drv_module.create_parser() + +else: + def _create_parser(parser_name): + drv_module = __import__(parser_name,{},{},['create_parser']) + return drv_module.create_parser() + +del sys diff --git a/modules/language/python/module/xml/sax/_exceptions.py b/modules/language/python/module/xml/sax/_exceptions.py new file mode 100644 index 0000000..bdcc1ea --- /dev/null +++ b/modules/language/python/module/xml/sax/_exceptions.py @@ -0,0 +1,135 @@ +module(xml,sax,_exceptions) + +"""Different kinds of SAX Exceptions""" +import sys +if sys.platform[:4] == "java": + from java.lang import Exception +del sys + +__all__ = ['SAXException','SAXParseException','SAXNotSupportedException' + ,'SAXNotRecognizedException','SAXReaderNotAvailable'] +# ===== SAXEXCEPTION ===== + +class SAXException(Exception): + """Encapsulate an XML error or warning. This class can contain + basic error or warning information from either the XML parser or + the application: you can subclass it to provide additional + functionality, or to add localization. Note that although you will + receive a SAXException as the argument to the handlers in the + ErrorHandler interface, you are not actually required to raise + the exception; instead, you can simply read the information in + it.""" + + def __init__(self, msg, exception=None): + """Creates an exception. The message is required, but the exception + is optional.""" + self._msg = msg + self._exception = exception + Exception.__init__(self, msg) + + def getMessage(self): + "Return a message for this exception." + return self._msg + + def getException(self): + "Return the embedded exception, or None if there was none." + return self._exception + + def __str__(self): + "Create a string representation of the exception." + return self._msg + + def __getitem__(self, ix): + """Avoids weird error messages if someone does exception[ix] by + mistake, since Exception has __getitem__ defined.""" + raise AttributeError("__getitem__") + + +# ===== SAXPARSEEXCEPTION ===== + +class SAXParseException(SAXException): + """Encapsulate an XML parse error or warning. + + This exception will include information for locating the error in + the original XML document. Note that although the application will + receive a SAXParseException as the argument to the handlers in the + ErrorHandler interface, the application is not actually required + to raise the exception; instead, it can simply read the + information in it and take a different action. + + Since this exception is a subclass of SAXException, it inherits + the ability to wrap another exception.""" + + def __init__(self, msg, exception, locator): + "Creates the exception. The exception parameter is allowed to be None." + SAXException.__init__(self, msg, exception) + self._locator = locator + + # We need to cache this stuff at construction time. + # If this exception is raised, the objects through which we must + # traverse to get this information may be deleted by the time + # it gets caught. + self._systemId = self._locator.getSystemId() + self._colnum = self._locator.getColumnNumber() + self._linenum = self._locator.getLineNumber() + + def getColumnNumber(self): + """The column number of the end of the text where the exception + occurred.""" + return self._colnum + + def getLineNumber(self): + "The line number of the end of the text where the exception occurred." + return self._linenum + + def getPublicId(self): + "Get the public identifier of the entity where the exception occurred." + return self._locator.getPublicId() + + def getSystemId(self): + "Get the system identifier of the entity where the exception occurred." + return self._systemId + + def __str__(self): + "Create a string representation of the exception." + sysid = self.getSystemId() + if sysid is None: + sysid = "<unknown>" + linenum = self.getLineNumber() + if linenum is None: + linenum = "?" + colnum = self.getColumnNumber() + if colnum is None: + colnum = "?" + return "%s:%s:%s: %s" % (sysid, linenum, colnum, self._msg) + + +# ===== SAXNOTRECOGNIZEDEXCEPTION ===== + +class SAXNotRecognizedException(SAXException): + """Exception class for an unrecognized identifier. + + An XMLReader will raise this exception when it is confronted with an + unrecognized feature or property. SAX applications and extensions may + use this class for similar purposes.""" + + +# ===== SAXNOTSUPPORTEDEXCEPTION ===== + +class SAXNotSupportedException(SAXException): + """Exception class for an unsupported operation. + + An XMLReader will raise this exception when a service it cannot + perform is requested (specifically setting a state or value). SAX + applications and extensions may use this class for similar + purposes.""" + +# ===== SAXNOTSUPPORTEDEXCEPTION ===== + +class SAXReaderNotAvailable(SAXNotSupportedException): + """Exception class for a missing driver. + + An XMLReader module (driver) should raise this exception when it + is first imported, e.g. when a support module cannot be imported. + It also may be raised during parsing, e.g. if executing an external + program is not permitted.""" diff --git a/modules/language/python/module/xml/sax/expatreader.py b/modules/language/python/module/xml/sax/expatreader.py new file mode 100644 index 0000000..c6da384 --- /dev/null +++ b/modules/language/python/module/xml/sax/expatreader.py @@ -0,0 +1,447 @@ +module(xml,sax,expatreader) +""" +SAX driver for the pyexpat C module. This driver works with +pyexpat.__version__ == '2.22'. +""" + +version = "0.20" + +from xml.sax._exceptions import * +from xml.sax.handler import feature_validation, feature_namespaces +from xml.sax.handler import feature_namespace_prefixes +from xml.sax.handler import feature_external_ges, feature_external_pes +from xml.sax.handler import feature_string_interning +from xml.sax.handler import property_xml_string, property_interning_dict + +# xml.parsers.expat does not raise ImportError in Jython +import sys +if sys.platform[:4] == "java": + raise SAXReaderNotAvailable("expat not available in Java", None) +del sys + +try: + from xml.parsers import expat +except ImportError: + raise SAXReaderNotAvailable("expat not supported", None) +else: + if not hasattr(expat, "ParserCreate"): + raise SAXReaderNotAvailable("expat not supported", None) +from xml.sax import xmlreader, saxutils, handler + +AttributesImpl = xmlreader.AttributesImpl +AttributesNSImpl = xmlreader.AttributesNSImpl + +# If we're using a sufficiently recent version of Python, we can use +# weak references to avoid cycles between the parser and content +# handler, otherwise we'll just have to pretend. +try: + import _weakref +except ImportError: + def _mkproxy(o): + return o +else: + import weakref + _mkproxy = weakref.proxy + del weakref, _weakref + +class _ClosedParser: + pass + +# --- ExpatLocator + +class ExpatLocator(xmlreader.Locator): + """Locator for use with the ExpatParser class. + + This uses a weak reference to the parser object to avoid creating + a circular reference between the parser and the content handler. + """ + def __init__(self, parser): + self._ref = _mkproxy(parser) + + def getColumnNumber(self): + parser = self._ref + if parser._parser is None: + return None + return parser._parser.ErrorColumnNumber + + def getLineNumber(self): + parser = self._ref + if parser._parser is None: + return 1 + return parser._parser.ErrorLineNumber + + def getPublicId(self): + parser = self._ref + if parser is None: + return None + return parser._source.getPublicId() + + def getSystemId(self): + parser = self._ref + if parser is None: + return None + return parser._source.getSystemId() + + +# --- ExpatParser + +class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): + """SAX driver for the pyexpat C module.""" + + def __init__(self, namespaceHandling=0, bufsize=2**16-20): + xmlreader.IncrementalParser.__init__(self, bufsize) + self._source = xmlreader.InputSource() + self._parser = None + self._namespaces = namespaceHandling + self._lex_handler_prop = None + self._parsing = 0 + self._entity_stack = [] + self._external_ges = 1 + self._interning = None + + # XMLReader methods + + def parse(self, source): + "Parse an XML document from a URL or an InputSource." + source = saxutils.prepare_input_source(source) + + self._source = source + try: + self.reset() + self._cont_handler.setDocumentLocator(ExpatLocator(self)) + xmlreader.IncrementalParser.parse(self, source) + except: + # bpo-30264: Close the source on error to not leak resources: + # xml.sax.parse() doesn't give access to the underlying parser + # to the caller + self._close_source() + raise + + def prepareParser(self, source): + if source.getSystemId() is not None: + self._parser.SetBase(source.getSystemId()) + + # Redefined setContentHandler to allow changing handlers during parsing + + def setContentHandler(self, handler): + xmlreader.IncrementalParser.setContentHandler(self, handler) + if self._parsing: + self._reset_cont_handler() + + def getFeature(self, name): + if name == feature_namespaces: + return self._namespaces + elif name == feature_string_interning: + return self._interning is not None + elif name in (feature_validation, feature_external_pes, + feature_namespace_prefixes): + return 0 + elif name == feature_external_ges: + return self._external_ges + raise SAXNotRecognizedException("Feature '%s' not recognized" % name) + + def setFeature(self, name, state): + if self._parsing: + raise SAXNotSupportedException("Cannot set features while parsing") + + if name == feature_namespaces: + self._namespaces = state + elif name == feature_external_ges: + self._external_ges = state + elif name == feature_string_interning: + if state: + if self._interning is None: + self._interning = {} + else: + self._interning = None + elif name == feature_validation: + if state: + raise SAXNotSupportedException( + "expat does not support validation") + elif name == feature_external_pes: + if state: + raise SAXNotSupportedException( + "expat does not read external parameter entities") + elif name == feature_namespace_prefixes: + if state: + raise SAXNotSupportedException( + "expat does not report namespace prefixes") + else: + raise SAXNotRecognizedException( + "Feature '%s' not recognized" % name) + + def getProperty(self, name): + if name == handler.property_lexical_handler: + return self._lex_handler_prop + elif name == property_interning_dict: + return self._interning + elif name == property_xml_string: + if self._parser: + if hasattr(self._parser, "GetInputContext"): + return self._parser.GetInputContext() + else: + raise SAXNotRecognizedException( + "This version of expat does not support getting" + " the XML string") + else: + raise SAXNotSupportedException( + "XML string cannot be returned when not parsing") + raise SAXNotRecognizedException("Property '%s' not recognized" % name) + + def setProperty(self, name, value): + if name == handler.property_lexical_handler: + self._lex_handler_prop = value + if self._parsing: + self._reset_lex_handler_prop() + elif name == property_interning_dict: + self._interning = value + elif name == property_xml_string: + raise SAXNotSupportedException("Property '%s' cannot be set" % + name) + else: + raise SAXNotRecognizedException("Property '%s' not recognized" % + name) + + # IncrementalParser methods + + def feed(self, data, isFinal = 0): + if not self._parsing: + self.reset() + self._parsing = 1 + self._cont_handler.startDocument() + + try: + # The isFinal parameter is internal to the expat reader. + # If it is set to true, expat will check validity of the entire + # document. When feeding chunks, they are not normally final - + # except when invoked from close. + self._parser.Parse(data, isFinal) + except expat.error as e: + exc = SAXParseException(expat.ErrorString(e.code), e, self) + # FIXME: when to invoke error()? + self._err_handler.fatalError(exc) + + def _close_source(self): + source = self._source + try: + file = source.getCharacterStream() + if file is not None: + file.close() + finally: + file = source.getByteStream() + if file is not None: + file.close() + + def close(self): + if (self._entity_stack or self._parser is None or + isinstance(self._parser, _ClosedParser)): + # If we are completing an external entity, do nothing here + return + try: + self.feed("", isFinal = 1) + self._cont_handler.endDocument() + self._parsing = 0 + # break cycle created by expat handlers pointing to our methods + self._parser = None + finally: + self._parsing = 0 + if self._parser is not None: + # Keep ErrorColumnNumber and ErrorLineNumber after closing. + parser = _ClosedParser() + parser.ErrorColumnNumber = self._parser.ErrorColumnNumber + parser.ErrorLineNumber = self._parser.ErrorLineNumber + self._parser = parser + self._close_source() + + def _reset_cont_handler(self): + self._parser.ProcessingInstructionHandler = \ + self._cont_handler.processingInstruction + self._parser.CharacterDataHandler = self._cont_handler.characters + + def _reset_lex_handler_prop(self): + lex = self._lex_handler_prop + parser = self._parser + if lex is None: + parser.CommentHandler = None + parser.StartCdataSectionHandler = None + parser.EndCdataSectionHandler = None + parser.StartDoctypeDeclHandler = None + parser.EndDoctypeDeclHandler = None + else: + parser.CommentHandler = lex.comment + parser.StartCdataSectionHandler = lex.startCDATA + parser.EndCdataSectionHandler = lex.endCDATA + parser.StartDoctypeDeclHandler = self.start_doctype_decl + parser.EndDoctypeDeclHandler = lex.endDTD + + def reset(self): + if self._namespaces: + self._parser = expat.ParserCreate(self._source.getEncoding(), " ", + intern=self._interning) + self._parser.namespace_prefixes = 1 + self._parser.StartElementHandler = self.start_element_ns + self._parser.EndElementHandler = self.end_element_ns + else: + self._parser = expat.ParserCreate(self._source.getEncoding(), + intern = self._interning) + self._parser.StartElementHandler = self.start_element + self._parser.EndElementHandler = self.end_element + + self._reset_cont_handler() + self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl + self._parser.NotationDeclHandler = self.notation_decl + self._parser.StartNamespaceDeclHandler = self.start_namespace_decl + self._parser.EndNamespaceDeclHandler = self.end_namespace_decl + + self._decl_handler_prop = None + if self._lex_handler_prop: + self._reset_lex_handler_prop() +# self._parser.DefaultHandler = +# self._parser.DefaultHandlerExpand = +# self._parser.NotStandaloneHandler = + self._parser.ExternalEntityRefHandler = self.external_entity_ref + try: + self._parser.SkippedEntityHandler = self.skipped_entity_handler + except AttributeError: + # This pyexpat does not support SkippedEntity + pass + self._parser.SetParamEntityParsing( + expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) + + self._parsing = 0 + self._entity_stack = [] + + # Locator methods + + def getColumnNumber(self): + if self._parser is None: + return None + return self._parser.ErrorColumnNumber + + def getLineNumber(self): + if self._parser is None: + return 1 + return self._parser.ErrorLineNumber + + def getPublicId(self): + return self._source.getPublicId() + + def getSystemId(self): + return self._source.getSystemId() + + # event handlers + def start_element(self, name, attrs): + self._cont_handler.startElement(name, AttributesImpl(attrs)) + + def end_element(self, name): + self._cont_handler.endElement(name) + + def start_element_ns(self, name, attrs): + pair = name.split() + if len(pair) == 1: + # no namespace + pair = (None, name) + elif len(pair) == 3: + pair = pair[0], pair[1] + else: + # default namespace + pair = tuple(pair) + + newattrs = {} + qnames = {} + for (aname, value) in attrs.items(): + parts = aname.split() + length = len(parts) + if length == 1: + # no namespace + qname = aname + apair = (None, aname) + elif length == 3: + qname = "%s:%s" % (parts[2], parts[1]) + apair = parts[0], parts[1] + else: + # default namespace + qname = parts[1] + apair = tuple(parts) + + newattrs[apair] = value + qnames[apair] = qname + + self._cont_handler.startElementNS(pair, None, + AttributesNSImpl(newattrs, qnames)) + + def end_element_ns(self, name): + pair = name.split() + if len(pair) == 1: + pair = (None, name) + elif len(pair) == 3: + pair = pair[0], pair[1] + else: + pair = tuple(pair) + + self._cont_handler.endElementNS(pair, None) + + # this is not used (call directly to ContentHandler) + def processing_instruction(self, target, data): + self._cont_handler.processingInstruction(target, data) + + # this is not used (call directly to ContentHandler) + def character_data(self, data): + self._cont_handler.characters(data) + + def start_namespace_decl(self, prefix, uri): + self._cont_handler.startPrefixMapping(prefix, uri) + + def end_namespace_decl(self, prefix): + self._cont_handler.endPrefixMapping(prefix) + + def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): + self._lex_handler_prop.startDTD(name, pubid, sysid) + + def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): + self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) + + def notation_decl(self, name, base, sysid, pubid): + self._dtd_handler.notationDecl(name, pubid, sysid) + + def external_entity_ref(self, context, base, sysid, pubid): + if not self._external_ges: + return 1 + + source = self._ent_handler.resolveEntity(pubid, sysid) + source = saxutils.prepare_input_source(source, + self._source.getSystemId() or + "") + + self._entity_stack.append((self._parser, self._source)) + self._parser = self._parser.ExternalEntityParserCreate(context) + self._source = source + + try: + xmlreader.IncrementalParser.parse(self, source) + except: + return 0 # FIXME: save error info here? + + (self._parser, self._source) = self._entity_stack[-1] + del self._entity_stack[-1] + return 1 + + def skipped_entity_handler(self, name, is_pe): + if is_pe: + # The SAX spec requires to report skipped PEs with a '%' + name = '%'+name + self._cont_handler.skippedEntity(name) + +# --- + +def create_parser(*args, **kwargs): + return ExpatParser(*args, **kwargs) + +# --- + +if __name__ == "__main__": + import xml.sax.saxutils + p = create_parser() + p.setContentHandler(xml.sax.saxutils.XMLGenerator()) + p.setErrorHandler(xml.sax.ErrorHandler()) + p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml") diff --git a/modules/language/python/module/xml/sax/handler.py b/modules/language/python/module/xml/sax/handler.py new file mode 100644 index 0000000..c245411 --- /dev/null +++ b/modules/language/python/module/xml/sax/handler.py @@ -0,0 +1,346 @@ +module(xml,sax,handler) + +""" +This module contains the core classes of version 2.0 of SAX for Python. +This file provides only default classes with absolutely minimum +functionality, from which drivers and applications can be subclassed. + +Many of these classes are empty and are included only as documentation +of the interfaces. + +$Id$ +""" + +__all__=['ErrorHandler','ContentHandler','DTDHandler'] + +version = '2.0beta' + +#============================================================================ +# +# HANDLER INTERFACES +# +#============================================================================ + +# ===== ERRORHANDLER ===== + +class ErrorHandler: + """Basic interface for SAX error handlers. + + If you create an object that implements this interface, then + register the object with your XMLReader, the parser will call the + methods in your object to report all warnings and errors. There + are three levels of errors available: warnings, (possibly) + recoverable errors, and unrecoverable errors. All methods take a + SAXParseException as the only parameter.""" + + def error(self, exception): + "Handle a recoverable error." + raise exception + + def fatalError(self, exception): + "Handle a non-recoverable error." + raise exception + + def warning(self, exception): + "Handle a warning." + print(exception) + + +# ===== CONTENTHANDLER ===== + +class ContentHandler: + """Interface for receiving logical document content events. + + This is the main callback interface in SAX, and the one most + important to applications. The order of events in this interface + mirrors the order of the information in the document.""" + + def __init__(self): + self._locator = None + + def setDocumentLocator(self, locator): + """Called by the parser to give the application a locator for + locating the origin of document events. + + SAX parsers are strongly encouraged (though not absolutely + required) to supply a locator: if it does so, it must supply + the locator to the application by invoking this method before + invoking any of the other methods in the DocumentHandler + interface. + + The locator allows the application to determine the end + position of any document-related event, even if the parser is + not reporting an error. Typically, the application will use + this information for reporting its own errors (such as + character content that does not match an application's + business rules). The information returned by the locator is + probably not sufficient for use with a search engine. + + Note that the locator will return correct information only + during the invocation of the events in this interface. The + application should not attempt to use it at any other time.""" + self._locator = locator + + def startDocument(self): + """Receive notification of the beginning of a document. + + The SAX parser will invoke this method only once, before any + other methods in this interface or in DTDHandler (except for + setDocumentLocator).""" + + def endDocument(self): + """Receive notification of the end of a document. + + The SAX parser will invoke this method only once, and it will + be the last method invoked during the parse. The parser shall + not invoke this method until it has either abandoned parsing + (because of an unrecoverable error) or reached the end of + input.""" + + def startPrefixMapping(self, prefix, uri): + """Begin the scope of a prefix-URI Namespace mapping. + + The information from this event is not necessary for normal + Namespace processing: the SAX XML reader will automatically + replace prefixes for element and attribute names when the + http://xml.org/sax/features/namespaces feature is true (the + default). + + There are cases, however, when applications need to use + prefixes in character data or in attribute values, where they + cannot safely be expanded automatically; the + start/endPrefixMapping event supplies the information to the + application to expand prefixes in those contexts itself, if + necessary. + + Note that start/endPrefixMapping events are not guaranteed to + be properly nested relative to each-other: all + startPrefixMapping events will occur before the corresponding + startElement event, and all endPrefixMapping events will occur + after the corresponding endElement event, but their order is + not guaranteed.""" + + def endPrefixMapping(self, prefix): + """End the scope of a prefix-URI mapping. + + See startPrefixMapping for details. This event will always + occur after the corresponding endElement event, but the order + of endPrefixMapping events is not otherwise guaranteed.""" + + def startElement(self, name, attrs): + """Signals the start of an element in non-namespace mode. + + The name parameter contains the raw XML 1.0 name of the + element type as a string and the attrs parameter holds an + instance of the Attributes class containing the attributes of + the element.""" + + def endElement(self, name): + """Signals the end of an element in non-namespace mode. + + The name parameter contains the name of the element type, just + as with the startElement event.""" + + def startElementNS(self, name, qname, attrs): + """Signals the start of an element in namespace mode. + + The name parameter contains the name of the element type as a + (uri, localname) tuple, the qname parameter the raw XML 1.0 + name used in the source document, and the attrs parameter + holds an instance of the Attributes class containing the + attributes of the element. + + The uri part of the name tuple is None for elements which have + no namespace.""" + + def endElementNS(self, name, qname): + """Signals the end of an element in namespace mode. + + The name parameter contains the name of the element type, just + as with the startElementNS event.""" + + def characters(self, content): + """Receive notification of character data. + + The Parser will call this method to report each chunk of + character data. SAX parsers may return all contiguous + character data in a single chunk, or they may split it into + several chunks; however, all of the characters in any single + event must come from the same external entity so that the + Locator provides useful information.""" + + def ignorableWhitespace(self, whitespace): + """Receive notification of ignorable whitespace in element content. + + Validating Parsers must use this method to report each chunk + of ignorable whitespace (see the W3C XML 1.0 recommendation, + section 2.10): non-validating parsers may also use this method + if they are capable of parsing and using content models. + + SAX parsers may return all contiguous whitespace in a single + chunk, or they may split it into several chunks; however, all + of the characters in any single event must come from the same + external entity, so that the Locator provides useful + information.""" + + def processingInstruction(self, target, data): + """Receive notification of a processing instruction. + + The Parser will invoke this method once for each processing + instruction found: note that processing instructions may occur + before or after the main document element. + + A SAX parser should never report an XML declaration (XML 1.0, + section 2.8) or a text declaration (XML 1.0, section 4.3.1) + using this method.""" + + def skippedEntity(self, name): + """Receive notification of a skipped entity. + + The Parser will invoke this method once for each entity + skipped. Non-validating processors may skip entities if they + have not seen the declarations (because, for example, the + entity was declared in an external DTD subset). All processors + may skip external entities, depending on the values of the + http://xml.org/sax/features/external-general-entities and the + http://xml.org/sax/features/external-parameter-entities + properties.""" + + +# ===== DTDHandler ===== + +class DTDHandler: + """Handle DTD events. + + This interface specifies only those DTD events required for basic + parsing (unparsed entities and attributes).""" + + def notationDecl(self, name, publicId, systemId): + "Handle a notation declaration event." + + def unparsedEntityDecl(self, name, publicId, systemId, ndata): + "Handle an unparsed entity declaration event." + + +# ===== ENTITYRESOLVER ===== + +class EntityResolver: + """Basic interface for resolving entities. If you create an object + implementing this interface, then register the object with your + Parser, the parser will call the method in your object to + resolve all external entities. Note that DefaultHandler implements + this interface with the default behaviour.""" + + def resolveEntity(self, publicId, systemId): + """Resolve the system identifier of an entity and return either + the system identifier to read from as a string, or an InputSource + to read from.""" + return systemId + + +#============================================================================ +# +# CORE FEATURES +# +#============================================================================ + +feature_namespaces = "http://xml.org/sax/features/namespaces" +# true: Perform Namespace processing (default). +# false: Optionally do not perform Namespace processing +# (implies namespace-prefixes). +# access: (parsing) read-only; (not parsing) read/write + +feature_namespace_prefixes = "http://xml.org/sax/features/namespace-prefixes" +# true: Report the original prefixed names and attributes used for Namespace +# declarations. +# false: Do not report attributes used for Namespace declarations, and +# optionally do not report original prefixed names (default). +# access: (parsing) read-only; (not parsing) read/write + +feature_string_interning = "http://xml.org/sax/features/string-interning" +# true: All element names, prefixes, attribute names, Namespace URIs, and +# local names are interned using the built-in intern function. +# false: Names are not necessarily interned, although they may be (default). +# access: (parsing) read-only; (not parsing) read/write + +feature_validation = "http://xml.org/sax/features/validation" +# true: Report all validation errors (implies external-general-entities and +# external-parameter-entities). +# false: Do not report validation errors. +# access: (parsing) read-only; (not parsing) read/write + +feature_external_ges = "http://xml.org/sax/features/external-general-entities" +# true: Include all external general (text) entities. +# false: Do not include external general entities. +# access: (parsing) read-only; (not parsing) read/write + +feature_external_pes = "http://xml.org/sax/features/external-parameter-entities" +# true: Include all external parameter entities, including the external +# DTD subset. +# false: Do not include any external parameter entities, even the external +# DTD subset. +# access: (parsing) read-only; (not parsing) read/write + +all_features = [feature_namespaces, + feature_namespace_prefixes, + feature_string_interning, + feature_validation, + feature_external_ges, + feature_external_pes] + + +#============================================================================ +# +# CORE PROPERTIES +# +#============================================================================ + +property_lexical_handler = "http://xml.org/sax/properties/lexical-handler" +# data type: xml.sax.sax2lib.LexicalHandler +# description: An optional extension handler for lexical events like comments. +# access: read/write + +property_declaration_handler = "http://xml.org/sax/properties/declaration-handler" +# data type: xml.sax.sax2lib.DeclHandler +# description: An optional extension handler for DTD-related events other +# than notations and unparsed entities. +# access: read/write + +property_dom_node = "http://xml.org/sax/properties/dom-node" +# data type: org.w3c.dom.Node +# description: When parsing, the current DOM node being visited if this is +# a DOM iterator; when not parsing, the root DOM node for +# iteration. +# access: (parsing) read-only; (not parsing) read/write + +property_xml_string = "http://xml.org/sax/properties/xml-string" +# data type: String +# description: The literal string of characters that was the source for +# the current event. +# access: read-only + +property_encoding = "http://www.python.org/sax/properties/encoding" +# data type: String +# description: The name of the encoding to assume for input data. +# access: write: set the encoding, e.g. established by a higher-level +# protocol. May change during parsing (e.g. after +# processing a META tag) +# read: return the current encoding (possibly established through +# auto-detection. +# initial value: UTF-8 +# + +property_interning_dict = "http://www.python.org/sax/properties/interning-dict" +# data type: Dictionary +# description: The dictionary used to intern common strings in the document +# access: write: Request that the parser uses a specific dictionary, to +# allow interning across different documents +# read: return the current interning dictionary, or None +# + +all_properties = [property_lexical_handler, + property_dom_node, + property_declaration_handler, + property_xml_string, + property_encoding, + property_interning_dict] diff --git a/modules/language/python/module/xml/sax/saxutils.py b/modules/language/python/module/xml/sax/saxutils.py new file mode 100644 index 0000000..0899f91 --- /dev/null +++ b/modules/language/python/module/xml/sax/saxutils.py @@ -0,0 +1,371 @@ +module(xml,sax,saxutils) +"""\ +A library of useful helper classes to the SAX classes, for the +convenience of application and driver writers. +""" + +import os +import urllib.parse +import urllib.request +import io +import codecs +import xml.sax.handler as handler +import xml.sax.xmlreader as xmlreader + +def __dict_replace(s, d): + """Replace substrings of a string using a dictionary.""" + for key, value in d.items(): + s = s.replace(key, value) + return s + +def escape(data, entities={}): + """Escape &, <, and > in a string of data. + + You can escape other strings of data by passing a dictionary as + the optional entities parameter. The keys and values must all be + strings; each key will be replaced with its corresponding value. + """ + + # must do ampersand first + data = data.replace("&", "&") + data = data.replace(">", ">") + data = data.replace("<", "<") + if entities: + data = __dict_replace(data, entities) + return data + +def unescape(data, entities={}): + """Unescape &, <, and > in a string of data. + + You can unescape other strings of data by passing a dictionary as + the optional entities parameter. The keys and values must all be + strings; each key will be replaced with its corresponding value. + """ + data = data.replace("<", "<") + data = data.replace(">", ">") + if entities: + data = __dict_replace(data, entities) + # must do ampersand last + return data.replace("&", "&") + +def quoteattr(data, entities={}): + """Escape and quote an attribute value. + + Escape &, <, and > in a string of data, then quote it for use as + an attribute value. The \" character will be escaped as well, if + necessary. + + You can escape other strings of data by passing a dictionary as + the optional entities parameter. The keys and values must all be + strings; each key will be replaced with its corresponding value. + """ + entities = entities.copy() + entities.update({'\n': ' ', '\r': ' ', '\t':'	'}) + data = escape(data, entities) + if '"' in data: + if "'" in data: + data = '"%s"' % data.replace('"', """) + else: + data = "'%s'" % data + else: + data = '"%s"' % data + return data + + +def _gettextwriter(out, encoding): + if out is None: + import sys + return sys.stdout + + if isinstance(out, io.TextIOBase): + # use a text writer as is + return out + + if isinstance(out, (codecs.StreamWriter, codecs.StreamReaderWriter)): + # use a codecs stream writer as is + return out + + # wrap a binary writer with TextIOWrapper + if isinstance(out, io.RawIOBase): + # Keep the original file open when the TextIOWrapper is + # destroyed + class _wrapper: + __class__ = out.__class__ + def __getattr__(self, name): + return getattr(out, name) + buffer = _wrapper() + buffer.close = lambda: None + else: + # This is to handle passed objects that aren't in the + # IOBase hierarchy, but just have a write method + buffer = io.BufferedIOBase() + buffer.writable = lambda: True + buffer.write = out.write + try: + # TextIOWrapper uses this methods to determine + # if BOM (for UTF-16, etc) should be added + buffer.seekable = out.seekable + buffer.tell = out.tell + except AttributeError: + pass + return io.TextIOWrapper(buffer, encoding=encoding, + errors='xmlcharrefreplace', + newline='\n', + write_through=True) + +class XMLGenerator(handler.ContentHandler): + + def __init__(self, out=None, encoding="iso-8859-1", short_empty_elements=False): + handler.ContentHandler.__init__(self) + out = _gettextwriter(out, encoding) + self._write = out.write + self._flush = out.flush + self._ns_contexts = [{}] # contains uri -> prefix dicts + self._current_context = self._ns_contexts[-1] + self._undeclared_ns_maps = [] + self._encoding = encoding + self._short_empty_elements = short_empty_elements + self._pending_start_element = False + + def _qname(self, name): + """Builds a qualified name from a (ns_url, localname) pair""" + if name[0]: + # Per http://www.w3.org/XML/1998/namespace, The 'xml' prefix is + # bound by definition to http://www.w3.org/XML/1998/namespace. It + # does not need to be declared and will not usually be found in + # self._current_context. + if 'http://www.w3.org/XML/1998/namespace' == name[0]: + return 'xml:' + name[1] + # The name is in a non-empty namespace + prefix = self._current_context[name[0]] + if prefix: + # If it is not the default namespace, prepend the prefix + return prefix + ":" + name[1] + # Return the unqualified name + return name[1] + + def _finish_pending_start_element(self,endElement=False): + if self._pending_start_element: + self._write('>') + self._pending_start_element = False + + # ContentHandler methods + + def startDocument(self): + self._write('<?xml version="1.0" encoding="%s"?>\n' % + self._encoding) + + def endDocument(self): + self._flush() + + def startPrefixMapping(self, prefix, uri): + self._ns_contexts.append(self._current_context.copy()) + self._current_context[uri] = prefix + self._undeclared_ns_maps.append((prefix, uri)) + + def endPrefixMapping(self, prefix): + self._current_context = self._ns_contexts[-1] + del self._ns_contexts[-1] + + def startElement(self, name, attrs): + self._finish_pending_start_element() + self._write('<' + name) + for (name, value) in attrs.items(): + self._write(' %s=%s' % (name, quoteattr(value))) + if self._short_empty_elements: + self._pending_start_element = True + else: + self._write(">") + + def endElement(self, name): + if self._pending_start_element: + self._write('/>') + self._pending_start_element = False + else: + self._write('</%s>' % name) + + def startElementNS(self, name, qname, attrs): + self._finish_pending_start_element() + self._write('<' + self._qname(name)) + + for prefix, uri in self._undeclared_ns_maps: + if prefix: + self._write(' xmlns:%s="%s"' % (prefix, uri)) + else: + self._write(' xmlns="%s"' % uri) + self._undeclared_ns_maps = [] + + for (name, value) in attrs.items(): + self._write(' %s=%s' % (self._qname(name), quoteattr(value))) + if self._short_empty_elements: + self._pending_start_element = True + else: + self._write(">") + + def endElementNS(self, name, qname): + if self._pending_start_element: + self._write('/>') + self._pending_start_element = False + else: + self._write('</%s>' % self._qname(name)) + + def characters(self, content): + if content: + self._finish_pending_start_element() + if not isinstance(content, str): + content = str(content, self._encoding) + self._write(escape(content)) + + def ignorableWhitespace(self, content): + if content: + self._finish_pending_start_element() + if not isinstance(content, str): + content = str(content, self._encoding) + self._write(content) + + def processingInstruction(self, target, data): + self._finish_pending_start_element() + self._write('<?%s %s?>' % (target, data)) + + +class XMLFilterBase(xmlreader.XMLReader): + """This class is designed to sit between an XMLReader and the + client application's event handlers. By default, it does nothing + but pass requests up to the reader and events on to the handlers + unmodified, but subclasses can override specific methods to modify + the event stream or the configuration requests as they pass + through.""" + + def __init__(self, parent = None): + xmlreader.XMLReader.__init__(self) + self._parent = parent + + # ErrorHandler methods + + def error(self, exception): + self._err_handler.error(exception) + + def fatalError(self, exception): + self._err_handler.fatalError(exception) + + def warning(self, exception): + self._err_handler.warning(exception) + + # ContentHandler methods + + def setDocumentLocator(self, locator): + self._cont_handler.setDocumentLocator(locator) + + def startDocument(self): + self._cont_handler.startDocument() + + def endDocument(self): + self._cont_handler.endDocument() + + def startPrefixMapping(self, prefix, uri): + self._cont_handler.startPrefixMapping(prefix, uri) + + def endPrefixMapping(self, prefix): + self._cont_handler.endPrefixMapping(prefix) + + def startElement(self, name, attrs): + self._cont_handler.startElement(name, attrs) + + def endElement(self, name): + self._cont_handler.endElement(name) + + def startElementNS(self, name, qname, attrs): + self._cont_handler.startElementNS(name, qname, attrs) + + def endElementNS(self, name, qname): + self._cont_handler.endElementNS(name, qname) + + def characters(self, content): + self._cont_handler.characters(content) + + def ignorableWhitespace(self, chars): + self._cont_handler.ignorableWhitespace(chars) + + def processingInstruction(self, target, data): + self._cont_handler.processingInstruction(target, data) + + def skippedEntity(self, name): + self._cont_handler.skippedEntity(name) + + # DTDHandler methods + + def notationDecl(self, name, publicId, systemId): + self._dtd_handler.notationDecl(name, publicId, systemId) + + def unparsedEntityDecl(self, name, publicId, systemId, ndata): + self._dtd_handler.unparsedEntityDecl(name, publicId, systemId, ndata) + + # EntityResolver methods + + def resolveEntity(self, publicId, systemId): + return self._ent_handler.resolveEntity(publicId, systemId) + + # XMLReader methods + + def parse(self, source): + self._parent.setContentHandler(self) + self._parent.setErrorHandler(self) + self._parent.setEntityResolver(self) + self._parent.setDTDHandler(self) + self._parent.parse(source) + + def setLocale(self, locale): + self._parent.setLocale(locale) + + def getFeature(self, name): + return self._parent.getFeature(name) + + def setFeature(self, name, state): + self._parent.setFeature(name, state) + + def getProperty(self, name): + return self._parent.getProperty(name) + + def setProperty(self, name, value): + self._parent.setProperty(name, value) + + # XMLFilter methods + + def getParent(self): + return self._parent + + def setParent(self, parent): + self._parent = parent + +# --- Utility functions + +def prepare_input_source(source, base=""): + """This function takes an InputSource and an optional base URL and + returns a fully resolved InputSource object ready for reading.""" + + if isinstance(source, str): + source = xmlreader.InputSource(source) + elif hasattr(source, "read"): + f = source + source = xmlreader.InputSource() + if isinstance(f.read(0), str): + source.setCharacterStream(f) + else: + source.setByteStream(f) + if hasattr(f, "name") and isinstance(f.name, str): + source.setSystemId(f.name) + + if source.getCharacterStream() is None and source.getByteStream() is None: + sysid = source.getSystemId() + basehead = os.path.dirname(os.path.normpath(base)) + sysidfilename = os.path.join(basehead, sysid) + if os.path.isfile(sysidfilename): + source.setSystemId(sysidfilename) + f = open(sysidfilename, "rb") + else: + source.setSystemId(urllib.parse.urljoin(base, sysid)) + f = urllib.request.urlopen(source.getSystemId()) + + source.setByteStream(f) + + return source diff --git a/modules/language/python/module/xml/sax/xmlreader.py b/modules/language/python/module/xml/sax/xmlreader.py new file mode 100644 index 0000000..4c24044 --- /dev/null +++ b/modules/language/python/module/xml/sax/xmlreader.py @@ -0,0 +1,383 @@ +module(xml,sax,xmlreader) + +"""An XML Reader is the SAX 2 name for an XML parser. XML Parsers +should be based on this code. """ + +import xml.sax.handler as handler + +from xml.sax._exceptions import SAXNotSupportedException, SAXNotRecognizedException + +__all__=['InputSource','XMLReader','IncrementalParser','Locator' + ,'AttributesImpl','AttributesNSImpl'] +# ===== XMLREADER ===== + +class XMLReader: + """Interface for reading an XML document using callbacks. + + XMLReader is the interface that an XML parser's SAX2 driver must + implement. This interface allows an application to set and query + features and properties in the parser, to register event handlers + for document processing, and to initiate a document parse. + + All SAX interfaces are assumed to be synchronous: the parse + methods must not return until parsing is complete, and readers + must wait for an event-handler callback to return before reporting + the next event.""" + + def __init__(self): + self._cont_handler = handler.ContentHandler() + self._dtd_handler = handler.DTDHandler() + self._ent_handler = handler.EntityResolver() + self._err_handler = handler.ErrorHandler() + + def parse(self, source): + "Parse an XML document from a system identifier or an InputSource." + raise NotImplementedError("This method must be implemented!") + + def getContentHandler(self): + "Returns the current ContentHandler." + return self._cont_handler + + def setContentHandler(self, handler): + "Registers a new object to receive document content events." + self._cont_handler = handler + + def getDTDHandler(self): + "Returns the current DTD handler." + return self._dtd_handler + + def setDTDHandler(self, handler): + "Register an object to receive basic DTD-related events." + self._dtd_handler = handler + + def getEntityResolver(self): + "Returns the current EntityResolver." + return self._ent_handler + + def setEntityResolver(self, resolver): + "Register an object to resolve external entities." + self._ent_handler = resolver + + def getErrorHandler(self): + "Returns the current ErrorHandler." + return self._err_handler + + def setErrorHandler(self, handler): + "Register an object to receive error-message events." + self._err_handler = handler + + def setLocale(self, locale): + """Allow an application to set the locale for errors and warnings. + + SAX parsers are not required to provide localization for errors + and warnings; if they cannot support the requested locale, + however, they must raise a SAX exception. Applications may + request a locale change in the middle of a parse.""" + raise SAXNotSupportedException("Locale support not implemented") + + def getFeature(self, name): + "Looks up and returns the state of a SAX2 feature." + raise SAXNotRecognizedException("Feature '%s' not recognized" % name) + + def setFeature(self, name, state): + "Sets the state of a SAX2 feature." + raise SAXNotRecognizedException("Feature '%s' not recognized" % name) + + def getProperty(self, name): + "Looks up and returns the value of a SAX2 property." + raise SAXNotRecognizedException("Property '%s' not recognized" % name) + + def setProperty(self, name, value): + "Sets the value of a SAX2 property." + raise SAXNotRecognizedException("Property '%s' not recognized" % name) + +class IncrementalParser(XMLReader): + """This interface adds three extra methods to the XMLReader + interface that allow XML parsers to support incremental + parsing. Support for this interface is optional, since not all + underlying XML parsers support this functionality. + + When the parser is instantiated it is ready to begin accepting + data from the feed method immediately. After parsing has been + finished with a call to close the reset method must be called to + make the parser ready to accept new data, either from feed or + using the parse method. + + Note that these methods must _not_ be called during parsing, that + is, after parse has been called and before it returns. + + By default, the class also implements the parse method of the XMLReader + interface using the feed, close and reset methods of the + IncrementalParser interface as a convenience to SAX 2.0 driver + writers.""" + + def __init__(self, bufsize=2**16): + self._bufsize = bufsize + XMLReader.__init__(self) + + def parse(self, source): + import xml.sax.saxutils as saxutils + source = saxutils.prepare_input_source(source) + + self.prepareParser(source) + file = source.getCharacterStream() + if file is None: + file = source.getByteStream() + buffer = file.read(self._bufsize) + while buffer: + self.feed(buffer) + buffer = file.read(self._bufsize) + self.close() + + def feed(self, data): + """This method gives the raw XML data in the data parameter to + the parser and makes it parse the data, emitting the + corresponding events. It is allowed for XML constructs to be + split across several calls to feed. + + feed may raise SAXException.""" + raise NotImplementedError("This method must be implemented!") + + def prepareParser(self, source): + """This method is called by the parse implementation to allow + the SAX 2.0 driver to prepare itself for parsing.""" + raise NotImplementedError("prepareParser must be overridden!") + + def close(self): + """This method is called when the entire XML document has been + passed to the parser through the feed method, to notify the + parser that there are no more data. This allows the parser to + do the final checks on the document and empty the internal + data buffer. + + The parser will not be ready to parse another document until + the reset method has been called. + + close may raise SAXException.""" + raise NotImplementedError("This method must be implemented!") + + def reset(self): + """This method is called after close has been called to reset + the parser so that it is ready to parse new documents. The + results of calling parse or feed after close without calling + reset are undefined.""" + raise NotImplementedError("This method must be implemented!") + +# ===== LOCATOR ===== + +class Locator: + """Interface for associating a SAX event with a document + location. A locator object will return valid results only during + calls to DocumentHandler methods; at any other time, the + results are unpredictable.""" + + def getColumnNumber(self): + "Return the column number where the current event ends." + return -1 + + def getLineNumber(self): + "Return the line number where the current event ends." + return -1 + + def getPublicId(self): + "Return the public identifier for the current event." + return None + + def getSystemId(self): + "Return the system identifier for the current event." + return None + +# ===== INPUTSOURCE ===== + +class InputSource: + """Encapsulation of the information needed by the XMLReader to + read entities. + + This class may include information about the public identifier, + system identifier, byte stream (possibly with character encoding + information) and/or the character stream of an entity. + + Applications will create objects of this class for use in the + XMLReader.parse method and for returning from + EntityResolver.resolveEntity. + + An InputSource belongs to the application, the XMLReader is not + allowed to modify InputSource objects passed to it from the + application, although it may make copies and modify those.""" + + def __init__(self, system_id = None): + self.__system_id = system_id + self.__public_id = None + self.__encoding = None + self.__bytefile = None + self.__charfile = None + + def setPublicId(self, public_id): + "Sets the public identifier of this InputSource." + self.__public_id = public_id + + def getPublicId(self): + "Returns the public identifier of this InputSource." + return self.__public_id + + def setSystemId(self, system_id): + "Sets the system identifier of this InputSource." + self.__system_id = system_id + + def getSystemId(self): + "Returns the system identifier of this InputSource." + return self.__system_id + + def setEncoding(self, encoding): + """Sets the character encoding of this InputSource. + + The encoding must be a string acceptable for an XML encoding + declaration (see section 4.3.3 of the XML recommendation). + + The encoding attribute of the InputSource is ignored if the + InputSource also contains a character stream.""" + self.__encoding = encoding + + def getEncoding(self): + "Get the character encoding of this InputSource." + return self.__encoding + + def setByteStream(self, bytefile): + """Set the byte stream (a Python file-like object which does + not perform byte-to-character conversion) for this input + source. + + The SAX parser will ignore this if there is also a character + stream specified, but it will use a byte stream in preference + to opening a URI connection itself. + + If the application knows the character encoding of the byte + stream, it should set it with the setEncoding method.""" + self.__bytefile = bytefile + + def getByteStream(self): + """Get the byte stream for this input source. + + The getEncoding method will return the character encoding for + this byte stream, or None if unknown.""" + return self.__bytefile + + def setCharacterStream(self, charfile): + """Set the character stream for this input source. (The stream + must be a Python 2.0 Unicode-wrapped file-like that performs + conversion to Unicode strings.) + + If there is a character stream specified, the SAX parser will + ignore any byte stream and will not attempt to open a URI + connection to the system identifier.""" + self.__charfile = charfile + + def getCharacterStream(self): + "Get the character stream for this input source." + return self.__charfile + +# ===== ATTRIBUTESIMPL ===== + +class AttributesImpl: + + def __init__(self, attrs): + """Non-NS-aware implementation. + + attrs should be of the form {name : value}.""" + self._attrs = attrs + + def getLength(self): + return len(self._attrs) + + def getType(self, name): + return "CDATA" + + def getValue(self, name): + return self._attrs[name] + + def getValueByQName(self, name): + return self._attrs[name] + + def getNameByQName(self, name): + if name not in self._attrs: + raise KeyError(name) + return name + + def getQNameByName(self, name): + if name not in self._attrs: + raise KeyError(name) + return name + + def getNames(self): + return list(self._attrs.keys()) + + def getQNames(self): + return list(self._attrs.keys()) + + def __len__(self): + return len(self._attrs) + + def __getitem__(self, name): + return self._attrs[name] + + def keys(self): + return list(self._attrs.keys()) + + def __contains__(self, name): + return name in self._attrs + + def get(self, name, alternative=None): + return self._attrs.get(name, alternative) + + def copy(self): + return self.__class__(self._attrs) + + def items(self): + return list(self._attrs.items()) + + def values(self): + return list(self._attrs.values()) + +# ===== ATTRIBUTESNSIMPL ===== + +class AttributesNSImpl(AttributesImpl): + + def __init__(self, attrs, qnames): + """NS-aware implementation. + + attrs should be of the form {(ns_uri, lname): value, ...}. + qnames of the form {(ns_uri, lname): qname, ...}.""" + self._attrs = attrs + self._qnames = qnames + + def getValueByQName(self, name): + for (nsname, qname) in self._qnames.items(): + if qname == name: + return self._attrs[nsname] + + raise KeyError(name) + + def getNameByQName(self, name): + for (nsname, qname) in self._qnames.items(): + if qname == name: + return nsname + + raise KeyError(name) + + def getQNameByName(self, name): + return self._qnames[name] + + def getQNames(self): + return list(self._qnames.values()) + + def copy(self): + return self.__class__(self._attrs, self._qnames) + + +def _test(): + XMLReader() + IncrementalParser() + Locator() + +if __name__ == "__main__": + _test() |