diff options
author | Stefan Israelsson Tampe <stefan.itampe@gmail.com> | 2018-09-06 23:40:42 +0200 |
---|---|---|
committer | Stefan Israelsson Tampe <stefan.itampe@gmail.com> | 2018-09-06 23:40:42 +0200 |
commit | 8733038b4de44a8cfb669420621597457eeef88a (patch) | |
tree | 9f7345b4b407ea0919320e17955b058cb2f2a5bc | |
parent | b950c0d70a1d2c95da8d60aca7af02bc50542a2b (diff) |
email policy works mm
30 files changed, 12018 insertions, 80 deletions
diff --git a/modules/language/python/compile.scm b/modules/language/python/compile.scm index d4b1011..503a1d8 100644 --- a/modules/language/python/compile.scm +++ b/modules/language/python/compile.scm @@ -544,9 +544,6 @@ (mkfast ;; General ((__init__) (O 'py-init)) - ((__getattr__) (O 'ref)) - ((__setattr__) (O 'set)) - ((__delattr__) (O 'del)) ((__ne__) (O 'ne)) ((__eq__) (O 'equal?)) ((__repr__) (O 'repr)) @@ -1221,9 +1218,7 @@ (arglist->pkw (clean parents)) `(,(G 'cons) (,(G 'quote) ()) (,(G 'quote) ()))) ,doc - ,(cons - (list 'define '__doc__ doc) - (map (lambda (x) `(define ,x ,(gw-persson x vo))) ls)) + ,(map (lambda (x) `(define ,x ,(gw-persson x vo))) ls) ,cd))))))))) (#:verb ((_ x) x)) @@ -1964,7 +1959,7 @@ (define (exp vs x) - (match (pr x) + (match x ((e) (exp vs e)) ((tag . l) @@ -2039,13 +2034,14 @@ (doc (cdr e.doc))) `(begin - (,(G 'set!) __doc__ ,doc) ,@start (,(G 'define) ,fnm (,(G 'make-hash-table))) ,@(map (lambda (s) (if (member s (fluid-ref ignore)) `(,cvalues) - `(,(C 'var) ,(cons '__doc__ s)))) globs) + `(,(C 'var) ,s))) + (cons '__doc__ globs)) + (,(G 'set!) __doc__ ,doc) ,@e (,(C 'export-all))))) @@ -2056,17 +2052,17 @@ (if (pair? start) (set! x (cdr x))) - + (let* ((globs (get-globals x)) (res (gensym "res")) (e (map (g globs exp) x))) `(,(G 'begin) - ,@start - ,@(map (lambda (s) - (if (member s (fluid-ref ignore)) - `(,cvalues) - `(,(C 'var) ,s))) globs) - (,(C 'with-exit) ,@e)))))) + ,@start + ,@(map (lambda (s) + (if (member s (fluid-ref ignore)) + `(,cvalues) + `(,(C 'var) ,s))) globs) + (,(C 'with-exit) ,@e)))))) diff --git a/modules/language/python/exceptions.scm b/modules/language/python/exceptions.scm index 0a3689f..87dd3c5 100644 --- a/modules/language/python/exceptions.scm +++ b/modules/language/python/exceptions.scm @@ -1,24 +1,27 @@ (define-module (language python exceptions) #:use-module (oop pf-objects) #:use-module (oop goops) - #:export (StopIteration GeneratorExit RuntimeError + #:export (StopIteration GeneratorExit RuntimeError TabError Exception ValueError TypeError IndexError KeyError AttributeError ArgumentError SyntaxError SystemException OSError ProcessLookupError PermissionError None NotImplemented NotImplementedError - RunTimeError AssertionError ImportError + AssertionError ImportError ModuleNotFoundError BlockingIOError InterruptedError BaseException - ZeroDivisionError ArithmeticError + ZeroDivisionError OverflowError RecursionError Warning DeprecationWarning BytesWarning - ResourceWarning UserWarning + ResourceWarning UserWarning UnicodeTranslateError UnicodeDecodeError LookupError IndentationError KeyboardInterrupt MemoryError NameError EOFError UnicodeError UnicodeEncodeError FileExistsError FileNotFoundError IsADirectoryError - EnvironmentError)) + EnvironmentError ConnectionError NotADirectoryError + ConnectionResetError ChildProcessError TimeOutError + BrokenPipeError ConnectionAbortedError + ConnectionRefusedError ArithmeticError)) (define-syntax-rule (aif it p x y) (let ((it p)) (if it x y))) @@ -71,46 +74,67 @@ (define StopIteration 'StopIteration) (define GeneratorExit 'GeneratorExit) (define-er EnvironmentError 'EnvironmentError) -(define-er UnicodeEncodeError 'UnicodeEncodeError) -(define-er FileExistsError 'FileExistsError) -(define-er FileNotFoundError 'FileNotFoundError) -(define-er IsADirectoryError 'IsADirectoryError) -(define-er UnicodeError 'UnicodeError) + (define-er EOFError 'EOFError) (define-er MemoryError 'MemoryError) (define-er NameError 'NameError) -(define-er UnicodeDecodeError 'UnicodeDecodeError) + +(define-er ValueError 'ValueError) +(define-python-class UnicodeError (ValueError)) +(define-python-class UnicodeDecodeError (UnicodeError)) +(define-python-class UnicodeEncodeError (UnicodeError)) +(define-python-class UnicodeTranslateError (UnicodeError)) + (define-er LookupError 'LookupError) -(define-er IndentationError 'IndentationError) -(define-er OverflowError 'OverflowError) + (define-python-class IndexError (LookupError)) + (define-python-class KeyError (LookupError)) + +(define-er ArithmeticError 'OverflowError) + (define-python-class OverflowError (ArithmeticError)) + (define-python-class ZeroDivisionError (ArithmeticError)) + + (define-er KeyboardInterrupt 'KeyboardInterrupt) -(define-er RecursionError 'RecursionError) -(define-er ArithmeticError 'ArithmeticError) (define-er BaseException 'BaseException) -(define-er ZeroDivisionError 'ZeroDivisionError) (define-er SystemException 'SystemException) (define-er RuntimeError 'RuntimeError) -(define-er IndexError 'IndexError) + (define-python-class NotImplementedError (RuntimeError)) + (define-python-class RecursionError (RuntimeError)) + + (define-er ArgumentError 'IndexError) -(define-er ValueError 'ValueError) + +(define-er OSError 'OSError) + (define-python-class BlockingIOError (OSError)) + (define-python-class ChildProcessError (OSError)) + (define-python-class ConnectionError (OSError)) + (define-python-class BrokenPipeError (ConnectionError)) + (define-python-class ConnectionAbortedError (ConnectionError)) + (define-python-class ConnectionRefusedError (ConnectionError)) + (define-python-class ConnectionResetError (ConnectionError)) + (define-python-class FileExistsError (OSError)) + (define-python-class FileNotFoundError (OSError)) + (define-python-class InterruptedError (OSError)) + (define-python-class IsADirectoryError (OSError)) + (define-python-class NotADirectoryError (OSError)) + (define-python-class PermissionError (OSError)) + (define-python-class ProcessLookupError (OSError)) + (define-python-class TimeOutError (OSError)) (define None 'None) -(define-er KeyError 'KeyError) + (define-er TypeError 'TypeError) (define-er AttributeError 'AttributeError) (define-er SyntaxError 'SyntaxError) -(define-er OSError 'OSError) -(define-er ProcessLookupError 'ProcessLookupError) -(define-er PermissionError 'PermissionError) -(define-er NotImplementedError 'NotImplementedError) + (define-python-class IndentationError (SyntaxError)) + (define-python-class TabError (IndentationError)) + (define-er RunTimeError 'RunTimeError) (define AssertionError 'AssertionError) (define-er ImportError 'ImportError) (define-er ModuleNotFoundError (ImportError) 'ModuleNotFoundError) -(define-er BlockingIOError 'BlockingIOError) -(define-er InterruptedError 'OSError) (define NotImplemented (list 'NotImplemented)) diff --git a/modules/language/python/module/_markupbase.py b/modules/language/python/module/_markupbase.py new file mode 100644 index 0000000..1c955be --- /dev/null +++ b/modules/language/python/module/_markupbase.py @@ -0,0 +1,397 @@ +module(_markupbase) + +"""Shared support for scanning document type declarations in HTML and XHTML. + +This module is used as a foundation for the html.parser module. It has no +documented public API and should not be used directly. + +""" + +import re + +_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match +_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match +_commentclose = re.compile(r'--\s*>') +_markedsectionclose = re.compile(r']\s*]\s*>') + +# An analysis of the MS-Word extensions is available at +# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf + +_msmarkedsectionclose = re.compile(r']\s*>') + +del re + + +class ParserBase: + """Parser base class which provides some common support methods used + by the SGML/HTML and XHTML parsers.""" + + def __init__(self): + if self.__class__ is ParserBase: + raise RuntimeError( + "_markupbase.ParserBase must be subclassed") + + def error(self, message): + raise NotImplementedError( + "subclasses of ParserBase must override error()") + + def reset(self): + self.lineno = 1 + self.offset = 0 + + def getpos(self): + """Return current line number and offset.""" + return self.lineno, self.offset + + # Internal -- update line number and offset. This should be + # called for each piece of data exactly once, in order -- in other + # words the concatenation of all the input strings to this + # function should be exactly the entire input. + def updatepos(self, i, j): + if i >= j: + return j + rawdata = self.rawdata + nlines = rawdata.count("\n", i, j) + if nlines: + self.lineno = self.lineno + nlines + pos = rawdata.rindex("\n", i, j) # Should not fail + self.offset = j-(pos+1) + else: + self.offset = self.offset + j-i + return j + + _decl_otherchars = '' + + # Internal -- parse declaration (for use by subclasses). + def parse_declaration(self, i): + # This is some sort of declaration; in "HTML as + # deployed," this should only be the document type + # declaration ("<!DOCTYPE html...>"). + # ISO 8879:1986, however, has more complex + # declaration syntax for elements in <!...>, including: + # --comment-- + # [marked section] + # name in the following list: ENTITY, DOCTYPE, ELEMENT, + # ATTLIST, NOTATION, SHORTREF, USEMAP, + # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM + rawdata = self.rawdata + j = i + 2 + assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" + if rawdata[j:j+1] == ">": + # the empty comment <!> + return j + 1 + if rawdata[j:j+1] in ("-", ""): + # Start of comment followed by buffer boundary, + # or just a buffer boundary. + return -1 + # A simple, practical version could look like: ((name|stringlit) S*) + '>' + n = len(rawdata) + if rawdata[j:j+2] == '--': #comment + # Locate --.*-- as the body of the comment + return self.parse_comment(i) + elif rawdata[j] == '[': #marked section + # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section + # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA + # Note that this is extended by Microsoft Office "Save as Web" function + # to include [if...] and [endif]. + return self.parse_marked_section(i) + else: #all other declaration elements + decltype, j = self._scan_name(j, i) + if j < 0: + return j + if decltype == "doctype": + self._decl_otherchars = '' + while j < n: + c = rawdata[j] + if c == ">": + # end of declaration syntax + data = rawdata[i+2:j] + if decltype == "doctype": + self.handle_decl(data) + else: + # According to the HTML5 specs sections "8.2.4.44 Bogus + # comment state" and "8.2.4.45 Markup declaration open + # state", a comment token should be emitted. + # Calling unknown_decl provides more flexibility though. + self.unknown_decl(data) + return j + 1 + if c in "\"'": + m = _declstringlit_match(rawdata, j) + if not m: + return -1 # incomplete + j = m.end() + elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": + name, j = self._scan_name(j, i) + elif c in self._decl_otherchars: + j = j + 1 + elif c == "[": + # this could be handled in a separate doctype parser + if decltype == "doctype": + j = self._parse_doctype_subset(j + 1, i) + elif decltype in {"attlist", "linktype", "link", "element"}: + # must tolerate []'d groups in a content model in an element declaration + # also in data attribute specifications of attlist declaration + # also link type declaration subsets in linktype declarations + # also link attribute specification lists in link declarations + self.error("unsupported '[' char in %s declaration" % decltype) + else: + self.error("unexpected '[' char in declaration") + else: + self.error( + "unexpected %r char in declaration" % rawdata[j]) + if j < 0: + return j + return -1 # incomplete + + # Internal -- parse a marked section + # Override this to handle MS-word extension syntax <![if word]>content<![endif]> + def parse_marked_section(self, i, report=1): + rawdata= self.rawdata + assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" + sectName, j = self._scan_name( i+3, i ) + if j < 0: + return j + if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}: + # look for standard ]]> ending + match= _markedsectionclose.search(rawdata, i+3) + elif sectName in {"if", "else", "endif"}: + # look for MS Office ]> ending + match= _msmarkedsectionclose.search(rawdata, i+3) + else: + self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) + if not match: + return -1 + if report: + j = match.start(0) + self.unknown_decl(rawdata[i+3: j]) + return match.end(0) + + # Internal -- parse comment, return length or -1 if not terminated + def parse_comment(self, i, report=1): + rawdata = self.rawdata + if rawdata[i:i+4] != '<!--': + self.error('unexpected call to parse_comment()') + match = _commentclose.search(rawdata, i+4) + if not match: + return -1 + if report: + j = match.start(0) + self.handle_comment(rawdata[i+4: j]) + return match.end(0) + + # Internal -- scan past the internal subset in a <!DOCTYPE declaration, + # returning the index just past any whitespace following the trailing ']'. + def _parse_doctype_subset(self, i, declstartpos): + rawdata = self.rawdata + n = len(rawdata) + j = i + while j < n: + c = rawdata[j] + if c == "<": + s = rawdata[j:j+2] + if s == "<": + # end of buffer; incomplete + return -1 + if s != "<!": + self.updatepos(declstartpos, j + 1) + self.error("unexpected char in internal subset (in %r)" % s) + if (j + 2) == n: + # end of buffer; incomplete + return -1 + if (j + 4) > n: + # end of buffer; incomplete + return -1 + if rawdata[j:j+4] == "<!--": + j = self.parse_comment(j, report=0) + if j < 0: + return j + continue + name, j = self._scan_name(j + 2, declstartpos) + if j == -1: + return -1 + if name not in {"attlist", "element", "entity", "notation"}: + self.updatepos(declstartpos, j + 2) + self.error( + "unknown declaration %r in internal subset" % name) + # handle the individual names + meth = getattr(self, "_parse_doctype_" + name) + j = meth(j, declstartpos) + if j < 0: + return j + elif c == "%": + # parameter entity reference + if (j + 1) == n: + # end of buffer; incomplete + return -1 + s, j = self._scan_name(j + 1, declstartpos) + if j < 0: + return j + if rawdata[j] == ";": + j = j + 1 + elif c == "]": + j = j + 1 + while j < n and rawdata[j].isspace(): + j = j + 1 + if j < n: + if rawdata[j] == ">": + return j + self.updatepos(declstartpos, j) + self.error("unexpected char after internal subset") + else: + return -1 + elif c.isspace(): + j = j + 1 + else: + self.updatepos(declstartpos, j) + self.error("unexpected char %r in internal subset" % c) + # end of buffer reached + return -1 + + # Internal -- scan past <!ELEMENT declarations + def _parse_doctype_element(self, i, declstartpos): + name, j = self._scan_name(i, declstartpos) + if j == -1: + return -1 + # style content model; just skip until '>' + rawdata = self.rawdata + if '>' in rawdata[j:]: + return rawdata.find(">", j) + 1 + return -1 + + # Internal -- scan past <!ATTLIST declarations + def _parse_doctype_attlist(self, i, declstartpos): + rawdata = self.rawdata + name, j = self._scan_name(i, declstartpos) + c = rawdata[j:j+1] + if c == "": + return -1 + if c == ">": + return j + 1 + while 1: + # scan a series of attribute descriptions; simplified: + # name type [value] [#constraint] + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + c = rawdata[j:j+1] + if c == "": + return -1 + if c == "(": + # an enumerated type; look for ')' + if ")" in rawdata[j:]: + j = rawdata.find(")", j) + 1 + else: + return -1 + while rawdata[j:j+1].isspace(): + j = j + 1 + if not rawdata[j:]: + # end of buffer, incomplete + return -1 + else: + name, j = self._scan_name(j, declstartpos) + c = rawdata[j:j+1] + if not c: + return -1 + if c in "'\"": + m = _declstringlit_match(rawdata, j) + if m: + j = m.end() + else: + return -1 + c = rawdata[j:j+1] + if not c: + return -1 + if c == "#": + if rawdata[j:] == "#": + # end of buffer + return -1 + name, j = self._scan_name(j + 1, declstartpos) + if j < 0: + return j + c = rawdata[j:j+1] + if not c: + return -1 + if c == '>': + # all done + return j + 1 + + # Internal -- scan past <!NOTATION declarations + def _parse_doctype_notation(self, i, declstartpos): + name, j = self._scan_name(i, declstartpos) + if j < 0: + return j + rawdata = self.rawdata + while 1: + c = rawdata[j:j+1] + if not c: + # end of buffer; incomplete + return -1 + if c == '>': + return j + 1 + if c in "'\"": + m = _declstringlit_match(rawdata, j) + if not m: + return -1 + j = m.end() + else: + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + + # Internal -- scan past <!ENTITY declarations + def _parse_doctype_entity(self, i, declstartpos): + rawdata = self.rawdata + if rawdata[i:i+1] == "%": + j = i + 1 + while 1: + c = rawdata[j:j+1] + if not c: + return -1 + if c.isspace(): + j = j + 1 + else: + break + else: + j = i + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + while 1: + c = self.rawdata[j:j+1] + if not c: + return -1 + if c in "'\"": + m = _declstringlit_match(rawdata, j) + if m: + j = m.end() + else: + return -1 # incomplete + elif c == ">": + return j + 1 + else: + name, j = self._scan_name(j, declstartpos) + if j < 0: + return j + + # Internal -- scan a name token and the new position and the token, or + # return -1 if we've reached the end of the buffer. + def _scan_name(self, i, declstartpos): + rawdata = self.rawdata + n = len(rawdata) + if i == n: + return None, -1 + m = _declname_match(rawdata, i) + if m: + s = m.group() + name = s.strip() + if (i + len(s)) == n: + return None, -1 # end of buffer + return name.lower(), m.end() + else: + self.updatepos(declstartpos, i) + self.error("expected name token at %r" + % rawdata[declstartpos:declstartpos+20]) + + # To be overridden -- handlers for unknown objects + def unknown_decl(self, data): + pass diff --git a/modules/language/python/module/binascii.scm b/modules/language/python/module/binascii.scm index 12bdcac..2fc0e62 100644 --- a/modules/language/python/module/binascii.scm +++ b/modules/language/python/module/binascii.scm @@ -386,7 +386,7 @@ (lp (+ i 2) (cons x r))) (bytes (reverse r))))))) -(define-inlinable (id x) x) +(define (id x) x) (define-syntax-rule (mkcrc crc_hqx high xor mask) (def (crc_hqx data (= value 0)) (let ((n (len data)) diff --git a/modules/language/python/module/email/_encoded_words.py b/modules/language/python/module/email/_encoded_words.py new file mode 100644 index 0000000..8c02709 --- /dev/null +++ b/modules/language/python/module/email/_encoded_words.py @@ -0,0 +1,222 @@ +module(email,_encoded_words) +""" Routines for manipulating RFC2047 encoded words. + +This is currently a package-private API, but will be considered for promotion +to a public API if there is demand. + +""" + +# An ecoded word looks like this: +# +# =?charset[*lang]?cte?encoded_string?= +# +# for more information about charset see the charset module. Here it is one +# of the preferred MIME charset names (hopefully; you never know when parsing). +# cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In +# theory other letters could be used for other encodings, but in practice this +# (almost?) never happens. There could be a public API for adding entries +# to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is +# Base64. The meaning of encoded_string should be obvious. 'lang' is optional +# as indicated by the brackets (they are not part of the syntax) but is almost +# never encountered in practice. +# +# The general interface for a CTE decoder is that it takes the encoded_string +# as its argument, and returns a tuple (cte_decoded_string, defects). The +# cte_decoded_string is the original binary that was encoded using the +# specified cte. 'defects' is a list of MessageDefect instances indicating any +# problems encountered during conversion. 'charset' and 'lang' are the +# corresponding strings extracted from the EW, case preserved. +# +# The general interface for a CTE encoder is that it takes a binary sequence +# as input and returns the cte_encoded_string, which is an ascii-only string. +# +# Each decoder must also supply a length function that takes the binary +# sequence as its argument and returns the length of the resulting encoded +# string. +# +# The main API functions for the module are decode, which calls the decoder +# referenced by the cte specifier, and encode, which adds the appropriate +# RFC 2047 "chrome" to the encoded string, and can optionally automatically +# select the shortest possible encoding. See their docstrings below for +# details. + +import re +import base64 +import binascii +import functools +from string import ascii_letters, digits +import email.errors as errors + +__all__ = ['decode_q', + 'encode_q', + 'decode_b', + 'encode_b', + 'len_q', + 'len_b', + 'decode', + 'encode', + ] + +# +# Quoted Printable +# + +# regex based decoder. +_q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub, + lambda m: bytes([int(m.group(1), 16)])) + +def decode_q(encoded): + encoded = encoded.replace(b'_', b' ') + return _q_byte_subber(encoded), [] + + +# dict mapping bytes to their encoded form +class _QByteMap(dict): + + safe = b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii') + + def __missing__(self, key): + if key in self.safe: + self[key] = chr(key) + else: + self[key] = "={:02X}".format(key) + return self[key] + +_q_byte_map = _QByteMap() + +# In headers spaces are mapped to '_'. +_q_byte_map[ord(' ')] = '_' + +def encode_q(bstring): + return ''.join(_q_byte_map[x] for x in bstring) + +def len_q(bstring): + return sum(len(_q_byte_map[x]) for x in bstring) + + +# +# Base64 +# + +def decode_b(encoded): + defects = [] + pad_err = len(encoded) % 4 + if pad_err: + defects.append(errors.InvalidBase64PaddingDefect()) + padded_encoded = encoded + b'==='[:4-pad_err] + else: + padded_encoded = encoded + try: + return base64.b64decode(padded_encoded, validate=True), defects + except binascii.Error: + # Since we had correct padding, this must an invalid char error. + defects = [errors.InvalidBase64CharactersDefect()] + # The non-alphabet characters are ignored as far as padding + # goes, but we don't know how many there are. So we'll just + # try various padding lengths until something works. + for i in 0, 1, 2, 3: + try: + return base64.b64decode(encoded+b'='*i, validate=False), defects + except binascii.Error: + if i==0: + defects.append(errors.InvalidBase64PaddingDefect()) + else: + # This should never happen. + raise AssertionError("unexpected binascii.Error") + +def encode_b(bstring): + return base64.b64encode(bstring).decode('ascii') + +def len_b(bstring): + groups_of_3, leftover = divmod(len(bstring), 3) + # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in. + return groups_of_3 * 4 + (4 if leftover else 0) + + +_cte_decoders = { + 'q': decode_q, + 'b': decode_b, + } + +def decode(ew): + """Decode encoded word and return (string, charset, lang, defects) tuple. + + An RFC 2047/2243 encoded word has the form: + + =?charset*lang?cte?encoded_string?= + + where '*lang' may be omitted but the other parts may not be. + + This function expects exactly such a string (that is, it does not check the + syntax and may raise errors if the string is not well formed), and returns + the encoded_string decoded first from its Content Transfer Encoding and + then from the resulting bytes into unicode using the specified charset. If + the cte-decoded string does not successfully decode using the specified + character set, a defect is added to the defects list and the unknown octets + are replaced by the unicode 'unknown' character \\uFDFF. + + The specified charset and language are returned. The default for language, + which is rarely if ever encountered, is the empty string. + + """ + _, charset, cte, cte_string, _ = ew.split('?') + charset, _, lang = charset.partition('*') + cte = cte.lower() + # Recover the original bytes and do CTE decoding. + bstring = cte_string.encode('ascii', 'surrogateescape') + bstring, defects = _cte_decoders[cte](bstring) + # Turn the CTE decoded bytes into unicode. + try: + string = bstring.decode(charset) + except UnicodeError: + defects.append(errors.UndecodableBytesDefect("Encoded word " + "contains bytes not decodable using {} charset".format(charset))) + string = bstring.decode(charset, 'surrogateescape') + except LookupError: + string = bstring.decode('ascii', 'surrogateescape') + if charset.lower() != 'unknown-8bit': + defects.append(errors.CharsetError("Unknown charset {} " + "in encoded word; decoded as unknown bytes".format(charset))) + return string, charset, lang, defects + + +_cte_encoders = { + 'q': encode_q, + 'b': encode_b, + } + +_cte_encode_length = { + 'q': len_q, + 'b': len_b, + } + +def encode(string, charset='utf-8', encoding=None, lang=''): + """Encode string using the CTE encoding that produces the shorter result. + + Produces an RFC 2047/2243 encoded word of the form: + + =?charset*lang?cte?encoded_string?= + + where '*lang' is omitted unless the 'lang' parameter is given a value. + Optional argument charset (defaults to utf-8) specifies the charset to use + to encode the string to binary before CTE encoding it. Optional argument + 'encoding' is the cte specifier for the encoding that should be used ('q' + or 'b'); if it is None (the default) the encoding which produces the + shortest encoded sequence is used, except that 'q' is preferred if it is up + to five characters longer. Optional argument 'lang' (default '') gives the + RFC 2243 language string to specify in the encoded word. + + """ + if charset == 'unknown-8bit': + bstring = string.encode('ascii', 'surrogateescape') + else: + bstring = string.encode(charset) + if encoding is None: + qlen = _cte_encode_length['q'](bstring) + blen = _cte_encode_length['b'](bstring) + # Bias toward q. 5 is arbitrary. + encoding = 'q' if qlen - blen < 5 else 'b' + encoded = _cte_encoders[encoding](bstring) + if lang: + lang = '*' + lang + return "=?{}{}?{}?{}?=".format(charset, lang, encoding, encoded) diff --git a/modules/language/python/module/email/_header_value_parser.py b/modules/language/python/module/email/_header_value_parser.py new file mode 100644 index 0000000..0f4a1cb --- /dev/null +++ b/modules/language/python/module/email/_header_value_parser.py @@ -0,0 +1,2820 @@ +module(email,_header_value_parser) + +"""Header value parser implementing various email-related RFC parsing rules. + +The parsing methods defined in this module implement various email related +parsing rules. Principal among them is RFC 5322, which is the followon +to RFC 2822 and primarily a clarification of the former. It also implements +RFC 2047 encoded word decoding. + +RFC 5322 goes to considerable trouble to maintain backward compatibility with +RFC 822 in the parse phase, while cleaning up the structure on the generation +phase. This parser supports correct RFC 5322 generation by tagging white space +as folding white space only when folding is allowed in the non-obsolete rule +sets. Actually, the parser is even more generous when accepting input than RFC +5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages. +Where possible deviations from the standard are annotated on the 'defects' +attribute of tokens that deviate. + +The general structure of the parser follows RFC 5322, and uses its terminology +where there is a direct correspondence. Where the implementation requires a +somewhat different structure than that used by the formal grammar, new terms +that mimic the closest existing terms are used. Thus, it really helps to have +a copy of RFC 5322 handy when studying this code. + +Input to the parser is a string that has already been unfolded according to +RFC 5322 rules. According to the RFC this unfolding is the very first step, and +this parser leaves the unfolding step to a higher level message parser, which +will have already detected the line breaks that need unfolding while +determining the beginning and end of each header. + +The output of the parser is a TokenList object, which is a list subclass. A +TokenList is a recursive data structure. The terminal nodes of the structure +are Terminal objects, which are subclasses of str. These do not correspond +directly to terminal objects in the formal grammar, but are instead more +practical higher level combinations of true terminals. + +All TokenList and Terminal objects have a 'value' attribute, which produces the +semantically meaningful value of that part of the parse subtree. The value of +all whitespace tokens (no matter how many sub-tokens they may contain) is a +single space, as per the RFC rules. This includes 'CFWS', which is herein +included in the general class of whitespace tokens. There is one exception to +the rule that whitespace tokens are collapsed into single spaces in values: in +the value of a 'bare-quoted-string' (a quoted-string with no leading or +trailing whitespace), any whitespace that appeared between the quotation marks +is preserved in the returned value. Note that in all Terminal strings quoted +pairs are turned into their unquoted values. + +All TokenList and Terminal objects also have a string value, which attempts to +be a "canonical" representation of the RFC-compliant form of the substring that +produced the parsed subtree, including minimal use of quoted pair quoting. +Whitespace runs are not collapsed. + +Comment tokens also have a 'content' attribute providing the string found +between the parens (including any nested comments) with whitespace preserved. + +All TokenList and Terminal objects have a 'defects' attribute which is a +possibly empty list all of the defects found while creating the token. Defects +may appear on any token in the tree, and a composite list of all defects in the +subtree is available through the 'all_defects' attribute of any node. (For +Terminal notes x.defects == x.all_defects.) + +Each object in a parse tree is called a 'token', and each has a 'token_type' +attribute that gives the name from the RFC 5322 grammar that it represents. +Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that +may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters. +It is returned in place of lists of (ctext/quoted-pair) and +(qtext/quoted-pair). + +XXX: provide complete list of token types. +""" + +import re +import urllib.parse as parse # For urllib.parse.unquote +from string import hexdigits +from collections import OrderedDict +from operator import itemgetter +import email._encoded_words as _ew +import email.errors as errors +import email.utils as utils + +# +# Useful constants and functions +# + +WSP = set(' \t') +CFWS_LEADER = WSP | set('(') +SPECIALS = set(r'()<>@,:;.\"[]') +ATOM_ENDS = SPECIALS | WSP +DOT_ATOM_ENDS = ATOM_ENDS - set('.') +# '.', '"', and '(' do not end phrases in order to support obs-phrase +PHRASE_ENDS = SPECIALS - set('."(') +TSPECIALS = (SPECIALS | set('/?=')) - set('.') +TOKEN_ENDS = TSPECIALS | WSP +ASPECIALS = TSPECIALS | set("*'%") +ATTRIBUTE_ENDS = ASPECIALS | WSP +EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') + +def quote_string(value): + return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' + +# +# TokenList and its subclasses +# + +class TokenList(list): + + token_type = None + syntactic_break = True + ew_combine_allowed = True + + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self.defects = [] + + def __str__(self): + return ''.join(str(x) for x in self) + + def __repr__(self): + return '{}({})'.format(self.__class__.__name__, + super().__repr__()) + + @property + def value(self): + return ''.join(x.value for x in self if x.value) + + @property + def all_defects(self): + return sum((x.all_defects for x in self), self.defects) + + def startswith_fws(self): + return self[0].startswith_fws() + + @property + def as_ew_allowed(self): + """True if all top level tokens of this part may be RFC2047 encoded.""" + return all(part.as_ew_allowed for part in self) + + @property + def comments(self): + comments = [] + for token in self: + comments.extend(token.comments) + return comments + + def fold(self, *, policy): + return _refold_parse_tree(self, policy=policy) + + def pprint(self, indent=''): + print(self.ppstr(indent=indent)) + + def ppstr(self, indent=''): + return '\n'.join(self._pp(indent=indent)) + + def _pp(self, indent=''): + yield '{}{}/{}('.format( + indent, + self.__class__.__name__, + self.token_type) + for token in self: + if not hasattr(token, '_pp'): + yield (indent + ' !! invalid element in token ' + 'list: {!r}'.format(token)) + else: + yield from token._pp(indent+' ') + if self.defects: + extra = ' Defects: {}'.format(self.defects) + else: + extra = '' + yield '{}){}'.format(indent, extra) + + +class WhiteSpaceTokenList(TokenList): + + @property + def value(self): + return ' ' + + @property + def comments(self): + return [x.content for x in self if x.token_type=='comment'] + + +class UnstructuredTokenList(TokenList): + + token_type = 'unstructured' + + +class Phrase(TokenList): + + token_type = 'phrase' + +class Word(TokenList): + + token_type = 'word' + + +class CFWSList(WhiteSpaceTokenList): + + token_type = 'cfws' + + +class Atom(TokenList): + + token_type = 'atom' + + +class Token(TokenList): + + token_type = 'token' + encode_as_ew = False + + +class EncodedWord(TokenList): + + token_type = 'encoded-word' + cte = None + charset = None + lang = None + + +class QuotedString(TokenList): + + token_type = 'quoted-string' + + @property + def content(self): + for x in self: + if x.token_type == 'bare-quoted-string': + return x.value + + @property + def quoted_value(self): + res = [] + for x in self: + if x.token_type == 'bare-quoted-string': + res.append(str(x)) + else: + res.append(x.value) + return ''.join(res) + + @property + def stripped_value(self): + for token in self: + if token.token_type == 'bare-quoted-string': + return token.value + + +class BareQuotedString(QuotedString): + + token_type = 'bare-quoted-string' + + def __str__(self): + return quote_string(''.join(str(x) for x in self)) + + @property + def value(self): + return ''.join(str(x) for x in self) + + +class Comment(WhiteSpaceTokenList): + + token_type = 'comment' + + def __str__(self): + return ''.join(sum([ + ["("], + [self.quote(x) for x in self], + [")"], + ], [])) + + def quote(self, value): + if value.token_type == 'comment': + return str(value) + return str(value).replace('\\', '\\\\').replace( + '(', r'\(').replace( + ')', r'\)') + + @property + def content(self): + return ''.join(str(x) for x in self) + + @property + def comments(self): + return [self.content] + +class AddressList(TokenList): + + token_type = 'address-list' + + @property + def addresses(self): + return [x for x in self if x.token_type=='address'] + + @property + def mailboxes(self): + return sum((x.mailboxes + for x in self if x.token_type=='address'), []) + + @property + def all_mailboxes(self): + return sum((x.all_mailboxes + for x in self if x.token_type=='address'), []) + + +class Address(TokenList): + + token_type = 'address' + + @property + def display_name(self): + if self[0].token_type == 'group': + return self[0].display_name + + @property + def mailboxes(self): + if self[0].token_type == 'mailbox': + return [self[0]] + elif self[0].token_type == 'invalid-mailbox': + return [] + return self[0].mailboxes + + @property + def all_mailboxes(self): + if self[0].token_type == 'mailbox': + return [self[0]] + elif self[0].token_type == 'invalid-mailbox': + return [self[0]] + return self[0].all_mailboxes + +class MailboxList(TokenList): + + token_type = 'mailbox-list' + + @property + def mailboxes(self): + return [x for x in self if x.token_type=='mailbox'] + + @property + def all_mailboxes(self): + return [x for x in self + if x.token_type in ('mailbox', 'invalid-mailbox')] + + +class GroupList(TokenList): + + token_type = 'group-list' + + @property + def mailboxes(self): + if not self or self[0].token_type != 'mailbox-list': + return [] + return self[0].mailboxes + + @property + def all_mailboxes(self): + if not self or self[0].token_type != 'mailbox-list': + return [] + return self[0].all_mailboxes + + +class Group(TokenList): + + token_type = "group" + + @property + def mailboxes(self): + if self[2].token_type != 'group-list': + return [] + return self[2].mailboxes + + @property + def all_mailboxes(self): + if self[2].token_type != 'group-list': + return [] + return self[2].all_mailboxes + + @property + def display_name(self): + return self[0].display_name + + +class NameAddr(TokenList): + + token_type = 'name-addr' + + @property + def display_name(self): + if len(self) == 1: + return None + return self[0].display_name + + @property + def local_part(self): + return self[-1].local_part + + @property + def domain(self): + return self[-1].domain + + @property + def route(self): + return self[-1].route + + @property + def addr_spec(self): + return self[-1].addr_spec + + +class AngleAddr(TokenList): + + token_type = 'angle-addr' + + @property + def local_part(self): + for x in self: + if x.token_type == 'addr-spec': + return x.local_part + + @property + def domain(self): + for x in self: + if x.token_type == 'addr-spec': + return x.domain + + @property + def route(self): + for x in self: + if x.token_type == 'obs-route': + return x.domains + + @property + def addr_spec(self): + for x in self: + if x.token_type == 'addr-spec': + if x.local_part: + return x.addr_spec + else: + return quote_string(x.local_part) + x.addr_spec + else: + return '<>' + + +class ObsRoute(TokenList): + + token_type = 'obs-route' + + @property + def domains(self): + return [x.domain for x in self if x.token_type == 'domain'] + + +class Mailbox(TokenList): + + token_type = 'mailbox' + + @property + def display_name(self): + if self[0].token_type == 'name-addr': + return self[0].display_name + + @property + def local_part(self): + return self[0].local_part + + @property + def domain(self): + return self[0].domain + + @property + def route(self): + if self[0].token_type == 'name-addr': + return self[0].route + + @property + def addr_spec(self): + return self[0].addr_spec + + +class InvalidMailbox(TokenList): + + token_type = 'invalid-mailbox' + + @property + def display_name(self): + return None + + local_part = domain = route = addr_spec = display_name + + +class Domain(TokenList): + + token_type = 'domain' + as_ew_allowed = False + + @property + def domain(self): + return ''.join(super().value.split()) + + +class DotAtom(TokenList): + + token_type = 'dot-atom' + + +class DotAtomText(TokenList): + + token_type = 'dot-atom-text' + as_ew_allowed = True + + +class AddrSpec(TokenList): + + token_type = 'addr-spec' + as_ew_allowed = False + + @property + def local_part(self): + return self[0].local_part + + @property + def domain(self): + if len(self) < 3: + return None + return self[-1].domain + + @property + def value(self): + if len(self) < 3: + return self[0].value + return self[0].value.rstrip()+self[1].value+self[2].value.lstrip() + + @property + def addr_spec(self): + nameset = set(self.local_part) + if len(nameset) > len(nameset-DOT_ATOM_ENDS): + lp = quote_string(self.local_part) + else: + lp = self.local_part + if self.domain is not None: + return lp + '@' + self.domain + return lp + + +class ObsLocalPart(TokenList): + + token_type = 'obs-local-part' + as_ew_allowed = False + + +class DisplayName(Phrase): + + token_type = 'display-name' + ew_combine_allowed = False + + @property + def display_name(self): + res = TokenList(self) + if res[0].token_type == 'cfws': + res.pop(0) + else: + if res[0][0].token_type == 'cfws': + res[0] = TokenList(res[0][1:]) + if res[-1].token_type == 'cfws': + res.pop() + else: + if res[-1][-1].token_type == 'cfws': + res[-1] = TokenList(res[-1][:-1]) + return res.value + + @property + def value(self): + quote = False + if self.defects: + quote = True + else: + for x in self: + if x.token_type == 'quoted-string': + quote = True + if quote: + pre = post = '' + if self[0].token_type=='cfws' or self[0][0].token_type=='cfws': + pre = ' ' + if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws': + post = ' ' + return pre+quote_string(self.display_name)+post + else: + return super().value + + +class LocalPart(TokenList): + + token_type = 'local-part' + as_ew_allowed = False + + @property + def value(self): + if self[0].token_type == "quoted-string": + return self[0].quoted_value + else: + return self[0].value + + @property + def local_part(self): + # Strip whitespace from front, back, and around dots. + res = [DOT] + last = DOT + last_is_tl = False + for tok in self[0] + [DOT]: + if tok.token_type == 'cfws': + continue + if (last_is_tl and tok.token_type == 'dot' and + last[-1].token_type == 'cfws'): + res[-1] = TokenList(last[:-1]) + is_tl = isinstance(tok, TokenList) + if (is_tl and last.token_type == 'dot' and + tok[0].token_type == 'cfws'): + res.append(TokenList(tok[1:])) + else: + res.append(tok) + last = res[-1] + last_is_tl = is_tl + res = TokenList(res[1:-1]) + return res.value + + +class DomainLiteral(TokenList): + + token_type = 'domain-literal' + as_ew_allowed = False + + @property + def domain(self): + return ''.join(super().value.split()) + + @property + def ip(self): + for x in self: + if x.token_type == 'ptext': + return x.value + + +class MIMEVersion(TokenList): + + token_type = 'mime-version' + major = None + minor = None + + +class Parameter(TokenList): + + token_type = 'parameter' + sectioned = False + extended = False + charset = 'us-ascii' + + @property + def section_number(self): + # Because the first token, the attribute (name) eats CFWS, the second + # token is always the section if there is one. + return self[1].number if self.sectioned else 0 + + @property + def param_value(self): + # This is part of the "handle quoted extended parameters" hack. + for token in self: + if token.token_type == 'value': + return token.stripped_value + if token.token_type == 'quoted-string': + for token in token: + if token.token_type == 'bare-quoted-string': + for token in token: + if token.token_type == 'value': + return token.stripped_value + return '' + + +class InvalidParameter(Parameter): + + token_type = 'invalid-parameter' + + +class Attribute(TokenList): + + token_type = 'attribute' + + @property + def stripped_value(self): + for token in self: + if token.token_type.endswith('attrtext'): + return token.value + +class Section(TokenList): + + token_type = 'section' + number = None + + +class Value(TokenList): + + token_type = 'value' + + @property + def stripped_value(self): + token = self[0] + if token.token_type == 'cfws': + token = self[1] + if token.token_type.endswith( + ('quoted-string', 'attribute', 'extended-attribute')): + return token.stripped_value + return self.value + + +class MimeParameters(TokenList): + + token_type = 'mime-parameters' + syntactic_break = False + + @property + def params(self): + # The RFC specifically states that the ordering of parameters is not + # guaranteed and may be reordered by the transport layer. So we have + # to assume the RFC 2231 pieces can come in any order. However, we + # output them in the order that we first see a given name, which gives + # us a stable __str__. + params = OrderedDict() + for token in self: + if not token.token_type.endswith('parameter'): + continue + if token[0].token_type != 'attribute': + continue + name = token[0].value.strip() + if name not in params: + params[name] = [] + params[name].append((token.section_number, token)) + for name, parts in params.items(): + parts = sorted(parts, key=itemgetter(0)) + first_param = parts[0][1] + charset = first_param.charset + # Our arbitrary error recovery is to ignore duplicate parameters, + # to use appearance order if there are duplicate rfc 2231 parts, + # and to ignore gaps. This mimics the error recovery of get_param. + if not first_param.extended and len(parts) > 1: + if parts[1][0] == 0: + parts[1][1].defects.append(errors.InvalidHeaderDefect( + 'duplicate parameter name; duplicate(s) ignored')) + parts = parts[:1] + # Else assume the *0* was missing...note that this is different + # from get_param, but we registered a defect for this earlier. + value_parts = [] + i = 0 + for section_number, param in parts: + if section_number != i: + # We could get fancier here and look for a complete + # duplicate extended parameter and ignore the second one + # seen. But we're not doing that. The old code didn't. + if not param.extended: + param.defects.append(errors.InvalidHeaderDefect( + 'duplicate parameter name; duplicate ignored')) + continue + else: + param.defects.append(errors.InvalidHeaderDefect( + "inconsistent RFC2231 parameter numbering")) + i += 1 + value = param.param_value + if param.extended: + try: + value = parse.unquote_to_bytes(value) + except UnicodeEncodeError: + # source had surrogate escaped bytes. What we do now + # is a bit of an open question. I'm not sure this is + # the best choice, but it is what the old algorithm did + value = parse.unquote(value, encoding='latin-1') + else: + try: + value = value.decode(charset, 'surrogateescape') + except LookupError: + # XXX: there should really be a custom defect for + # unknown character set to make it easy to find, + # because otherwise unknown charset is a silent + # failure. + value = value.decode('us-ascii', 'surrogateescape') + if utils._has_surrogates(value): + param.defects.append(errors.UndecodableBytesDefect()) + value_parts.append(value) + value = ''.join(value_parts) + yield name, value + + def __str__(self): + params = [] + for name, value in self.params: + if value: + params.append('{}={}'.format(name, quote_string(value))) + else: + params.append(name) + params = '; '.join(params) + return ' ' + params if params else '' + + +class ParameterizedHeaderValue(TokenList): + + # Set this false so that the value doesn't wind up on a new line even + # if it and the parameters would fit there but not on the first line. + syntactic_break = False + + @property + def params(self): + for token in reversed(self): + if token.token_type == 'mime-parameters': + return token.params + return {} + + +class ContentType(ParameterizedHeaderValue): + + token_type = 'content-type' + as_ew_allowed = False + maintype = 'text' + subtype = 'plain' + + +class ContentDisposition(ParameterizedHeaderValue): + + token_type = 'content-disposition' + as_ew_allowed = False + content_disposition = None + + +class ContentTransferEncoding(TokenList): + + token_type = 'content-transfer-encoding' + as_ew_allowed = False + cte = '7bit' + + +class HeaderLabel(TokenList): + + token_type = 'header-label' + as_ew_allowed = False + + +class Header(TokenList): + + token_type = 'header' + + +# +# Terminal classes and instances +# + +class Terminal(str): + + as_ew_allowed = True + ew_combine_allowed = True + syntactic_break = True + + def __new__(cls, value, token_type): + self = str.__newobj__(cls, value) + self.token_type = token_type + self.defects = [] + return self + + def __repr__(self): + return "{}({})".format(self.__class__.__name__, super().__repr__()) + + def pprint(self): + print(self.__class__.__name__ + '/' + self.token_type) + + @property + def all_defects(self): + return list(self.defects) + + def _pp(self, indent=''): + return ["{}{}/{}({}){}".format( + indent, + self.__class__.__name__, + self.token_type, + super().__repr__(), + '' if not self.defects else ' {}'.format(self.defects), + )] + + def pop_trailing_ws(self): + # This terminates the recursion. + return None + + @property + def comments(self): + return [] + + def __getnewargs__(self): + return(str(self), self.token_type) + + +class WhiteSpaceTerminal(Terminal): + + @property + def value(self): + return ' ' + + def startswith_fws(self): + return True + + +class ValueTerminal(Terminal): + + @property + def value(self): + return self + + def startswith_fws(self): + return False + + +class EWWhiteSpaceTerminal(WhiteSpaceTerminal): + + @property + def value(self): + return '' + + def __str__(self): + return '' + + +# XXX these need to become classes and used as instances so +# that a program can't change them in a parse tree and screw +# up other parse trees. Maybe should have tests for that, too. +DOT = ValueTerminal('.', 'dot') +ListSeparator = ValueTerminal(',', 'list-separator') +RouteComponentMarker = ValueTerminal('@', 'route-component-marker') + +# +# Parser +# + +# Parse strings according to RFC822/2047/2822/5322 rules. +# +# This is a stateless parser. Each get_XXX function accepts a string and +# returns either a Terminal or a TokenList representing the RFC object named +# by the method and a string containing the remaining unparsed characters +# from the input. Thus a parser method consumes the next syntactic construct +# of a given type and returns a token representing the construct plus the +# unparsed remainder of the input string. +# +# For example, if the first element of a structured header is a 'phrase', +# then: +# +# phrase, value = get_phrase(value) +# +# returns the complete phrase from the start of the string value, plus any +# characters left in the string after the phrase is removed. + + +_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split +_non_atom_end_matcher = re.compile(r"[^{}]+".format( + ''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match +_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall +_non_token_end_matcher = re.compile(r"[^{}]+".format( + ''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match +_non_attribute_end_matcher = re.compile(r"[^{}]+".format( + ''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match +_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format( + ''.join(EXTENDED_ATTRIBUTE_ENDS).replace( + '\\','\\\\').replace(']',r'\]'))).match + +def _validate_xtext(xtext): + """If input token contains ASCII non-printables, register a defect.""" + + non_printables = _non_printable_finder(xtext) + if non_printables: + xtext.defects.append(errors.NonPrintableDefect(non_printables)) + if utils._has_surrogates(xtext): + xtext.defects.append(errors.UndecodableBytesDefect( + "Non-ASCII characters found in header token")) + +def _get_ptext_to_endchars(value, endchars): + """Scan printables/quoted-pairs until endchars and return unquoted ptext. + + This function turns a run of qcontent, ccontent-without-comments, or + dtext-with-quoted-printables into a single string by unquoting any + quoted printables. It returns the string, the remaining value, and + a flag that is True iff there were any quoted printables decoded. + + """ + fragment, *remainder = _wsp_splitter(value, 1) + vchars = [] + escape = False + had_qp = False + for pos in range(len(fragment)): + if fragment[pos] == '\\': + if escape: + escape = False + had_qp = True + else: + escape = True + continue + if escape: + escape = False + elif fragment[pos] in endchars: + break + vchars.append(fragment[pos]) + else: + pos = pos + 1 + return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp + +def get_fws(value): + """FWS = 1*WSP + + This isn't the RFC definition. We're using fws to represent tokens where + folding can be done, but when we are parsing the *un*folding has already + been done so we don't need to watch out for CRLF. + + """ + newvalue = value.lstrip() + fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') + return fws, newvalue + +def get_encoded_word(value): + """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" + + """ + ew = EncodedWord() + if not value.startswith('=?'): + raise errors.HeaderParseError( + "expected encoded word but found {}".format(value)) + tok, *remainder = value[2:].split('?=', 1) + if tok == value[2:]: + raise errors.HeaderParseError( + "expected encoded word but found {}".format(value)) + remstr = ''.join(remainder) + if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits: + # The ? after the CTE was followed by an encoded word escape (=XX). + rest, *remainder = remstr.split('?=', 1) + tok = tok + '?=' + rest + if len(tok.split()) > 1: + ew.defects.append(errors.InvalidHeaderDefect( + "whitespace inside encoded word")) + ew.cte = value + value = ''.join(remainder) + try: + text, charset, lang, defects = _ew.decode('=?' + tok + '?=') + except ValueError: + raise errors.HeaderParseError( + "encoded word format invalid: '{}'".format(ew.cte)) + ew.charset = charset + ew.lang = lang + ew.defects.extend(defects) + while text: + if text[0] in WSP: + token, text = get_fws(text) + ew.append(token) + continue + chars, *remainder = _wsp_splitter(text, 1) + vtext = ValueTerminal(chars, 'vtext') + _validate_xtext(vtext) + ew.append(vtext) + text = ''.join(remainder) + return ew, value + +def get_unstructured(value): + """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct + obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) + obs-utext = %d0 / obs-NO-WS-CTL / LF / CR + + obs-NO-WS-CTL is control characters except WSP/CR/LF. + + So, basically, we have printable runs, plus control characters or nulls in + the obsolete syntax, separated by whitespace. Since RFC 2047 uses the + obsolete syntax in its specification, but requires whitespace on either + side of the encoded words, I can see no reason to need to separate the + non-printable-non-whitespace from the printable runs if they occur, so we + parse this into xtext tokens separated by WSP tokens. + + Because an 'unstructured' value must by definition constitute the entire + value, this 'get' routine does not return a remaining value, only the + parsed TokenList. + + """ + # XXX: but what about bare CR and LF? They might signal the start or + # end of an encoded word. YAGNI for now, since our current parsers + # will never send us strings with bare CR or LF. + + unstructured = UnstructuredTokenList() + while value: + if value[0] in WSP: + token, value = get_fws(value) + unstructured.append(token) + continue + if value.startswith('=?'): + try: + token, value = get_encoded_word(value) + except errors.HeaderParseError: + # XXX: Need to figure out how to register defects when + # appropriate here. + pass + else: + have_ws = True + if len(unstructured) > 0: + if unstructured[-1].token_type != 'fws': + unstructured.defects.append(errors.InvalidHeaderDefect( + "missing whitespace before encoded word")) + have_ws = False + if have_ws and len(unstructured) > 1: + if unstructured[-2].token_type == 'encoded-word': + unstructured[-1] = EWWhiteSpaceTerminal( + unstructured[-1], 'fws') + unstructured.append(token) + continue + tok, *remainder = _wsp_splitter(value, 1) + vtext = ValueTerminal(tok, 'vtext') + _validate_xtext(vtext) + unstructured.append(vtext) + value = ''.join(remainder) + return unstructured + +def get_qp_ctext(value): + r"""ctext = <printable ascii except \ ( )> + + This is not the RFC ctext, since we are handling nested comments in comment + and unquoting quoted-pairs here. We allow anything except the '()' + characters, but if we find any ASCII other than the RFC defined printable + ASCII, a NonPrintableDefect is added to the token's defects list. Since + quoted pairs are converted to their unquoted values, what is returned is + a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value + is ' '. + + """ + ptext, value, _ = _get_ptext_to_endchars(value, '()') + ptext = WhiteSpaceTerminal(ptext, 'ptext') + _validate_xtext(ptext) + return ptext, value + +def get_qcontent(value): + """qcontent = qtext / quoted-pair + + We allow anything except the DQUOTE character, but if we find any ASCII + other than the RFC defined printable ASCII, a NonPrintableDefect is + added to the token's defects list. Any quoted pairs are converted to their + unquoted values, so what is returned is a 'ptext' token. In this case it + is a ValueTerminal. + + """ + ptext, value, _ = _get_ptext_to_endchars(value, '"') + ptext = ValueTerminal(ptext, 'ptext') + _validate_xtext(ptext) + return ptext, value + +def get_atext(value): + """atext = <matches _atext_matcher> + + We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to + the token's defects list if we find non-atext characters. + """ + m = _non_atom_end_matcher(value) + if not m: + raise errors.HeaderParseError( + "expected atext but found '{}'".format(value)) + atext = m.group() + value = value[len(atext):] + atext = ValueTerminal(atext, 'atext') + _validate_xtext(atext) + return atext, value + +def get_bare_quoted_string(value): + """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE + + A quoted-string without the leading or trailing white space. Its + value is the text between the quote marks, with whitespace + preserved and quoted pairs decoded. + """ + if value[0] != '"': + raise errors.HeaderParseError( + "expected '\"' but found '{}'".format(value)) + bare_quoted_string = BareQuotedString() + value = value[1:] + if value[0] == '"': + token, value = get_qcontent(value) + bare_quoted_string.append(token) + while value and value[0] != '"': + if value[0] in WSP: + token, value = get_fws(value) + elif value[:2] == '=?': + try: + token, value = get_encoded_word(value) + bare_quoted_string.defects.append(errors.InvalidHeaderDefect( + "encoded word inside quoted string")) + except errors.HeaderParseError: + token, value = get_qcontent(value) + else: + token, value = get_qcontent(value) + bare_quoted_string.append(token) + if not value: + bare_quoted_string.defects.append(errors.InvalidHeaderDefect( + "end of header inside quoted string")) + return bare_quoted_string, value + return bare_quoted_string, value[1:] + +def get_comment(value): + """comment = "(" *([FWS] ccontent) [FWS] ")" + ccontent = ctext / quoted-pair / comment + + We handle nested comments here, and quoted-pair in our qp-ctext routine. + """ + if value and value[0] != '(': + raise errors.HeaderParseError( + "expected '(' but found '{}'".format(value)) + comment = Comment() + value = value[1:] + while value and value[0] != ")": + if value[0] in WSP: + token, value = get_fws(value) + elif value[0] == '(': + token, value = get_comment(value) + else: + token, value = get_qp_ctext(value) + comment.append(token) + if not value: + comment.defects.append(errors.InvalidHeaderDefect( + "end of header inside comment")) + return comment, value + return comment, value[1:] + +def get_cfws(value): + """CFWS = (1*([FWS] comment) [FWS]) / FWS + + """ + cfws = CFWSList() + while value and value[0] in CFWS_LEADER: + if value[0] in WSP: + token, value = get_fws(value) + else: + token, value = get_comment(value) + cfws.append(token) + return cfws, value + +def get_quoted_string(value): + """quoted-string = [CFWS] <bare-quoted-string> [CFWS] + + 'bare-quoted-string' is an intermediate class defined by this + parser and not by the RFC grammar. It is the quoted string + without any attached CFWS. + """ + quoted_string = QuotedString() + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + quoted_string.append(token) + token, value = get_bare_quoted_string(value) + quoted_string.append(token) + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + quoted_string.append(token) + return quoted_string, value + +def get_atom(value): + """atom = [CFWS] 1*atext [CFWS] + + An atom could be an rfc2047 encoded word. + """ + atom = Atom() + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + atom.append(token) + if value and value[0] in ATOM_ENDS: + raise errors.HeaderParseError( + "expected atom but found '{}'".format(value)) + if value.startswith('=?'): + try: + token, value = get_encoded_word(value) + except errors.HeaderParseError: + # XXX: need to figure out how to register defects when + # appropriate here. + token, value = get_atext(value) + else: + token, value = get_atext(value) + atom.append(token) + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + atom.append(token) + return atom, value + +def get_dot_atom_text(value): + """ dot-text = 1*atext *("." 1*atext) + + """ + dot_atom_text = DotAtomText() + if not value or value[0] in ATOM_ENDS: + raise errors.HeaderParseError("expected atom at a start of " + "dot-atom-text but found '{}'".format(value)) + while value and value[0] not in ATOM_ENDS: + token, value = get_atext(value) + dot_atom_text.append(token) + if value and value[0] == '.': + dot_atom_text.append(DOT) + value = value[1:] + if dot_atom_text[-1] is DOT: + raise errors.HeaderParseError("expected atom at end of dot-atom-text " + "but found '{}'".format('.'+value)) + return dot_atom_text, value + +def get_dot_atom(value): + """ dot-atom = [CFWS] dot-atom-text [CFWS] + + Any place we can have a dot atom, we could instead have an rfc2047 encoded + word. + """ + dot_atom = DotAtom() + if value[0] in CFWS_LEADER: + token, value = get_cfws(value) + dot_atom.append(token) + if value.startswith('=?'): + try: + token, value = get_encoded_word(value) + except errors.HeaderParseError: + # XXX: need to figure out how to register defects when + # appropriate here. + token, value = get_dot_atom_text(value) + else: + token, value = get_dot_atom_text(value) + dot_atom.append(token) + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + dot_atom.append(token) + return dot_atom, value + +def get_word(value): + """word = atom / quoted-string + + Either atom or quoted-string may start with CFWS. We have to peel off this + CFWS first to determine which type of word to parse. Afterward we splice + the leading CFWS, if any, into the parsed sub-token. + + If neither an atom or a quoted-string is found before the next special, a + HeaderParseError is raised. + + The token returned is either an Atom or a QuotedString, as appropriate. + This means the 'word' level of the formal grammar is not represented in the + parse tree; this is because having that extra layer when manipulating the + parse tree is more confusing than it is helpful. + + """ + if value[0] in CFWS_LEADER: + leader, value = get_cfws(value) + else: + leader = None + if value[0]=='"': + token, value = get_quoted_string(value) + elif value[0] in SPECIALS: + raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' " + "but found '{}'".format(value)) + else: + token, value = get_atom(value) + if leader is not None: + token[:0] = [leader] + return token, value + +def get_phrase(value): + """ phrase = 1*word / obs-phrase + obs-phrase = word *(word / "." / CFWS) + + This means a phrase can be a sequence of words, periods, and CFWS in any + order as long as it starts with at least one word. If anything other than + words is detected, an ObsoleteHeaderDefect is added to the token's defect + list. We also accept a phrase that starts with CFWS followed by a dot; + this is registered as an InvalidHeaderDefect, since it is not supported by + even the obsolete grammar. + + """ + phrase = Phrase() + try: + token, value = get_word(value) + phrase.append(token) + except errors.HeaderParseError: + phrase.defects.append(errors.InvalidHeaderDefect( + "phrase does not start with word")) + while value and value[0] not in PHRASE_ENDS: + if value[0]=='.': + phrase.append(DOT) + phrase.defects.append(errors.ObsoleteHeaderDefect( + "period in 'phrase'")) + value = value[1:] + else: + try: + token, value = get_word(value) + except errors.HeaderParseError: + if value[0] in CFWS_LEADER: + token, value = get_cfws(value) + phrase.defects.append(errors.ObsoleteHeaderDefect( + "comment found without atom")) + else: + raise + phrase.append(token) + return phrase, value + +def get_local_part(value): + """ local-part = dot-atom / quoted-string / obs-local-part + + """ + local_part = LocalPart() + leader = None + if value[0] in CFWS_LEADER: + leader, value = get_cfws(value) + if not value: + raise errors.HeaderParseError( + "expected local-part but found '{}'".format(value)) + try: + token, value = get_dot_atom(value) + except errors.HeaderParseError: + try: + token, value = get_word(value) + except errors.HeaderParseError: + if value[0] != '\\' and value[0] in PHRASE_ENDS: + raise + token = TokenList() + if leader is not None: + token[:0] = [leader] + local_part.append(token) + if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): + obs_local_part, value = get_obs_local_part(str(local_part) + value) + if obs_local_part.token_type == 'invalid-obs-local-part': + local_part.defects.append(errors.InvalidHeaderDefect( + "local-part is not dot-atom, quoted-string, or obs-local-part")) + else: + local_part.defects.append(errors.ObsoleteHeaderDefect( + "local-part is not a dot-atom (contains CFWS)")) + local_part[0] = obs_local_part + try: + local_part.value.encode('ascii') + except UnicodeEncodeError: + local_part.defects.append(errors.NonASCIILocalPartDefect( + "local-part contains non-ASCII characters)")) + return local_part, value + +def get_obs_local_part(value): + """ obs-local-part = word *("." word) + """ + obs_local_part = ObsLocalPart() + last_non_ws_was_dot = False + while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): + if value[0] == '.': + if last_non_ws_was_dot: + obs_local_part.defects.append(errors.InvalidHeaderDefect( + "invalid repeated '.'")) + obs_local_part.append(DOT) + last_non_ws_was_dot = True + value = value[1:] + continue + elif value[0]=='\\': + obs_local_part.append(ValueTerminal(value[0], + 'misplaced-special')) + value = value[1:] + obs_local_part.defects.append(errors.InvalidHeaderDefect( + "'\\' character outside of quoted-string/ccontent")) + last_non_ws_was_dot = False + continue + if obs_local_part and obs_local_part[-1].token_type != 'dot': + obs_local_part.defects.append(errors.InvalidHeaderDefect( + "missing '.' between words")) + try: + token, value = get_word(value) + last_non_ws_was_dot = False + except errors.HeaderParseError: + if value[0] not in CFWS_LEADER: + raise + token, value = get_cfws(value) + obs_local_part.append(token) + if (obs_local_part[0].token_type == 'dot' or + obs_local_part[0].token_type=='cfws' and + obs_local_part[1].token_type=='dot'): + obs_local_part.defects.append(errors.InvalidHeaderDefect( + "Invalid leading '.' in local part")) + if (obs_local_part[-1].token_type == 'dot' or + obs_local_part[-1].token_type=='cfws' and + obs_local_part[-2].token_type=='dot'): + obs_local_part.defects.append(errors.InvalidHeaderDefect( + "Invalid trailing '.' in local part")) + if obs_local_part.defects: + obs_local_part.token_type = 'invalid-obs-local-part' + return obs_local_part, value + +def get_dtext(value): + r""" dtext = <printable ascii except \ [ ]> / obs-dtext + obs-dtext = obs-NO-WS-CTL / quoted-pair + + We allow anything except the excluded characters, but if we find any + ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is + added to the token's defects list. Quoted pairs are converted to their + unquoted values, so what is returned is a ptext token, in this case a + ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is + added to the returned token's defect list. + + """ + ptext, value, had_qp = _get_ptext_to_endchars(value, '[]') + ptext = ValueTerminal(ptext, 'ptext') + if had_qp: + ptext.defects.append(errors.ObsoleteHeaderDefect( + "quoted printable found in domain-literal")) + _validate_xtext(ptext) + return ptext, value + +def _check_for_early_dl_end(value, domain_literal): + if value: + return False + domain_literal.append(errors.InvalidHeaderDefect( + "end of input inside domain-literal")) + domain_literal.append(ValueTerminal(']', 'domain-literal-end')) + return True + +def get_domain_literal(value): + """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS] + + """ + domain_literal = DomainLiteral() + if value[0] in CFWS_LEADER: + token, value = get_cfws(value) + domain_literal.append(token) + if not value: + raise errors.HeaderParseError("expected domain-literal") + if value[0] != '[': + raise errors.HeaderParseError("expected '[' at start of domain-literal " + "but found '{}'".format(value)) + value = value[1:] + if _check_for_early_dl_end(value, domain_literal): + return domain_literal, value + domain_literal.append(ValueTerminal('[', 'domain-literal-start')) + if value[0] in WSP: + token, value = get_fws(value) + domain_literal.append(token) + token, value = get_dtext(value) + domain_literal.append(token) + if _check_for_early_dl_end(value, domain_literal): + return domain_literal, value + if value[0] in WSP: + token, value = get_fws(value) + domain_literal.append(token) + if _check_for_early_dl_end(value, domain_literal): + return domain_literal, value + if value[0] != ']': + raise errors.HeaderParseError("expected ']' at end of domain-literal " + "but found '{}'".format(value)) + domain_literal.append(ValueTerminal(']', 'domain-literal-end')) + value = value[1:] + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + domain_literal.append(token) + return domain_literal, value + +def get_domain(value): + """ domain = dot-atom / domain-literal / obs-domain + obs-domain = atom *("." atom)) + + """ + domain = Domain() + leader = None + if value[0] in CFWS_LEADER: + leader, value = get_cfws(value) + if not value: + raise errors.HeaderParseError( + "expected domain but found '{}'".format(value)) + if value[0] == '[': + token, value = get_domain_literal(value) + if leader is not None: + token[:0] = [leader] + domain.append(token) + return domain, value + try: + token, value = get_dot_atom(value) + except errors.HeaderParseError: + token, value = get_atom(value) + if leader is not None: + token[:0] = [leader] + domain.append(token) + if value and value[0] == '.': + domain.defects.append(errors.ObsoleteHeaderDefect( + "domain is not a dot-atom (contains CFWS)")) + if domain[0].token_type == 'dot-atom': + domain[:] = domain[0] + while value and value[0] == '.': + domain.append(DOT) + token, value = get_atom(value[1:]) + domain.append(token) + return domain, value + +def get_addr_spec(value): + """ addr-spec = local-part "@" domain + + """ + addr_spec = AddrSpec() + token, value = get_local_part(value) + addr_spec.append(token) + if not value or value[0] != '@': + addr_spec.defects.append(errors.InvalidHeaderDefect( + "add-spec local part with no domain")) + return addr_spec, value + addr_spec.append(ValueTerminal('@', 'address-at-symbol')) + token, value = get_domain(value[1:]) + addr_spec.append(token) + return addr_spec, value + +def get_obs_route(value): + """ obs-route = obs-domain-list ":" + obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain]) + + Returns an obs-route token with the appropriate sub-tokens (that is, + there is no obs-domain-list in the parse tree). + """ + obs_route = ObsRoute() + while value and (value[0]==',' or value[0] in CFWS_LEADER): + if value[0] in CFWS_LEADER: + token, value = get_cfws(value) + obs_route.append(token) + elif value[0] == ',': + obs_route.append(ListSeparator) + value = value[1:] + if not value or value[0] != '@': + raise errors.HeaderParseError( + "expected obs-route domain but found '{}'".format(value)) + obs_route.append(RouteComponentMarker) + token, value = get_domain(value[1:]) + obs_route.append(token) + while value and value[0]==',': + obs_route.append(ListSeparator) + value = value[1:] + if not value: + break + if value[0] in CFWS_LEADER: + token, value = get_cfws(value) + obs_route.append(token) + if value[0] == '@': + obs_route.append(RouteComponentMarker) + token, value = get_domain(value[1:]) + obs_route.append(token) + if not value: + raise errors.HeaderParseError("end of header while parsing obs-route") + if value[0] != ':': + raise errors.HeaderParseError( "expected ':' marking end of " + "obs-route but found '{}'".format(value)) + obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker')) + return obs_route, value[1:] + +def get_angle_addr(value): + """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr + obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS] + + """ + angle_addr = AngleAddr() + if value[0] in CFWS_LEADER: + token, value = get_cfws(value) + angle_addr.append(token) + if not value or value[0] != '<': + raise errors.HeaderParseError( + "expected angle-addr but found '{}'".format(value)) + angle_addr.append(ValueTerminal('<', 'angle-addr-start')) + value = value[1:] + # Although it is not legal per RFC5322, SMTP uses '<>' in certain + # circumstances. + if value[0] == '>': + angle_addr.append(ValueTerminal('>', 'angle-addr-end')) + angle_addr.defects.append(errors.InvalidHeaderDefect( + "null addr-spec in angle-addr")) + value = value[1:] + return angle_addr, value + try: + token, value = get_addr_spec(value) + except errors.HeaderParseError: + try: + token, value = get_obs_route(value) + angle_addr.defects.append(errors.ObsoleteHeaderDefect( + "obsolete route specification in angle-addr")) + except errors.HeaderParseError: + raise errors.HeaderParseError( + "expected addr-spec or obs-route but found '{}'".format(value)) + angle_addr.append(token) + token, value = get_addr_spec(value) + angle_addr.append(token) + if value and value[0] == '>': + value = value[1:] + else: + angle_addr.defects.append(errors.InvalidHeaderDefect( + "missing trailing '>' on angle-addr")) + angle_addr.append(ValueTerminal('>', 'angle-addr-end')) + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + angle_addr.append(token) + return angle_addr, value + +def get_display_name(value): + """ display-name = phrase + + Because this is simply a name-rule, we don't return a display-name + token containing a phrase, but rather a display-name token with + the content of the phrase. + + """ + display_name = DisplayName() + token, value = get_phrase(value) + display_name.extend(token[:]) + display_name.defects = token.defects[:] + return display_name, value + + +def get_name_addr(value): + """ name-addr = [display-name] angle-addr + + """ + name_addr = NameAddr() + # Both the optional display name and the angle-addr can start with cfws. + leader = None + if value[0] in CFWS_LEADER: + leader, value = get_cfws(value) + if not value: + raise errors.HeaderParseError( + "expected name-addr but found '{}'".format(leader)) + if value[0] != '<': + if value[0] in PHRASE_ENDS: + raise errors.HeaderParseError( + "expected name-addr but found '{}'".format(value)) + token, value = get_display_name(value) + if not value: + raise errors.HeaderParseError( + "expected name-addr but found '{}'".format(token)) + if leader is not None: + token[0][:0] = [leader] + leader = None + name_addr.append(token) + token, value = get_angle_addr(value) + if leader is not None: + token[:0] = [leader] + name_addr.append(token) + return name_addr, value + +def get_mailbox(value): + """ mailbox = name-addr / addr-spec + + """ + # The only way to figure out if we are dealing with a name-addr or an + # addr-spec is to try parsing each one. + mailbox = Mailbox() + try: + token, value = get_name_addr(value) + except errors.HeaderParseError: + try: + token, value = get_addr_spec(value) + except errors.HeaderParseError: + raise errors.HeaderParseError( + "expected mailbox but found '{}'".format(value)) + if any(isinstance(x, errors.InvalidHeaderDefect) + for x in token.all_defects): + mailbox.token_type = 'invalid-mailbox' + mailbox.append(token) + return mailbox, value + +def get_invalid_mailbox(value, endchars): + """ Read everything up to one of the chars in endchars. + + This is outside the formal grammar. The InvalidMailbox TokenList that is + returned acts like a Mailbox, but the data attributes are None. + + """ + invalid_mailbox = InvalidMailbox() + while value and value[0] not in endchars: + if value[0] in PHRASE_ENDS: + invalid_mailbox.append(ValueTerminal(value[0], + 'misplaced-special')) + value = value[1:] + else: + token, value = get_phrase(value) + invalid_mailbox.append(token) + return invalid_mailbox, value + +def get_mailbox_list(value): + """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list + obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]) + + For this routine we go outside the formal grammar in order to improve error + handling. We recognize the end of the mailbox list only at the end of the + value or at a ';' (the group terminator). This is so that we can turn + invalid mailboxes into InvalidMailbox tokens and continue parsing any + remaining valid mailboxes. We also allow all mailbox entries to be null, + and this condition is handled appropriately at a higher level. + + """ + mailbox_list = MailboxList() + while value and value[0] != ';': + try: + token, value = get_mailbox(value) + mailbox_list.append(token) + except errors.HeaderParseError: + leader = None + if value[0] in CFWS_LEADER: + leader, value = get_cfws(value) + if not value or value[0] in ',;': + mailbox_list.append(leader) + mailbox_list.defects.append(errors.ObsoleteHeaderDefect( + "empty element in mailbox-list")) + else: + token, value = get_invalid_mailbox(value, ',;') + if leader is not None: + token[:0] = [leader] + mailbox_list.append(token) + mailbox_list.defects.append(errors.InvalidHeaderDefect( + "invalid mailbox in mailbox-list")) + elif value[0] == ',': + mailbox_list.defects.append(errors.ObsoleteHeaderDefect( + "empty element in mailbox-list")) + else: + token, value = get_invalid_mailbox(value, ',;') + if leader is not None: + token[:0] = [leader] + mailbox_list.append(token) + mailbox_list.defects.append(errors.InvalidHeaderDefect( + "invalid mailbox in mailbox-list")) + if value and value[0] not in ',;': + # Crap after mailbox; treat it as an invalid mailbox. + # The mailbox info will still be available. + mailbox = mailbox_list[-1] + mailbox.token_type = 'invalid-mailbox' + token, value = get_invalid_mailbox(value, ',;') + mailbox.extend(token) + mailbox_list.defects.append(errors.InvalidHeaderDefect( + "invalid mailbox in mailbox-list")) + if value and value[0] == ',': + mailbox_list.append(ListSeparator) + value = value[1:] + return mailbox_list, value + + +def get_group_list(value): + """ group-list = mailbox-list / CFWS / obs-group-list + obs-group-list = 1*([CFWS] ",") [CFWS] + + """ + group_list = GroupList() + if not value: + group_list.defects.append(errors.InvalidHeaderDefect( + "end of header before group-list")) + return group_list, value + leader = None + if value and value[0] in CFWS_LEADER: + leader, value = get_cfws(value) + if not value: + # This should never happen in email parsing, since CFWS-only is a + # legal alternative to group-list in a group, which is the only + # place group-list appears. + group_list.defects.append(errors.InvalidHeaderDefect( + "end of header in group-list")) + group_list.append(leader) + return group_list, value + if value[0] == ';': + group_list.append(leader) + return group_list, value + token, value = get_mailbox_list(value) + if len(token.all_mailboxes)==0: + if leader is not None: + group_list.append(leader) + group_list.extend(token) + group_list.defects.append(errors.ObsoleteHeaderDefect( + "group-list with empty entries")) + return group_list, value + if leader is not None: + token[:0] = [leader] + group_list.append(token) + return group_list, value + +def get_group(value): + """ group = display-name ":" [group-list] ";" [CFWS] + + """ + group = Group() + token, value = get_display_name(value) + if not value or value[0] != ':': + raise errors.HeaderParseError("expected ':' at end of group " + "display name but found '{}'".format(value)) + group.append(token) + group.append(ValueTerminal(':', 'group-display-name-terminator')) + value = value[1:] + if value and value[0] == ';': + group.append(ValueTerminal(';', 'group-terminator')) + return group, value[1:] + token, value = get_group_list(value) + group.append(token) + if not value: + group.defects.append(errors.InvalidHeaderDefect( + "end of header in group")) + if value[0] != ';': + raise errors.HeaderParseError( + "expected ';' at end of group but found {}".format(value)) + group.append(ValueTerminal(';', 'group-terminator')) + value = value[1:] + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + group.append(token) + return group, value + +def get_address(value): + """ address = mailbox / group + + Note that counter-intuitively, an address can be either a single address or + a list of addresses (a group). This is why the returned Address object has + a 'mailboxes' attribute which treats a single address as a list of length + one. When you need to differentiate between to two cases, extract the single + element, which is either a mailbox or a group token. + + """ + # The formal grammar isn't very helpful when parsing an address. mailbox + # and group, especially when allowing for obsolete forms, start off very + # similarly. It is only when you reach one of @, <, or : that you know + # what you've got. So, we try each one in turn, starting with the more + # likely of the two. We could perhaps make this more efficient by looking + # for a phrase and then branching based on the next character, but that + # would be a premature optimization. + address = Address() + try: + token, value = get_group(value) + except errors.HeaderParseError: + try: + token, value = get_mailbox(value) + except errors.HeaderParseError: + raise errors.HeaderParseError( + "expected address but found '{}'".format(value)) + address.append(token) + return address, value + +def get_address_list(value): + """ address_list = (address *("," address)) / obs-addr-list + obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]) + + We depart from the formal grammar here by continuing to parse until the end + of the input, assuming the input to be entirely composed of an + address-list. This is always true in email parsing, and allows us + to skip invalid addresses to parse additional valid ones. + + """ + address_list = AddressList() + while value: + try: + token, value = get_address(value) + address_list.append(token) + except errors.HeaderParseError as err: + leader = None + if value[0] in CFWS_LEADER: + leader, value = get_cfws(value) + if not value or value[0] == ',': + address_list.append(leader) + address_list.defects.append(errors.ObsoleteHeaderDefect( + "address-list entry with no content")) + else: + token, value = get_invalid_mailbox(value, ',') + if leader is not None: + token[:0] = [leader] + address_list.append(Address([token])) + address_list.defects.append(errors.InvalidHeaderDefect( + "invalid address in address-list")) + elif value[0] == ',': + address_list.defects.append(errors.ObsoleteHeaderDefect( + "empty element in address-list")) + else: + token, value = get_invalid_mailbox(value, ',') + if leader is not None: + token[:0] = [leader] + address_list.append(Address([token])) + address_list.defects.append(errors.InvalidHeaderDefect( + "invalid address in address-list")) + if value and value[0] != ',': + # Crap after address; treat it as an invalid mailbox. + # The mailbox info will still be available. + mailbox = address_list[-1][0] + mailbox.token_type = 'invalid-mailbox' + token, value = get_invalid_mailbox(value, ',') + mailbox.extend(token) + address_list.defects.append(errors.InvalidHeaderDefect( + "invalid address in address-list")) + if value: # Must be a , at this point. + address_list.append(ValueTerminal(',', 'list-separator')) + value = value[1:] + return address_list, value + +# +# XXX: As I begin to add additional header parsers, I'm realizing we probably +# have two level of parser routines: the get_XXX methods that get a token in +# the grammar, and parse_XXX methods that parse an entire field value. So +# get_address_list above should really be a parse_ method, as probably should +# be get_unstructured. +# + +def parse_mime_version(value): + """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS] + + """ + # The [CFWS] is implicit in the RFC 2045 BNF. + # XXX: This routine is a bit verbose, should factor out a get_int method. + mime_version = MIMEVersion() + if not value: + mime_version.defects.append(errors.HeaderMissingRequiredValue( + "Missing MIME version number (eg: 1.0)")) + return mime_version + if value[0] in CFWS_LEADER: + token, value = get_cfws(value) + mime_version.append(token) + if not value: + mime_version.defects.append(errors.HeaderMissingRequiredValue( + "Expected MIME version number but found only CFWS")) + digits = '' + while value and value[0] != '.' and value[0] not in CFWS_LEADER: + digits += value[0] + value = value[1:] + if not digits.isdigit(): + mime_version.defects.append(errors.InvalidHeaderDefect( + "Expected MIME major version number but found {!r}".format(digits))) + mime_version.append(ValueTerminal(digits, 'xtext')) + else: + mime_version.major = int(digits) + mime_version.append(ValueTerminal(digits, 'digits')) + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + mime_version.append(token) + if not value or value[0] != '.': + if mime_version.major is not None: + mime_version.defects.append(errors.InvalidHeaderDefect( + "Incomplete MIME version; found only major number")) + if value: + mime_version.append(ValueTerminal(value, 'xtext')) + return mime_version + mime_version.append(ValueTerminal('.', 'version-separator')) + value = value[1:] + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + mime_version.append(token) + if not value: + if mime_version.major is not None: + mime_version.defects.append(errors.InvalidHeaderDefect( + "Incomplete MIME version; found only major number")) + return mime_version + digits = '' + while value and value[0] not in CFWS_LEADER: + digits += value[0] + value = value[1:] + if not digits.isdigit(): + mime_version.defects.append(errors.InvalidHeaderDefect( + "Expected MIME minor version number but found {!r}".format(digits))) + mime_version.append(ValueTerminal(digits, 'xtext')) + else: + mime_version.minor = int(digits) + mime_version.append(ValueTerminal(digits, 'digits')) + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + mime_version.append(token) + if value: + mime_version.defects.append(errors.InvalidHeaderDefect( + "Excess non-CFWS text after MIME version")) + mime_version.append(ValueTerminal(value, 'xtext')) + return mime_version + +def get_invalid_parameter(value): + """ Read everything up to the next ';'. + + This is outside the formal grammar. The InvalidParameter TokenList that is + returned acts like a Parameter, but the data attributes are None. + + """ + invalid_parameter = InvalidParameter() + while value and value[0] != ';': + if value[0] in PHRASE_ENDS: + invalid_parameter.append(ValueTerminal(value[0], + 'misplaced-special')) + value = value[1:] + else: + token, value = get_phrase(value) + invalid_parameter.append(token) + return invalid_parameter, value + +def get_ttext(value): + """ttext = <matches _ttext_matcher> + + We allow any non-TOKEN_ENDS in ttext, but add defects to the token's + defects list if we find non-ttext characters. We also register defects for + *any* non-printables even though the RFC doesn't exclude all of them, + because we follow the spirit of RFC 5322. + + """ + m = _non_token_end_matcher(value) + if not m: + raise errors.HeaderParseError( + "expected ttext but found '{}'".format(value)) + ttext = m.group() + value = value[len(ttext):] + ttext = ValueTerminal(ttext, 'ttext') + _validate_xtext(ttext) + return ttext, value + +def get_token(value): + """token = [CFWS] 1*ttext [CFWS] + + The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or + tspecials. We also exclude tabs even though the RFC doesn't. + + The RFC implies the CFWS but is not explicit about it in the BNF. + + """ + mtoken = Token() + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + mtoken.append(token) + if value and value[0] in TOKEN_ENDS: + raise errors.HeaderParseError( + "expected token but found '{}'".format(value)) + token, value = get_ttext(value) + mtoken.append(token) + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + mtoken.append(token) + return mtoken, value + +def get_attrtext(value): + """attrtext = 1*(any non-ATTRIBUTE_ENDS character) + + We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the + token's defects list if we find non-attrtext characters. We also register + defects for *any* non-printables even though the RFC doesn't exclude all of + them, because we follow the spirit of RFC 5322. + + """ + m = _non_attribute_end_matcher(value) + if not m: + raise errors.HeaderParseError( + "expected attrtext but found {!r}".format(value)) + attrtext = m.group() + value = value[len(attrtext):] + attrtext = ValueTerminal(attrtext, 'attrtext') + _validate_xtext(attrtext) + return attrtext, value + +def get_attribute(value): + """ [CFWS] 1*attrtext [CFWS] + + This version of the BNF makes the CFWS explicit, and as usual we use a + value terminal for the actual run of characters. The RFC equivalent of + attrtext is the token characters, with the subtraction of '*', "'", and '%'. + We include tab in the excluded set just as we do for token. + + """ + attribute = Attribute() + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + attribute.append(token) + if value and value[0] in ATTRIBUTE_ENDS: + raise errors.HeaderParseError( + "expected token but found '{}'".format(value)) + token, value = get_attrtext(value) + attribute.append(token) + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + attribute.append(token) + return attribute, value + +def get_extended_attrtext(value): + """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%') + + This is a special parsing routine so that we get a value that + includes % escapes as a single string (which we decode as a single + string later). + + """ + m = _non_extended_attribute_end_matcher(value) + if not m: + raise errors.HeaderParseError( + "expected extended attrtext but found {!r}".format(value)) + attrtext = m.group() + value = value[len(attrtext):] + attrtext = ValueTerminal(attrtext, 'extended-attrtext') + _validate_xtext(attrtext) + return attrtext, value + +def get_extended_attribute(value): + """ [CFWS] 1*extended_attrtext [CFWS] + + This is like the non-extended version except we allow % characters, so that + we can pick up an encoded value as a single string. + + """ + # XXX: should we have an ExtendedAttribute TokenList? + attribute = Attribute() + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + attribute.append(token) + if value and value[0] in EXTENDED_ATTRIBUTE_ENDS: + raise errors.HeaderParseError( + "expected token but found '{}'".format(value)) + token, value = get_extended_attrtext(value) + attribute.append(token) + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + attribute.append(token) + return attribute, value + +def get_section(value): + """ '*' digits + + The formal BNF is more complicated because leading 0s are not allowed. We + check for that and add a defect. We also assume no CFWS is allowed between + the '*' and the digits, though the RFC is not crystal clear on that. + The caller should already have dealt with leading CFWS. + + """ + section = Section() + if not value or value[0] != '*': + raise errors.HeaderParseError("Expected section but found {}".format( + value)) + section.append(ValueTerminal('*', 'section-marker')) + value = value[1:] + if not value or not value[0].isdigit(): + raise errors.HeaderParseError("Expected section number but " + "found {}".format(value)) + digits = '' + while value and value[0].isdigit(): + digits += value[0] + value = value[1:] + if digits[0] == '0' and digits != '0': + section.defects.append(errors.InvalidHeaderError("section number" + "has an invalid leading 0")) + section.number = int(digits) + section.append(ValueTerminal(digits, 'digits')) + return section, value + + +def get_value(value): + """ quoted-string / attribute + + """ + v = Value() + if not value: + raise errors.HeaderParseError("Expected value but found end of string") + leader = None + if value[0] in CFWS_LEADER: + leader, value = get_cfws(value) + if not value: + raise errors.HeaderParseError("Expected value but found " + "only {}".format(leader)) + if value[0] == '"': + token, value = get_quoted_string(value) + else: + token, value = get_extended_attribute(value) + if leader is not None: + token[:0] = [leader] + v.append(token) + return v, value + +def get_parameter(value): + """ attribute [section] ["*"] [CFWS] "=" value + + The CFWS is implied by the RFC but not made explicit in the BNF. This + simplified form of the BNF from the RFC is made to conform with the RFC BNF + through some extra checks. We do it this way because it makes both error + recovery and working with the resulting parse tree easier. + """ + # It is possible CFWS would also be implicitly allowed between the section + # and the 'extended-attribute' marker (the '*') , but we've never seen that + # in the wild and we will therefore ignore the possibility. + param = Parameter() + token, value = get_attribute(value) + param.append(token) + if not value or value[0] == ';': + param.defects.append(errors.InvalidHeaderDefect("Parameter contains " + "name ({}) but no value".format(token))) + return param, value + if value[0] == '*': + try: + token, value = get_section(value) + param.sectioned = True + param.append(token) + except errors.HeaderParseError: + pass + if not value: + raise errors.HeaderParseError("Incomplete parameter") + if value[0] == '*': + param.append(ValueTerminal('*', 'extended-parameter-marker')) + value = value[1:] + param.extended = True + if value[0] != '=': + raise errors.HeaderParseError("Parameter not followed by '='") + param.append(ValueTerminal('=', 'parameter-separator')) + value = value[1:] + leader = None + if value and value[0] in CFWS_LEADER: + token, value = get_cfws(value) + param.append(token) + remainder = None + appendto = param + if param.extended and value and value[0] == '"': + # Now for some serious hackery to handle the common invalid case of + # double quotes around an extended value. We also accept (with defect) + # a value marked as encoded that isn't really. + qstring, remainder = get_quoted_string(value) + inner_value = qstring.stripped_value + semi_valid = False + if param.section_number == 0: + if inner_value and inner_value[0] == "'": + semi_valid = True + else: + token, rest = get_attrtext(inner_value) + if rest and rest[0] == "'": + semi_valid = True + else: + try: + token, rest = get_extended_attrtext(inner_value) + except: + pass + else: + if not rest: + semi_valid = True + if semi_valid: + param.defects.append(errors.InvalidHeaderDefect( + "Quoted string value for extended parameter is invalid")) + param.append(qstring) + for t in qstring: + if t.token_type == 'bare-quoted-string': + t[:] = [] + appendto = t + break + value = inner_value + else: + remainder = None + param.defects.append(errors.InvalidHeaderDefect( + "Parameter marked as extended but appears to have a " + "quoted string value that is non-encoded")) + if value and value[0] == "'": + token = None + else: + token, value = get_value(value) + if not param.extended or param.section_number > 0: + if not value or value[0] != "'": + appendto.append(token) + if remainder is not None: + assert not value, value + value = remainder + return param, value + param.defects.append(errors.InvalidHeaderDefect( + "Apparent initial-extended-value but attribute " + "was not marked as extended or was not initial section")) + if not value: + # Assume the charset/lang is missing and the token is the value. + param.defects.append(errors.InvalidHeaderDefect( + "Missing required charset/lang delimiters")) + appendto.append(token) + if remainder is None: + return param, value + else: + if token is not None: + for t in token: + if t.token_type == 'extended-attrtext': + break + t.token_type == 'attrtext' + appendto.append(t) + param.charset = t.value + if value[0] != "'": + raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " + "delimiter, but found {!r}".format(value)) + appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) + value = value[1:] + if value and value[0] != "'": + token, value = get_attrtext(value) + appendto.append(token) + param.lang = token.value + if not value or value[0] != "'": + raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " + "delimiter, but found {}".format(value)) + appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) + value = value[1:] + if remainder is not None: + # Treat the rest of value as bare quoted string content. + v = Value() + while value: + if value[0] in WSP: + token, value = get_fws(value) + else: + token, value = get_qcontent(value) + v.append(token) + token = v + else: + token, value = get_value(value) + appendto.append(token) + if remainder is not None: + assert not value, value + value = remainder + return param, value + +def parse_mime_parameters(value): + """ parameter *( ";" parameter ) + + That BNF is meant to indicate this routine should only be called after + finding and handling the leading ';'. There is no corresponding rule in + the formal RFC grammar, but it is more convenient for us for the set of + parameters to be treated as its own TokenList. + + This is 'parse' routine because it consumes the reminaing value, but it + would never be called to parse a full header. Instead it is called to + parse everything after the non-parameter value of a specific MIME header. + + """ + mime_parameters = MimeParameters() + while value: + try: + token, value = get_parameter(value) + mime_parameters.append(token) + except errors.HeaderParseError as err: + leader = None + if value[0] in CFWS_LEADER: + leader, value = get_cfws(value) + if not value: + mime_parameters.append(leader) + return mime_parameters + if value[0] == ';': + if leader is not None: + mime_parameters.append(leader) + mime_parameters.defects.append(errors.InvalidHeaderDefect( + "parameter entry with no content")) + else: + token, value = get_invalid_parameter(value) + if leader: + token[:0] = [leader] + mime_parameters.append(token) + mime_parameters.defects.append(errors.InvalidHeaderDefect( + "invalid parameter {!r}".format(token))) + if value and value[0] != ';': + # Junk after the otherwise valid parameter. Mark it as + # invalid, but it will have a value. + param = mime_parameters[-1] + param.token_type = 'invalid-parameter' + token, value = get_invalid_parameter(value) + param.extend(token) + mime_parameters.defects.append(errors.InvalidHeaderDefect( + "parameter with invalid trailing text {!r}".format(token))) + if value: + # Must be a ';' at this point. + mime_parameters.append(ValueTerminal(';', 'parameter-separator')) + value = value[1:] + return mime_parameters + +def _find_mime_parameters(tokenlist, value): + """Do our best to find the parameters in an invalid MIME header + + """ + while value and value[0] != ';': + if value[0] in PHRASE_ENDS: + tokenlist.append(ValueTerminal(value[0], 'misplaced-special')) + value = value[1:] + else: + token, value = get_phrase(value) + tokenlist.append(token) + if not value: + return + tokenlist.append(ValueTerminal(';', 'parameter-separator')) + tokenlist.append(parse_mime_parameters(value[1:])) + +def parse_content_type_header(value): + """ maintype "/" subtype *( ";" parameter ) + + The maintype and substype are tokens. Theoretically they could + be checked against the official IANA list + x-token, but we + don't do that. + """ + ctype = ContentType() + recover = False + if not value: + ctype.defects.append(errors.HeaderMissingRequiredValue( + "Missing content type specification")) + return ctype + try: + token, value = get_token(value) + except errors.HeaderParseError: + ctype.defects.append(errors.InvalidHeaderDefect( + "Expected content maintype but found {!r}".format(value))) + _find_mime_parameters(ctype, value) + return ctype + ctype.append(token) + # XXX: If we really want to follow the formal grammar we should make + # mantype and subtype specialized TokenLists here. Probably not worth it. + if not value or value[0] != '/': + ctype.defects.append(errors.InvalidHeaderDefect( + "Invalid content type")) + if value: + _find_mime_parameters(ctype, value) + return ctype + ctype.maintype = token.value.strip().lower() + ctype.append(ValueTerminal('/', 'content-type-separator')) + value = value[1:] + try: + token, value = get_token(value) + except errors.HeaderParseError: + ctype.defects.append(errors.InvalidHeaderDefect( + "Expected content subtype but found {!r}".format(value))) + _find_mime_parameters(ctype, value) + return ctype + ctype.append(token) + ctype.subtype = token.value.strip().lower() + if not value: + return ctype + if value[0] != ';': + ctype.defects.append(errors.InvalidHeaderDefect( + "Only parameters are valid after content type, but " + "found {!r}".format(value))) + # The RFC requires that a syntactically invalid content-type be treated + # as text/plain. Perhaps we should postel this, but we should probably + # only do that if we were checking the subtype value against IANA. + del ctype.maintype, ctype.subtype + _find_mime_parameters(ctype, value) + return ctype + ctype.append(ValueTerminal(';', 'parameter-separator')) + ctype.append(parse_mime_parameters(value[1:])) + return ctype + +def parse_content_disposition_header(value): + """ disposition-type *( ";" parameter ) + + """ + disp_header = ContentDisposition() + if not value: + disp_header.defects.append(errors.HeaderMissingRequiredValue( + "Missing content disposition")) + return disp_header + try: + token, value = get_token(value) + except errors.HeaderParseError: + disp_header.defects.append(errors.InvalidHeaderDefect( + "Expected content disposition but found {!r}".format(value))) + _find_mime_parameters(disp_header, value) + return disp_header + disp_header.append(token) + disp_header.content_disposition = token.value.strip().lower() + if not value: + return disp_header + if value[0] != ';': + disp_header.defects.append(errors.InvalidHeaderDefect( + "Only parameters are valid after content disposition, but " + "found {!r}".format(value))) + _find_mime_parameters(disp_header, value) + return disp_header + disp_header.append(ValueTerminal(';', 'parameter-separator')) + disp_header.append(parse_mime_parameters(value[1:])) + return disp_header + +def parse_content_transfer_encoding_header(value): + """ mechanism + + """ + # We should probably validate the values, since the list is fixed. + cte_header = ContentTransferEncoding() + if not value: + cte_header.defects.append(errors.HeaderMissingRequiredValue( + "Missing content transfer encoding")) + return cte_header + try: + token, value = get_token(value) + except errors.HeaderParseError: + cte_header.defects.append(errors.InvalidHeaderDefect( + "Expected content transfer encoding but found {!r}".format(value))) + else: + cte_header.append(token) + cte_header.cte = token.value.strip().lower() + if not value: + return cte_header + while value: + cte_header.defects.append(errors.InvalidHeaderDefect( + "Extra text after content transfer encoding")) + if value[0] in PHRASE_ENDS: + cte_header.append(ValueTerminal(value[0], 'misplaced-special')) + value = value[1:] + else: + token, value = get_phrase(value) + cte_header.append(token) + return cte_header + + +# +# Header folding +# +# Header folding is complex, with lots of rules and corner cases. The +# following code does its best to obey the rules and handle the corner +# cases, but you can be sure there are few bugs:) +# +# This folder generally canonicalizes as it goes, preferring the stringified +# version of each token. The tokens contain information that supports the +# folder, including which tokens can be encoded in which ways. +# +# Folded text is accumulated in a simple list of strings ('lines'), each +# one of which should be less than policy.max_line_length ('maxlen'). +# + +def _steal_trailing_WSP_if_exists(lines): + wsp = '' + if lines and lines[-1] and lines[-1][-1] in WSP: + wsp = lines[-1][-1] + lines[-1] = lines[-1][:-1] + return wsp + +def _refold_parse_tree(parse_tree, *, policy): + """Return string of contents of parse_tree folded according to RFC rules. + + """ + # max_line_length 0/None means no limit, ie: infinitely long. + maxlen = policy.max_line_length or float("+inf") + encoding = 'utf-8' if policy.utf8 else 'us-ascii' + lines = [''] + last_ew = None + wrap_as_ew_blocked = 0 + want_encoding = False + end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') + parts = list(parse_tree) + while parts: + part = parts.pop(0) + if part is end_ew_not_allowed: + wrap_as_ew_blocked -= 1 + continue + tstr = str(part) + try: + tstr.encode(encoding) + charset = encoding + except UnicodeEncodeError: + if any(isinstance(x, errors.UndecodableBytesDefect) + for x in part.all_defects): + charset = 'unknown-8bit' + else: + # If policy.utf8 is false this should really be taken from a + # 'charset' property on the policy. + charset = 'utf-8' + want_encoding = True + if part.token_type == 'mime-parameters': + # Mime parameter folding (using RFC2231) is extra special. + _fold_mime_parameters(part, lines, maxlen, encoding) + continue + if want_encoding and not wrap_as_ew_blocked: + if not part.as_ew_allowed: + want_encoding = False + last_ew = None + if part.syntactic_break: + encoded_part = part.fold(policy=policy)[:-1] # strip nl + if policy.linesep not in encoded_part: + # It fits on a single line + if len(encoded_part) > maxlen - len(lines[-1]): + # But not on this one, so start a new one. + newline = _steal_trailing_WSP_if_exists(lines) + # XXX what if encoded_part has no leading FWS? + lines.append(newline) + lines[-1] += encoded_part + continue + # Either this is not a major syntactic break, so we don't + # want it on a line by itself even if it fits, or it + # doesn't fit on a line by itself. Either way, fall through + # to unpacking the subparts and wrapping them. + if not hasattr(part, 'encode'): + # It's not a Terminal, do each piece individually. + parts = list(part) + parts + else: + # It's a terminal, wrap it as an encoded word, possibly + # combining it with previously encoded words if allowed. + last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, + part.ew_combine_allowed, charset) + want_encoding = False + continue + if len(tstr) <= maxlen - len(lines[-1]): + lines[-1] += tstr + continue + # This part is too long to fit. The RFC wants us to break at + # "major syntactic breaks", so unless we don't consider this + # to be one, check if it will fit on the next line by itself. + if (part.syntactic_break and + len(tstr) + 1 <= maxlen): + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + continue + if not hasattr(part, 'encode'): + # It's not a terminal, try folding the subparts. + newparts = list(part) + if not part.as_ew_allowed: + wrap_as_ew_blocked += 1 + newparts.append(end_ew_not_allowed) + parts = newparts + parts + continue + if part.as_ew_allowed and not wrap_as_ew_blocked: + # It doesn't need CTE encoding, but encode it anyway so we can + # wrap it. + parts.insert(0, part) + want_encoding = True + continue + # We can't figure out how to wrap, it, so give up. + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + else: + # We can't fold it onto the next line either... + lines[-1] += tstr + return policy.linesep.join(lines) + policy.linesep + +def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): + """Fold string to_encode into lines as encoded word, combining if allowed. + Return the new value for last_ew, or None if ew_combine_allowed is False. + + If there is already an encoded word in the last line of lines (indicated by + a non-None value for last_ew) and ew_combine_allowed is true, decode the + existing ew, combine it with to_encode, and re-encode. Otherwise, encode + to_encode. In either case, split to_encode as necessary so that the + encoded segments fit within maxlen. + + """ + if last_ew is not None and ew_combine_allowed: + to_encode = str( + get_unstructured(lines[-1][last_ew:] + to_encode)) + lines[-1] = lines[-1][:last_ew] + if to_encode[0] in WSP: + # We're joining this to non-encoded text, so don't encode + # the leading blank. + leading_wsp = to_encode[0] + to_encode = to_encode[1:] + if (len(lines[-1]) == maxlen): + lines.append(_steal_trailing_WSP_if_exists(lines)) + lines[-1] += leading_wsp + trailing_wsp = '' + if to_encode[-1] in WSP: + # Likewise for the trailing space. + trailing_wsp = to_encode[-1] + to_encode = to_encode[:-1] + new_last_ew = len(lines[-1]) if last_ew is None else last_ew + while to_encode: + remaining_space = maxlen - len(lines[-1]) + # The RFC2047 chrome takes up 7 characters plus the length + # of the charset name. + encode_as = 'utf-8' if charset == 'us-ascii' else charset + text_space = remaining_space - len(encode_as) - 7 + if text_space <= 0: + lines.append(' ') + # XXX We'll get an infinite loop here if maxlen is <= 7 + continue + first_part = to_encode[:text_space] + ew = _ew.encode(first_part, charset=encode_as) + excess = len(ew) - remaining_space + if excess > 0: + # encode always chooses the shortest encoding, so this + # is guaranteed to fit at this point. + first_part = first_part[:-excess] + ew = _ew.encode(first_part) + lines[-1] += ew + to_encode = to_encode[len(first_part):] + if to_encode: + lines.append(' ') + new_last_ew = len(lines[-1]) + lines[-1] += trailing_wsp + return new_last_ew if ew_combine_allowed else None + +def _fold_mime_parameters(part, lines, maxlen, encoding): + """Fold TokenList 'part' into the 'lines' list as mime parameters. + + Using the decoded list of parameters and values, format them according to + the RFC rules, including using RFC2231 encoding if the value cannot be + expressed in 'encoding' and/or the parameter+value is too long to fit + within 'maxlen'. + + """ + # Special case for RFC2231 encoding: start from decoded values and use + # RFC2231 encoding iff needed. + # + # Note that the 1 and 2s being added to the length calculations are + # accounting for the possibly-needed spaces and semicolons we'll be adding. + # + for name, value in part.params: + # XXX What if this ';' puts us over maxlen the first time through the + # loop? We should split the header value onto a newline in that case, + # but to do that we need to recognize the need earlier or reparse the + # header, so I'm going to ignore that bug for now. It'll only put us + # one character over. + if not lines[-1].rstrip().endswith(';'): + lines[-1] += ';' + charset = encoding + error_handler = 'strict' + try: + value.encode(encoding) + encoding_required = False + except UnicodeEncodeError: + encoding_required = True + if utils._has_surrogates(value): + charset = 'unknown-8bit' + error_handler = 'surrogateescape' + else: + charset = 'utf-8' + if encoding_required: + encoded_value = parse.quote( + value, safe='', errors=error_handler) + tstr = "{}*={}''{}".format(name, charset, encoded_value) + else: + tstr = '{}={}'.format(name, quote_string(value)) + if len(lines[-1]) + len(tstr) + 1 < maxlen: + lines[-1] = lines[-1] + ' ' + tstr + continue + elif len(tstr) + 2 <= maxlen: + lines.append(' ' + tstr) + continue + # We need multiple sections. We are allowed to mix encoded and + # non-encoded sections, but we aren't going to. We'll encode them all. + section = 0 + extra_chrome = charset + "''" + while value: + chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome) + if maxlen <= chrome_len + 3: + # We need room for the leading blank, the trailing semicolon, + # and at least one character of the value. If we don't + # have that, we'd be stuck, so in that case fall back to + # the RFC standard width. + maxlen = 78 + splitpoint = maxchars = maxlen - chrome_len - 2 + while True: + partial = value[:splitpoint] + encoded_value = parse.quote( + partial, safe='', errors=error_handler) + if len(encoded_value) <= maxchars: + break + splitpoint -= 1 + lines.append(" {}*{}*={}{}".format( + name, section, extra_chrome, encoded_value)) + extra_chrome = '' + section += 1 + value = value[splitpoint:] + if value: + lines[-1] += ';' diff --git a/modules/language/python/module/email/_policybase.py b/modules/language/python/module/email/_policybase.py index a8633e2..bbc8b3c 100644 --- a/modules/language/python/module/email/_policybase.py +++ b/modules/language/python/module/email/_policybase.py @@ -3,7 +3,6 @@ module(email,_policybase) Allows fine grained feature control of how the package parses and emits data. """ - import abc import email.header as header import email.charset as _charset @@ -13,6 +12,7 @@ __all__ = [ 'Policy', 'Compat32', 'compat32', + '_extend_docstrings' ] @@ -101,9 +101,9 @@ def _extend_docstrings(cls): if cls.__doc__ and cls.__doc__.startswith('+'): cls.__doc__ = _append_doc(cls.__bases__[0].__doc__, cls.__doc__) for name, attr in cls.__dict__.items(): - if attr.__doc__ and attr.__doc__.startswith('+'): + if getattr(attr, '__doc__',False) and attr.__doc__.startswith('+'): for c in (c for base in cls.__bases__ for c in base.mro()): - doc = getattr(getattr(c, name), '__doc__') + doc = getattr(getattr(c, name, False), '__doc__', False) if doc: attr.__doc__ = _append_doc(doc, attr.__doc__) break diff --git a/modules/language/python/module/email/contentmanager.py b/modules/language/python/module/email/contentmanager.py new file mode 100644 index 0000000..5eaf2ee --- /dev/null +++ b/modules/language/python/module/email/contentmanager.py @@ -0,0 +1,254 @@ +module(email,contentmanager) + +__all__=['ContentManager','raw_data_manager'] + +import binascii +import email.charset as charset +import email.message as message +import email.errors as errors +import email.quoprimime as quoprimime + +class ContentManager: + + def __init__(self): + self.get_handlers = {} + self.set_handlers = {} + + def add_get_handler(self, key, handler): + self.get_handlers[key] = handler + + def get_content(self, msg, *args, **kw): + content_type = msg.get_content_type() + if content_type in self.get_handlers: + return self.get_handlers[content_type](msg, *args, **kw) + maintype = msg.get_content_maintype() + if maintype in self.get_handlers: + return self.get_handlers[maintype](msg, *args, **kw) + if '' in self.get_handlers: + return self.get_handlers[''](msg, *args, **kw) + raise KeyError(content_type) + + def add_set_handler(self, typekey, handler): + self.set_handlers[typekey] = handler + + def set_content(self, msg, obj, *args, **kw): + if msg.get_content_maintype() == 'multipart': + # XXX: is this error a good idea or not? We can remove it later, + # but we can't add it later, so do it for now. + raise TypeError("set_content not valid on multipart") + handler = self._find_set_handler(msg, obj) + msg.clear_content() + handler(msg, obj, *args, **kw) + + def _find_set_handler(self, msg, obj): + full_path_for_error = None + for typ in type(obj).__mro__: + if typ in self.set_handlers: + return self.set_handlers[typ] + qname = typ.__qualname__ + modname = getattr(typ, '__module__', '') + full_path = '.'.join((modname, qname)) if modname else qname + if full_path_for_error is None: + full_path_for_error = full_path + if full_path in self.set_handlers: + return self.set_handlers[full_path] + if qname in self.set_handlers: + return self.set_handlers[qname] + name = typ.__name__ + if name in self.set_handlers: + return self.set_handlers[name] + if None in self.set_handlers: + return self.set_handlers[None] + raise KeyError(full_path_for_error) + + +raw_data_manager = ContentManager() + + +def get_text_content(msg, errors='replace'): + content = msg.get_payload(decode=True) + charset = msg.get_param('charset', 'ASCII') + return content.decode(charset, errors=errors) +raw_data_manager.add_get_handler('text', get_text_content) + + +def get_non_text_content(msg): + return msg.get_payload(decode=True) +for maintype in 'audio image video application'.split(): + raw_data_manager.add_get_handler(maintype, get_non_text_content) + + +def get_message_content(msg): + return msg.get_payload(0) +for subtype in 'rfc822 external-body'.split(): + raw_data_manager.add_get_handler('message/'+subtype, get_message_content) + + +def get_and_fixup_unknown_message_content(msg): + # If we don't understand a message subtype, we are supposed to treat it as + # if it were application/octet-stream, per + # tools.ietf.org/html/rfc2046#section-5.2.4. Feedparser doesn't do that, + # so do our best to fix things up. Note that it is *not* appropriate to + # model message/partial content as Message objects, so they are handled + # here as well. (How to reassemble them is out of scope for this comment :) + return bytes(msg.get_payload(0)) +raw_data_manager.add_get_handler('message', + get_and_fixup_unknown_message_content) + + +def _prepare_set(msg, maintype, subtype, headers): + msg['Content-Type'] = '/'.join((maintype, subtype)) + if headers: + if not hasattr(headers[0], 'name'): + mp = msg.policy + headers = [mp.header_factory(*mp.header_source_parse([header])) + for header in headers] + try: + for header in headers: + if header.defects: + raise header.defects[0] + msg[header.name] = header + except errors.HeaderDefect as exc: + raise ValueError("Invalid header: {}".format( + header.fold(policy=msg.policy))) from exc + + +def _finalize_set(msg, disposition, filename, cid, params): + if disposition is None and filename is not None: + disposition = 'attachment' + if disposition is not None: + msg['Content-Disposition'] = disposition + if filename is not None: + msg.set_param('filename', + filename, + header='Content-Disposition', + replace=True) + if cid is not None: + msg['Content-ID'] = cid + if params is not None: + for key, value in params.items(): + msg.set_param(key, value) + + +# XXX: This is a cleaned-up version of base64mime.body_encode (including a bug +# fix in the calculation of unencoded_bytes_per_line). It would be nice to +# drop both this and quoprimime.body_encode in favor of enhanced binascii +# routines that accepted a max_line_length parameter. +def _encode_base64(data, max_line_length): + encoded_lines = [] + unencoded_bytes_per_line = max_line_length // 4 * 3 + for i in range(0, len(data), unencoded_bytes_per_line): + thisline = data[i:i+unencoded_bytes_per_line] + encoded_lines.append(binascii.b2a_base64(thisline).decode('ascii')) + return ''.join(encoded_lines) + + +def _encode_text(string, charset, cte, policy): + lines = string.encode(charset).splitlines() + linesep = policy.linesep.encode('ascii') + def embedded_body(lines): return linesep.join(lines) + linesep + def normal_body(lines): return b'\n'.join(lines) + b'\n' + if cte==None: + # Use heuristics to decide on the "best" encoding. + try: + return '7bit', normal_body(lines).decode('ascii') + except UnicodeDecodeError: + pass + if (policy.cte_type == '8bit' and + max(len(x) for x in lines) <= policy.max_line_length): + return '8bit', normal_body(lines).decode('ascii', 'surrogateescape') + sniff = embedded_body(lines[:10]) + sniff_qp = quoprimime.body_encode(sniff.decode('latin-1'), + policy.max_line_length) + sniff_base64 = binascii.b2a_base64(sniff) + # This is a little unfair to qp; it includes lineseps, base64 doesn't. + if len(sniff_qp) > len(sniff_base64): + cte = 'base64' + else: + cte = 'quoted-printable' + if len(lines) <= 10: + return cte, sniff_qp + if cte == '7bit': + data = normal_body(lines).decode('ascii') + elif cte == '8bit': + data = normal_body(lines).decode('ascii', 'surrogateescape') + elif cte == 'quoted-printable': + data = quoprimime.body_encode(normal_body(lines).decode('latin-1'), + policy.max_line_length) + elif cte == 'base64': + data = _encode_base64(embedded_body(lines), policy.max_line_length) + else: + raise ValueError("Unknown content transfer encoding {}".format(cte)) + return cte, data + + +def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None, + disposition=None, filename=None, cid=None, + params=None, headers=None): + _prepare_set(msg, 'text', subtype, headers) + cte, payload = _encode_text(string, charset, cte, msg.policy) + msg.set_payload(payload) + msg.set_param('charset', + charset.ALIASES.get(charset, charset), + replace=True) + msg['Content-Transfer-Encoding'] = cte + _finalize_set(msg, disposition, filename, cid, params) +raw_data_manager.add_set_handler(str, set_text_content) + + +def set_message_content(msg, message, subtype="rfc822", cte=None, + disposition=None, filename=None, cid=None, + params=None, headers=None): + if subtype == 'partial': + raise ValueError("message/partial is not supported for Message objects") + if subtype == 'rfc822': + if cte not in (None, '7bit', '8bit', 'binary'): + # http://tools.ietf.org/html/rfc2046#section-5.2.1 mandate. + raise ValueError( + "message/rfc822 parts do not support cte={}".format(cte)) + # 8bit will get coerced on serialization if policy.cte_type='7bit'. We + # may end up claiming 8bit when it isn't needed, but the only negative + # result of that should be a gateway that needs to coerce to 7bit + # having to look through the whole embedded message to discover whether + # or not it actually has to do anything. + cte = '8bit' if cte is None else cte + elif subtype == 'external-body': + if cte not in (None, '7bit'): + # http://tools.ietf.org/html/rfc2046#section-5.2.3 mandate. + raise ValueError( + "message/external-body parts do not support cte={}".format(cte)) + cte = '7bit' + elif cte is None: + # http://tools.ietf.org/html/rfc2046#section-5.2.4 says all future + # subtypes should be restricted to 7bit, so assume that. + cte = '7bit' + _prepare_set(msg, 'message', subtype, headers) + msg.set_payload([message]) + msg['Content-Transfer-Encoding'] = cte + _finalize_set(msg, disposition, filename, cid, params) +raw_data_manager.add_set_handler(message.Message, set_message_content) + + +def set_bytes_content(msg, data, maintype, subtype, cte='base64', + disposition=None, filename=None, cid=None, + params=None, headers=None): + _prepare_set(msg, maintype, subtype, headers) + if cte == 'base64': + data = _encode_base64(data, max_line_length=msg.policy.max_line_length) + elif cte == 'quoted-printable': + # XXX: quoprimime.body_encode won't encode newline characters in data, + # so we can't use it. This means max_line_length is ignored. Another + # bug to fix later. (Note: encoders.quopri is broken on line ends.) + data = binascii.b2a_qp(data, istext=False, header=False, quotetabs=True) + data = data.decode('ascii') + elif cte == '7bit': + # Make sure it really is only ASCII. The early warning here seems + # worth the overhead...if you care write your own content manager :). + data.encode('ascii') + elif cte in ('8bit', 'binary'): + data = data.decode('ascii', 'surrogateescape') + msg.set_payload(data) + msg['Content-Transfer-Encoding'] = cte + _finalize_set(msg, disposition, filename, cid, params) +for typ in (bytes, bytearray, memoryview): + raw_data_manager.add_set_handler(typ, set_bytes_content) diff --git a/modules/language/python/module/email/feedparser.py b/modules/language/python/module/email/feedparser.py index f07448a..2f31633 100644 --- a/modules/language/python/module/email/feedparser.py +++ b/modules/language/python/module/email/feedparser.py @@ -23,7 +23,6 @@ object's .defects attribute. __all__ = ['FeedParser', 'BytesFeedParser'] import re - import email.errors as errors from email._policybase import compat32 from collections import deque diff --git a/modules/language/python/module/email/generator.py b/modules/language/python/module/email/generator.py new file mode 100644 index 0000000..2b90fe5 --- /dev/null +++ b/modules/language/python/module/email/generator.py @@ -0,0 +1,509 @@ +module(email,generator) +# Copyright (C) 2001-2010 Python Software Foundation +# Author: Barry Warsaw +# Contact: email-sig@python.org + +"""Classes to generate plain text from a message object tree.""" + +__all__ = ['Generator', 'DecodedGenerator', 'BytesGenerator'] + +import re +import sys +import time +import random + +from copy import deepcopy +from io import StringIO, BytesIO +from email.utils import _has_surrogates + +UNDERSCORE = '_' +NL = '\n' # XXX: no longer used by the code below. + +NLCRE = re.compile(r'\r\n|\r|\n') +fcre = re.compile(r'^From ', re.MULTILINE) + + + +class Generator: + """Generates output from a Message object tree. + + This basic generator writes the message to the given file object as plain + text. + """ + # + # Public interface + # + + def __init__(self, outfp, mangle_from_=None, maxheaderlen=None, *, + policy=None): + """Create the generator for message flattening. + + outfp is the output file-like object for writing the message to. It + must have a write() method. + + Optional mangle_from_ is a flag that, when True (the default if policy + is not set), escapes From_ lines in the body of the message by putting + a `>' in front of them. + + Optional maxheaderlen specifies the longest length for a non-continued + header. When a header line is longer (in characters, with tabs + expanded to 8 spaces) than maxheaderlen, the header will split as + defined in the Header class. Set maxheaderlen to zero to disable + header wrapping. The default is 78, as recommended (but not required) + by RFC 2822. + + The policy keyword specifies a policy object that controls a number of + aspects of the generator's operation. If no policy is specified, + the policy associated with the Message object passed to the + flatten method is used. + + """ + + if mangle_from_ is None: + mangle_from_ = True if policy is None else policy.mangle_from_ + self._fp = outfp + self._mangle_from_ = mangle_from_ + self.maxheaderlen = maxheaderlen + self.policy = policy + + def write(self, s): + # Just delegate to the file object + self._fp.write(s) + + def flatten(self, msg, unixfrom=False, linesep=None): + r"""Print the message object tree rooted at msg to the output file + specified when the Generator instance was created. + + unixfrom is a flag that forces the printing of a Unix From_ delimiter + before the first object in the message tree. If the original message + has no From_ delimiter, a `standard' one is crafted. By default, this + is False to inhibit the printing of any From_ delimiter. + + Note that for subobjects, no From_ line is printed. + + linesep specifies the characters used to indicate a new line in + the output. The default value is determined by the policy specified + when the Generator instance was created or, if none was specified, + from the policy associated with the msg. + + """ + # We use the _XXX constants for operating on data that comes directly + # from the msg, and _encoded_XXX constants for operating on data that + # has already been converted (to bytes in the BytesGenerator) and + # inserted into a temporary buffer. + policy = msg.policy if self.policy is None else self.policy + if linesep is not None: + policy = policy.clone(linesep=linesep) + if self.maxheaderlen is not None: + policy = policy.clone(max_line_length=self.maxheaderlen) + self._NL = policy.linesep + self._encoded_NL = self._encode(self._NL) + self._EMPTY = '' + self._encoded_EMPTY = self._encode(self._EMPTY) + # Because we use clone (below) when we recursively process message + # subparts, and because clone uses the computed policy (not None), + # submessages will automatically get set to the computed policy when + # they are processed by this code. + old_gen_policy = self.policy + old_msg_policy = msg.policy + try: + self.policy = policy + msg.policy = policy + if unixfrom: + ufrom = msg.get_unixfrom() + if not ufrom: + ufrom = 'From nobody ' + time.ctime(time.time()) + self.write(ufrom + self._NL) + self._write(msg) + finally: + self.policy = old_gen_policy + msg.policy = old_msg_policy + + def clone(self, fp): + """Clone this generator with the exact same options.""" + return self.__class__(fp, + self._mangle_from_, + None, # Use policy setting, which we've adjusted + policy=self.policy) + + # + # Protected interface - undocumented ;/ + # + + # Note that we use 'self.write' when what we are writing is coming from + # the source, and self._fp.write when what we are writing is coming from a + # buffer (because the Bytes subclass has already had a chance to transform + # the data in its write method in that case). This is an entirely + # pragmatic split determined by experiment; we could be more general by + # always using write and having the Bytes subclass write method detect when + # it has already transformed the input; but, since this whole thing is a + # hack anyway this seems good enough. + + def _new_buffer(self): + # BytesGenerator overrides this to return BytesIO. + return StringIO() + + def _encode(self, s): + # BytesGenerator overrides this to encode strings to bytes. + return s + + def _write_lines(self, lines): + # We have to transform the line endings. + if not lines: + return + lines = NLCRE.split(lines) + for line in lines[:-1]: + self.write(line) + self.write(self._NL) + if lines[-1]: + self.write(lines[-1]) + # XXX logic tells me this else should be needed, but the tests fail + # with it and pass without it. (NLCRE.split ends with a blank element + # if and only if there was a trailing newline.) + #else: + # self.write(self._NL) + + def _write(self, msg): + # We can't write the headers yet because of the following scenario: + # say a multipart message includes the boundary string somewhere in + # its body. We'd have to calculate the new boundary /before/ we write + # the headers so that we can write the correct Content-Type: + # parameter. + # + # The way we do this, so as to make the _handle_*() methods simpler, + # is to cache any subpart writes into a buffer. The we write the + # headers and the buffer contents. That way, subpart handlers can + # Do The Right Thing, and can still modify the Content-Type: header if + # necessary. + oldfp = self._fp + try: + self._munge_cte = None + self._fp = sfp = self._new_buffer() + self._dispatch(msg) + finally: + self._fp = oldfp + munge_cte = self._munge_cte + del self._munge_cte + # If we munged the cte, copy the message again and re-fix the CTE. + if munge_cte: + msg = deepcopy(msg) + msg.replace_header('content-transfer-encoding', munge_cte[0]) + msg.replace_header('content-type', munge_cte[1]) + # Write the headers. First we see if the message object wants to + # handle that itself. If not, we'll do it generically. + meth = getattr(msg, '_write_headers', None) + if meth is None: + self._write_headers(msg) + else: + meth(self) + self._fp.write(sfp.getvalue()) + + def _dispatch(self, msg): + # Get the Content-Type: for the message, then try to dispatch to + # self._handle_<maintype>_<subtype>(). If there's no handler for the + # full MIME type, then dispatch to self._handle_<maintype>(). If + # that's missing too, then dispatch to self._writeBody(). + main = msg.get_content_maintype() + sub = msg.get_content_subtype() + specific = UNDERSCORE.join((main, sub)).replace('-', '_') + meth = getattr(self, '_handle_' + specific, None) + if meth is None: + generic = main.replace('-', '_') + meth = getattr(self, '_handle_' + generic, None) + if meth is None: + meth = self._writeBody + meth(msg) + + # + # Default handlers + # + + def _write_headers(self, msg): + for h, v in msg.raw_items(): + self.write(self.policy.fold(h, v)) + # A blank line always separates headers from body + self.write(self._NL) + + # + # Handlers for writing types and subtypes + # + + def _handle_text(self, msg): + payload = msg.get_payload() + if payload is None: + return + if not isinstance(payload, str): + raise TypeError('string payload expected: %s' % type(payload)) + if _has_surrogates(msg._payload): + charset = msg.get_param('charset') + if charset is not None: + # XXX: This copy stuff is an ugly hack to avoid modifying the + # existing message. + msg = deepcopy(msg) + del msg['content-transfer-encoding'] + msg.set_payload(payload, charset) + payload = msg.get_payload() + self._munge_cte = (msg['content-transfer-encoding'], + msg['content-type']) + if self._mangle_from_: + payload = fcre.sub('>From ', payload) + self._write_lines(payload) + + # Default body handler + _writeBody = _handle_text + + def _handle_multipart(self, msg): + # The trick here is to write out each part separately, merge them all + # together, and then make sure that the boundary we've chosen isn't + # present in the payload. + msgtexts = [] + subparts = msg.get_payload() + if subparts is None: + subparts = [] + elif isinstance(subparts, str): + # e.g. a non-strict parse of a message with no starting boundary. + self.write(subparts) + return + elif not isinstance(subparts, list): + # Scalar payload + subparts = [subparts] + for part in subparts: + s = self._new_buffer() + g = self.clone(s) + g.flatten(part, unixfrom=False, linesep=self._NL) + msgtexts.append(s.getvalue()) + # BAW: What about boundaries that are wrapped in double-quotes? + boundary = msg.get_boundary() + if not boundary: + # Create a boundary that doesn't appear in any of the + # message texts. + alltext = self._encoded_NL.join(msgtexts) + boundary = self._make_boundary(alltext) + msg.set_boundary(boundary) + # If there's a preamble, write it out, with a trailing CRLF + if msg.preamble is not None: + if self._mangle_from_: + preamble = fcre.sub('>From ', msg.preamble) + else: + preamble = msg.preamble + self._write_lines(preamble) + self.write(self._NL) + # dash-boundary transport-padding CRLF + self.write('--' + boundary + self._NL) + # body-part + if msgtexts: + self._fp.write(msgtexts.pop(0)) + # *encapsulation + # --> delimiter transport-padding + # --> CRLF body-part + for body_part in msgtexts: + # delimiter transport-padding CRLF + self.write(self._NL + '--' + boundary + self._NL) + # body-part + self._fp.write(body_part) + # close-delimiter transport-padding + self.write(self._NL + '--' + boundary + '--' + self._NL) + if msg.epilogue is not None: + if self._mangle_from_: + epilogue = fcre.sub('>From ', msg.epilogue) + else: + epilogue = msg.epilogue + self._write_lines(epilogue) + + def _handle_multipart_signed(self, msg): + # The contents of signed parts has to stay unmodified in order to keep + # the signature intact per RFC1847 2.1, so we disable header wrapping. + # RDM: This isn't enough to completely preserve the part, but it helps. + p = self.policy + self.policy = p.clone(max_line_length=0) + try: + self._handle_multipart(msg) + finally: + self.policy = p + + def _handle_message_delivery_status(self, msg): + # We can't just write the headers directly to self's file object + # because this will leave an extra newline between the last header + # block and the boundary. Sigh. + blocks = [] + for part in msg.get_payload(): + s = self._new_buffer() + g = self.clone(s) + g.flatten(part, unixfrom=False, linesep=self._NL) + text = s.getvalue() + lines = text.split(self._encoded_NL) + # Strip off the unnecessary trailing empty line + if lines and lines[-1] == self._encoded_EMPTY: + blocks.append(self._encoded_NL.join(lines[:-1])) + else: + blocks.append(text) + # Now join all the blocks with an empty line. This has the lovely + # effect of separating each block with an empty line, but not adding + # an extra one after the last one. + self._fp.write(self._encoded_NL.join(blocks)) + + def _handle_message(self, msg): + s = self._new_buffer() + g = self.clone(s) + # The payload of a message/rfc822 part should be a multipart sequence + # of length 1. The zeroth element of the list should be the Message + # object for the subpart. Extract that object, stringify it, and + # write it out. + # Except, it turns out, when it's a string instead, which happens when + # and only when HeaderParser is used on a message of mime type + # message/rfc822. Such messages are generated by, for example, + # Groupwise when forwarding unadorned messages. (Issue 7970.) So + # in that case we just emit the string body. + payload = msg._payload + if isinstance(payload, list): + g.flatten(msg.get_payload(0), unixfrom=False, linesep=self._NL) + payload = s.getvalue() + else: + payload = self._encode(payload) + self._fp.write(payload) + + # This used to be a module level function; we use a classmethod for this + # and _compile_re so we can continue to provide the module level function + # for backward compatibility by doing + # _make_boundary = Generator._make_boundary + # at the end of the module. It *is* internal, so we could drop that... + @classmethod + def _make_boundary(cls, text=None): + # Craft a random boundary. If text is given, ensure that the chosen + # boundary doesn't appear in the text. + token = random.randrange(sys.maxsize) + boundary = ('=' * 15) + (_fmt % token) + '==' + if text is None: + return boundary + b = boundary + counter = 0 + while True: + cre = cls._compile_re('^--' + re.escape(b) + '(--)?$', re.MULTILINE) + if not cre.search(text): + break + b = boundary + '.' + str(counter) + counter += 1 + return b + + @classmethod + def _compile_re(cls, s, flags): + return re.compile(s, flags) + + +class BytesGenerator(Generator): + """Generates a bytes version of a Message object tree. + + Functionally identical to the base Generator except that the output is + bytes and not string. When surrogates were used in the input to encode + bytes, these are decoded back to bytes for output. If the policy has + cte_type set to 7bit, then the message is transformed such that the + non-ASCII bytes are properly content transfer encoded, using the charset + unknown-8bit. + + The outfp object must accept bytes in its write method. + """ + + def write(self, s): + self._fp.write(s.encode('ascii', 'surrogateescape')) + + def _new_buffer(self): + return BytesIO() + + def _encode(self, s): + return s.encode('ascii') + + def _write_headers(self, msg): + # This is almost the same as the string version, except for handling + # strings with 8bit bytes. + for h, v in msg.raw_items(): + self._fp.write(self.policy.fold_binary(h, v)) + # A blank line always separates headers from body + self.write(self._NL) + + def _handle_text(self, msg): + # If the string has surrogates the original source was bytes, so + # just write it back out. + if msg._payload is None: + return + if _has_surrogates(msg._payload) and not self.policy.cte_type=='7bit': + if self._mangle_from_: + msg._payload = fcre.sub(">From ", msg._payload) + self._write_lines(msg._payload) + else: + super(BytesGenerator,self)._handle_text(msg) + + # Default body handler + _writeBody = _handle_text + + @classmethod + def _compile_re(cls, s, flags): + return re.compile(s.encode('ascii'), flags) + + + +_FMT = '[Non-text (%(type)s) part of message omitted, filename %(filename)s]' + +class DecodedGenerator(Generator): + """Generates a text representation of a message. + + Like the Generator base class, except that non-text parts are substituted + with a format string representing the part. + """ + def __init__(self, outfp, mangle_from_=None, maxheaderlen=None, fmt=None, *, + policy=None): + """Like Generator.__init__() except that an additional optional + argument is allowed. + + Walks through all subparts of a message. If the subpart is of main + type `text', then it prints the decoded payload of the subpart. + + Otherwise, fmt is a format string that is used instead of the message + payload. fmt is expanded with the following keywords (in + %(keyword)s format): + + type : Full MIME type of the non-text part + maintype : Main MIME type of the non-text part + subtype : Sub-MIME type of the non-text part + filename : Filename of the non-text part + description: Description associated with the non-text part + encoding : Content transfer encoding of the non-text part + + The default value for fmt is None, meaning + + [Non-text (%(type)s) part of message omitted, filename %(filename)s] + """ + Generator.__init__(self, outfp, mangle_from_, maxheaderlen, + policy=policy) + if fmt is None: + self._fmt = _FMT + else: + self._fmt = fmt + + def _dispatch(self, msg): + for part in msg.walk(): + maintype = part.get_content_maintype() + if maintype == 'text': + print(part.get_payload(decode=False), file=self) + elif maintype == 'multipart': + # Just skip this + pass + else: + print(self._fmt % { + 'type' : part.get_content_type(), + 'maintype' : part.get_content_maintype(), + 'subtype' : part.get_content_subtype(), + 'filename' : part.get_filename('[no filename]'), + 'description': part.get('Content-Description', + '[no description]'), + 'encoding' : part.get('Content-Transfer-Encoding', + '[no encoding]'), + }, file=self) + + + +# Helper used by Generator._make_boundary +_width = len(repr(sys.maxsize-1)) +_fmt = '%%0%dd' % _width + +# Backward compatibility +_make_boundary = Generator._make_boundary diff --git a/modules/language/python/module/email/headerregistry.py b/modules/language/python/module/email/headerregistry.py new file mode 100644 index 0000000..6b8615d --- /dev/null +++ b/modules/language/python/module/email/headerregistry.py @@ -0,0 +1,592 @@ +module(email,headerregistry) +"""Representing and manipulating email headers via custom objects. + +This module provides an implementation of the HeaderRegistry API. +The implementation is designed to flexibly follow RFC5322 rules. + +Eventually HeaderRegistry will be a public API, but it isn't yet, +and will probably change some before that happens. + +""" + +__all__=['HeaderRegistry'] + +from types import MappingProxyType +import email.utils as utils +import email.errors as errors +import email._header_value_parser as parser + +class Address: + + def __init__(self, display_name='', username='', domain='', addr_spec=None): + """Create an object representing a full email address. + + An address can have a 'display_name', a 'username', and a 'domain'. In + addition to specifying the username and domain separately, they may be + specified together by using the addr_spec keyword *instead of* the + username and domain keywords. If an addr_spec string is specified it + must be properly quoted according to RFC 5322 rules; an error will be + raised if it is not. + + An Address object has display_name, username, domain, and addr_spec + attributes, all of which are read-only. The addr_spec and the string + value of the object are both quoted according to RFC5322 rules, but + without any Content Transfer Encoding. + + """ + # This clause with its potential 'raise' may only happen when an + # application program creates an Address object using an addr_spec + # keyword. The email library code itself must always supply username + # and domain. + if addr_spec is not None: + if username or domain: + raise TypeError("addrspec specified when username and/or " + "domain also specified") + a_s, rest = parser.get_addr_spec(addr_spec) + if rest: + raise ValueError("Invalid addr_spec; only '{}' " + "could be parsed from '{}'".format( + a_s, addr_spec)) + if a_s.all_defects: + raise a_s.all_defects[0] + username = a_s.local_part + domain = a_s.domain + self._display_name = display_name + self._username = username + self._domain = domain + + @property + def display_name(self): + return self._display_name + + @property + def username(self): + return self._username + + @property + def domain(self): + return self._domain + + @property + def addr_spec(self): + """The addr_spec (username@domain) portion of the address, quoted + according to RFC 5322 rules, but with no Content Transfer Encoding. + """ + nameset = set(self.username) + if len(nameset) > len(nameset-parser.DOT_ATOM_ENDS): + lp = parser.quote_string(self.username) + else: + lp = self.username + if self.domain: + return lp + '@' + self.domain + if not lp: + return '<>' + return lp + + def __repr__(self): + return "{}(display_name={!r}, username={!r}, domain={!r})".format( + self.__class__.__name__, + self.display_name, self.username, self.domain) + + def __str__(self): + nameset = set(self.display_name) + if len(nameset) > len(nameset-parser.SPECIALS): + disp = parser.quote_string(self.display_name) + else: + disp = self.display_name + if disp: + addr_spec = '' if self.addr_spec=='<>' else self.addr_spec + return "{} <{}>".format(disp, addr_spec) + return self.addr_spec + + def __eq__(self, other): + if type(other) != type(self): + return False + return (self.display_name == other.display_name and + self.username == other.username and + self.domain == other.domain) + + +class Group: + + def __init__(self, display_name=None, addresses=None): + """Create an object representing an address group. + + An address group consists of a display_name followed by colon and a + list of addresses (see Address) terminated by a semi-colon. The Group + is created by specifying a display_name and a possibly empty list of + Address objects. A Group can also be used to represent a single + address that is not in a group, which is convenient when manipulating + lists that are a combination of Groups and individual Addresses. In + this case the display_name should be set to None. In particular, the + string representation of a Group whose display_name is None is the same + as the Address object, if there is one and only one Address object in + the addresses list. + + """ + self._display_name = display_name + self._addresses = tuple(addresses) if addresses else tuple() + + @property + def display_name(self): + return self._display_name + + @property + def addresses(self): + return self._addresses + + def __repr__(self): + return "{}(display_name={!r}, addresses={!r}".format( + self.__class__.__name__, + self.display_name, self.addresses) + + def __str__(self): + if self.display_name is None and len(self.addresses)==1: + return str(self.addresses[0]) + disp = self.display_name + if disp is not None: + nameset = set(disp) + if len(nameset) > len(nameset-parser.SPECIALS): + disp = parser.quote_string(disp) + adrstr = ", ".join(str(x) for x in self.addresses) + adrstr = ' ' + adrstr if adrstr else adrstr + return "{}:{};".format(disp, adrstr) + + def __eq__(self, other): + if type(other) != type(self): + return False + return (self.display_name == other.display_name and + self.addresses == other.addresses) + + +# Header Classes # + +class BaseHeader(str): + + """Base class for message headers. + + Implements generic behavior and provides tools for subclasses. + + A subclass must define a classmethod named 'parse' that takes an unfolded + value string and a dictionary as its arguments. The dictionary will + contain one key, 'defects', initialized to an empty list. After the call + the dictionary must contain two additional keys: parse_tree, set to the + parse tree obtained from parsing the header, and 'decoded', set to the + string value of the idealized representation of the data from the value. + (That is, encoded words are decoded, and values that have canonical + representations are so represented.) + + The defects key is intended to collect parsing defects, which the message + parser will subsequently dispose of as appropriate. The parser should not, + insofar as practical, raise any errors. Defects should be added to the + list instead. The standard header parsers register defects for RFC + compliance issues, for obsolete RFC syntax, and for unrecoverable parsing + errors. + + The parse method may add additional keys to the dictionary. In this case + the subclass must define an 'init' method, which will be passed the + dictionary as its keyword arguments. The method should use (usually by + setting them as the value of similarly named attributes) and remove all the + extra keys added by its parse method, and then use super to call its parent + class with the remaining arguments and keywords. + + The subclass should also make sure that a 'max_count' attribute is defined + that is either None or 1. XXX: need to better define this API. + + """ + + def __new__(cls, name, value): + kwds = {'defects': []} + cls.parse(value, kwds) + if utils._has_surrogates(kwds['decoded']): + kwds['decoded'] = utils._sanitize(kwds['decoded']) + self = str.__new__(cls, kwds['decoded']) + del kwds['decoded'] + self.init(name, **kwds) + return self + + def init(self, name, *, parse_tree, defects): + self._name = name + self._parse_tree = parse_tree + self._defects = defects + + @property + def name(self): + return self._name + + @property + def defects(self): + return tuple(self._defects) + + def __reduce__(self): + return ( + _reconstruct_header, + ( + self.__class__.__name__, + self.__class__.__bases__, + str(self), + ), + self.__dict__) + + @classmethod + def _reconstruct(cls, value): + return str.__new__(cls, value) + + def fold(self, *, policy): + """Fold header according to policy. + + The parsed representation of the header is folded according to + RFC5322 rules, as modified by the policy. If the parse tree + contains surrogateescaped bytes, the bytes are CTE encoded using + the charset 'unknown-8bit". + + Any non-ASCII characters in the parse tree are CTE encoded using + charset utf-8. XXX: make this a policy setting. + + The returned value is an ASCII-only string possibly containing linesep + characters, and ending with a linesep character. The string includes + the header name and the ': ' separator. + + """ + # At some point we need to put fws here iif it was in the source. + header = parser.Header([ + parser.HeaderLabel([ + parser.ValueTerminal(self.name, 'header-name'), + parser.ValueTerminal(':', 'header-sep')]), + ]) + if self._parse_tree: + header.append( + parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')])) + header.append(self._parse_tree) + return header.fold(policy=policy) + + +def _reconstruct_header(cls_name, bases, value): + return type(cls_name, bases, {})._reconstruct(value) + + +class UnstructuredHeader: + + max_count = None + value_parser = staticmethod(parser.get_unstructured) + + @classmethod + def parse(cls, value, kwds): + kwds['parse_tree'] = cls.value_parser(value) + kwds['decoded'] = str(kwds['parse_tree']) + + +class UniqueUnstructuredHeader(UnstructuredHeader): + + max_count = 1 + + +class DateHeader: + + """Header whose value consists of a single timestamp. + + Provides an additional attribute, datetime, which is either an aware + datetime using a timezone, or a naive datetime if the timezone + in the input string is -0000. Also accepts a datetime as input. + The 'value' attribute is the normalized form of the timestamp, + which means it is the output of format_datetime on the datetime. + """ + + max_count = None + + # This is used only for folding, not for creating 'decoded'. + value_parser = staticmethod(parser.get_unstructured) + + @classmethod + def parse(cls, value, kwds): + if not value: + kwds['defects'].append(errors.HeaderMissingRequiredValue()) + kwds['datetime'] = None + kwds['decoded'] = '' + kwds['parse_tree'] = parser.TokenList() + return + if isinstance(value, str): + value = utils.parsedate_to_datetime(value) + kwds['datetime'] = value + kwds['decoded'] = utils.format_datetime(kwds['datetime']) + kwds['parse_tree'] = cls.value_parser(kwds['decoded']) + + def init(self, *args, **kw): + self._datetime = kw.pop('datetime') + super().init(*args, **kw) + + @property + def datetime(self): + return self._datetime + + +class UniqueDateHeader(DateHeader): + + max_count = 1 + + +class AddressHeader: + + max_count = None + + @staticmethod + def value_parser(value): + address_list, value = parser.get_address_list(value) + assert not value, 'this should not happen' + return address_list + + @classmethod + def parse(cls, value, kwds): + if isinstance(value, str): + # We are translating here from the RFC language (address/mailbox) + # to our API language (group/address). + kwds['parse_tree'] = address_list = cls.value_parser(value) + groups = [] + for addr in address_list.addresses: + groups.append(Group(addr.display_name, + [Address(mb.display_name or '', + mb.local_part or '', + mb.domain or '') + for mb in addr.all_mailboxes])) + defects = list(address_list.all_defects) + else: + # Assume it is Address/Group stuff + if not hasattr(value, '__iter__'): + value = [value] + groups = [Group(None, [item]) if not hasattr(item, 'addresses') + else item + for item in value] + defects = [] + kwds['groups'] = groups + kwds['defects'] = defects + kwds['decoded'] = ', '.join([str(item) for item in groups]) + if 'parse_tree' not in kwds: + kwds['parse_tree'] = cls.value_parser(kwds['decoded']) + + def init(self, *args, **kw): + self._groups = tuple(kw.pop('groups')) + self._addresses = None + super().init(*args, **kw) + + @property + def groups(self): + return self._groups + + @property + def addresses(self): + if self._addresses is None: + self._addresses = tuple([address for group in self._groups + for address in group.addresses]) + return self._addresses + + +class UniqueAddressHeader(AddressHeader): + + max_count = 1 + + +class SingleAddressHeader(AddressHeader): + + @property + def address(self): + if len(self.addresses)!=1: + raise ValueError(("value of single address header {} is not " + "a single address").format(self.name)) + return self.addresses[0] + + +class UniqueSingleAddressHeader(SingleAddressHeader): + + max_count = 1 + + +class MIMEVersionHeader: + + max_count = 1 + + value_parser = staticmethod(parser.parse_mime_version) + + @classmethod + def parse(cls, value, kwds): + kwds['parse_tree'] = parse_tree = cls.value_parser(value) + kwds['decoded'] = str(parse_tree) + kwds['defects'].extend(parse_tree.all_defects) + kwds['major'] = None if parse_tree.minor is None else parse_tree.major + kwds['minor'] = parse_tree.minor + if parse_tree.minor is not None: + kwds['version'] = '{}.{}'.format(kwds['major'], kwds['minor']) + else: + kwds['version'] = None + + def init(self, *args, **kw): + self._version = kw.pop('version') + self._major = kw.pop('major') + self._minor = kw.pop('minor') + super().init(*args, **kw) + + @property + def major(self): + return self._major + + @property + def minor(self): + return self._minor + + @property + def version(self): + return self._version + + +class ParameterizedMIMEHeader: + + # Mixin that handles the params dict. Must be subclassed and + # a property value_parser for the specific header provided. + + max_count = 1 + + @classmethod + def parse(cls, value, kwds): + kwds['parse_tree'] = parse_tree = cls.value_parser(value) + kwds['decoded'] = str(parse_tree) + kwds['defects'].extend(parse_tree.all_defects) + if parse_tree.params is None: + kwds['params'] = {} + else: + # The MIME RFCs specify that parameter ordering is arbitrary. + kwds['params'] = {utils._sanitize(name).lower(): + utils._sanitize(value) + for name, value in parse_tree.params} + + def init(self, *args, **kw): + self._params = kw.pop('params') + super().init(*args, **kw) + + @property + def params(self): + return MappingProxyType(self._params) + + +class ContentTypeHeader(ParameterizedMIMEHeader): + + value_parser = staticmethod(parser.parse_content_type_header) + + def init(self, *args, **kw): + super().init(*args, **kw) + self._maintype = utils._sanitize(self._parse_tree.maintype) + self._subtype = utils._sanitize(self._parse_tree.subtype) + + @property + def maintype(self): + return self._maintype + + @property + def subtype(self): + return self._subtype + + @property + def content_type(self): + return self.maintype + '/' + self.subtype + + +class ContentDispositionHeader(ParameterizedMIMEHeader): + + value_parser = staticmethod(parser.parse_content_disposition_header) + + def init(self, *args, **kw): + super().init(*args, **kw) + cd = self._parse_tree.content_disposition + self._content_disposition = cd if cd is None else utils._sanitize(cd) + + @property + def content_disposition(self): + return self._content_disposition + + +class ContentTransferEncodingHeader: + + max_count = 1 + + value_parser = staticmethod(parser.parse_content_transfer_encoding_header) + + @classmethod + def parse(cls, value, kwds): + kwds['parse_tree'] = parse_tree = cls.value_parser(value) + kwds['decoded'] = str(parse_tree) + kwds['defects'].extend(parse_tree.all_defects) + + def init(self, *args, **kw): + super().init(*args, **kw) + self._cte = utils._sanitize(self._parse_tree.cte) + + @property + def cte(self): + return self._cte + + +# The header factory # + +_default_header_map = { + 'subject': UniqueUnstructuredHeader, + 'date': UniqueDateHeader, + 'resent-date': DateHeader, + 'orig-date': UniqueDateHeader, + 'sender': UniqueSingleAddressHeader, + 'resent-sender': SingleAddressHeader, + 'to': UniqueAddressHeader, + 'resent-to': AddressHeader, + 'cc': UniqueAddressHeader, + 'resent-cc': AddressHeader, + 'bcc': UniqueAddressHeader, + 'resent-bcc': AddressHeader, + 'from': UniqueAddressHeader, + 'resent-from': AddressHeader, + 'reply-to': UniqueAddressHeader, + 'mime-version': MIMEVersionHeader, + 'content-type': ContentTypeHeader, + 'content-disposition': ContentDispositionHeader, + 'content-transfer-encoding': ContentTransferEncodingHeader, + } + +class HeaderRegistry: + + """A header_factory and header registry.""" + + def __init__(self, base_class=BaseHeader, default_class=UnstructuredHeader, + use_default_map=True): + """Create a header_factory that works with the Policy API. + + base_class is the class that will be the last class in the created + header class's __bases__ list. default_class is the class that will be + used if "name" (see __call__) does not appear in the registry. + use_default_map controls whether or not the default mapping of names to + specialized classes is copied in to the registry when the factory is + created. The default is True. + + """ + self.registry = {} + self.base_class = base_class + self.default_class = default_class + if use_default_map: + self.registry.update(_default_header_map) + + def map_to_type(self, name, cls): + """Register cls as the specialized class for handling "name" headers. + + """ + self.registry[name.lower()] = cls + + def __getitem__(self, name): + cls = self.registry.get(name.lower(), self.default_class) + return type('_'+cls.__name__, (cls, self.base_class), {}) + + def __call__(self, name, value): + """Create a header instance for header 'name' from 'value'. + + Creates a header instance by creating a specialized class for parsing + and representing the specified header by combining the factory + base_class with a specialized class from the registry or the + default_class, and passing the name and value to the constructed + class's constructor. + + """ + return self[name](name, value) diff --git a/modules/language/python/module/email/iterators.py b/modules/language/python/module/email/iterators.py new file mode 100644 index 0000000..0773bdf --- /dev/null +++ b/modules/language/python/module/email/iterators.py @@ -0,0 +1,72 @@ +module(email,iterators) +# Copyright (C) 2001-2006 Python Software Foundation +# Author: Barry Warsaw +# Contact: email-sig@python.org + +"""Various types of useful iterators and generators.""" + +__all__ = [ + 'body_line_iterator', + 'typed_subpart_iterator', + 'walk', + # Do not include _structure() since it's part of the debugging API. + ] + +import sys +from io import StringIO + + + +# This function will become a method of the Message class +def walk(self): + """Walk over the message tree, yielding each subpart. + + The walk is performed in depth-first order. This method is a + generator. + """ + yield self + if self.is_multipart(): + for subpart in self.get_payload(): + yield from subpart.walk() + + + +# These two functions are imported into the Iterators.py interface module. +def body_line_iterator(msg, decode=False): + """Iterate over the parts, returning string payloads line-by-line. + + Optional decode (default False) is passed through to .get_payload(). + """ + for subpart in msg.walk(): + payload = subpart.get_payload(decode=decode) + if isinstance(payload, str): + yield from StringIO(payload) + + +def typed_subpart_iterator(msg, maintype='text', subtype=None): + """Iterate over the subparts with a given MIME type. + + Use `maintype' as the main MIME type to match against; this defaults to + "text". Optional `subtype' is the MIME subtype to match against; if + omitted, only the main type is matched. + """ + for subpart in msg.walk(): + if subpart.get_content_maintype() == maintype: + if subtype is None or subpart.get_content_subtype() == subtype: + yield subpart + + + +def _structure(msg, fp=None, level=0, include_default=False): + """A handy debugging aid""" + if fp is None: + fp = sys.stdout + tab = ' ' * (level * 4) + print(tab + msg.get_content_type(), end='', file=fp) + if include_default: + print(' [%s]' % msg.get_default_type(), file=fp) + else: + print(file=fp) + if msg.is_multipart(): + for subpart in msg.get_payload(): + _structure(subpart, fp, level+1, include_default) diff --git a/modules/language/python/module/email/message.py b/modules/language/python/module/email/message.py new file mode 100644 index 0000000..c73aa3c --- /dev/null +++ b/modules/language/python/module/email/message.py @@ -0,0 +1,1164 @@ +module(email,message) +# Copyright (C) 2001-2007 Python Software Foundation +# Author: Barry Warsaw +# Contact: email-sig@python.org + +"""Basic message object for the email package object model.""" + +__all__ = ['Message', 'EmailMessage'] + +import re +import uu +import quopri +from io import BytesIO, StringIO + +# Intrapackage imports +import email.utils as utils +import email.errors as errors +from email._policybase import Policy, compat32 +import email.charset as _charset +from email._encoded_words import decode_b +Charset = _charset.Charset + +SEMISPACE = '; ' + +# Regular expression that matches `special' characters in parameters, the +# existence of which force quoting of the parameter value. +tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]') + + +def _splitparam(param): + # Split header parameters. BAW: this may be too simple. It isn't + # strictly RFC 2045 (section 5.1) compliant, but it catches most headers + # found in the wild. We may eventually need a full fledged parser. + # RDM: we might have a Header here; for now just stringify it. + a, sep, b = str(param).partition(';') + if not sep: + return a.strip(), None + return a.strip(), b.strip() + +def _formatparam(param, value=None, quote=True): + """Convenience function to format and return a key=value pair. + + This will quote the value if needed or if quote is true. If value is a + three tuple (charset, language, value), it will be encoded according + to RFC2231 rules. If it contains non-ascii characters it will likewise + be encoded according to RFC2231 rules, using the utf-8 charset and + a null language. + """ + if value is not None and len(value) > 0: + # A tuple is used for RFC 2231 encoded parameter values where items + # are (charset, language, value). charset is a string, not a Charset + # instance. RFC 2231 encoded values are never quoted, per RFC. + if isinstance(value, tuple): + # Encode as per RFC 2231 + param += '*' + value = utils.encode_rfc2231(value[2], value[0], value[1]) + return '%s=%s' % (param, value) + else: + try: + value.encode('ascii') + except UnicodeEncodeError: + param += '*' + value = utils.encode_rfc2231(value, 'utf-8', '') + return '%s=%s' % (param, value) + # BAW: Please check this. I think that if quote is set it should + # force quoting even if not necessary. + if quote or tspecials.search(value): + return '%s="%s"' % (param, utils.quote(value)) + else: + return '%s=%s' % (param, value) + else: + return param + +def _parseparam(s): + # RDM This might be a Header, so for now stringify it. + s = ';' + str(s) + plist = [] + while s[:1] == ';': + s = s[1:] + end = s.find(';') + while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2: + end = s.find(';', end + 1) + if end < 0: + end = len(s) + f = s[:end] + if '=' in f: + i = f.index('=') + f = f[:i].strip().lower() + '=' + f[i+1:].strip() + plist.append(f.strip()) + s = s[end:] + return plist + + +def _unquotevalue(value): + # This is different than utils.collapse_rfc2231_value() because it doesn't + # try to convert the value to a unicode. Message.get_param() and + # Message.get_params() are both currently defined to return the tuple in + # the face of RFC 2231 parameters. + if isinstance(value, tuple): + return value[0], value[1], utils.unquote(value[2]) + else: + return utils.unquote(value) + + +class Message: + """Basic message object. + + A message object is defined as something that has a bunch of RFC 2822 + headers and a payload. It may optionally have an envelope header + (a.k.a. Unix-From or From_ header). If the message is a container (i.e. a + multipart or a message/rfc822), then the payload is a list of Message + objects, otherwise it is a string. + + Message objects implement part of the `mapping' interface, which assumes + there is exactly one occurrence of the header per message. Some headers + do in fact appear multiple times (e.g. Received) and for those headers, + you must use the explicit API to set or get all the headers. Not all of + the mapping methods are implemented. + """ + def __init__(self, policy=compat32): + self.policy = policy + self._headers = [] + self._unixfrom = None + self._payload = None + self._charset = None + # Defaults for multipart messages + self.preamble = self.epilogue = None + self.defects = [] + # Default content type + self._default_type = 'text/plain' + + def __str__(self): + """Return the entire formatted message as a string. + """ + return self.as_string() + + def as_string(self, unixfrom=False, maxheaderlen=0, policy=None): + """Return the entire formatted message as a string. + + Optional 'unixfrom', when true, means include the Unix From_ envelope + header. For backward compatibility reasons, if maxheaderlen is + not specified it defaults to 0, so you must override it explicitly + if you want a different maxheaderlen. 'policy' is passed to the + Generator instance used to serialize the mesasge; if it is not + specified the policy associated with the message instance is used. + + If the message object contains binary data that is not encoded + according to RFC standards, the non-compliant data will be replaced by + unicode "unknown character" code points. + """ + from email.generator import Generator + policy = self.policy if policy is None else policy + fp = StringIO() + g = Generator(fp, + mangle_from_=False, + maxheaderlen=maxheaderlen, + policy=policy) + g.flatten(self, unixfrom=unixfrom) + return fp.getvalue() + + def __bytes__(self): + """Return the entire formatted message as a bytes object. + """ + return self.as_bytes() + + def as_bytes(self, unixfrom=False, policy=None): + """Return the entire formatted message as a bytes object. + + Optional 'unixfrom', when true, means include the Unix From_ envelope + header. 'policy' is passed to the BytesGenerator instance used to + serialize the message; if not specified the policy associated with + the message instance is used. + """ + from email.generator import BytesGenerator + policy = self.policy if policy is None else policy + fp = BytesIO() + g = BytesGenerator(fp, mangle_from_=False, policy=policy) + g.flatten(self, unixfrom=unixfrom) + return fp.getvalue() + + def is_multipart(self): + """Return True if the message consists of multiple parts.""" + return isinstance(self._payload, list) + + # + # Unix From_ line + # + def set_unixfrom(self, unixfrom): + self._unixfrom = unixfrom + + def get_unixfrom(self): + return self._unixfrom + + # + # Payload manipulation. + # + def attach(self, payload): + """Add the given payload to the current payload. + + The current payload will always be a list of objects after this method + is called. If you want to set the payload to a scalar object, use + set_payload() instead. + """ + if self._payload is None: + self._payload = [payload] + else: + try: + self._payload.append(payload) + except AttributeError: + raise TypeError("Attach is not valid on a message with a" + " non-multipart payload") + + def get_payload(self, i=None, decode=False): + """Return a reference to the payload. + + The payload will either be a list object or a string. If you mutate + the list object, you modify the message's payload in place. Optional + i returns that index into the payload. + + Optional decode is a flag indicating whether the payload should be + decoded or not, according to the Content-Transfer-Encoding header + (default is False). + + When True and the message is not a multipart, the payload will be + decoded if this header's value is `quoted-printable' or `base64'. If + some other encoding is used, or the header is missing, or if the + payload has bogus data (i.e. bogus base64 or uuencoded data), the + payload is returned as-is. + + If the message is a multipart and the decode flag is True, then None + is returned. + """ + # Here is the logic table for this code, based on the email5.0.0 code: + # i decode is_multipart result + # ------ ------ ------------ ------------------------------ + # None True True None + # i True True None + # None False True _payload (a list) + # i False True _payload element i (a Message) + # i False False error (not a list) + # i True False error (not a list) + # None False False _payload + # None True False _payload decoded (bytes) + # Note that Barry planned to factor out the 'decode' case, but that + # isn't so easy now that we handle the 8 bit data, which needs to be + # converted in both the decode and non-decode path. + if self.is_multipart(): + if decode: + return None + if i is None: + return self._payload + else: + return self._payload[i] + # For backward compatibility, Use isinstance and this error message + # instead of the more logical is_multipart test. + if i is not None and not isinstance(self._payload, list): + raise TypeError('Expected list, got %s' % type(self._payload)) + payload = self._payload + # cte might be a Header, so for now stringify it. + cte = str(self.get('content-transfer-encoding', '')).lower() + # payload may be bytes here. + if isinstance(payload, str): + if utils._has_surrogates(payload): + bpayload = payload.encode('ascii', 'surrogateescape') + if not decode: + try: + payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace') + except LookupError: + payload = bpayload.decode('ascii', 'replace') + elif decode: + try: + bpayload = payload.encode('ascii') + except UnicodeError: + # This won't happen for RFC compliant messages (messages + # containing only ASCII code points in the unicode input). + # If it does happen, turn the string into bytes in a way + # guaranteed not to fail. + bpayload = payload.encode('raw-unicode-escape') + if not decode: + return payload + if cte == 'quoted-printable': + return quopri.decodestring(bpayload) + elif cte == 'base64': + # XXX: this is a bit of a hack; decode_b should probably be factored + # out somewhere, but I haven't figured out where yet. + value, defects = decode_b(b''.join(bpayload.splitlines())) + for defect in defects: + self.policy.handle_defect(self, defect) + return value + elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'): + in_file = BytesIO(bpayload) + out_file = BytesIO() + try: + uu.decode(in_file, out_file, quiet=True) + return out_file.getvalue() + except uu.Error: + # Some decoding problem + return bpayload + if isinstance(payload, str): + return bpayload + return payload + + def set_payload(self, payload, charset=None): + """Set the payload to the given value. + + Optional charset sets the message's default character set. See + set_charset() for details. + """ + if hasattr(payload, 'encode'): + if charset is None: + self._payload = payload + return + if not isinstance(charset, Charset): + charset = Charset(charset) + payload = payload.encode(charset.output_charset) + if hasattr(payload, 'decode'): + self._payload = payload.decode('ascii', 'surrogateescape') + else: + self._payload = payload + if charset is not None: + self.set_charset(charset) + + def set_charset(self, charset): + """Set the charset of the payload to a given character set. + + charset can be a Charset instance, a string naming a character set, or + None. If it is a string it will be converted to a Charset instance. + If charset is None, the charset parameter will be removed from the + Content-Type field. Anything else will generate a TypeError. + + The message will be assumed to be of type text/* encoded with + charset.input_charset. It will be converted to charset.output_charset + and encoded properly, if needed, when generating the plain text + representation of the message. MIME headers (MIME-Version, + Content-Type, Content-Transfer-Encoding) will be added as needed. + """ + if charset is None: + self.del_param('charset') + self._charset = None + return + if not isinstance(charset, Charset): + charset = Charset(charset) + self._charset = charset + if 'MIME-Version' not in self: + self.add_header('MIME-Version', '1.0') + if 'Content-Type' not in self: + self.add_header('Content-Type', 'text/plain', + charset=charset.get_output_charset()) + else: + self.set_param('charset', charset.get_output_charset()) + if charset != charset.get_output_charset(): + self._payload = charset.body_encode(self._payload) + if 'Content-Transfer-Encoding' not in self: + cte = charset.get_body_encoding() + try: + cte(self) + except TypeError: + # This 'if' is for backward compatibility, it allows unicode + # through even though that won't work correctly if the + # message is serialized. + payload = self._payload + if payload: + try: + payload = payload.encode('ascii', 'surrogateescape') + except UnicodeError: + payload = payload.encode(charset.output_charset) + self._payload = charset.body_encode(payload) + self.add_header('Content-Transfer-Encoding', cte) + + def get_charset(self): + """Return the Charset instance associated with the message's payload. + """ + return self._charset + + # + # MAPPING INTERFACE (partial) + # + def __len__(self): + """Return the total number of headers, including duplicates.""" + return len(self._headers) + + def __getitem__(self, name): + """Get a header value. + + Return None if the header is missing instead of raising an exception. + + Note that if the header appeared multiple times, exactly which + occurrence gets returned is undefined. Use get_all() to get all + the values matching a header field name. + """ + return self.get(name) + + def __setitem__(self, name, val): + """Set the value of a header. + + Note: this does not overwrite an existing header with the same field + name. Use __delitem__() first to delete any existing headers. + """ + max_count = self.policy.header_max_count(name) + if max_count: + lname = name.lower() + found = 0 + for k, v in self._headers: + if k.lower() == lname: + found += 1 + if found >= max_count: + raise ValueError("There may be at most {} {} headers " + "in a message".format(max_count, name)) + self._headers.append(self.policy.header_store_parse(name, val)) + + def __delitem__(self, name): + """Delete all occurrences of a header, if present. + + Does not raise an exception if the header is missing. + """ + name = name.lower() + newheaders = [] + for k, v in self._headers: + if k.lower() != name: + newheaders.append((k, v)) + self._headers = newheaders + + def __contains__(self, name): + return name.lower() in [k.lower() for k, v in self._headers] + + def __iter__(self): + for field, value in self._headers: + yield field + + def keys(self): + """Return a list of all the message's header field names. + + These will be sorted in the order they appeared in the original + message, or were added to the message, and may contain duplicates. + Any fields deleted and re-inserted are always appended to the header + list. + """ + return [k for k, v in self._headers] + + def values(self): + """Return a list of all the message's header values. + + These will be sorted in the order they appeared in the original + message, or were added to the message, and may contain duplicates. + Any fields deleted and re-inserted are always appended to the header + list. + """ + return [self.policy.header_fetch_parse(k, v) + for k, v in self._headers] + + def items(self): + """Get all the message's header fields and values. + + These will be sorted in the order they appeared in the original + message, or were added to the message, and may contain duplicates. + Any fields deleted and re-inserted are always appended to the header + list. + """ + return [(k, self.policy.header_fetch_parse(k, v)) + for k, v in self._headers] + + def get(self, name, failobj=None): + """Get a header value. + + Like __getitem__() but return failobj instead of None when the field + is missing. + """ + name = name.lower() + for k, v in self._headers: + if k.lower() == name: + return self.policy.header_fetch_parse(k, v) + return failobj + + # + # "Internal" methods (public API, but only intended for use by a parser + # or generator, not normal application code. + # + + def set_raw(self, name, value): + """Store name and value in the model without modification. + + This is an "internal" API, intended only for use by a parser. + """ + self._headers.append((name, value)) + + def raw_items(self): + """Return the (name, value) header pairs without modification. + + This is an "internal" API, intended only for use by a generator. + """ + return iter(self._headers.copy()) + + # + # Additional useful stuff + # + + def get_all(self, name, failobj=None): + """Return a list of all the values for the named field. + + These will be sorted in the order they appeared in the original + message, and may contain duplicates. Any fields deleted and + re-inserted are always appended to the header list. + + If no such fields exist, failobj is returned (defaults to None). + """ + values = [] + name = name.lower() + for k, v in self._headers: + if k.lower() == name: + values.append(self.policy.header_fetch_parse(k, v)) + if not values: + return failobj + return values + + def add_header(self, _name, _value, **_params): + """Extended header setting. + + name is the header field to add. keyword arguments can be used to set + additional parameters for the header field, with underscores converted + to dashes. Normally the parameter will be added as key="value" unless + value is None, in which case only the key will be added. If a + parameter value contains non-ASCII characters it can be specified as a + three-tuple of (charset, language, value), in which case it will be + encoded according to RFC2231 rules. Otherwise it will be encoded using + the utf-8 charset and a language of ''. + + Examples: + + msg.add_header('content-disposition', 'attachment', filename='bud.gif') + msg.add_header('content-disposition', 'attachment', + filename=('utf-8', '', Fußballer.ppt')) + msg.add_header('content-disposition', 'attachment', + filename='Fußballer.ppt')) + """ + parts = [] + for k, v in _params.items(): + if v is None: + parts.append(k.replace('_', '-')) + else: + parts.append(_formatparam(k.replace('_', '-'), v)) + if _value is not None: + parts.insert(0, _value) + self[_name] = SEMISPACE.join(parts) + + def replace_header(self, _name, _value): + """Replace a header. + + Replace the first matching header found in the message, retaining + header order and case. If no matching header was found, a KeyError is + raised. + """ + _name = _name.lower() + for i, (k, v) in zip(range(len(self._headers)), self._headers): + if k.lower() == _name: + self._headers[i] = self.policy.header_store_parse(k, _value) + break + else: + raise KeyError(_name) + + # + # Use these three methods instead of the three above. + # + + def get_content_type(self): + """Return the message's content type. + + The returned string is coerced to lower case of the form + `maintype/subtype'. If there was no Content-Type header in the + message, the default type as given by get_default_type() will be + returned. Since according to RFC 2045, messages always have a default + type this will always return a value. + + RFC 2045 defines a message's default type to be text/plain unless it + appears inside a multipart/digest container, in which case it would be + message/rfc822. + """ + missing = object() + value = self.get('content-type', missing) + if value is missing: + # This should have no parameters + return self.get_default_type() + ctype = _splitparam(value)[0].lower() + # RFC 2045, section 5.2 says if its invalid, use text/plain + if ctype.count('/') != 1: + return 'text/plain' + return ctype + + def get_content_maintype(self): + """Return the message's main content type. + + This is the `maintype' part of the string returned by + get_content_type(). + """ + ctype = self.get_content_type() + return ctype.split('/')[0] + + def get_content_subtype(self): + """Returns the message's sub-content type. + + This is the `subtype' part of the string returned by + get_content_type(). + """ + ctype = self.get_content_type() + return ctype.split('/')[1] + + def get_default_type(self): + """Return the `default' content type. + + Most messages have a default content type of text/plain, except for + messages that are subparts of multipart/digest containers. Such + subparts have a default content type of message/rfc822. + """ + return self._default_type + + def set_default_type(self, ctype): + """Set the `default' content type. + + ctype should be either "text/plain" or "message/rfc822", although this + is not enforced. The default content type is not stored in the + Content-Type header. + """ + self._default_type = ctype + + def _get_params_preserve(self, failobj, header): + # Like get_params() but preserves the quoting of values. BAW: + # should this be part of the public interface? + missing = object() + value = self.get(header, missing) + if value is missing: + return failobj + params = [] + for p in _parseparam(value): + try: + name, val = p.split('=', 1) + name = name.strip() + val = val.strip() + except ValueError: + # Must have been a bare attribute + name = p.strip() + val = '' + params.append((name, val)) + params = utils.decode_params(params) + return params + + def get_params(self, failobj=None, header='content-type', unquote=True): + """Return the message's Content-Type parameters, as a list. + + The elements of the returned list are 2-tuples of key/value pairs, as + split on the `=' sign. The left hand side of the `=' is the key, + while the right hand side is the value. If there is no `=' sign in + the parameter the value is the empty string. The value is as + described in the get_param() method. + + Optional failobj is the object to return if there is no Content-Type + header. Optional header is the header to search instead of + Content-Type. If unquote is True, the value is unquoted. + """ + missing = object() + params = self._get_params_preserve(missing, header) + if params is missing: + return failobj + if unquote: + return [(k, _unquotevalue(v)) for k, v in params] + else: + return params + + def get_param(self, param, failobj=None, header='content-type', + unquote=True): + """Return the parameter value if found in the Content-Type header. + + Optional failobj is the object to return if there is no Content-Type + header, or the Content-Type header has no such parameter. Optional + header is the header to search instead of Content-Type. + + Parameter keys are always compared case insensitively. The return + value can either be a string, or a 3-tuple if the parameter was RFC + 2231 encoded. When it's a 3-tuple, the elements of the value are of + the form (CHARSET, LANGUAGE, VALUE). Note that both CHARSET and + LANGUAGE can be None, in which case you should consider VALUE to be + encoded in the us-ascii charset. You can usually ignore LANGUAGE. + The parameter value (either the returned string, or the VALUE item in + the 3-tuple) is always unquoted, unless unquote is set to False. + + If your application doesn't care whether the parameter was RFC 2231 + encoded, it can turn the return value into a string as follows: + + rawparam = msg.get_param('foo') + param = email.utils.collapse_rfc2231_value(rawparam) + + """ + if header not in self: + return failobj + for k, v in self._get_params_preserve(failobj, header): + if k.lower() == param.lower(): + if unquote: + return _unquotevalue(v) + else: + return v + return failobj + + def set_param(self, param, value, header='Content-Type', requote=True, + charset=None, language='', replace=False): + """Set a parameter in the Content-Type header. + + If the parameter already exists in the header, its value will be + replaced with the new value. + + If header is Content-Type and has not yet been defined for this + message, it will be set to "text/plain" and the new parameter and + value will be appended as per RFC 2045. + + An alternate header can be specified in the header argument, and all + parameters will be quoted as necessary unless requote is False. + + If charset is specified, the parameter will be encoded according to RFC + 2231. Optional language specifies the RFC 2231 language, defaulting + to the empty string. Both charset and language should be strings. + """ + if not isinstance(value, tuple) and charset: + value = (charset, language, value) + + if header not in self and header.lower() == 'content-type': + ctype = 'text/plain' + else: + ctype = self.get(header) + if not self.get_param(param, header=header): + if not ctype: + ctype = _formatparam(param, value, requote) + else: + ctype = SEMISPACE.join( + [ctype, _formatparam(param, value, requote)]) + else: + ctype = '' + for old_param, old_value in self.get_params(header=header, + unquote=requote): + append_param = '' + if old_param.lower() == param.lower(): + append_param = _formatparam(param, value, requote) + else: + append_param = _formatparam(old_param, old_value, requote) + if not ctype: + ctype = append_param + else: + ctype = SEMISPACE.join([ctype, append_param]) + if ctype != self.get(header): + if replace: + self.replace_header(header, ctype) + else: + del self[header] + self[header] = ctype + + def del_param(self, param, header='content-type', requote=True): + """Remove the given parameter completely from the Content-Type header. + + The header will be re-written in place without the parameter or its + value. All values will be quoted as necessary unless requote is + False. Optional header specifies an alternative to the Content-Type + header. + """ + if header not in self: + return + new_ctype = '' + for p, v in self.get_params(header=header, unquote=requote): + if p.lower() != param.lower(): + if not new_ctype: + new_ctype = _formatparam(p, v, requote) + else: + new_ctype = SEMISPACE.join([new_ctype, + _formatparam(p, v, requote)]) + if new_ctype != self.get(header): + del self[header] + self[header] = new_ctype + + def set_type(self, type, header='Content-Type', requote=True): + """Set the main type and subtype for the Content-Type header. + + type must be a string in the form "maintype/subtype", otherwise a + ValueError is raised. + + This method replaces the Content-Type header, keeping all the + parameters in place. If requote is False, this leaves the existing + header's quoting as is. Otherwise, the parameters will be quoted (the + default). + + An alternative header can be specified in the header argument. When + the Content-Type header is set, we'll always also add a MIME-Version + header. + """ + # BAW: should we be strict? + if not type.count('/') == 1: + raise ValueError + # Set the Content-Type, you get a MIME-Version + if header.lower() == 'content-type': + del self['mime-version'] + self['MIME-Version'] = '1.0' + if header not in self: + self[header] = type + return + params = self.get_params(header=header, unquote=requote) + del self[header] + self[header] = type + # Skip the first param; it's the old type. + for p, v in params[1:]: + self.set_param(p, v, header, requote) + + def get_filename(self, failobj=None): + """Return the filename associated with the payload if present. + + The filename is extracted from the Content-Disposition header's + `filename' parameter, and it is unquoted. If that header is missing + the `filename' parameter, this method falls back to looking for the + `name' parameter. + """ + missing = object() + filename = self.get_param('filename', missing, 'content-disposition') + if filename is missing: + filename = self.get_param('name', missing, 'content-type') + if filename is missing: + return failobj + return utils.collapse_rfc2231_value(filename).strip() + + def get_boundary(self, failobj=None): + """Return the boundary associated with the payload if present. + + The boundary is extracted from the Content-Type header's `boundary' + parameter, and it is unquoted. + """ + missing = object() + boundary = self.get_param('boundary', missing) + if boundary is missing: + return failobj + # RFC 2046 says that boundaries may begin but not end in w/s + return utils.collapse_rfc2231_value(boundary).rstrip() + + def set_boundary(self, boundary): + """Set the boundary parameter in Content-Type to 'boundary'. + + This is subtly different than deleting the Content-Type header and + adding a new one with a new boundary parameter via add_header(). The + main difference is that using the set_boundary() method preserves the + order of the Content-Type header in the original message. + + HeaderParseError is raised if the message has no Content-Type header. + """ + missing = object() + params = self._get_params_preserve(missing, 'content-type') + if params is missing: + # There was no Content-Type header, and we don't know what type + # to set it to, so raise an exception. + raise errors.HeaderParseError('No Content-Type header found') + newparams = [] + foundp = False + for pk, pv in params: + if pk.lower() == 'boundary': + newparams.append(('boundary', '"%s"' % boundary)) + foundp = True + else: + newparams.append((pk, pv)) + if not foundp: + # The original Content-Type header had no boundary attribute. + # Tack one on the end. BAW: should we raise an exception + # instead??? + newparams.append(('boundary', '"%s"' % boundary)) + # Replace the existing Content-Type header with the new value + newheaders = [] + for h, v in self._headers: + if h.lower() == 'content-type': + parts = [] + for k, v in newparams: + if v == '': + parts.append(k) + else: + parts.append('%s=%s' % (k, v)) + val = SEMISPACE.join(parts) + newheaders.append(self.policy.header_store_parse(h, val)) + + else: + newheaders.append((h, v)) + self._headers = newheaders + + def get_content_charset(self, failobj=None): + """Return the charset parameter of the Content-Type header. + + The returned string is always coerced to lower case. If there is no + Content-Type header, or if that header has no charset parameter, + failobj is returned. + """ + missing = object() + charset = self.get_param('charset', missing) + if charset is missing: + return failobj + if isinstance(charset, tuple): + # RFC 2231 encoded, so decode it, and it better end up as ascii. + pcharset = charset[0] or 'us-ascii' + try: + # LookupError will be raised if the charset isn't known to + # Python. UnicodeError will be raised if the encoded text + # contains a character not in the charset. + as_bytes = charset[2].encode('raw-unicode-escape') + charset = str(as_bytes, pcharset) + except (LookupError, UnicodeError): + charset = charset[2] + # charset characters must be in us-ascii range + try: + charset.encode('us-ascii') + except UnicodeError: + return failobj + # RFC 2046, $4.1.2 says charsets are not case sensitive + return charset.lower() + + def get_charsets(self, failobj=None): + """Return a list containing the charset(s) used in this message. + + The returned list of items describes the Content-Type headers' + charset parameter for this message and all the subparts in its + payload. + + Each item will either be a string (the value of the charset parameter + in the Content-Type header of that part) or the value of the + 'failobj' parameter (defaults to None), if the part does not have a + main MIME type of "text", or the charset is not defined. + + The list will contain one string for each part of the message, plus + one for the container message (i.e. self), so that a non-multipart + message will still return a list of length 1. + """ + return [part.get_content_charset(failobj) for part in self.walk()] + + def get_content_disposition(self): + """Return the message's content-disposition if it exists, or None. + + The return values can be either 'inline', 'attachment' or None + according to the rfc2183. + """ + value = self.get('content-disposition') + if value is None: + return None + c_d = _splitparam(value)[0].lower() + return c_d + + # I.e. def walk(self): ... + from email.iterators import walk + + +class MIMEPart(Message): + + def __init__(self, policy=None): + if policy is None: + from email.policy import default + policy = default + Message.__init__(self, policy) + + + def as_string(self, unixfrom=False, maxheaderlen=None, policy=None): + """Return the entire formatted message as a string. + + Optional 'unixfrom', when true, means include the Unix From_ envelope + header. maxheaderlen is retained for backward compatibility with the + base Message class, but defaults to None, meaning that the policy value + for max_line_length controls the header maximum length. 'policy' is + passed to the Generator instance used to serialize the mesasge; if it + is not specified the policy associated with the message instance is + used. + """ + policy = self.policy if policy is None else policy + if maxheaderlen is None: + maxheaderlen = policy.max_line_length + return super().as_string(maxheaderlen=maxheaderlen, policy=policy) + + def __str__(self): + return self.as_string(policy=self.policy.clone(utf8=True)) + + def is_attachment(self): + c_d = self.get('content-disposition') + return False if c_d is None else c_d.content_disposition == 'attachment' + + def _find_body(self, part, preferencelist): + if part.is_attachment(): + return + maintype, subtype = part.get_content_type().split('/') + if maintype == 'text': + if subtype in preferencelist: + yield (preferencelist.index(subtype), part) + return + if maintype != 'multipart': + return + if subtype != 'related': + for subpart in part.iter_parts(): + yield from self._find_body(subpart, preferencelist) + return + if 'related' in preferencelist: + yield (preferencelist.index('related'), part) + candidate = None + start = part.get_param('start') + if start: + for subpart in part.iter_parts(): + if subpart['content-id'] == start: + candidate = subpart + break + if candidate is None: + subparts = part.get_payload() + candidate = subparts[0] if subparts else None + if candidate is not None: + yield from self._find_body(candidate, preferencelist) + + def get_body(self, preferencelist=('related', 'html', 'plain')): + """Return best candidate mime part for display as 'body' of message. + + Do a depth first search, starting with self, looking for the first part + matching each of the items in preferencelist, and return the part + corresponding to the first item that has a match, or None if no items + have a match. If 'related' is not included in preferencelist, consider + the root part of any multipart/related encountered as a candidate + match. Ignore parts with 'Content-Disposition: attachment'. + """ + best_prio = len(preferencelist) + body = None + for prio, part in self._find_body(self, preferencelist): + if prio < best_prio: + best_prio = prio + body = part + if prio == 0: + break + return body + + _body_types = {('text', 'plain'), + ('text', 'html'), + ('multipart', 'related'), + ('multipart', 'alternative')} + def iter_attachments(self): + """Return an iterator over the non-main parts of a multipart. + + Skip the first of each occurrence of text/plain, text/html, + multipart/related, or multipart/alternative in the multipart (unless + they have a 'Content-Disposition: attachment' header) and include all + remaining subparts in the returned iterator. When applied to a + multipart/related, return all parts except the root part. Return an + empty iterator when applied to a multipart/alternative or a + non-multipart. + """ + maintype, subtype = self.get_content_type().split('/') + if maintype != 'multipart' or subtype == 'alternative': + return + parts = self.get_payload().copy() + if maintype == 'multipart' and subtype == 'related': + # For related, we treat everything but the root as an attachment. + # The root may be indicated by 'start'; if there's no start or we + # can't find the named start, treat the first subpart as the root. + start = self.get_param('start') + if start: + found = False + attachments = [] + for part in parts: + if part.get('content-id') == start: + found = True + else: + attachments.append(part) + if found: + yield from attachments + return + parts.pop(0) + yield from parts + return + # Otherwise we more or less invert the remaining logic in get_body. + # This only really works in edge cases (ex: non-text related or + # alternatives) if the sending agent sets content-disposition. + seen = [] # Only skip the first example of each candidate type. + for part in parts: + maintype, subtype = part.get_content_type().split('/') + if ((maintype, subtype) in self._body_types and + not part.is_attachment() and subtype not in seen): + seen.append(subtype) + continue + yield part + + def iter_parts(self): + """Return an iterator over all immediate subparts of a multipart. + + Return an empty iterator for a non-multipart. + """ + if self.get_content_maintype() == 'multipart': + yield from self.get_payload() + + def get_content(self, *args, content_manager=None, **kw): + if content_manager is None: + content_manager = self.policy.content_manager + return content_manager.get_content(self, *args, **kw) + + def set_content(self, *args, content_manager=None, **kw): + if content_manager is None: + content_manager = self.policy.content_manager + content_manager.set_content(self, *args, **kw) + + def _make_multipart(self, subtype, disallowed_subtypes, boundary): + if self.get_content_maintype() == 'multipart': + existing_subtype = self.get_content_subtype() + disallowed_subtypes = disallowed_subtypes + (subtype,) + if existing_subtype in disallowed_subtypes: + raise ValueError("Cannot convert {} to {}".format( + existing_subtype, subtype)) + keep_headers = [] + part_headers = [] + for name, value in self._headers: + if name.lower().startswith('content-'): + part_headers.append((name, value)) + else: + keep_headers.append((name, value)) + if part_headers: + # There is existing content, move it to the first subpart. + part = type(self)(policy=self.policy) + part._headers = part_headers + part._payload = self._payload + self._payload = [part] + else: + self._payload = [] + self._headers = keep_headers + self['Content-Type'] = 'multipart/' + subtype + if boundary is not None: + self.set_param('boundary', boundary) + + def make_related(self, boundary=None): + self._make_multipart('related', ('alternative', 'mixed'), boundary) + + def make_alternative(self, boundary=None): + self._make_multipart('alternative', ('mixed',), boundary) + + def make_mixed(self, boundary=None): + self._make_multipart('mixed', (), boundary) + + def _add_multipart(self, _subtype, *args, _disp=None, **kw): + if (self.get_content_maintype() != 'multipart' or + self.get_content_subtype() != _subtype): + getattr(self, 'make_' + _subtype)() + part = type(self)(policy=self.policy) + part.set_content(*args, **kw) + if _disp and 'content-disposition' not in part: + part['Content-Disposition'] = _disp + self.attach(part) + + def add_related(self, *args, **kw): + self._add_multipart('related', *args, _disp='inline', **kw) + + def add_alternative(self, *args, **kw): + self._add_multipart('alternative', *args, **kw) + + def add_attachment(self, *args, **kw): + self._add_multipart('mixed', *args, _disp='attachment', **kw) + + def clear(self): + self._headers = [] + self._payload = None + + def clear_content(self): + self._headers = [(n, v) for n, v in self._headers + if not n.lower().startswith('content-')] + self._payload = None + + +class EmailMessage(MIMEPart): + + def set_content(self, *args, **kw): + super().set_content(*args, **kw) + if 'MIME-Version' not in self: + self['MIME-Version'] = '1.0' diff --git a/modules/language/python/module/email/parser.py b/modules/language/python/module/email/parser.py index 5ec164e..962f9dc 100644 --- a/modules/language/python/module/email/parser.py +++ b/modules/language/python/module/email/parser.py @@ -9,7 +9,6 @@ __all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser', 'FeedParser', 'BytesFeedParser'] from io import StringIO, TextIOWrapper - from email.feedparser import FeedParser, BytesFeedParser from email._policybase import compat32 diff --git a/modules/language/python/module/email/policy.py b/modules/language/python/module/email/policy.py new file mode 100644 index 0000000..3f8551c --- /dev/null +++ b/modules/language/python/module/email/policy.py @@ -0,0 +1,231 @@ +module(email,policy) +"""This will be the home for the policy that hooks in the new +code that adds all the email6 features. +""" + +import re +from email._policybase import Policy, Compat32, compat32, _extend_docstrings +from email.utils import _has_surrogates +from email.headerregistry import HeaderRegistry as HeaderRegistry +from email.contentmanager import raw_data_manager +from email.message import EmailMessage + +__all__ = [ + 'Compat32', + 'compat32', + 'Policy', + 'EmailPolicy', + 'default', + 'strict', + 'SMTP', + 'HTTP', + ] + +linesep_splitter = re.compile(r'\n|\r') + +@_extend_docstrings +class EmailPolicy(Policy): + + """+ + PROVISIONAL + + The API extensions enabled by this policy are currently provisional. + Refer to the documentation for details. + + This policy adds new header parsing and folding algorithms. Instead of + simple strings, headers are custom objects with custom attributes + depending on the type of the field. The folding algorithm fully + implements RFCs 2047 and 5322. + + In addition to the settable attributes listed above that apply to + all Policies, this policy adds the following additional attributes: + + utf8 -- if False (the default) message headers will be + serialized as ASCII, using encoded words to encode + any non-ASCII characters in the source strings. If + True, the message headers will be serialized using + utf8 and will not contain encoded words (see RFC + 6532 for more on this serialization format). + + refold_source -- if the value for a header in the Message object + came from the parsing of some source, this attribute + indicates whether or not a generator should refold + that value when transforming the message back into + stream form. The possible values are: + + none -- all source values use original folding + long -- source values that have any line that is + longer than max_line_length will be + refolded + all -- all values are refolded. + + The default is 'long'. + + header_factory -- a callable that takes two arguments, 'name' and + 'value', where 'name' is a header field name and + 'value' is an unfolded header field value, and + returns a string-like object that represents that + header. A default header_factory is provided that + understands some of the RFC5322 header field types. + (Currently address fields and date fields have + special treatment, while all other fields are + treated as unstructured. This list will be + completed before the extension is marked stable.) + + content_manager -- an object with at least two methods: get_content + and set_content. When the get_content or + set_content method of a Message object is called, + it calls the corresponding method of this object, + passing it the message object as its first argument, + and any arguments or keywords that were passed to + it as additional arguments. The default + content_manager is + :data:`~email.contentmanager.raw_data_manager`. + + """ + + message_factory = EmailMessage + utf8 = False + refold_source = 'long' + header_factory = HeaderRegistry() + content_manager = raw_data_manager + + def __init__(self, **kw): + # Ensure that each new instance gets a unique header factory + # (as opposed to clones, which share the factory). + + if 'header_factory' not in kw: + object.__setattr__(self, 'header_factory', HeaderRegistry()) + + super().__init__(**kw) + + + def header_max_count(self, name): + """+ + The implementation for this class returns the max_count attribute from + the specialized header class that would be used to construct a header + of type 'name'. + """ + return self.header_factory[name].max_count + + # The logic of the next three methods is chosen such that it is possible to + # switch a Message object between a Compat32 policy and a policy derived + # from this class and have the results stay consistent. This allows a + # Message object constructed with this policy to be passed to a library + # that only handles Compat32 objects, or to receive such an object and + # convert it to use the newer style by just changing its policy. It is + # also chosen because it postpones the relatively expensive full rfc5322 + # parse until as late as possible when parsing from source, since in many + # applications only a few headers will actually be inspected. + + def header_source_parse(self, sourcelines): + """+ + The name is parsed as everything up to the ':' and returned unmodified. + The value is determined by stripping leading whitespace off the + remainder of the first line, joining all subsequent lines together, and + stripping any trailing carriage return or linefeed characters. (This + is the same as Compat32). + + """ + name, value = sourcelines[0].split(':', 1) + value = value.lstrip(' \t') + ''.join(sourcelines[1:]) + return (name, value.rstrip('\r\n')) + + def header_store_parse(self, name, value): + """+ + The name is returned unchanged. If the input value has a 'name' + attribute and it matches the name ignoring case, the value is returned + unchanged. Otherwise the name and value are passed to header_factory + method, and the resulting custom header object is returned as the + value. In this case a ValueError is raised if the input value contains + CR or LF characters. + + """ + if hasattr(value, 'name') and value.name.lower() == name.lower(): + return (name, value) + if isinstance(value, str) and len(value.splitlines())>1: + # XXX this error message isn't quite right when we use splitlines + # (see issue 22233), but I'm not sure what should happen here. + raise ValueError("Header values may not contain linefeed " + "or carriage return characters") + return (name, self.header_factory(name, value)) + + def header_fetch_parse(self, name, value): + """+ + If the value has a 'name' attribute, it is returned to unmodified. + Otherwise the name and the value with any linesep characters removed + are passed to the header_factory method, and the resulting custom + header object is returned. Any surrogateescaped bytes get turned + into the unicode unknown-character glyph. + + """ + if hasattr(value, 'name'): + return value + # We can't use splitlines here because it splits on more than \r and \n. + value = ''.join(linesep_splitter.split(value)) + return self.header_factory(name, value) + + def fold(self, name, value): + """+ + Header folding is controlled by the refold_source policy setting. A + value is considered to be a 'source value' if and only if it does not + have a 'name' attribute (having a 'name' attribute means it is a header + object of some sort). If a source value needs to be refolded according + to the policy, it is converted into a custom header object by passing + the name and the value with any linesep characters removed to the + header_factory method. Folding of a custom header object is done by + calling its fold method with the current policy. + + Source values are split into lines using splitlines. If the value is + not to be refolded, the lines are rejoined using the linesep from the + policy and returned. The exception is lines containing non-ascii + binary data. In that case the value is refolded regardless of the + refold_source setting, which causes the binary data to be CTE encoded + using the unknown-8bit charset. + + """ + return self._fold(name, value, refold_binary=True) + + def fold_binary(self, name, value): + """+ + The same as fold if cte_type is 7bit, except that the returned value is + bytes. + + If cte_type is 8bit, non-ASCII binary data is converted back into + bytes. Headers with binary data are not refolded, regardless of the + refold_header setting, since there is no way to know whether the binary + data consists of single byte characters or multibyte characters. + + If utf8 is true, headers are encoded to utf8, otherwise to ascii with + non-ASCII unicode rendered as encoded words. + + """ + folded = self._fold(name, value, refold_binary=self.cte_type=='7bit') + charset = 'utf8' if self.utf8 else 'ascii' + return folded.encode(charset, 'surrogateescape') + + def _fold(self, name, value, refold_binary=False): + if hasattr(value, 'name'): + return value.fold(policy=self) + maxlen = self.max_line_length if self.max_line_length else float('inf') + lines = value.splitlines() + refold = (self.refold_source == 'all' or + self.refold_source == 'long' and + (lines and len(lines[0])+len(name)+2 > maxlen or + any(len(x) > maxlen for x in lines[1:]))) + if refold or refold_binary and _has_surrogates(value): + return self.header_factory(name, ''.join(lines)).fold(policy=self) + return name + ': ' + self.linesep.join(lines) + self.linesep + + +default = EmailPolicy() + +# Make the default policy use the class default header_factory +del default.header_factory +strict = default.clone(raise_on_defect=True) + +SMTP = default.clone(linesep='\r\n') + +HTTP = default.clone(linesep='\r\n', max_line_length=None) + +SMTPUTF8 = SMTP.clone(utf8=True) diff --git a/modules/language/python/module/email/utils.py b/modules/language/python/module/email/utils.py index abe3895..861f1ad 100644 --- a/modules/language/python/module/email/utils.py +++ b/modules/language/python/module/email/utils.py @@ -24,6 +24,7 @@ __all__ = [ '_has_surrogates' ] + import os import re import time diff --git a/modules/language/python/module/html.py b/modules/language/python/module/html.py new file mode 100644 index 0000000..654ffb7 --- /dev/null +++ b/modules/language/python/module/html.py @@ -0,0 +1,134 @@ +module(html) + +""" +General functions for HTML manipulation. +""" + +import re as _re +from html.entities import html5 as _html5 + + +__all__ = ['escape', 'unescape'] + + +def escape(s, quote=True): + """ + Replace special characters "&", "<" and ">" to HTML-safe sequences. + If the optional flag quote is true (the default), the quotation mark + characters, both double quote (") and single quote (') characters are also + translated. + """ + s = s.replace("&", "&") # Must be done first! + s = s.replace("<", "<") + s = s.replace(">", ">") + if quote: + s = s.replace('"', """) + s = s.replace('\'', "'") + return s + + +# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references + +_invalid_charrefs = { + 0x00: '\ufffd', # REPLACEMENT CHARACTER + 0x0d: '\r', # CARRIAGE RETURN + 0x80: '\u20ac', # EURO SIGN + 0x81: '\x81', # <control> + 0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK + 0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK + 0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK + 0x85: '\u2026', # HORIZONTAL ELLIPSIS + 0x86: '\u2020', # DAGGER + 0x87: '\u2021', # DOUBLE DAGGER + 0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT + 0x89: '\u2030', # PER MILLE SIGN + 0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON + 0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE + 0x8d: '\x8d', # <control> + 0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON + 0x8f: '\x8f', # <control> + 0x90: '\x90', # <control> + 0x91: '\u2018', # LEFT SINGLE QUOTATION MARK + 0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK + 0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK + 0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK + 0x95: '\u2022', # BULLET + 0x96: '\u2013', # EN DASH + 0x97: '\u2014', # EM DASH + 0x98: '\u02dc', # SMALL TILDE + 0x99: '\u2122', # TRADE MARK SIGN + 0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON + 0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 0x9c: '\u0153', # LATIN SMALL LIGATURE OE + 0x9d: '\x9d', # <control> + 0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON + 0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS +} + +_invalid_codepoints = { + # 0x0001 to 0x0008 + 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, + # 0x000E to 0x001F + 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + # 0x007F to 0x009F + 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, + 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, + 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + # 0xFDD0 to 0xFDEF + 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8, + 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1, + 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea, + 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef, + # others + 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, + 0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, + 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff, + 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff, + 0x10fffe, 0x10ffff +} + + +def _replace_charref(s): + s = s.group(1) + if s[0] == '#': + # numeric charref + if s[1] in 'xX': + num = int(s[2:].rstrip(';'), 16) + else: + num = int(s[1:].rstrip(';')) + if num in _invalid_charrefs: + return _invalid_charrefs[num] + if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF: + return '\uFFFD' + if num in _invalid_codepoints: + return '' + return chr(num) + else: + # named charref + if s in _html5: + return _html5[s] + # find the longest matching name (as defined by the standard) + for x in range(len(s)-1, 1, -1): + if s[:x] in _html5: + return _html5[s[:x]] + s[x:] + else: + return '&' + s + + +_charref = _re.compile(r'&(#[0-9]+;?' + r'|#[xX][0-9a-fA-F]+;?' + r'|[^\t\n\f <&#;]{1,32};?)') + +def unescape(s): + """ + Convert all named and numeric character references (e.g. >, >, + &x3e;) in the string s to the corresponding unicode characters. + This function uses the rules defined by the HTML 5 standard + for both valid and invalid character references, and the list of + HTML 5 named character references defined in html.entities.html5. + """ + if '&' not in s: + return s + return _charref.sub(_replace_charref, s) diff --git a/modules/language/python/module/html/entities.py b/modules/language/python/module/html/entities.py new file mode 100644 index 0000000..a7a888e --- /dev/null +++ b/modules/language/python/module/html/entities.py @@ -0,0 +1,2511 @@ +module(html,entities) + +"""HTML character entity references.""" + +__all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs'] + + +# maps the HTML entity name to the Unicode code point +name2codepoint = { + 'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 + 'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1 + 'Acirc': 0x00c2, # latin capital letter A with circumflex, U+00C2 ISOlat1 + 'Agrave': 0x00c0, # latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 + 'Alpha': 0x0391, # greek capital letter alpha, U+0391 + 'Aring': 0x00c5, # latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 + 'Atilde': 0x00c3, # latin capital letter A with tilde, U+00C3 ISOlat1 + 'Auml': 0x00c4, # latin capital letter A with diaeresis, U+00C4 ISOlat1 + 'Beta': 0x0392, # greek capital letter beta, U+0392 + 'Ccedil': 0x00c7, # latin capital letter C with cedilla, U+00C7 ISOlat1 + 'Chi': 0x03a7, # greek capital letter chi, U+03A7 + 'Dagger': 0x2021, # double dagger, U+2021 ISOpub + 'Delta': 0x0394, # greek capital letter delta, U+0394 ISOgrk3 + 'ETH': 0x00d0, # latin capital letter ETH, U+00D0 ISOlat1 + 'Eacute': 0x00c9, # latin capital letter E with acute, U+00C9 ISOlat1 + 'Ecirc': 0x00ca, # latin capital letter E with circumflex, U+00CA ISOlat1 + 'Egrave': 0x00c8, # latin capital letter E with grave, U+00C8 ISOlat1 + 'Epsilon': 0x0395, # greek capital letter epsilon, U+0395 + 'Eta': 0x0397, # greek capital letter eta, U+0397 + 'Euml': 0x00cb, # latin capital letter E with diaeresis, U+00CB ISOlat1 + 'Gamma': 0x0393, # greek capital letter gamma, U+0393 ISOgrk3 + 'Iacute': 0x00cd, # latin capital letter I with acute, U+00CD ISOlat1 + 'Icirc': 0x00ce, # latin capital letter I with circumflex, U+00CE ISOlat1 + 'Igrave': 0x00cc, # latin capital letter I with grave, U+00CC ISOlat1 + 'Iota': 0x0399, # greek capital letter iota, U+0399 + 'Iuml': 0x00cf, # latin capital letter I with diaeresis, U+00CF ISOlat1 + 'Kappa': 0x039a, # greek capital letter kappa, U+039A + 'Lambda': 0x039b, # greek capital letter lambda, U+039B ISOgrk3 + 'Mu': 0x039c, # greek capital letter mu, U+039C + 'Ntilde': 0x00d1, # latin capital letter N with tilde, U+00D1 ISOlat1 + 'Nu': 0x039d, # greek capital letter nu, U+039D + 'OElig': 0x0152, # latin capital ligature OE, U+0152 ISOlat2 + 'Oacute': 0x00d3, # latin capital letter O with acute, U+00D3 ISOlat1 + 'Ocirc': 0x00d4, # latin capital letter O with circumflex, U+00D4 ISOlat1 + 'Ograve': 0x00d2, # latin capital letter O with grave, U+00D2 ISOlat1 + 'Omega': 0x03a9, # greek capital letter omega, U+03A9 ISOgrk3 + 'Omicron': 0x039f, # greek capital letter omicron, U+039F + 'Oslash': 0x00d8, # latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 + 'Otilde': 0x00d5, # latin capital letter O with tilde, U+00D5 ISOlat1 + 'Ouml': 0x00d6, # latin capital letter O with diaeresis, U+00D6 ISOlat1 + 'Phi': 0x03a6, # greek capital letter phi, U+03A6 ISOgrk3 + 'Pi': 0x03a0, # greek capital letter pi, U+03A0 ISOgrk3 + 'Prime': 0x2033, # double prime = seconds = inches, U+2033 ISOtech + 'Psi': 0x03a8, # greek capital letter psi, U+03A8 ISOgrk3 + 'Rho': 0x03a1, # greek capital letter rho, U+03A1 + 'Scaron': 0x0160, # latin capital letter S with caron, U+0160 ISOlat2 + 'Sigma': 0x03a3, # greek capital letter sigma, U+03A3 ISOgrk3 + 'THORN': 0x00de, # latin capital letter THORN, U+00DE ISOlat1 + 'Tau': 0x03a4, # greek capital letter tau, U+03A4 + 'Theta': 0x0398, # greek capital letter theta, U+0398 ISOgrk3 + 'Uacute': 0x00da, # latin capital letter U with acute, U+00DA ISOlat1 + 'Ucirc': 0x00db, # latin capital letter U with circumflex, U+00DB ISOlat1 + 'Ugrave': 0x00d9, # latin capital letter U with grave, U+00D9 ISOlat1 + 'Upsilon': 0x03a5, # greek capital letter upsilon, U+03A5 ISOgrk3 + 'Uuml': 0x00dc, # latin capital letter U with diaeresis, U+00DC ISOlat1 + 'Xi': 0x039e, # greek capital letter xi, U+039E ISOgrk3 + 'Yacute': 0x00dd, # latin capital letter Y with acute, U+00DD ISOlat1 + 'Yuml': 0x0178, # latin capital letter Y with diaeresis, U+0178 ISOlat2 + 'Zeta': 0x0396, # greek capital letter zeta, U+0396 + 'aacute': 0x00e1, # latin small letter a with acute, U+00E1 ISOlat1 + 'acirc': 0x00e2, # latin small letter a with circumflex, U+00E2 ISOlat1 + 'acute': 0x00b4, # acute accent = spacing acute, U+00B4 ISOdia + 'aelig': 0x00e6, # latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 + 'agrave': 0x00e0, # latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 + 'alefsym': 0x2135, # alef symbol = first transfinite cardinal, U+2135 NEW + 'alpha': 0x03b1, # greek small letter alpha, U+03B1 ISOgrk3 + 'amp': 0x0026, # ampersand, U+0026 ISOnum + 'and': 0x2227, # logical and = wedge, U+2227 ISOtech + 'ang': 0x2220, # angle, U+2220 ISOamso + 'aring': 0x00e5, # latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 + 'asymp': 0x2248, # almost equal to = asymptotic to, U+2248 ISOamsr + 'atilde': 0x00e3, # latin small letter a with tilde, U+00E3 ISOlat1 + 'auml': 0x00e4, # latin small letter a with diaeresis, U+00E4 ISOlat1 + 'bdquo': 0x201e, # double low-9 quotation mark, U+201E NEW + 'beta': 0x03b2, # greek small letter beta, U+03B2 ISOgrk3 + 'brvbar': 0x00a6, # broken bar = broken vertical bar, U+00A6 ISOnum + 'bull': 0x2022, # bullet = black small circle, U+2022 ISOpub + 'cap': 0x2229, # intersection = cap, U+2229 ISOtech + 'ccedil': 0x00e7, # latin small letter c with cedilla, U+00E7 ISOlat1 + 'cedil': 0x00b8, # cedilla = spacing cedilla, U+00B8 ISOdia + 'cent': 0x00a2, # cent sign, U+00A2 ISOnum + 'chi': 0x03c7, # greek small letter chi, U+03C7 ISOgrk3 + 'circ': 0x02c6, # modifier letter circumflex accent, U+02C6 ISOpub + 'clubs': 0x2663, # black club suit = shamrock, U+2663 ISOpub + 'cong': 0x2245, # approximately equal to, U+2245 ISOtech + 'copy': 0x00a9, # copyright sign, U+00A9 ISOnum + 'crarr': 0x21b5, # downwards arrow with corner leftwards = carriage return, U+21B5 NEW + 'cup': 0x222a, # union = cup, U+222A ISOtech + 'curren': 0x00a4, # currency sign, U+00A4 ISOnum + 'dArr': 0x21d3, # downwards double arrow, U+21D3 ISOamsa + 'dagger': 0x2020, # dagger, U+2020 ISOpub + 'darr': 0x2193, # downwards arrow, U+2193 ISOnum + 'deg': 0x00b0, # degree sign, U+00B0 ISOnum + 'delta': 0x03b4, # greek small letter delta, U+03B4 ISOgrk3 + 'diams': 0x2666, # black diamond suit, U+2666 ISOpub + 'divide': 0x00f7, # division sign, U+00F7 ISOnum + 'eacute': 0x00e9, # latin small letter e with acute, U+00E9 ISOlat1 + 'ecirc': 0x00ea, # latin small letter e with circumflex, U+00EA ISOlat1 + 'egrave': 0x00e8, # latin small letter e with grave, U+00E8 ISOlat1 + 'empty': 0x2205, # empty set = null set = diameter, U+2205 ISOamso + 'emsp': 0x2003, # em space, U+2003 ISOpub + 'ensp': 0x2002, # en space, U+2002 ISOpub + 'epsilon': 0x03b5, # greek small letter epsilon, U+03B5 ISOgrk3 + 'equiv': 0x2261, # identical to, U+2261 ISOtech + 'eta': 0x03b7, # greek small letter eta, U+03B7 ISOgrk3 + 'eth': 0x00f0, # latin small letter eth, U+00F0 ISOlat1 + 'euml': 0x00eb, # latin small letter e with diaeresis, U+00EB ISOlat1 + 'euro': 0x20ac, # euro sign, U+20AC NEW + 'exist': 0x2203, # there exists, U+2203 ISOtech + 'fnof': 0x0192, # latin small f with hook = function = florin, U+0192 ISOtech + 'forall': 0x2200, # for all, U+2200 ISOtech + 'frac12': 0x00bd, # vulgar fraction one half = fraction one half, U+00BD ISOnum + 'frac14': 0x00bc, # vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum + 'frac34': 0x00be, # vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum + 'frasl': 0x2044, # fraction slash, U+2044 NEW + 'gamma': 0x03b3, # greek small letter gamma, U+03B3 ISOgrk3 + 'ge': 0x2265, # greater-than or equal to, U+2265 ISOtech + 'gt': 0x003e, # greater-than sign, U+003E ISOnum + 'hArr': 0x21d4, # left right double arrow, U+21D4 ISOamsa + 'harr': 0x2194, # left right arrow, U+2194 ISOamsa + 'hearts': 0x2665, # black heart suit = valentine, U+2665 ISOpub + 'hellip': 0x2026, # horizontal ellipsis = three dot leader, U+2026 ISOpub + 'iacute': 0x00ed, # latin small letter i with acute, U+00ED ISOlat1 + 'icirc': 0x00ee, # latin small letter i with circumflex, U+00EE ISOlat1 + 'iexcl': 0x00a1, # inverted exclamation mark, U+00A1 ISOnum + 'igrave': 0x00ec, # latin small letter i with grave, U+00EC ISOlat1 + 'image': 0x2111, # blackletter capital I = imaginary part, U+2111 ISOamso + 'infin': 0x221e, # infinity, U+221E ISOtech + 'int': 0x222b, # integral, U+222B ISOtech + 'iota': 0x03b9, # greek small letter iota, U+03B9 ISOgrk3 + 'iquest': 0x00bf, # inverted question mark = turned question mark, U+00BF ISOnum + 'isin': 0x2208, # element of, U+2208 ISOtech + 'iuml': 0x00ef, # latin small letter i with diaeresis, U+00EF ISOlat1 + 'kappa': 0x03ba, # greek small letter kappa, U+03BA ISOgrk3 + 'lArr': 0x21d0, # leftwards double arrow, U+21D0 ISOtech + 'lambda': 0x03bb, # greek small letter lambda, U+03BB ISOgrk3 + 'lang': 0x2329, # left-pointing angle bracket = bra, U+2329 ISOtech + 'laquo': 0x00ab, # left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum + 'larr': 0x2190, # leftwards arrow, U+2190 ISOnum + 'lceil': 0x2308, # left ceiling = apl upstile, U+2308 ISOamsc + 'ldquo': 0x201c, # left double quotation mark, U+201C ISOnum + 'le': 0x2264, # less-than or equal to, U+2264 ISOtech + 'lfloor': 0x230a, # left floor = apl downstile, U+230A ISOamsc + 'lowast': 0x2217, # asterisk operator, U+2217 ISOtech + 'loz': 0x25ca, # lozenge, U+25CA ISOpub + 'lrm': 0x200e, # left-to-right mark, U+200E NEW RFC 2070 + 'lsaquo': 0x2039, # single left-pointing angle quotation mark, U+2039 ISO proposed + 'lsquo': 0x2018, # left single quotation mark, U+2018 ISOnum + 'lt': 0x003c, # less-than sign, U+003C ISOnum + 'macr': 0x00af, # macron = spacing macron = overline = APL overbar, U+00AF ISOdia + 'mdash': 0x2014, # em dash, U+2014 ISOpub + 'micro': 0x00b5, # micro sign, U+00B5 ISOnum + 'middot': 0x00b7, # middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum + 'minus': 0x2212, # minus sign, U+2212 ISOtech + 'mu': 0x03bc, # greek small letter mu, U+03BC ISOgrk3 + 'nabla': 0x2207, # nabla = backward difference, U+2207 ISOtech + 'nbsp': 0x00a0, # no-break space = non-breaking space, U+00A0 ISOnum + 'ndash': 0x2013, # en dash, U+2013 ISOpub + 'ne': 0x2260, # not equal to, U+2260 ISOtech + 'ni': 0x220b, # contains as member, U+220B ISOtech + 'not': 0x00ac, # not sign, U+00AC ISOnum + 'notin': 0x2209, # not an element of, U+2209 ISOtech + 'nsub': 0x2284, # not a subset of, U+2284 ISOamsn + 'ntilde': 0x00f1, # latin small letter n with tilde, U+00F1 ISOlat1 + 'nu': 0x03bd, # greek small letter nu, U+03BD ISOgrk3 + 'oacute': 0x00f3, # latin small letter o with acute, U+00F3 ISOlat1 + 'ocirc': 0x00f4, # latin small letter o with circumflex, U+00F4 ISOlat1 + 'oelig': 0x0153, # latin small ligature oe, U+0153 ISOlat2 + 'ograve': 0x00f2, # latin small letter o with grave, U+00F2 ISOlat1 + 'oline': 0x203e, # overline = spacing overscore, U+203E NEW + 'omega': 0x03c9, # greek small letter omega, U+03C9 ISOgrk3 + 'omicron': 0x03bf, # greek small letter omicron, U+03BF NEW + 'oplus': 0x2295, # circled plus = direct sum, U+2295 ISOamsb + 'or': 0x2228, # logical or = vee, U+2228 ISOtech + 'ordf': 0x00aa, # feminine ordinal indicator, U+00AA ISOnum + 'ordm': 0x00ba, # masculine ordinal indicator, U+00BA ISOnum + 'oslash': 0x00f8, # latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 + 'otilde': 0x00f5, # latin small letter o with tilde, U+00F5 ISOlat1 + 'otimes': 0x2297, # circled times = vector product, U+2297 ISOamsb + 'ouml': 0x00f6, # latin small letter o with diaeresis, U+00F6 ISOlat1 + 'para': 0x00b6, # pilcrow sign = paragraph sign, U+00B6 ISOnum + 'part': 0x2202, # partial differential, U+2202 ISOtech + 'permil': 0x2030, # per mille sign, U+2030 ISOtech + 'perp': 0x22a5, # up tack = orthogonal to = perpendicular, U+22A5 ISOtech + 'phi': 0x03c6, # greek small letter phi, U+03C6 ISOgrk3 + 'pi': 0x03c0, # greek small letter pi, U+03C0 ISOgrk3 + 'piv': 0x03d6, # greek pi symbol, U+03D6 ISOgrk3 + 'plusmn': 0x00b1, # plus-minus sign = plus-or-minus sign, U+00B1 ISOnum + 'pound': 0x00a3, # pound sign, U+00A3 ISOnum + 'prime': 0x2032, # prime = minutes = feet, U+2032 ISOtech + 'prod': 0x220f, # n-ary product = product sign, U+220F ISOamsb + 'prop': 0x221d, # proportional to, U+221D ISOtech + 'psi': 0x03c8, # greek small letter psi, U+03C8 ISOgrk3 + 'quot': 0x0022, # quotation mark = APL quote, U+0022 ISOnum + 'rArr': 0x21d2, # rightwards double arrow, U+21D2 ISOtech + 'radic': 0x221a, # square root = radical sign, U+221A ISOtech + 'rang': 0x232a, # right-pointing angle bracket = ket, U+232A ISOtech + 'raquo': 0x00bb, # right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum + 'rarr': 0x2192, # rightwards arrow, U+2192 ISOnum + 'rceil': 0x2309, # right ceiling, U+2309 ISOamsc + 'rdquo': 0x201d, # right double quotation mark, U+201D ISOnum + 'real': 0x211c, # blackletter capital R = real part symbol, U+211C ISOamso + 'reg': 0x00ae, # registered sign = registered trade mark sign, U+00AE ISOnum + 'rfloor': 0x230b, # right floor, U+230B ISOamsc + 'rho': 0x03c1, # greek small letter rho, U+03C1 ISOgrk3 + 'rlm': 0x200f, # right-to-left mark, U+200F NEW RFC 2070 + 'rsaquo': 0x203a, # single right-pointing angle quotation mark, U+203A ISO proposed + 'rsquo': 0x2019, # right single quotation mark, U+2019 ISOnum + 'sbquo': 0x201a, # single low-9 quotation mark, U+201A NEW + 'scaron': 0x0161, # latin small letter s with caron, U+0161 ISOlat2 + 'sdot': 0x22c5, # dot operator, U+22C5 ISOamsb + 'sect': 0x00a7, # section sign, U+00A7 ISOnum + 'shy': 0x00ad, # soft hyphen = discretionary hyphen, U+00AD ISOnum + 'sigma': 0x03c3, # greek small letter sigma, U+03C3 ISOgrk3 + 'sigmaf': 0x03c2, # greek small letter final sigma, U+03C2 ISOgrk3 + 'sim': 0x223c, # tilde operator = varies with = similar to, U+223C ISOtech + 'spades': 0x2660, # black spade suit, U+2660 ISOpub + 'sub': 0x2282, # subset of, U+2282 ISOtech + 'sube': 0x2286, # subset of or equal to, U+2286 ISOtech + 'sum': 0x2211, # n-ary summation, U+2211 ISOamsb + 'sup': 0x2283, # superset of, U+2283 ISOtech + 'sup1': 0x00b9, # superscript one = superscript digit one, U+00B9 ISOnum + 'sup2': 0x00b2, # superscript two = superscript digit two = squared, U+00B2 ISOnum + 'sup3': 0x00b3, # superscript three = superscript digit three = cubed, U+00B3 ISOnum + 'supe': 0x2287, # superset of or equal to, U+2287 ISOtech + 'szlig': 0x00df, # latin small letter sharp s = ess-zed, U+00DF ISOlat1 + 'tau': 0x03c4, # greek small letter tau, U+03C4 ISOgrk3 + 'there4': 0x2234, # therefore, U+2234 ISOtech + 'theta': 0x03b8, # greek small letter theta, U+03B8 ISOgrk3 + 'thetasym': 0x03d1, # greek small letter theta symbol, U+03D1 NEW + 'thinsp': 0x2009, # thin space, U+2009 ISOpub + 'thorn': 0x00fe, # latin small letter thorn with, U+00FE ISOlat1 + 'tilde': 0x02dc, # small tilde, U+02DC ISOdia + 'times': 0x00d7, # multiplication sign, U+00D7 ISOnum + 'trade': 0x2122, # trade mark sign, U+2122 ISOnum + 'uArr': 0x21d1, # upwards double arrow, U+21D1 ISOamsa + 'uacute': 0x00fa, # latin small letter u with acute, U+00FA ISOlat1 + 'uarr': 0x2191, # upwards arrow, U+2191 ISOnum + 'ucirc': 0x00fb, # latin small letter u with circumflex, U+00FB ISOlat1 + 'ugrave': 0x00f9, # latin small letter u with grave, U+00F9 ISOlat1 + 'uml': 0x00a8, # diaeresis = spacing diaeresis, U+00A8 ISOdia + 'upsih': 0x03d2, # greek upsilon with hook symbol, U+03D2 NEW + 'upsilon': 0x03c5, # greek small letter upsilon, U+03C5 ISOgrk3 + 'uuml': 0x00fc, # latin small letter u with diaeresis, U+00FC ISOlat1 + 'weierp': 0x2118, # script capital P = power set = Weierstrass p, U+2118 ISOamso + 'xi': 0x03be, # greek small letter xi, U+03BE ISOgrk3 + 'yacute': 0x00fd, # latin small letter y with acute, U+00FD ISOlat1 + 'yen': 0x00a5, # yen sign = yuan sign, U+00A5 ISOnum + 'yuml': 0x00ff, # latin small letter y with diaeresis, U+00FF ISOlat1 + 'zeta': 0x03b6, # greek small letter zeta, U+03B6 ISOgrk3 + 'zwj': 0x200d, # zero width joiner, U+200D NEW RFC 2070 + 'zwnj': 0x200c, # zero width non-joiner, U+200C NEW RFC 2070 +} + + +# maps the HTML5 named character references to the equivalent Unicode character(s) +html5 = { + 'Aacute': '\xc1', + 'aacute': '\xe1', + 'Aacute;': '\xc1', + 'aacute;': '\xe1', + 'Abreve;': '\u0102', + 'abreve;': '\u0103', + 'ac;': '\u223e', + 'acd;': '\u223f', + 'acE;': '\u223e\u0333', + 'Acirc': '\xc2', + 'acirc': '\xe2', + 'Acirc;': '\xc2', + 'acirc;': '\xe2', + 'acute': '\xb4', + 'acute;': '\xb4', + 'Acy;': '\u0410', + 'acy;': '\u0430', + 'AElig': '\xc6', + 'aelig': '\xe6', + 'AElig;': '\xc6', + 'aelig;': '\xe6', + 'af;': '\u2061', + 'Afr;': '\U0001d504', + 'afr;': '\U0001d51e', + 'Agrave': '\xc0', + 'agrave': '\xe0', + 'Agrave;': '\xc0', + 'agrave;': '\xe0', + 'alefsym;': '\u2135', + 'aleph;': '\u2135', + 'Alpha;': '\u0391', + 'alpha;': '\u03b1', + 'Amacr;': '\u0100', + 'amacr;': '\u0101', + 'amalg;': '\u2a3f', + 'AMP': '&', + 'amp': '&', + 'AMP;': '&', + 'amp;': '&', + 'And;': '\u2a53', + 'and;': '\u2227', + 'andand;': '\u2a55', + 'andd;': '\u2a5c', + 'andslope;': '\u2a58', + 'andv;': '\u2a5a', + 'ang;': '\u2220', + 'ange;': '\u29a4', + 'angle;': '\u2220', + 'angmsd;': '\u2221', + 'angmsdaa;': '\u29a8', + 'angmsdab;': '\u29a9', + 'angmsdac;': '\u29aa', + 'angmsdad;': '\u29ab', + 'angmsdae;': '\u29ac', + 'angmsdaf;': '\u29ad', + 'angmsdag;': '\u29ae', + 'angmsdah;': '\u29af', + 'angrt;': '\u221f', + 'angrtvb;': '\u22be', + 'angrtvbd;': '\u299d', + 'angsph;': '\u2222', + 'angst;': '\xc5', + 'angzarr;': '\u237c', + 'Aogon;': '\u0104', + 'aogon;': '\u0105', + 'Aopf;': '\U0001d538', + 'aopf;': '\U0001d552', + 'ap;': '\u2248', + 'apacir;': '\u2a6f', + 'apE;': '\u2a70', + 'ape;': '\u224a', + 'apid;': '\u224b', + 'apos;': "'", + 'ApplyFunction;': '\u2061', + 'approx;': '\u2248', + 'approxeq;': '\u224a', + 'Aring': '\xc5', + 'aring': '\xe5', + 'Aring;': '\xc5', + 'aring;': '\xe5', + 'Ascr;': '\U0001d49c', + 'ascr;': '\U0001d4b6', + 'Assign;': '\u2254', + 'ast;': '*', + 'asymp;': '\u2248', + 'asympeq;': '\u224d', + 'Atilde': '\xc3', + 'atilde': '\xe3', + 'Atilde;': '\xc3', + 'atilde;': '\xe3', + 'Auml': '\xc4', + 'auml': '\xe4', + 'Auml;': '\xc4', + 'auml;': '\xe4', + 'awconint;': '\u2233', + 'awint;': '\u2a11', + 'backcong;': '\u224c', + 'backepsilon;': '\u03f6', + 'backprime;': '\u2035', + 'backsim;': '\u223d', + 'backsimeq;': '\u22cd', + 'Backslash;': '\u2216', + 'Barv;': '\u2ae7', + 'barvee;': '\u22bd', + 'Barwed;': '\u2306', + 'barwed;': '\u2305', + 'barwedge;': '\u2305', + 'bbrk;': '\u23b5', + 'bbrktbrk;': '\u23b6', + 'bcong;': '\u224c', + 'Bcy;': '\u0411', + 'bcy;': '\u0431', + 'bdquo;': '\u201e', + 'becaus;': '\u2235', + 'Because;': '\u2235', + 'because;': '\u2235', + 'bemptyv;': '\u29b0', + 'bepsi;': '\u03f6', + 'bernou;': '\u212c', + 'Bernoullis;': '\u212c', + 'Beta;': '\u0392', + 'beta;': '\u03b2', + 'beth;': '\u2136', + 'between;': '\u226c', + 'Bfr;': '\U0001d505', + 'bfr;': '\U0001d51f', + 'bigcap;': '\u22c2', + 'bigcirc;': '\u25ef', + 'bigcup;': '\u22c3', + 'bigodot;': '\u2a00', + 'bigoplus;': '\u2a01', + 'bigotimes;': '\u2a02', + 'bigsqcup;': '\u2a06', + 'bigstar;': '\u2605', + 'bigtriangledown;': '\u25bd', + 'bigtriangleup;': '\u25b3', + 'biguplus;': '\u2a04', + 'bigvee;': '\u22c1', + 'bigwedge;': '\u22c0', + 'bkarow;': '\u290d', + 'blacklozenge;': '\u29eb', + 'blacksquare;': '\u25aa', + 'blacktriangle;': '\u25b4', + 'blacktriangledown;': '\u25be', + 'blacktriangleleft;': '\u25c2', + 'blacktriangleright;': '\u25b8', + 'blank;': '\u2423', + 'blk12;': '\u2592', + 'blk14;': '\u2591', + 'blk34;': '\u2593', + 'block;': '\u2588', + 'bne;': '=\u20e5', + 'bnequiv;': '\u2261\u20e5', + 'bNot;': '\u2aed', + 'bnot;': '\u2310', + 'Bopf;': '\U0001d539', + 'bopf;': '\U0001d553', + 'bot;': '\u22a5', + 'bottom;': '\u22a5', + 'bowtie;': '\u22c8', + 'boxbox;': '\u29c9', + 'boxDL;': '\u2557', + 'boxDl;': '\u2556', + 'boxdL;': '\u2555', + 'boxdl;': '\u2510', + 'boxDR;': '\u2554', + 'boxDr;': '\u2553', + 'boxdR;': '\u2552', + 'boxdr;': '\u250c', + 'boxH;': '\u2550', + 'boxh;': '\u2500', + 'boxHD;': '\u2566', + 'boxHd;': '\u2564', + 'boxhD;': '\u2565', + 'boxhd;': '\u252c', + 'boxHU;': '\u2569', + 'boxHu;': '\u2567', + 'boxhU;': '\u2568', + 'boxhu;': '\u2534', + 'boxminus;': '\u229f', + 'boxplus;': '\u229e', + 'boxtimes;': '\u22a0', + 'boxUL;': '\u255d', + 'boxUl;': '\u255c', + 'boxuL;': '\u255b', + 'boxul;': '\u2518', + 'boxUR;': '\u255a', + 'boxUr;': '\u2559', + 'boxuR;': '\u2558', + 'boxur;': '\u2514', + 'boxV;': '\u2551', + 'boxv;': '\u2502', + 'boxVH;': '\u256c', + 'boxVh;': '\u256b', + 'boxvH;': '\u256a', + 'boxvh;': '\u253c', + 'boxVL;': '\u2563', + 'boxVl;': '\u2562', + 'boxvL;': '\u2561', + 'boxvl;': '\u2524', + 'boxVR;': '\u2560', + 'boxVr;': '\u255f', + 'boxvR;': '\u255e', + 'boxvr;': '\u251c', + 'bprime;': '\u2035', + 'Breve;': '\u02d8', + 'breve;': '\u02d8', + 'brvbar': '\xa6', + 'brvbar;': '\xa6', + 'Bscr;': '\u212c', + 'bscr;': '\U0001d4b7', + 'bsemi;': '\u204f', + 'bsim;': '\u223d', + 'bsime;': '\u22cd', + 'bsol;': '\\', + 'bsolb;': '\u29c5', + 'bsolhsub;': '\u27c8', + 'bull;': '\u2022', + 'bullet;': '\u2022', + 'bump;': '\u224e', + 'bumpE;': '\u2aae', + 'bumpe;': '\u224f', + 'Bumpeq;': '\u224e', + 'bumpeq;': '\u224f', + 'Cacute;': '\u0106', + 'cacute;': '\u0107', + 'Cap;': '\u22d2', + 'cap;': '\u2229', + 'capand;': '\u2a44', + 'capbrcup;': '\u2a49', + 'capcap;': '\u2a4b', + 'capcup;': '\u2a47', + 'capdot;': '\u2a40', + 'CapitalDifferentialD;': '\u2145', + 'caps;': '\u2229\ufe00', + 'caret;': '\u2041', + 'caron;': '\u02c7', + 'Cayleys;': '\u212d', + 'ccaps;': '\u2a4d', + 'Ccaron;': '\u010c', + 'ccaron;': '\u010d', + 'Ccedil': '\xc7', + 'ccedil': '\xe7', + 'Ccedil;': '\xc7', + 'ccedil;': '\xe7', + 'Ccirc;': '\u0108', + 'ccirc;': '\u0109', + 'Cconint;': '\u2230', + 'ccups;': '\u2a4c', + 'ccupssm;': '\u2a50', + 'Cdot;': '\u010a', + 'cdot;': '\u010b', + 'cedil': '\xb8', + 'cedil;': '\xb8', + 'Cedilla;': '\xb8', + 'cemptyv;': '\u29b2', + 'cent': '\xa2', + 'cent;': '\xa2', + 'CenterDot;': '\xb7', + 'centerdot;': '\xb7', + 'Cfr;': '\u212d', + 'cfr;': '\U0001d520', + 'CHcy;': '\u0427', + 'chcy;': '\u0447', + 'check;': '\u2713', + 'checkmark;': '\u2713', + 'Chi;': '\u03a7', + 'chi;': '\u03c7', + 'cir;': '\u25cb', + 'circ;': '\u02c6', + 'circeq;': '\u2257', + 'circlearrowleft;': '\u21ba', + 'circlearrowright;': '\u21bb', + 'circledast;': '\u229b', + 'circledcirc;': '\u229a', + 'circleddash;': '\u229d', + 'CircleDot;': '\u2299', + 'circledR;': '\xae', + 'circledS;': '\u24c8', + 'CircleMinus;': '\u2296', + 'CirclePlus;': '\u2295', + 'CircleTimes;': '\u2297', + 'cirE;': '\u29c3', + 'cire;': '\u2257', + 'cirfnint;': '\u2a10', + 'cirmid;': '\u2aef', + 'cirscir;': '\u29c2', + 'ClockwiseContourIntegral;': '\u2232', + 'CloseCurlyDoubleQuote;': '\u201d', + 'CloseCurlyQuote;': '\u2019', + 'clubs;': '\u2663', + 'clubsuit;': '\u2663', + 'Colon;': '\u2237', + 'colon;': ':', + 'Colone;': '\u2a74', + 'colone;': '\u2254', + 'coloneq;': '\u2254', + 'comma;': ',', + 'commat;': '@', + 'comp;': '\u2201', + 'compfn;': '\u2218', + 'complement;': '\u2201', + 'complexes;': '\u2102', + 'cong;': '\u2245', + 'congdot;': '\u2a6d', + 'Congruent;': '\u2261', + 'Conint;': '\u222f', + 'conint;': '\u222e', + 'ContourIntegral;': '\u222e', + 'Copf;': '\u2102', + 'copf;': '\U0001d554', + 'coprod;': '\u2210', + 'Coproduct;': '\u2210', + 'COPY': '\xa9', + 'copy': '\xa9', + 'COPY;': '\xa9', + 'copy;': '\xa9', + 'copysr;': '\u2117', + 'CounterClockwiseContourIntegral;': '\u2233', + 'crarr;': '\u21b5', + 'Cross;': '\u2a2f', + 'cross;': '\u2717', + 'Cscr;': '\U0001d49e', + 'cscr;': '\U0001d4b8', + 'csub;': '\u2acf', + 'csube;': '\u2ad1', + 'csup;': '\u2ad0', + 'csupe;': '\u2ad2', + 'ctdot;': '\u22ef', + 'cudarrl;': '\u2938', + 'cudarrr;': '\u2935', + 'cuepr;': '\u22de', + 'cuesc;': '\u22df', + 'cularr;': '\u21b6', + 'cularrp;': '\u293d', + 'Cup;': '\u22d3', + 'cup;': '\u222a', + 'cupbrcap;': '\u2a48', + 'CupCap;': '\u224d', + 'cupcap;': '\u2a46', + 'cupcup;': '\u2a4a', + 'cupdot;': '\u228d', + 'cupor;': '\u2a45', + 'cups;': '\u222a\ufe00', + 'curarr;': '\u21b7', + 'curarrm;': '\u293c', + 'curlyeqprec;': '\u22de', + 'curlyeqsucc;': '\u22df', + 'curlyvee;': '\u22ce', + 'curlywedge;': '\u22cf', + 'curren': '\xa4', + 'curren;': '\xa4', + 'curvearrowleft;': '\u21b6', + 'curvearrowright;': '\u21b7', + 'cuvee;': '\u22ce', + 'cuwed;': '\u22cf', + 'cwconint;': '\u2232', + 'cwint;': '\u2231', + 'cylcty;': '\u232d', + 'Dagger;': '\u2021', + 'dagger;': '\u2020', + 'daleth;': '\u2138', + 'Darr;': '\u21a1', + 'dArr;': '\u21d3', + 'darr;': '\u2193', + 'dash;': '\u2010', + 'Dashv;': '\u2ae4', + 'dashv;': '\u22a3', + 'dbkarow;': '\u290f', + 'dblac;': '\u02dd', + 'Dcaron;': '\u010e', + 'dcaron;': '\u010f', + 'Dcy;': '\u0414', + 'dcy;': '\u0434', + 'DD;': '\u2145', + 'dd;': '\u2146', + 'ddagger;': '\u2021', + 'ddarr;': '\u21ca', + 'DDotrahd;': '\u2911', + 'ddotseq;': '\u2a77', + 'deg': '\xb0', + 'deg;': '\xb0', + 'Del;': '\u2207', + 'Delta;': '\u0394', + 'delta;': '\u03b4', + 'demptyv;': '\u29b1', + 'dfisht;': '\u297f', + 'Dfr;': '\U0001d507', + 'dfr;': '\U0001d521', + 'dHar;': '\u2965', + 'dharl;': '\u21c3', + 'dharr;': '\u21c2', + 'DiacriticalAcute;': '\xb4', + 'DiacriticalDot;': '\u02d9', + 'DiacriticalDoubleAcute;': '\u02dd', + 'DiacriticalGrave;': '`', + 'DiacriticalTilde;': '\u02dc', + 'diam;': '\u22c4', + 'Diamond;': '\u22c4', + 'diamond;': '\u22c4', + 'diamondsuit;': '\u2666', + 'diams;': '\u2666', + 'die;': '\xa8', + 'DifferentialD;': '\u2146', + 'digamma;': '\u03dd', + 'disin;': '\u22f2', + 'div;': '\xf7', + 'divide': '\xf7', + 'divide;': '\xf7', + 'divideontimes;': '\u22c7', + 'divonx;': '\u22c7', + 'DJcy;': '\u0402', + 'djcy;': '\u0452', + 'dlcorn;': '\u231e', + 'dlcrop;': '\u230d', + 'dollar;': '$', + 'Dopf;': '\U0001d53b', + 'dopf;': '\U0001d555', + 'Dot;': '\xa8', + 'dot;': '\u02d9', + 'DotDot;': '\u20dc', + 'doteq;': '\u2250', + 'doteqdot;': '\u2251', + 'DotEqual;': '\u2250', + 'dotminus;': '\u2238', + 'dotplus;': '\u2214', + 'dotsquare;': '\u22a1', + 'doublebarwedge;': '\u2306', + 'DoubleContourIntegral;': '\u222f', + 'DoubleDot;': '\xa8', + 'DoubleDownArrow;': '\u21d3', + 'DoubleLeftArrow;': '\u21d0', + 'DoubleLeftRightArrow;': '\u21d4', + 'DoubleLeftTee;': '\u2ae4', + 'DoubleLongLeftArrow;': '\u27f8', + 'DoubleLongLeftRightArrow;': '\u27fa', + 'DoubleLongRightArrow;': '\u27f9', + 'DoubleRightArrow;': '\u21d2', + 'DoubleRightTee;': '\u22a8', + 'DoubleUpArrow;': '\u21d1', + 'DoubleUpDownArrow;': '\u21d5', + 'DoubleVerticalBar;': '\u2225', + 'DownArrow;': '\u2193', + 'Downarrow;': '\u21d3', + 'downarrow;': '\u2193', + 'DownArrowBar;': '\u2913', + 'DownArrowUpArrow;': '\u21f5', + 'DownBreve;': '\u0311', + 'downdownarrows;': '\u21ca', + 'downharpoonleft;': '\u21c3', + 'downharpoonright;': '\u21c2', + 'DownLeftRightVector;': '\u2950', + 'DownLeftTeeVector;': '\u295e', + 'DownLeftVector;': '\u21bd', + 'DownLeftVectorBar;': '\u2956', + 'DownRightTeeVector;': '\u295f', + 'DownRightVector;': '\u21c1', + 'DownRightVectorBar;': '\u2957', + 'DownTee;': '\u22a4', + 'DownTeeArrow;': '\u21a7', + 'drbkarow;': '\u2910', + 'drcorn;': '\u231f', + 'drcrop;': '\u230c', + 'Dscr;': '\U0001d49f', + 'dscr;': '\U0001d4b9', + 'DScy;': '\u0405', + 'dscy;': '\u0455', + 'dsol;': '\u29f6', + 'Dstrok;': '\u0110', + 'dstrok;': '\u0111', + 'dtdot;': '\u22f1', + 'dtri;': '\u25bf', + 'dtrif;': '\u25be', + 'duarr;': '\u21f5', + 'duhar;': '\u296f', + 'dwangle;': '\u29a6', + 'DZcy;': '\u040f', + 'dzcy;': '\u045f', + 'dzigrarr;': '\u27ff', + 'Eacute': '\xc9', + 'eacute': '\xe9', + 'Eacute;': '\xc9', + 'eacute;': '\xe9', + 'easter;': '\u2a6e', + 'Ecaron;': '\u011a', + 'ecaron;': '\u011b', + 'ecir;': '\u2256', + 'Ecirc': '\xca', + 'ecirc': '\xea', + 'Ecirc;': '\xca', + 'ecirc;': '\xea', + 'ecolon;': '\u2255', + 'Ecy;': '\u042d', + 'ecy;': '\u044d', + 'eDDot;': '\u2a77', + 'Edot;': '\u0116', + 'eDot;': '\u2251', + 'edot;': '\u0117', + 'ee;': '\u2147', + 'efDot;': '\u2252', + 'Efr;': '\U0001d508', + 'efr;': '\U0001d522', + 'eg;': '\u2a9a', + 'Egrave': '\xc8', + 'egrave': '\xe8', + 'Egrave;': '\xc8', + 'egrave;': '\xe8', + 'egs;': '\u2a96', + 'egsdot;': '\u2a98', + 'el;': '\u2a99', + 'Element;': '\u2208', + 'elinters;': '\u23e7', + 'ell;': '\u2113', + 'els;': '\u2a95', + 'elsdot;': '\u2a97', + 'Emacr;': '\u0112', + 'emacr;': '\u0113', + 'empty;': '\u2205', + 'emptyset;': '\u2205', + 'EmptySmallSquare;': '\u25fb', + 'emptyv;': '\u2205', + 'EmptyVerySmallSquare;': '\u25ab', + 'emsp13;': '\u2004', + 'emsp14;': '\u2005', + 'emsp;': '\u2003', + 'ENG;': '\u014a', + 'eng;': '\u014b', + 'ensp;': '\u2002', + 'Eogon;': '\u0118', + 'eogon;': '\u0119', + 'Eopf;': '\U0001d53c', + 'eopf;': '\U0001d556', + 'epar;': '\u22d5', + 'eparsl;': '\u29e3', + 'eplus;': '\u2a71', + 'epsi;': '\u03b5', + 'Epsilon;': '\u0395', + 'epsilon;': '\u03b5', + 'epsiv;': '\u03f5', + 'eqcirc;': '\u2256', + 'eqcolon;': '\u2255', + 'eqsim;': '\u2242', + 'eqslantgtr;': '\u2a96', + 'eqslantless;': '\u2a95', + 'Equal;': '\u2a75', + 'equals;': '=', + 'EqualTilde;': '\u2242', + 'equest;': '\u225f', + 'Equilibrium;': '\u21cc', + 'equiv;': '\u2261', + 'equivDD;': '\u2a78', + 'eqvparsl;': '\u29e5', + 'erarr;': '\u2971', + 'erDot;': '\u2253', + 'Escr;': '\u2130', + 'escr;': '\u212f', + 'esdot;': '\u2250', + 'Esim;': '\u2a73', + 'esim;': '\u2242', + 'Eta;': '\u0397', + 'eta;': '\u03b7', + 'ETH': '\xd0', + 'eth': '\xf0', + 'ETH;': '\xd0', + 'eth;': '\xf0', + 'Euml': '\xcb', + 'euml': '\xeb', + 'Euml;': '\xcb', + 'euml;': '\xeb', + 'euro;': '\u20ac', + 'excl;': '!', + 'exist;': '\u2203', + 'Exists;': '\u2203', + 'expectation;': '\u2130', + 'ExponentialE;': '\u2147', + 'exponentiale;': '\u2147', + 'fallingdotseq;': '\u2252', + 'Fcy;': '\u0424', + 'fcy;': '\u0444', + 'female;': '\u2640', + 'ffilig;': '\ufb03', + 'fflig;': '\ufb00', + 'ffllig;': '\ufb04', + 'Ffr;': '\U0001d509', + 'ffr;': '\U0001d523', + 'filig;': '\ufb01', + 'FilledSmallSquare;': '\u25fc', + 'FilledVerySmallSquare;': '\u25aa', + 'fjlig;': 'fj', + 'flat;': '\u266d', + 'fllig;': '\ufb02', + 'fltns;': '\u25b1', + 'fnof;': '\u0192', + 'Fopf;': '\U0001d53d', + 'fopf;': '\U0001d557', + 'ForAll;': '\u2200', + 'forall;': '\u2200', + 'fork;': '\u22d4', + 'forkv;': '\u2ad9', + 'Fouriertrf;': '\u2131', + 'fpartint;': '\u2a0d', + 'frac12': '\xbd', + 'frac12;': '\xbd', + 'frac13;': '\u2153', + 'frac14': '\xbc', + 'frac14;': '\xbc', + 'frac15;': '\u2155', + 'frac16;': '\u2159', + 'frac18;': '\u215b', + 'frac23;': '\u2154', + 'frac25;': '\u2156', + 'frac34': '\xbe', + 'frac34;': '\xbe', + 'frac35;': '\u2157', + 'frac38;': '\u215c', + 'frac45;': '\u2158', + 'frac56;': '\u215a', + 'frac58;': '\u215d', + 'frac78;': '\u215e', + 'frasl;': '\u2044', + 'frown;': '\u2322', + 'Fscr;': '\u2131', + 'fscr;': '\U0001d4bb', + 'gacute;': '\u01f5', + 'Gamma;': '\u0393', + 'gamma;': '\u03b3', + 'Gammad;': '\u03dc', + 'gammad;': '\u03dd', + 'gap;': '\u2a86', + 'Gbreve;': '\u011e', + 'gbreve;': '\u011f', + 'Gcedil;': '\u0122', + 'Gcirc;': '\u011c', + 'gcirc;': '\u011d', + 'Gcy;': '\u0413', + 'gcy;': '\u0433', + 'Gdot;': '\u0120', + 'gdot;': '\u0121', + 'gE;': '\u2267', + 'ge;': '\u2265', + 'gEl;': '\u2a8c', + 'gel;': '\u22db', + 'geq;': '\u2265', + 'geqq;': '\u2267', + 'geqslant;': '\u2a7e', + 'ges;': '\u2a7e', + 'gescc;': '\u2aa9', + 'gesdot;': '\u2a80', + 'gesdoto;': '\u2a82', + 'gesdotol;': '\u2a84', + 'gesl;': '\u22db\ufe00', + 'gesles;': '\u2a94', + 'Gfr;': '\U0001d50a', + 'gfr;': '\U0001d524', + 'Gg;': '\u22d9', + 'gg;': '\u226b', + 'ggg;': '\u22d9', + 'gimel;': '\u2137', + 'GJcy;': '\u0403', + 'gjcy;': '\u0453', + 'gl;': '\u2277', + 'gla;': '\u2aa5', + 'glE;': '\u2a92', + 'glj;': '\u2aa4', + 'gnap;': '\u2a8a', + 'gnapprox;': '\u2a8a', + 'gnE;': '\u2269', + 'gne;': '\u2a88', + 'gneq;': '\u2a88', + 'gneqq;': '\u2269', + 'gnsim;': '\u22e7', + 'Gopf;': '\U0001d53e', + 'gopf;': '\U0001d558', + 'grave;': '`', + 'GreaterEqual;': '\u2265', + 'GreaterEqualLess;': '\u22db', + 'GreaterFullEqual;': '\u2267', + 'GreaterGreater;': '\u2aa2', + 'GreaterLess;': '\u2277', + 'GreaterSlantEqual;': '\u2a7e', + 'GreaterTilde;': '\u2273', + 'Gscr;': '\U0001d4a2', + 'gscr;': '\u210a', + 'gsim;': '\u2273', + 'gsime;': '\u2a8e', + 'gsiml;': '\u2a90', + 'GT': '>', + 'gt': '>', + 'GT;': '>', + 'Gt;': '\u226b', + 'gt;': '>', + 'gtcc;': '\u2aa7', + 'gtcir;': '\u2a7a', + 'gtdot;': '\u22d7', + 'gtlPar;': '\u2995', + 'gtquest;': '\u2a7c', + 'gtrapprox;': '\u2a86', + 'gtrarr;': '\u2978', + 'gtrdot;': '\u22d7', + 'gtreqless;': '\u22db', + 'gtreqqless;': '\u2a8c', + 'gtrless;': '\u2277', + 'gtrsim;': '\u2273', + 'gvertneqq;': '\u2269\ufe00', + 'gvnE;': '\u2269\ufe00', + 'Hacek;': '\u02c7', + 'hairsp;': '\u200a', + 'half;': '\xbd', + 'hamilt;': '\u210b', + 'HARDcy;': '\u042a', + 'hardcy;': '\u044a', + 'hArr;': '\u21d4', + 'harr;': '\u2194', + 'harrcir;': '\u2948', + 'harrw;': '\u21ad', + 'Hat;': '^', + 'hbar;': '\u210f', + 'Hcirc;': '\u0124', + 'hcirc;': '\u0125', + 'hearts;': '\u2665', + 'heartsuit;': '\u2665', + 'hellip;': '\u2026', + 'hercon;': '\u22b9', + 'Hfr;': '\u210c', + 'hfr;': '\U0001d525', + 'HilbertSpace;': '\u210b', + 'hksearow;': '\u2925', + 'hkswarow;': '\u2926', + 'hoarr;': '\u21ff', + 'homtht;': '\u223b', + 'hookleftarrow;': '\u21a9', + 'hookrightarrow;': '\u21aa', + 'Hopf;': '\u210d', + 'hopf;': '\U0001d559', + 'horbar;': '\u2015', + 'HorizontalLine;': '\u2500', + 'Hscr;': '\u210b', + 'hscr;': '\U0001d4bd', + 'hslash;': '\u210f', + 'Hstrok;': '\u0126', + 'hstrok;': '\u0127', + 'HumpDownHump;': '\u224e', + 'HumpEqual;': '\u224f', + 'hybull;': '\u2043', + 'hyphen;': '\u2010', + 'Iacute': '\xcd', + 'iacute': '\xed', + 'Iacute;': '\xcd', + 'iacute;': '\xed', + 'ic;': '\u2063', + 'Icirc': '\xce', + 'icirc': '\xee', + 'Icirc;': '\xce', + 'icirc;': '\xee', + 'Icy;': '\u0418', + 'icy;': '\u0438', + 'Idot;': '\u0130', + 'IEcy;': '\u0415', + 'iecy;': '\u0435', + 'iexcl': '\xa1', + 'iexcl;': '\xa1', + 'iff;': '\u21d4', + 'Ifr;': '\u2111', + 'ifr;': '\U0001d526', + 'Igrave': '\xcc', + 'igrave': '\xec', + 'Igrave;': '\xcc', + 'igrave;': '\xec', + 'ii;': '\u2148', + 'iiiint;': '\u2a0c', + 'iiint;': '\u222d', + 'iinfin;': '\u29dc', + 'iiota;': '\u2129', + 'IJlig;': '\u0132', + 'ijlig;': '\u0133', + 'Im;': '\u2111', + 'Imacr;': '\u012a', + 'imacr;': '\u012b', + 'image;': '\u2111', + 'ImaginaryI;': '\u2148', + 'imagline;': '\u2110', + 'imagpart;': '\u2111', + 'imath;': '\u0131', + 'imof;': '\u22b7', + 'imped;': '\u01b5', + 'Implies;': '\u21d2', + 'in;': '\u2208', + 'incare;': '\u2105', + 'infin;': '\u221e', + 'infintie;': '\u29dd', + 'inodot;': '\u0131', + 'Int;': '\u222c', + 'int;': '\u222b', + 'intcal;': '\u22ba', + 'integers;': '\u2124', + 'Integral;': '\u222b', + 'intercal;': '\u22ba', + 'Intersection;': '\u22c2', + 'intlarhk;': '\u2a17', + 'intprod;': '\u2a3c', + 'InvisibleComma;': '\u2063', + 'InvisibleTimes;': '\u2062', + 'IOcy;': '\u0401', + 'iocy;': '\u0451', + 'Iogon;': '\u012e', + 'iogon;': '\u012f', + 'Iopf;': '\U0001d540', + 'iopf;': '\U0001d55a', + 'Iota;': '\u0399', + 'iota;': '\u03b9', + 'iprod;': '\u2a3c', + 'iquest': '\xbf', + 'iquest;': '\xbf', + 'Iscr;': '\u2110', + 'iscr;': '\U0001d4be', + 'isin;': '\u2208', + 'isindot;': '\u22f5', + 'isinE;': '\u22f9', + 'isins;': '\u22f4', + 'isinsv;': '\u22f3', + 'isinv;': '\u2208', + 'it;': '\u2062', + 'Itilde;': '\u0128', + 'itilde;': '\u0129', + 'Iukcy;': '\u0406', + 'iukcy;': '\u0456', + 'Iuml': '\xcf', + 'iuml': '\xef', + 'Iuml;': '\xcf', + 'iuml;': '\xef', + 'Jcirc;': '\u0134', + 'jcirc;': '\u0135', + 'Jcy;': '\u0419', + 'jcy;': '\u0439', + 'Jfr;': '\U0001d50d', + 'jfr;': '\U0001d527', + 'jmath;': '\u0237', + 'Jopf;': '\U0001d541', + 'jopf;': '\U0001d55b', + 'Jscr;': '\U0001d4a5', + 'jscr;': '\U0001d4bf', + 'Jsercy;': '\u0408', + 'jsercy;': '\u0458', + 'Jukcy;': '\u0404', + 'jukcy;': '\u0454', + 'Kappa;': '\u039a', + 'kappa;': '\u03ba', + 'kappav;': '\u03f0', + 'Kcedil;': '\u0136', + 'kcedil;': '\u0137', + 'Kcy;': '\u041a', + 'kcy;': '\u043a', + 'Kfr;': '\U0001d50e', + 'kfr;': '\U0001d528', + 'kgreen;': '\u0138', + 'KHcy;': '\u0425', + 'khcy;': '\u0445', + 'KJcy;': '\u040c', + 'kjcy;': '\u045c', + 'Kopf;': '\U0001d542', + 'kopf;': '\U0001d55c', + 'Kscr;': '\U0001d4a6', + 'kscr;': '\U0001d4c0', + 'lAarr;': '\u21da', + 'Lacute;': '\u0139', + 'lacute;': '\u013a', + 'laemptyv;': '\u29b4', + 'lagran;': '\u2112', + 'Lambda;': '\u039b', + 'lambda;': '\u03bb', + 'Lang;': '\u27ea', + 'lang;': '\u27e8', + 'langd;': '\u2991', + 'langle;': '\u27e8', + 'lap;': '\u2a85', + 'Laplacetrf;': '\u2112', + 'laquo': '\xab', + 'laquo;': '\xab', + 'Larr;': '\u219e', + 'lArr;': '\u21d0', + 'larr;': '\u2190', + 'larrb;': '\u21e4', + 'larrbfs;': '\u291f', + 'larrfs;': '\u291d', + 'larrhk;': '\u21a9', + 'larrlp;': '\u21ab', + 'larrpl;': '\u2939', + 'larrsim;': '\u2973', + 'larrtl;': '\u21a2', + 'lat;': '\u2aab', + 'lAtail;': '\u291b', + 'latail;': '\u2919', + 'late;': '\u2aad', + 'lates;': '\u2aad\ufe00', + 'lBarr;': '\u290e', + 'lbarr;': '\u290c', + 'lbbrk;': '\u2772', + 'lbrace;': '{', + 'lbrack;': '[', + 'lbrke;': '\u298b', + 'lbrksld;': '\u298f', + 'lbrkslu;': '\u298d', + 'Lcaron;': '\u013d', + 'lcaron;': '\u013e', + 'Lcedil;': '\u013b', + 'lcedil;': '\u013c', + 'lceil;': '\u2308', + 'lcub;': '{', + 'Lcy;': '\u041b', + 'lcy;': '\u043b', + 'ldca;': '\u2936', + 'ldquo;': '\u201c', + 'ldquor;': '\u201e', + 'ldrdhar;': '\u2967', + 'ldrushar;': '\u294b', + 'ldsh;': '\u21b2', + 'lE;': '\u2266', + 'le;': '\u2264', + 'LeftAngleBracket;': '\u27e8', + 'LeftArrow;': '\u2190', + 'Leftarrow;': '\u21d0', + 'leftarrow;': '\u2190', + 'LeftArrowBar;': '\u21e4', + 'LeftArrowRightArrow;': '\u21c6', + 'leftarrowtail;': '\u21a2', + 'LeftCeiling;': '\u2308', + 'LeftDoubleBracket;': '\u27e6', + 'LeftDownTeeVector;': '\u2961', + 'LeftDownVector;': '\u21c3', + 'LeftDownVectorBar;': '\u2959', + 'LeftFloor;': '\u230a', + 'leftharpoondown;': '\u21bd', + 'leftharpoonup;': '\u21bc', + 'leftleftarrows;': '\u21c7', + 'LeftRightArrow;': '\u2194', + 'Leftrightarrow;': '\u21d4', + 'leftrightarrow;': '\u2194', + 'leftrightarrows;': '\u21c6', + 'leftrightharpoons;': '\u21cb', + 'leftrightsquigarrow;': '\u21ad', + 'LeftRightVector;': '\u294e', + 'LeftTee;': '\u22a3', + 'LeftTeeArrow;': '\u21a4', + 'LeftTeeVector;': '\u295a', + 'leftthreetimes;': '\u22cb', + 'LeftTriangle;': '\u22b2', + 'LeftTriangleBar;': '\u29cf', + 'LeftTriangleEqual;': '\u22b4', + 'LeftUpDownVector;': '\u2951', + 'LeftUpTeeVector;': '\u2960', + 'LeftUpVector;': '\u21bf', + 'LeftUpVectorBar;': '\u2958', + 'LeftVector;': '\u21bc', + 'LeftVectorBar;': '\u2952', + 'lEg;': '\u2a8b', + 'leg;': '\u22da', + 'leq;': '\u2264', + 'leqq;': '\u2266', + 'leqslant;': '\u2a7d', + 'les;': '\u2a7d', + 'lescc;': '\u2aa8', + 'lesdot;': '\u2a7f', + 'lesdoto;': '\u2a81', + 'lesdotor;': '\u2a83', + 'lesg;': '\u22da\ufe00', + 'lesges;': '\u2a93', + 'lessapprox;': '\u2a85', + 'lessdot;': '\u22d6', + 'lesseqgtr;': '\u22da', + 'lesseqqgtr;': '\u2a8b', + 'LessEqualGreater;': '\u22da', + 'LessFullEqual;': '\u2266', + 'LessGreater;': '\u2276', + 'lessgtr;': '\u2276', + 'LessLess;': '\u2aa1', + 'lesssim;': '\u2272', + 'LessSlantEqual;': '\u2a7d', + 'LessTilde;': '\u2272', + 'lfisht;': '\u297c', + 'lfloor;': '\u230a', + 'Lfr;': '\U0001d50f', + 'lfr;': '\U0001d529', + 'lg;': '\u2276', + 'lgE;': '\u2a91', + 'lHar;': '\u2962', + 'lhard;': '\u21bd', + 'lharu;': '\u21bc', + 'lharul;': '\u296a', + 'lhblk;': '\u2584', + 'LJcy;': '\u0409', + 'ljcy;': '\u0459', + 'Ll;': '\u22d8', + 'll;': '\u226a', + 'llarr;': '\u21c7', + 'llcorner;': '\u231e', + 'Lleftarrow;': '\u21da', + 'llhard;': '\u296b', + 'lltri;': '\u25fa', + 'Lmidot;': '\u013f', + 'lmidot;': '\u0140', + 'lmoust;': '\u23b0', + 'lmoustache;': '\u23b0', + 'lnap;': '\u2a89', + 'lnapprox;': '\u2a89', + 'lnE;': '\u2268', + 'lne;': '\u2a87', + 'lneq;': '\u2a87', + 'lneqq;': '\u2268', + 'lnsim;': '\u22e6', + 'loang;': '\u27ec', + 'loarr;': '\u21fd', + 'lobrk;': '\u27e6', + 'LongLeftArrow;': '\u27f5', + 'Longleftarrow;': '\u27f8', + 'longleftarrow;': '\u27f5', + 'LongLeftRightArrow;': '\u27f7', + 'Longleftrightarrow;': '\u27fa', + 'longleftrightarrow;': '\u27f7', + 'longmapsto;': '\u27fc', + 'LongRightArrow;': '\u27f6', + 'Longrightarrow;': '\u27f9', + 'longrightarrow;': '\u27f6', + 'looparrowleft;': '\u21ab', + 'looparrowright;': '\u21ac', + 'lopar;': '\u2985', + 'Lopf;': '\U0001d543', + 'lopf;': '\U0001d55d', + 'loplus;': '\u2a2d', + 'lotimes;': '\u2a34', + 'lowast;': '\u2217', + 'lowbar;': '_', + 'LowerLeftArrow;': '\u2199', + 'LowerRightArrow;': '\u2198', + 'loz;': '\u25ca', + 'lozenge;': '\u25ca', + 'lozf;': '\u29eb', + 'lpar;': '(', + 'lparlt;': '\u2993', + 'lrarr;': '\u21c6', + 'lrcorner;': '\u231f', + 'lrhar;': '\u21cb', + 'lrhard;': '\u296d', + 'lrm;': '\u200e', + 'lrtri;': '\u22bf', + 'lsaquo;': '\u2039', + 'Lscr;': '\u2112', + 'lscr;': '\U0001d4c1', + 'Lsh;': '\u21b0', + 'lsh;': '\u21b0', + 'lsim;': '\u2272', + 'lsime;': '\u2a8d', + 'lsimg;': '\u2a8f', + 'lsqb;': '[', + 'lsquo;': '\u2018', + 'lsquor;': '\u201a', + 'Lstrok;': '\u0141', + 'lstrok;': '\u0142', + 'LT': '<', + 'lt': '<', + 'LT;': '<', + 'Lt;': '\u226a', + 'lt;': '<', + 'ltcc;': '\u2aa6', + 'ltcir;': '\u2a79', + 'ltdot;': '\u22d6', + 'lthree;': '\u22cb', + 'ltimes;': '\u22c9', + 'ltlarr;': '\u2976', + 'ltquest;': '\u2a7b', + 'ltri;': '\u25c3', + 'ltrie;': '\u22b4', + 'ltrif;': '\u25c2', + 'ltrPar;': '\u2996', + 'lurdshar;': '\u294a', + 'luruhar;': '\u2966', + 'lvertneqq;': '\u2268\ufe00', + 'lvnE;': '\u2268\ufe00', + 'macr': '\xaf', + 'macr;': '\xaf', + 'male;': '\u2642', + 'malt;': '\u2720', + 'maltese;': '\u2720', + 'Map;': '\u2905', + 'map;': '\u21a6', + 'mapsto;': '\u21a6', + 'mapstodown;': '\u21a7', + 'mapstoleft;': '\u21a4', + 'mapstoup;': '\u21a5', + 'marker;': '\u25ae', + 'mcomma;': '\u2a29', + 'Mcy;': '\u041c', + 'mcy;': '\u043c', + 'mdash;': '\u2014', + 'mDDot;': '\u223a', + 'measuredangle;': '\u2221', + 'MediumSpace;': '\u205f', + 'Mellintrf;': '\u2133', + 'Mfr;': '\U0001d510', + 'mfr;': '\U0001d52a', + 'mho;': '\u2127', + 'micro': '\xb5', + 'micro;': '\xb5', + 'mid;': '\u2223', + 'midast;': '*', + 'midcir;': '\u2af0', + 'middot': '\xb7', + 'middot;': '\xb7', + 'minus;': '\u2212', + 'minusb;': '\u229f', + 'minusd;': '\u2238', + 'minusdu;': '\u2a2a', + 'MinusPlus;': '\u2213', + 'mlcp;': '\u2adb', + 'mldr;': '\u2026', + 'mnplus;': '\u2213', + 'models;': '\u22a7', + 'Mopf;': '\U0001d544', + 'mopf;': '\U0001d55e', + 'mp;': '\u2213', + 'Mscr;': '\u2133', + 'mscr;': '\U0001d4c2', + 'mstpos;': '\u223e', + 'Mu;': '\u039c', + 'mu;': '\u03bc', + 'multimap;': '\u22b8', + 'mumap;': '\u22b8', + 'nabla;': '\u2207', + 'Nacute;': '\u0143', + 'nacute;': '\u0144', + 'nang;': '\u2220\u20d2', + 'nap;': '\u2249', + 'napE;': '\u2a70\u0338', + 'napid;': '\u224b\u0338', + 'napos;': '\u0149', + 'napprox;': '\u2249', + 'natur;': '\u266e', + 'natural;': '\u266e', + 'naturals;': '\u2115', + 'nbsp': '\xa0', + 'nbsp;': '\xa0', + 'nbump;': '\u224e\u0338', + 'nbumpe;': '\u224f\u0338', + 'ncap;': '\u2a43', + 'Ncaron;': '\u0147', + 'ncaron;': '\u0148', + 'Ncedil;': '\u0145', + 'ncedil;': '\u0146', + 'ncong;': '\u2247', + 'ncongdot;': '\u2a6d\u0338', + 'ncup;': '\u2a42', + 'Ncy;': '\u041d', + 'ncy;': '\u043d', + 'ndash;': '\u2013', + 'ne;': '\u2260', + 'nearhk;': '\u2924', + 'neArr;': '\u21d7', + 'nearr;': '\u2197', + 'nearrow;': '\u2197', + 'nedot;': '\u2250\u0338', + 'NegativeMediumSpace;': '\u200b', + 'NegativeThickSpace;': '\u200b', + 'NegativeThinSpace;': '\u200b', + 'NegativeVeryThinSpace;': '\u200b', + 'nequiv;': '\u2262', + 'nesear;': '\u2928', + 'nesim;': '\u2242\u0338', + 'NestedGreaterGreater;': '\u226b', + 'NestedLessLess;': '\u226a', + 'NewLine;': '\n', + 'nexist;': '\u2204', + 'nexists;': '\u2204', + 'Nfr;': '\U0001d511', + 'nfr;': '\U0001d52b', + 'ngE;': '\u2267\u0338', + 'nge;': '\u2271', + 'ngeq;': '\u2271', + 'ngeqq;': '\u2267\u0338', + 'ngeqslant;': '\u2a7e\u0338', + 'nges;': '\u2a7e\u0338', + 'nGg;': '\u22d9\u0338', + 'ngsim;': '\u2275', + 'nGt;': '\u226b\u20d2', + 'ngt;': '\u226f', + 'ngtr;': '\u226f', + 'nGtv;': '\u226b\u0338', + 'nhArr;': '\u21ce', + 'nharr;': '\u21ae', + 'nhpar;': '\u2af2', + 'ni;': '\u220b', + 'nis;': '\u22fc', + 'nisd;': '\u22fa', + 'niv;': '\u220b', + 'NJcy;': '\u040a', + 'njcy;': '\u045a', + 'nlArr;': '\u21cd', + 'nlarr;': '\u219a', + 'nldr;': '\u2025', + 'nlE;': '\u2266\u0338', + 'nle;': '\u2270', + 'nLeftarrow;': '\u21cd', + 'nleftarrow;': '\u219a', + 'nLeftrightarrow;': '\u21ce', + 'nleftrightarrow;': '\u21ae', + 'nleq;': '\u2270', + 'nleqq;': '\u2266\u0338', + 'nleqslant;': '\u2a7d\u0338', + 'nles;': '\u2a7d\u0338', + 'nless;': '\u226e', + 'nLl;': '\u22d8\u0338', + 'nlsim;': '\u2274', + 'nLt;': '\u226a\u20d2', + 'nlt;': '\u226e', + 'nltri;': '\u22ea', + 'nltrie;': '\u22ec', + 'nLtv;': '\u226a\u0338', + 'nmid;': '\u2224', + 'NoBreak;': '\u2060', + 'NonBreakingSpace;': '\xa0', + 'Nopf;': '\u2115', + 'nopf;': '\U0001d55f', + 'not': '\xac', + 'Not;': '\u2aec', + 'not;': '\xac', + 'NotCongruent;': '\u2262', + 'NotCupCap;': '\u226d', + 'NotDoubleVerticalBar;': '\u2226', + 'NotElement;': '\u2209', + 'NotEqual;': '\u2260', + 'NotEqualTilde;': '\u2242\u0338', + 'NotExists;': '\u2204', + 'NotGreater;': '\u226f', + 'NotGreaterEqual;': '\u2271', + 'NotGreaterFullEqual;': '\u2267\u0338', + 'NotGreaterGreater;': '\u226b\u0338', + 'NotGreaterLess;': '\u2279', + 'NotGreaterSlantEqual;': '\u2a7e\u0338', + 'NotGreaterTilde;': '\u2275', + 'NotHumpDownHump;': '\u224e\u0338', + 'NotHumpEqual;': '\u224f\u0338', + 'notin;': '\u2209', + 'notindot;': '\u22f5\u0338', + 'notinE;': '\u22f9\u0338', + 'notinva;': '\u2209', + 'notinvb;': '\u22f7', + 'notinvc;': '\u22f6', + 'NotLeftTriangle;': '\u22ea', + 'NotLeftTriangleBar;': '\u29cf\u0338', + 'NotLeftTriangleEqual;': '\u22ec', + 'NotLess;': '\u226e', + 'NotLessEqual;': '\u2270', + 'NotLessGreater;': '\u2278', + 'NotLessLess;': '\u226a\u0338', + 'NotLessSlantEqual;': '\u2a7d\u0338', + 'NotLessTilde;': '\u2274', + 'NotNestedGreaterGreater;': '\u2aa2\u0338', + 'NotNestedLessLess;': '\u2aa1\u0338', + 'notni;': '\u220c', + 'notniva;': '\u220c', + 'notnivb;': '\u22fe', + 'notnivc;': '\u22fd', + 'NotPrecedes;': '\u2280', + 'NotPrecedesEqual;': '\u2aaf\u0338', + 'NotPrecedesSlantEqual;': '\u22e0', + 'NotReverseElement;': '\u220c', + 'NotRightTriangle;': '\u22eb', + 'NotRightTriangleBar;': '\u29d0\u0338', + 'NotRightTriangleEqual;': '\u22ed', + 'NotSquareSubset;': '\u228f\u0338', + 'NotSquareSubsetEqual;': '\u22e2', + 'NotSquareSuperset;': '\u2290\u0338', + 'NotSquareSupersetEqual;': '\u22e3', + 'NotSubset;': '\u2282\u20d2', + 'NotSubsetEqual;': '\u2288', + 'NotSucceeds;': '\u2281', + 'NotSucceedsEqual;': '\u2ab0\u0338', + 'NotSucceedsSlantEqual;': '\u22e1', + 'NotSucceedsTilde;': '\u227f\u0338', + 'NotSuperset;': '\u2283\u20d2', + 'NotSupersetEqual;': '\u2289', + 'NotTilde;': '\u2241', + 'NotTildeEqual;': '\u2244', + 'NotTildeFullEqual;': '\u2247', + 'NotTildeTilde;': '\u2249', + 'NotVerticalBar;': '\u2224', + 'npar;': '\u2226', + 'nparallel;': '\u2226', + 'nparsl;': '\u2afd\u20e5', + 'npart;': '\u2202\u0338', + 'npolint;': '\u2a14', + 'npr;': '\u2280', + 'nprcue;': '\u22e0', + 'npre;': '\u2aaf\u0338', + 'nprec;': '\u2280', + 'npreceq;': '\u2aaf\u0338', + 'nrArr;': '\u21cf', + 'nrarr;': '\u219b', + 'nrarrc;': '\u2933\u0338', + 'nrarrw;': '\u219d\u0338', + 'nRightarrow;': '\u21cf', + 'nrightarrow;': '\u219b', + 'nrtri;': '\u22eb', + 'nrtrie;': '\u22ed', + 'nsc;': '\u2281', + 'nsccue;': '\u22e1', + 'nsce;': '\u2ab0\u0338', + 'Nscr;': '\U0001d4a9', + 'nscr;': '\U0001d4c3', + 'nshortmid;': '\u2224', + 'nshortparallel;': '\u2226', + 'nsim;': '\u2241', + 'nsime;': '\u2244', + 'nsimeq;': '\u2244', + 'nsmid;': '\u2224', + 'nspar;': '\u2226', + 'nsqsube;': '\u22e2', + 'nsqsupe;': '\u22e3', + 'nsub;': '\u2284', + 'nsubE;': '\u2ac5\u0338', + 'nsube;': '\u2288', + 'nsubset;': '\u2282\u20d2', + 'nsubseteq;': '\u2288', + 'nsubseteqq;': '\u2ac5\u0338', + 'nsucc;': '\u2281', + 'nsucceq;': '\u2ab0\u0338', + 'nsup;': '\u2285', + 'nsupE;': '\u2ac6\u0338', + 'nsupe;': '\u2289', + 'nsupset;': '\u2283\u20d2', + 'nsupseteq;': '\u2289', + 'nsupseteqq;': '\u2ac6\u0338', + 'ntgl;': '\u2279', + 'Ntilde': '\xd1', + 'ntilde': '\xf1', + 'Ntilde;': '\xd1', + 'ntilde;': '\xf1', + 'ntlg;': '\u2278', + 'ntriangleleft;': '\u22ea', + 'ntrianglelefteq;': '\u22ec', + 'ntriangleright;': '\u22eb', + 'ntrianglerighteq;': '\u22ed', + 'Nu;': '\u039d', + 'nu;': '\u03bd', + 'num;': '#', + 'numero;': '\u2116', + 'numsp;': '\u2007', + 'nvap;': '\u224d\u20d2', + 'nVDash;': '\u22af', + 'nVdash;': '\u22ae', + 'nvDash;': '\u22ad', + 'nvdash;': '\u22ac', + 'nvge;': '\u2265\u20d2', + 'nvgt;': '>\u20d2', + 'nvHarr;': '\u2904', + 'nvinfin;': '\u29de', + 'nvlArr;': '\u2902', + 'nvle;': '\u2264\u20d2', + 'nvlt;': '<\u20d2', + 'nvltrie;': '\u22b4\u20d2', + 'nvrArr;': '\u2903', + 'nvrtrie;': '\u22b5\u20d2', + 'nvsim;': '\u223c\u20d2', + 'nwarhk;': '\u2923', + 'nwArr;': '\u21d6', + 'nwarr;': '\u2196', + 'nwarrow;': '\u2196', + 'nwnear;': '\u2927', + 'Oacute': '\xd3', + 'oacute': '\xf3', + 'Oacute;': '\xd3', + 'oacute;': '\xf3', + 'oast;': '\u229b', + 'ocir;': '\u229a', + 'Ocirc': '\xd4', + 'ocirc': '\xf4', + 'Ocirc;': '\xd4', + 'ocirc;': '\xf4', + 'Ocy;': '\u041e', + 'ocy;': '\u043e', + 'odash;': '\u229d', + 'Odblac;': '\u0150', + 'odblac;': '\u0151', + 'odiv;': '\u2a38', + 'odot;': '\u2299', + 'odsold;': '\u29bc', + 'OElig;': '\u0152', + 'oelig;': '\u0153', + 'ofcir;': '\u29bf', + 'Ofr;': '\U0001d512', + 'ofr;': '\U0001d52c', + 'ogon;': '\u02db', + 'Ograve': '\xd2', + 'ograve': '\xf2', + 'Ograve;': '\xd2', + 'ograve;': '\xf2', + 'ogt;': '\u29c1', + 'ohbar;': '\u29b5', + 'ohm;': '\u03a9', + 'oint;': '\u222e', + 'olarr;': '\u21ba', + 'olcir;': '\u29be', + 'olcross;': '\u29bb', + 'oline;': '\u203e', + 'olt;': '\u29c0', + 'Omacr;': '\u014c', + 'omacr;': '\u014d', + 'Omega;': '\u03a9', + 'omega;': '\u03c9', + 'Omicron;': '\u039f', + 'omicron;': '\u03bf', + 'omid;': '\u29b6', + 'ominus;': '\u2296', + 'Oopf;': '\U0001d546', + 'oopf;': '\U0001d560', + 'opar;': '\u29b7', + 'OpenCurlyDoubleQuote;': '\u201c', + 'OpenCurlyQuote;': '\u2018', + 'operp;': '\u29b9', + 'oplus;': '\u2295', + 'Or;': '\u2a54', + 'or;': '\u2228', + 'orarr;': '\u21bb', + 'ord;': '\u2a5d', + 'order;': '\u2134', + 'orderof;': '\u2134', + 'ordf': '\xaa', + 'ordf;': '\xaa', + 'ordm': '\xba', + 'ordm;': '\xba', + 'origof;': '\u22b6', + 'oror;': '\u2a56', + 'orslope;': '\u2a57', + 'orv;': '\u2a5b', + 'oS;': '\u24c8', + 'Oscr;': '\U0001d4aa', + 'oscr;': '\u2134', + 'Oslash': '\xd8', + 'oslash': '\xf8', + 'Oslash;': '\xd8', + 'oslash;': '\xf8', + 'osol;': '\u2298', + 'Otilde': '\xd5', + 'otilde': '\xf5', + 'Otilde;': '\xd5', + 'otilde;': '\xf5', + 'Otimes;': '\u2a37', + 'otimes;': '\u2297', + 'otimesas;': '\u2a36', + 'Ouml': '\xd6', + 'ouml': '\xf6', + 'Ouml;': '\xd6', + 'ouml;': '\xf6', + 'ovbar;': '\u233d', + 'OverBar;': '\u203e', + 'OverBrace;': '\u23de', + 'OverBracket;': '\u23b4', + 'OverParenthesis;': '\u23dc', + 'par;': '\u2225', + 'para': '\xb6', + 'para;': '\xb6', + 'parallel;': '\u2225', + 'parsim;': '\u2af3', + 'parsl;': '\u2afd', + 'part;': '\u2202', + 'PartialD;': '\u2202', + 'Pcy;': '\u041f', + 'pcy;': '\u043f', + 'percnt;': '%', + 'period;': '.', + 'permil;': '\u2030', + 'perp;': '\u22a5', + 'pertenk;': '\u2031', + 'Pfr;': '\U0001d513', + 'pfr;': '\U0001d52d', + 'Phi;': '\u03a6', + 'phi;': '\u03c6', + 'phiv;': '\u03d5', + 'phmmat;': '\u2133', + 'phone;': '\u260e', + 'Pi;': '\u03a0', + 'pi;': '\u03c0', + 'pitchfork;': '\u22d4', + 'piv;': '\u03d6', + 'planck;': '\u210f', + 'planckh;': '\u210e', + 'plankv;': '\u210f', + 'plus;': '+', + 'plusacir;': '\u2a23', + 'plusb;': '\u229e', + 'pluscir;': '\u2a22', + 'plusdo;': '\u2214', + 'plusdu;': '\u2a25', + 'pluse;': '\u2a72', + 'PlusMinus;': '\xb1', + 'plusmn': '\xb1', + 'plusmn;': '\xb1', + 'plussim;': '\u2a26', + 'plustwo;': '\u2a27', + 'pm;': '\xb1', + 'Poincareplane;': '\u210c', + 'pointint;': '\u2a15', + 'Popf;': '\u2119', + 'popf;': '\U0001d561', + 'pound': '\xa3', + 'pound;': '\xa3', + 'Pr;': '\u2abb', + 'pr;': '\u227a', + 'prap;': '\u2ab7', + 'prcue;': '\u227c', + 'prE;': '\u2ab3', + 'pre;': '\u2aaf', + 'prec;': '\u227a', + 'precapprox;': '\u2ab7', + 'preccurlyeq;': '\u227c', + 'Precedes;': '\u227a', + 'PrecedesEqual;': '\u2aaf', + 'PrecedesSlantEqual;': '\u227c', + 'PrecedesTilde;': '\u227e', + 'preceq;': '\u2aaf', + 'precnapprox;': '\u2ab9', + 'precneqq;': '\u2ab5', + 'precnsim;': '\u22e8', + 'precsim;': '\u227e', + 'Prime;': '\u2033', + 'prime;': '\u2032', + 'primes;': '\u2119', + 'prnap;': '\u2ab9', + 'prnE;': '\u2ab5', + 'prnsim;': '\u22e8', + 'prod;': '\u220f', + 'Product;': '\u220f', + 'profalar;': '\u232e', + 'profline;': '\u2312', + 'profsurf;': '\u2313', + 'prop;': '\u221d', + 'Proportion;': '\u2237', + 'Proportional;': '\u221d', + 'propto;': '\u221d', + 'prsim;': '\u227e', + 'prurel;': '\u22b0', + 'Pscr;': '\U0001d4ab', + 'pscr;': '\U0001d4c5', + 'Psi;': '\u03a8', + 'psi;': '\u03c8', + 'puncsp;': '\u2008', + 'Qfr;': '\U0001d514', + 'qfr;': '\U0001d52e', + 'qint;': '\u2a0c', + 'Qopf;': '\u211a', + 'qopf;': '\U0001d562', + 'qprime;': '\u2057', + 'Qscr;': '\U0001d4ac', + 'qscr;': '\U0001d4c6', + 'quaternions;': '\u210d', + 'quatint;': '\u2a16', + 'quest;': '?', + 'questeq;': '\u225f', + 'QUOT': '"', + 'quot': '"', + 'QUOT;': '"', + 'quot;': '"', + 'rAarr;': '\u21db', + 'race;': '\u223d\u0331', + 'Racute;': '\u0154', + 'racute;': '\u0155', + 'radic;': '\u221a', + 'raemptyv;': '\u29b3', + 'Rang;': '\u27eb', + 'rang;': '\u27e9', + 'rangd;': '\u2992', + 'range;': '\u29a5', + 'rangle;': '\u27e9', + 'raquo': '\xbb', + 'raquo;': '\xbb', + 'Rarr;': '\u21a0', + 'rArr;': '\u21d2', + 'rarr;': '\u2192', + 'rarrap;': '\u2975', + 'rarrb;': '\u21e5', + 'rarrbfs;': '\u2920', + 'rarrc;': '\u2933', + 'rarrfs;': '\u291e', + 'rarrhk;': '\u21aa', + 'rarrlp;': '\u21ac', + 'rarrpl;': '\u2945', + 'rarrsim;': '\u2974', + 'Rarrtl;': '\u2916', + 'rarrtl;': '\u21a3', + 'rarrw;': '\u219d', + 'rAtail;': '\u291c', + 'ratail;': '\u291a', + 'ratio;': '\u2236', + 'rationals;': '\u211a', + 'RBarr;': '\u2910', + 'rBarr;': '\u290f', + 'rbarr;': '\u290d', + 'rbbrk;': '\u2773', + 'rbrace;': '}', + 'rbrack;': ']', + 'rbrke;': '\u298c', + 'rbrksld;': '\u298e', + 'rbrkslu;': '\u2990', + 'Rcaron;': '\u0158', + 'rcaron;': '\u0159', + 'Rcedil;': '\u0156', + 'rcedil;': '\u0157', + 'rceil;': '\u2309', + 'rcub;': '}', + 'Rcy;': '\u0420', + 'rcy;': '\u0440', + 'rdca;': '\u2937', + 'rdldhar;': '\u2969', + 'rdquo;': '\u201d', + 'rdquor;': '\u201d', + 'rdsh;': '\u21b3', + 'Re;': '\u211c', + 'real;': '\u211c', + 'realine;': '\u211b', + 'realpart;': '\u211c', + 'reals;': '\u211d', + 'rect;': '\u25ad', + 'REG': '\xae', + 'reg': '\xae', + 'REG;': '\xae', + 'reg;': '\xae', + 'ReverseElement;': '\u220b', + 'ReverseEquilibrium;': '\u21cb', + 'ReverseUpEquilibrium;': '\u296f', + 'rfisht;': '\u297d', + 'rfloor;': '\u230b', + 'Rfr;': '\u211c', + 'rfr;': '\U0001d52f', + 'rHar;': '\u2964', + 'rhard;': '\u21c1', + 'rharu;': '\u21c0', + 'rharul;': '\u296c', + 'Rho;': '\u03a1', + 'rho;': '\u03c1', + 'rhov;': '\u03f1', + 'RightAngleBracket;': '\u27e9', + 'RightArrow;': '\u2192', + 'Rightarrow;': '\u21d2', + 'rightarrow;': '\u2192', + 'RightArrowBar;': '\u21e5', + 'RightArrowLeftArrow;': '\u21c4', + 'rightarrowtail;': '\u21a3', + 'RightCeiling;': '\u2309', + 'RightDoubleBracket;': '\u27e7', + 'RightDownTeeVector;': '\u295d', + 'RightDownVector;': '\u21c2', + 'RightDownVectorBar;': '\u2955', + 'RightFloor;': '\u230b', + 'rightharpoondown;': '\u21c1', + 'rightharpoonup;': '\u21c0', + 'rightleftarrows;': '\u21c4', + 'rightleftharpoons;': '\u21cc', + 'rightrightarrows;': '\u21c9', + 'rightsquigarrow;': '\u219d', + 'RightTee;': '\u22a2', + 'RightTeeArrow;': '\u21a6', + 'RightTeeVector;': '\u295b', + 'rightthreetimes;': '\u22cc', + 'RightTriangle;': '\u22b3', + 'RightTriangleBar;': '\u29d0', + 'RightTriangleEqual;': '\u22b5', + 'RightUpDownVector;': '\u294f', + 'RightUpTeeVector;': '\u295c', + 'RightUpVector;': '\u21be', + 'RightUpVectorBar;': '\u2954', + 'RightVector;': '\u21c0', + 'RightVectorBar;': '\u2953', + 'ring;': '\u02da', + 'risingdotseq;': '\u2253', + 'rlarr;': '\u21c4', + 'rlhar;': '\u21cc', + 'rlm;': '\u200f', + 'rmoust;': '\u23b1', + 'rmoustache;': '\u23b1', + 'rnmid;': '\u2aee', + 'roang;': '\u27ed', + 'roarr;': '\u21fe', + 'robrk;': '\u27e7', + 'ropar;': '\u2986', + 'Ropf;': '\u211d', + 'ropf;': '\U0001d563', + 'roplus;': '\u2a2e', + 'rotimes;': '\u2a35', + 'RoundImplies;': '\u2970', + 'rpar;': ')', + 'rpargt;': '\u2994', + 'rppolint;': '\u2a12', + 'rrarr;': '\u21c9', + 'Rrightarrow;': '\u21db', + 'rsaquo;': '\u203a', + 'Rscr;': '\u211b', + 'rscr;': '\U0001d4c7', + 'Rsh;': '\u21b1', + 'rsh;': '\u21b1', + 'rsqb;': ']', + 'rsquo;': '\u2019', + 'rsquor;': '\u2019', + 'rthree;': '\u22cc', + 'rtimes;': '\u22ca', + 'rtri;': '\u25b9', + 'rtrie;': '\u22b5', + 'rtrif;': '\u25b8', + 'rtriltri;': '\u29ce', + 'RuleDelayed;': '\u29f4', + 'ruluhar;': '\u2968', + 'rx;': '\u211e', + 'Sacute;': '\u015a', + 'sacute;': '\u015b', + 'sbquo;': '\u201a', + 'Sc;': '\u2abc', + 'sc;': '\u227b', + 'scap;': '\u2ab8', + 'Scaron;': '\u0160', + 'scaron;': '\u0161', + 'sccue;': '\u227d', + 'scE;': '\u2ab4', + 'sce;': '\u2ab0', + 'Scedil;': '\u015e', + 'scedil;': '\u015f', + 'Scirc;': '\u015c', + 'scirc;': '\u015d', + 'scnap;': '\u2aba', + 'scnE;': '\u2ab6', + 'scnsim;': '\u22e9', + 'scpolint;': '\u2a13', + 'scsim;': '\u227f', + 'Scy;': '\u0421', + 'scy;': '\u0441', + 'sdot;': '\u22c5', + 'sdotb;': '\u22a1', + 'sdote;': '\u2a66', + 'searhk;': '\u2925', + 'seArr;': '\u21d8', + 'searr;': '\u2198', + 'searrow;': '\u2198', + 'sect': '\xa7', + 'sect;': '\xa7', + 'semi;': ';', + 'seswar;': '\u2929', + 'setminus;': '\u2216', + 'setmn;': '\u2216', + 'sext;': '\u2736', + 'Sfr;': '\U0001d516', + 'sfr;': '\U0001d530', + 'sfrown;': '\u2322', + 'sharp;': '\u266f', + 'SHCHcy;': '\u0429', + 'shchcy;': '\u0449', + 'SHcy;': '\u0428', + 'shcy;': '\u0448', + 'ShortDownArrow;': '\u2193', + 'ShortLeftArrow;': '\u2190', + 'shortmid;': '\u2223', + 'shortparallel;': '\u2225', + 'ShortRightArrow;': '\u2192', + 'ShortUpArrow;': '\u2191', + 'shy': '\xad', + 'shy;': '\xad', + 'Sigma;': '\u03a3', + 'sigma;': '\u03c3', + 'sigmaf;': '\u03c2', + 'sigmav;': '\u03c2', + 'sim;': '\u223c', + 'simdot;': '\u2a6a', + 'sime;': '\u2243', + 'simeq;': '\u2243', + 'simg;': '\u2a9e', + 'simgE;': '\u2aa0', + 'siml;': '\u2a9d', + 'simlE;': '\u2a9f', + 'simne;': '\u2246', + 'simplus;': '\u2a24', + 'simrarr;': '\u2972', + 'slarr;': '\u2190', + 'SmallCircle;': '\u2218', + 'smallsetminus;': '\u2216', + 'smashp;': '\u2a33', + 'smeparsl;': '\u29e4', + 'smid;': '\u2223', + 'smile;': '\u2323', + 'smt;': '\u2aaa', + 'smte;': '\u2aac', + 'smtes;': '\u2aac\ufe00', + 'SOFTcy;': '\u042c', + 'softcy;': '\u044c', + 'sol;': '/', + 'solb;': '\u29c4', + 'solbar;': '\u233f', + 'Sopf;': '\U0001d54a', + 'sopf;': '\U0001d564', + 'spades;': '\u2660', + 'spadesuit;': '\u2660', + 'spar;': '\u2225', + 'sqcap;': '\u2293', + 'sqcaps;': '\u2293\ufe00', + 'sqcup;': '\u2294', + 'sqcups;': '\u2294\ufe00', + 'Sqrt;': '\u221a', + 'sqsub;': '\u228f', + 'sqsube;': '\u2291', + 'sqsubset;': '\u228f', + 'sqsubseteq;': '\u2291', + 'sqsup;': '\u2290', + 'sqsupe;': '\u2292', + 'sqsupset;': '\u2290', + 'sqsupseteq;': '\u2292', + 'squ;': '\u25a1', + 'Square;': '\u25a1', + 'square;': '\u25a1', + 'SquareIntersection;': '\u2293', + 'SquareSubset;': '\u228f', + 'SquareSubsetEqual;': '\u2291', + 'SquareSuperset;': '\u2290', + 'SquareSupersetEqual;': '\u2292', + 'SquareUnion;': '\u2294', + 'squarf;': '\u25aa', + 'squf;': '\u25aa', + 'srarr;': '\u2192', + 'Sscr;': '\U0001d4ae', + 'sscr;': '\U0001d4c8', + 'ssetmn;': '\u2216', + 'ssmile;': '\u2323', + 'sstarf;': '\u22c6', + 'Star;': '\u22c6', + 'star;': '\u2606', + 'starf;': '\u2605', + 'straightepsilon;': '\u03f5', + 'straightphi;': '\u03d5', + 'strns;': '\xaf', + 'Sub;': '\u22d0', + 'sub;': '\u2282', + 'subdot;': '\u2abd', + 'subE;': '\u2ac5', + 'sube;': '\u2286', + 'subedot;': '\u2ac3', + 'submult;': '\u2ac1', + 'subnE;': '\u2acb', + 'subne;': '\u228a', + 'subplus;': '\u2abf', + 'subrarr;': '\u2979', + 'Subset;': '\u22d0', + 'subset;': '\u2282', + 'subseteq;': '\u2286', + 'subseteqq;': '\u2ac5', + 'SubsetEqual;': '\u2286', + 'subsetneq;': '\u228a', + 'subsetneqq;': '\u2acb', + 'subsim;': '\u2ac7', + 'subsub;': '\u2ad5', + 'subsup;': '\u2ad3', + 'succ;': '\u227b', + 'succapprox;': '\u2ab8', + 'succcurlyeq;': '\u227d', + 'Succeeds;': '\u227b', + 'SucceedsEqual;': '\u2ab0', + 'SucceedsSlantEqual;': '\u227d', + 'SucceedsTilde;': '\u227f', + 'succeq;': '\u2ab0', + 'succnapprox;': '\u2aba', + 'succneqq;': '\u2ab6', + 'succnsim;': '\u22e9', + 'succsim;': '\u227f', + 'SuchThat;': '\u220b', + 'Sum;': '\u2211', + 'sum;': '\u2211', + 'sung;': '\u266a', + 'sup1': '\xb9', + 'sup1;': '\xb9', + 'sup2': '\xb2', + 'sup2;': '\xb2', + 'sup3': '\xb3', + 'sup3;': '\xb3', + 'Sup;': '\u22d1', + 'sup;': '\u2283', + 'supdot;': '\u2abe', + 'supdsub;': '\u2ad8', + 'supE;': '\u2ac6', + 'supe;': '\u2287', + 'supedot;': '\u2ac4', + 'Superset;': '\u2283', + 'SupersetEqual;': '\u2287', + 'suphsol;': '\u27c9', + 'suphsub;': '\u2ad7', + 'suplarr;': '\u297b', + 'supmult;': '\u2ac2', + 'supnE;': '\u2acc', + 'supne;': '\u228b', + 'supplus;': '\u2ac0', + 'Supset;': '\u22d1', + 'supset;': '\u2283', + 'supseteq;': '\u2287', + 'supseteqq;': '\u2ac6', + 'supsetneq;': '\u228b', + 'supsetneqq;': '\u2acc', + 'supsim;': '\u2ac8', + 'supsub;': '\u2ad4', + 'supsup;': '\u2ad6', + 'swarhk;': '\u2926', + 'swArr;': '\u21d9', + 'swarr;': '\u2199', + 'swarrow;': '\u2199', + 'swnwar;': '\u292a', + 'szlig': '\xdf', + 'szlig;': '\xdf', + 'Tab;': '\t', + 'target;': '\u2316', + 'Tau;': '\u03a4', + 'tau;': '\u03c4', + 'tbrk;': '\u23b4', + 'Tcaron;': '\u0164', + 'tcaron;': '\u0165', + 'Tcedil;': '\u0162', + 'tcedil;': '\u0163', + 'Tcy;': '\u0422', + 'tcy;': '\u0442', + 'tdot;': '\u20db', + 'telrec;': '\u2315', + 'Tfr;': '\U0001d517', + 'tfr;': '\U0001d531', + 'there4;': '\u2234', + 'Therefore;': '\u2234', + 'therefore;': '\u2234', + 'Theta;': '\u0398', + 'theta;': '\u03b8', + 'thetasym;': '\u03d1', + 'thetav;': '\u03d1', + 'thickapprox;': '\u2248', + 'thicksim;': '\u223c', + 'ThickSpace;': '\u205f\u200a', + 'thinsp;': '\u2009', + 'ThinSpace;': '\u2009', + 'thkap;': '\u2248', + 'thksim;': '\u223c', + 'THORN': '\xde', + 'thorn': '\xfe', + 'THORN;': '\xde', + 'thorn;': '\xfe', + 'Tilde;': '\u223c', + 'tilde;': '\u02dc', + 'TildeEqual;': '\u2243', + 'TildeFullEqual;': '\u2245', + 'TildeTilde;': '\u2248', + 'times': '\xd7', + 'times;': '\xd7', + 'timesb;': '\u22a0', + 'timesbar;': '\u2a31', + 'timesd;': '\u2a30', + 'tint;': '\u222d', + 'toea;': '\u2928', + 'top;': '\u22a4', + 'topbot;': '\u2336', + 'topcir;': '\u2af1', + 'Topf;': '\U0001d54b', + 'topf;': '\U0001d565', + 'topfork;': '\u2ada', + 'tosa;': '\u2929', + 'tprime;': '\u2034', + 'TRADE;': '\u2122', + 'trade;': '\u2122', + 'triangle;': '\u25b5', + 'triangledown;': '\u25bf', + 'triangleleft;': '\u25c3', + 'trianglelefteq;': '\u22b4', + 'triangleq;': '\u225c', + 'triangleright;': '\u25b9', + 'trianglerighteq;': '\u22b5', + 'tridot;': '\u25ec', + 'trie;': '\u225c', + 'triminus;': '\u2a3a', + 'TripleDot;': '\u20db', + 'triplus;': '\u2a39', + 'trisb;': '\u29cd', + 'tritime;': '\u2a3b', + 'trpezium;': '\u23e2', + 'Tscr;': '\U0001d4af', + 'tscr;': '\U0001d4c9', + 'TScy;': '\u0426', + 'tscy;': '\u0446', + 'TSHcy;': '\u040b', + 'tshcy;': '\u045b', + 'Tstrok;': '\u0166', + 'tstrok;': '\u0167', + 'twixt;': '\u226c', + 'twoheadleftarrow;': '\u219e', + 'twoheadrightarrow;': '\u21a0', + 'Uacute': '\xda', + 'uacute': '\xfa', + 'Uacute;': '\xda', + 'uacute;': '\xfa', + 'Uarr;': '\u219f', + 'uArr;': '\u21d1', + 'uarr;': '\u2191', + 'Uarrocir;': '\u2949', + 'Ubrcy;': '\u040e', + 'ubrcy;': '\u045e', + 'Ubreve;': '\u016c', + 'ubreve;': '\u016d', + 'Ucirc': '\xdb', + 'ucirc': '\xfb', + 'Ucirc;': '\xdb', + 'ucirc;': '\xfb', + 'Ucy;': '\u0423', + 'ucy;': '\u0443', + 'udarr;': '\u21c5', + 'Udblac;': '\u0170', + 'udblac;': '\u0171', + 'udhar;': '\u296e', + 'ufisht;': '\u297e', + 'Ufr;': '\U0001d518', + 'ufr;': '\U0001d532', + 'Ugrave': '\xd9', + 'ugrave': '\xf9', + 'Ugrave;': '\xd9', + 'ugrave;': '\xf9', + 'uHar;': '\u2963', + 'uharl;': '\u21bf', + 'uharr;': '\u21be', + 'uhblk;': '\u2580', + 'ulcorn;': '\u231c', + 'ulcorner;': '\u231c', + 'ulcrop;': '\u230f', + 'ultri;': '\u25f8', + 'Umacr;': '\u016a', + 'umacr;': '\u016b', + 'uml': '\xa8', + 'uml;': '\xa8', + 'UnderBar;': '_', + 'UnderBrace;': '\u23df', + 'UnderBracket;': '\u23b5', + 'UnderParenthesis;': '\u23dd', + 'Union;': '\u22c3', + 'UnionPlus;': '\u228e', + 'Uogon;': '\u0172', + 'uogon;': '\u0173', + 'Uopf;': '\U0001d54c', + 'uopf;': '\U0001d566', + 'UpArrow;': '\u2191', + 'Uparrow;': '\u21d1', + 'uparrow;': '\u2191', + 'UpArrowBar;': '\u2912', + 'UpArrowDownArrow;': '\u21c5', + 'UpDownArrow;': '\u2195', + 'Updownarrow;': '\u21d5', + 'updownarrow;': '\u2195', + 'UpEquilibrium;': '\u296e', + 'upharpoonleft;': '\u21bf', + 'upharpoonright;': '\u21be', + 'uplus;': '\u228e', + 'UpperLeftArrow;': '\u2196', + 'UpperRightArrow;': '\u2197', + 'Upsi;': '\u03d2', + 'upsi;': '\u03c5', + 'upsih;': '\u03d2', + 'Upsilon;': '\u03a5', + 'upsilon;': '\u03c5', + 'UpTee;': '\u22a5', + 'UpTeeArrow;': '\u21a5', + 'upuparrows;': '\u21c8', + 'urcorn;': '\u231d', + 'urcorner;': '\u231d', + 'urcrop;': '\u230e', + 'Uring;': '\u016e', + 'uring;': '\u016f', + 'urtri;': '\u25f9', + 'Uscr;': '\U0001d4b0', + 'uscr;': '\U0001d4ca', + 'utdot;': '\u22f0', + 'Utilde;': '\u0168', + 'utilde;': '\u0169', + 'utri;': '\u25b5', + 'utrif;': '\u25b4', + 'uuarr;': '\u21c8', + 'Uuml': '\xdc', + 'uuml': '\xfc', + 'Uuml;': '\xdc', + 'uuml;': '\xfc', + 'uwangle;': '\u29a7', + 'vangrt;': '\u299c', + 'varepsilon;': '\u03f5', + 'varkappa;': '\u03f0', + 'varnothing;': '\u2205', + 'varphi;': '\u03d5', + 'varpi;': '\u03d6', + 'varpropto;': '\u221d', + 'vArr;': '\u21d5', + 'varr;': '\u2195', + 'varrho;': '\u03f1', + 'varsigma;': '\u03c2', + 'varsubsetneq;': '\u228a\ufe00', + 'varsubsetneqq;': '\u2acb\ufe00', + 'varsupsetneq;': '\u228b\ufe00', + 'varsupsetneqq;': '\u2acc\ufe00', + 'vartheta;': '\u03d1', + 'vartriangleleft;': '\u22b2', + 'vartriangleright;': '\u22b3', + 'Vbar;': '\u2aeb', + 'vBar;': '\u2ae8', + 'vBarv;': '\u2ae9', + 'Vcy;': '\u0412', + 'vcy;': '\u0432', + 'VDash;': '\u22ab', + 'Vdash;': '\u22a9', + 'vDash;': '\u22a8', + 'vdash;': '\u22a2', + 'Vdashl;': '\u2ae6', + 'Vee;': '\u22c1', + 'vee;': '\u2228', + 'veebar;': '\u22bb', + 'veeeq;': '\u225a', + 'vellip;': '\u22ee', + 'Verbar;': '\u2016', + 'verbar;': '|', + 'Vert;': '\u2016', + 'vert;': '|', + 'VerticalBar;': '\u2223', + 'VerticalLine;': '|', + 'VerticalSeparator;': '\u2758', + 'VerticalTilde;': '\u2240', + 'VeryThinSpace;': '\u200a', + 'Vfr;': '\U0001d519', + 'vfr;': '\U0001d533', + 'vltri;': '\u22b2', + 'vnsub;': '\u2282\u20d2', + 'vnsup;': '\u2283\u20d2', + 'Vopf;': '\U0001d54d', + 'vopf;': '\U0001d567', + 'vprop;': '\u221d', + 'vrtri;': '\u22b3', + 'Vscr;': '\U0001d4b1', + 'vscr;': '\U0001d4cb', + 'vsubnE;': '\u2acb\ufe00', + 'vsubne;': '\u228a\ufe00', + 'vsupnE;': '\u2acc\ufe00', + 'vsupne;': '\u228b\ufe00', + 'Vvdash;': '\u22aa', + 'vzigzag;': '\u299a', + 'Wcirc;': '\u0174', + 'wcirc;': '\u0175', + 'wedbar;': '\u2a5f', + 'Wedge;': '\u22c0', + 'wedge;': '\u2227', + 'wedgeq;': '\u2259', + 'weierp;': '\u2118', + 'Wfr;': '\U0001d51a', + 'wfr;': '\U0001d534', + 'Wopf;': '\U0001d54e', + 'wopf;': '\U0001d568', + 'wp;': '\u2118', + 'wr;': '\u2240', + 'wreath;': '\u2240', + 'Wscr;': '\U0001d4b2', + 'wscr;': '\U0001d4cc', + 'xcap;': '\u22c2', + 'xcirc;': '\u25ef', + 'xcup;': '\u22c3', + 'xdtri;': '\u25bd', + 'Xfr;': '\U0001d51b', + 'xfr;': '\U0001d535', + 'xhArr;': '\u27fa', + 'xharr;': '\u27f7', + 'Xi;': '\u039e', + 'xi;': '\u03be', + 'xlArr;': '\u27f8', + 'xlarr;': '\u27f5', + 'xmap;': '\u27fc', + 'xnis;': '\u22fb', + 'xodot;': '\u2a00', + 'Xopf;': '\U0001d54f', + 'xopf;': '\U0001d569', + 'xoplus;': '\u2a01', + 'xotime;': '\u2a02', + 'xrArr;': '\u27f9', + 'xrarr;': '\u27f6', + 'Xscr;': '\U0001d4b3', + 'xscr;': '\U0001d4cd', + 'xsqcup;': '\u2a06', + 'xuplus;': '\u2a04', + 'xutri;': '\u25b3', + 'xvee;': '\u22c1', + 'xwedge;': '\u22c0', + 'Yacute': '\xdd', + 'yacute': '\xfd', + 'Yacute;': '\xdd', + 'yacute;': '\xfd', + 'YAcy;': '\u042f', + 'yacy;': '\u044f', + 'Ycirc;': '\u0176', + 'ycirc;': '\u0177', + 'Ycy;': '\u042b', + 'ycy;': '\u044b', + 'yen': '\xa5', + 'yen;': '\xa5', + 'Yfr;': '\U0001d51c', + 'yfr;': '\U0001d536', + 'YIcy;': '\u0407', + 'yicy;': '\u0457', + 'Yopf;': '\U0001d550', + 'yopf;': '\U0001d56a', + 'Yscr;': '\U0001d4b4', + 'yscr;': '\U0001d4ce', + 'YUcy;': '\u042e', + 'yucy;': '\u044e', + 'yuml': '\xff', + 'Yuml;': '\u0178', + 'yuml;': '\xff', + 'Zacute;': '\u0179', + 'zacute;': '\u017a', + 'Zcaron;': '\u017d', + 'zcaron;': '\u017e', + 'Zcy;': '\u0417', + 'zcy;': '\u0437', + 'Zdot;': '\u017b', + 'zdot;': '\u017c', + 'zeetrf;': '\u2128', + 'ZeroWidthSpace;': '\u200b', + 'Zeta;': '\u0396', + 'zeta;': '\u03b6', + 'Zfr;': '\u2128', + 'zfr;': '\U0001d537', + 'ZHcy;': '\u0416', + 'zhcy;': '\u0436', + 'zigrarr;': '\u21dd', + 'Zopf;': '\u2124', + 'zopf;': '\U0001d56b', + 'Zscr;': '\U0001d4b5', + 'zscr;': '\U0001d4cf', + 'zwj;': '\u200d', + 'zwnj;': '\u200c', +} + +# maps the Unicode code point to the HTML entity name +codepoint2name = {} + +# maps the HTML entity name to the character +# (or a character reference if the character is outside the Latin-1 range) +entitydefs = {} + +for (name, codepoint) in name2codepoint.items(): + codepoint2name[codepoint] = name + entitydefs[name] = chr(codepoint) + +del name, codepoint diff --git a/modules/language/python/module/html/parser.py b/modules/language/python/module/html/parser.py new file mode 100644 index 0000000..c46eda8 --- /dev/null +++ b/modules/language/python/module/html/parser.py @@ -0,0 +1,472 @@ +module(html,parser) + +"""A parser for HTML and XHTML.""" + +# This file is based on sgmllib.py, but the API is slightly different. + +# XXX There should be a way to distinguish between PCDATA (parsed +# character data -- the normal case), RCDATA (replaceable character +# data -- only char and entity references and end tags are special) +# and CDATA (character data -- only end tags are special). + + +import re +import warnings +import _markupbase + +from html import unescape + + +__all__ = ['HTMLParser'] + +# Regular expressions used for parsing + +interesting_normal = re.compile('[&<]') +incomplete = re.compile('&[a-zA-Z#]') + +entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') +charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') + +starttagopen = re.compile('<[a-zA-Z]') +piclose = re.compile('>') +commentclose = re.compile(r'--\s*>') +# Note: +# 1) if you change tagfind/attrfind remember to update locatestarttagend too; +# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will +# explode, so don't do it. +# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state +# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state +tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') +attrfind_tolerant = re.compile( + r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') +locatestarttagend_tolerant = re.compile(r""" + <[a-zA-Z][^\t\n\r\f />\x00]* # tag name + (?:[\s/]* # optional whitespace before attribute name + (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name + (?:\s*=+\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^>\s]* # bare value + ) + (?:\s*,)* # possibly followed by a comma + )?(?:\s|/(?!>))* + )* + )? + \s* # trailing whitespace +""", re.VERBOSE) +endendtag = re.compile('>') +# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between +# </ and the tag name, so maybe this should be fixed +endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') + + + +class HTMLParser(_markupbase.ParserBase): + """Find tags and other markup and call handler functions. + + Usage: + p = HTMLParser() + p.feed(data) + ... + p.close() + + Start tags are handled by calling self.handle_starttag() or + self.handle_startendtag(); end tags by self.handle_endtag(). The + data between tags is passed from the parser to the derived class + by calling self.handle_data() with the data as argument (the data + may be split up in arbitrary chunks). If convert_charrefs is + True the character references are converted automatically to the + corresponding Unicode character (and self.handle_data() is no + longer split in chunks), otherwise they are passed by calling + self.handle_entityref() or self.handle_charref() with the string + containing respectively the named or numeric reference as the + argument. + """ + + CDATA_CONTENT_ELEMENTS = ("script", "style") + + def __init__(self, *, convert_charrefs=True): + """Initialize and reset this instance. + + If convert_charrefs is True (the default), all character references + are automatically converted to the corresponding Unicode characters. + """ + self.convert_charrefs = convert_charrefs + self.reset() + + def reset(self): + """Reset this instance. Loses all unprocessed data.""" + self.rawdata = '' + self.lasttag = '???' + self.interesting = interesting_normal + self.cdata_elem = None + _markupbase.ParserBase.reset(self) + + def feed(self, data): + r"""Feed data to the parser. + + Call this as often as you want, with as little or as much text + as you want (may include '\n'). + """ + self.rawdata = self.rawdata + data + self.goahead(0) + + def close(self): + """Handle any buffered data.""" + self.goahead(1) + + __starttag_text = None + + def get_starttag_text(self): + """Return full source of start tag: '<...>'.""" + return self.__starttag_text + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) + + def clear_cdata_mode(self): + self.interesting = interesting_normal + self.cdata_elem = None + + # Internal -- handle data as far as reasonable. May leave state + # and data to be processed by a subsequent call. If 'end' is + # true, force handling all data as if followed by EOF marker. + def goahead(self, end): + rawdata = self.rawdata + i = 0 + n = len(rawdata) + while i < n: + if self.convert_charrefs and not self.cdata_elem: + j = rawdata.find('<', i) + if j < 0: + # if we can't find the next <, either we are at the end + # or there's more text incoming. If the latter is True, + # we can't pass the text to handle_data in case we have + # a charref cut in half at end. Try to determine if + # this is the case before proceeding by looking for an + # & near the end and see if it's followed by a space or ;. + amppos = rawdata.rfind('&', max(i, n-34)) + if (amppos >= 0 and + not re.compile(r'[\s;]').search(rawdata, amppos)): + break # wait till we get all the text + j = n + else: + match = self.interesting.search(rawdata, i) # < or & + if match: + j = match.start() + else: + if self.cdata_elem: + break + j = n + if i < j: + if self.convert_charrefs and not self.cdata_elem: + self.handle_data(unescape(rawdata[i:j])) + else: + self.handle_data(rawdata[i:j]) + i = self.updatepos(i, j) + if i == n: break + startswith = rawdata.startswith + if startswith('<', i): + if starttagopen.match(rawdata, i): # < + letter + k = self.parse_starttag(i) + elif startswith("</", i): + k = self.parse_endtag(i) + elif startswith("<!--", i): + k = self.parse_comment(i) + elif startswith("<?", i): + k = self.parse_pi(i) + elif startswith("<!", i): + k = self.parse_html_declaration(i) + elif (i + 1) < n: + self.handle_data("<") + k = i + 1 + else: + break + if k < 0: + if not end: + break + k = rawdata.find('>', i + 1) + if k < 0: + k = rawdata.find('<', i + 1) + if k < 0: + k = i + 1 + else: + k += 1 + if self.convert_charrefs and not self.cdata_elem: + self.handle_data(unescape(rawdata[i:k])) + else: + self.handle_data(rawdata[i:k]) + i = self.updatepos(i, k) + elif startswith("&#", i): + match = charref.match(rawdata, i) + if match: + name = match.group()[2:-1] + self.handle_charref(name) + k = match.end() + if not startswith(';', k-1): + k = k - 1 + i = self.updatepos(i, k) + continue + else: + if ";" in rawdata[i:]: # bail by consuming &# + self.handle_data(rawdata[i:i+2]) + i = self.updatepos(i, i+2) + break + elif startswith('&', i): + match = entityref.match(rawdata, i) + if match: + name = match.group(1) + self.handle_entityref(name) + k = match.end() + if not startswith(';', k-1): + k = k - 1 + i = self.updatepos(i, k) + continue + match = incomplete.match(rawdata, i) + if match: + # match.group() will contain at least 2 chars + if end and match.group() == rawdata[i:]: + k = match.end() + if k <= i: + k = n + i = self.updatepos(i, i + 1) + # incomplete + break + elif (i + 1) < n: + # not the end of the buffer, and can't be confused + # with some other construct + self.handle_data("&") + i = self.updatepos(i, i + 1) + else: + break + else: + assert 0, "interesting.search() lied" + # end while + if end and i < n and not self.cdata_elem: + if self.convert_charrefs and not self.cdata_elem: + self.handle_data(unescape(rawdata[i:n])) + else: + self.handle_data(rawdata[i:n]) + i = self.updatepos(i, n) + self.rawdata = rawdata[i:] + + # Internal -- parse html declarations, return length or -1 if not terminated + # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state + # See also parse_declaration in _markupbase + def parse_html_declaration(self, i): + rawdata = self.rawdata + assert rawdata[i:i+2] == '<!', ('unexpected call to ' + 'parse_html_declaration()') + if rawdata[i:i+4] == '<!--': + # this case is actually already handled in goahead() + return self.parse_comment(i) + elif rawdata[i:i+3] == '<![': + return self.parse_marked_section(i) + elif rawdata[i:i+9].lower() == '<!doctype': + # find the closing > + gtpos = rawdata.find('>', i+9) + if gtpos == -1: + return -1 + self.handle_decl(rawdata[i+2:gtpos]) + return gtpos+1 + else: + return self.parse_bogus_comment(i) + + # Internal -- parse bogus comment, return length or -1 if not terminated + # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state + def parse_bogus_comment(self, i, report=1): + rawdata = self.rawdata + assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' + 'parse_comment()') + pos = rawdata.find('>', i+2) + if pos == -1: + return -1 + if report: + self.handle_comment(rawdata[i+2:pos]) + return pos + 1 + + # Internal -- parse processing instr, return end or -1 if not terminated + def parse_pi(self, i): + rawdata = self.rawdata + assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' + match = piclose.search(rawdata, i+2) # > + if not match: + return -1 + j = match.start() + self.handle_pi(rawdata[i+2: j]) + j = match.end() + return j + + # Internal -- handle starttag, return end or -1 if not terminated + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind_tolerant.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = match.group(1).lower() + while k < endpos: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: <span attr="value" /> + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + # Internal -- check to see if we have a complete starttag; return end + # or -1 if incomplete. + def check_for_whole_start_tag(self, i): + rawdata = self.rawdata + m = locatestarttagend_tolerant.match(rawdata, i) + if m: + j = m.end() + next = rawdata[j:j+1] + if next == ">": + return j + 1 + if next == "/": + if rawdata.startswith("/>", j): + return j + 2 + if rawdata.startswith("/", j): + # buffer boundary + return -1 + # else bogus input + if j > i: + return j + else: + return i + 1 + if next == "": + # end of input + return -1 + if next in ("abcdefghijklmnopqrstuvwxyz=/" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): + # end of input in or before attribute value, or we have the + # '/' from a '/>' ending + return -1 + if j > i: + return j + else: + return i + 1 + raise AssertionError("we should not get here!") + + # Internal -- parse endtag, return end or -1 if incomplete + def parse_endtag(self, i): + rawdata = self.rawdata + assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" + match = endendtag.search(rawdata, i+1) # > + if not match: + return -1 + gtpos = match.end() + match = endtagfind.match(rawdata, i) # </ + tag + > + if not match: + if self.cdata_elem is not None: + self.handle_data(rawdata[i:gtpos]) + return gtpos + # find the name: w3.org/TR/html5/tokenization.html#tag-name-state + namematch = tagfind_tolerant.match(rawdata, i+2) + if not namematch: + # w3.org/TR/html5/tokenization.html#end-tag-open-state + if rawdata[i:i+3] == '</>': + return i+3 + else: + return self.parse_bogus_comment(i) + tagname = namematch.group(1).lower() + # consume and ignore other stuff between the name and the > + # Note: this is not 100% correct, since we might have things like + # </tag attr=">">, but looking for > after tha name should cover + # most of the cases and is much simpler + gtpos = rawdata.find('>', namematch.end()) + self.handle_endtag(tagname) + return gtpos+1 + + elem = match.group(1).lower() # script or style + if self.cdata_elem is not None: + if elem != self.cdata_elem: + self.handle_data(rawdata[i:gtpos]) + return gtpos + + self.handle_endtag(elem.lower()) + self.clear_cdata_mode() + return gtpos + + # Overridable -- finish processing of start+end tag: <tag.../> + def handle_startendtag(self, tag, attrs): + self.handle_starttag(tag, attrs) + self.handle_endtag(tag) + + # Overridable -- handle start tag + def handle_starttag(self, tag, attrs): + pass + + # Overridable -- handle end tag + def handle_endtag(self, tag): + pass + + # Overridable -- handle character reference + def handle_charref(self, name): + pass + + # Overridable -- handle entity reference + def handle_entityref(self, name): + pass + + # Overridable -- handle data + def handle_data(self, data): + pass + + # Overridable -- handle comment + def handle_comment(self, data): + pass + + # Overridable -- handle declaration + def handle_decl(self, decl): + pass + + # Overridable -- handle processing instruction + def handle_pi(self, data): + pass + + def unknown_decl(self, data): + pass + + # Internal -- helper to remove special character quoting + def unescape(self, s): + warnings.warn('The unescape method is deprecated and will be removed ' + 'in 3.5, use html.unescape() instead.', + DeprecationWarning, stacklevel=2) + return unescape(s) diff --git a/modules/language/python/module/http.py b/modules/language/python/module/http.py new file mode 100644 index 0000000..bd9a971 --- /dev/null +++ b/modules/language/python/module/http.py @@ -0,0 +1,136 @@ +module(http) + +from enum import IntEnum + +__all__ = ['HTTPStatus'] + +class HTTPStatus(IntEnum): + """HTTP status codes and reason phrases + + Status codes from the following RFCs are all observed: + + * RFC 7231: Hypertext Transfer Protocol (HTTP/1.1), obsoletes 2616 + * RFC 6585: Additional HTTP Status Codes + * RFC 3229: Delta encoding in HTTP + * RFC 4918: HTTP Extensions for WebDAV, obsoletes 2518 + * RFC 5842: Binding Extensions to WebDAV + * RFC 7238: Permanent Redirect + * RFC 2295: Transparent Content Negotiation in HTTP + * RFC 2774: An HTTP Extension Framework + """ + def __new__(cls, value, phrase, description=''): + obj = int.__newobj__(cls, value) + obj._value_ = value + + obj.phrase = phrase + obj.description = description + return obj + + # informational + CONTINUE = 100, 'Continue', 'Request received, please continue' + SWITCHING_PROTOCOLS = (101, 'Switching Protocols', + 'Switching to new protocol; obey Upgrade header') + PROCESSING = 102, 'Processing' + + # success + OK = 200, 'OK', 'Request fulfilled, document follows' + CREATED = 201, 'Created', 'Document created, URL follows' + ACCEPTED = (202, 'Accepted', + 'Request accepted, processing continues off-line') + NON_AUTHORITATIVE_INFORMATION = (203, + 'Non-Authoritative Information', 'Request fulfilled from cache') + NO_CONTENT = 204, 'No Content', 'Request fulfilled, nothing follows' + RESET_CONTENT = 205, 'Reset Content', 'Clear input form for further input' + PARTIAL_CONTENT = 206, 'Partial Content', 'Partial content follows' + MULTI_STATUS = 207, 'Multi-Status' + ALREADY_REPORTED = 208, 'Already Reported' + IM_USED = 226, 'IM Used' + + # redirection + MULTIPLE_CHOICES = (300, 'Multiple Choices', + 'Object has several resources -- see URI list') + MOVED_PERMANENTLY = (301, 'Moved Permanently', + 'Object moved permanently -- see URI list') + FOUND = 302, 'Found', 'Object moved temporarily -- see URI list' + SEE_OTHER = 303, 'See Other', 'Object moved -- see Method and URL list' + NOT_MODIFIED = (304, 'Not Modified', + 'Document has not changed since given time') + USE_PROXY = (305, 'Use Proxy', + 'You must use proxy specified in Location to access this resource') + TEMPORARY_REDIRECT = (307, 'Temporary Redirect', + 'Object moved temporarily -- see URI list') + PERMANENT_REDIRECT = (308, 'Permanent Redirect', + 'Object moved temporarily -- see URI list') + + # client error + BAD_REQUEST = (400, 'Bad Request', + 'Bad request syntax or unsupported method') + UNAUTHORIZED = (401, 'Unauthorized', + 'No permission -- see authorization schemes') + PAYMENT_REQUIRED = (402, 'Payment Required', + 'No payment -- see charging schemes') + FORBIDDEN = (403, 'Forbidden', + 'Request forbidden -- authorization will not help') + NOT_FOUND = (404, 'Not Found', + 'Nothing matches the given URI') + METHOD_NOT_ALLOWED = (405, 'Method Not Allowed', + 'Specified method is invalid for this resource') + NOT_ACCEPTABLE = (406, 'Not Acceptable', + 'URI not available in preferred format') + PROXY_AUTHENTICATION_REQUIRED = (407, + 'Proxy Authentication Required', + 'You must authenticate with this proxy before proceeding') + REQUEST_TIMEOUT = (408, 'Request Timeout', + 'Request timed out; try again later') + CONFLICT = 409, 'Conflict', 'Request conflict' + GONE = (410, 'Gone', + 'URI no longer exists and has been permanently removed') + LENGTH_REQUIRED = (411, 'Length Required', + 'Client must specify Content-Length') + PRECONDITION_FAILED = (412, 'Precondition Failed', + 'Precondition in headers is false') + REQUEST_ENTITY_TOO_LARGE = (413, 'Request Entity Too Large', + 'Entity is too large') + REQUEST_URI_TOO_LONG = (414, 'Request-URI Too Long', + 'URI is too long') + UNSUPPORTED_MEDIA_TYPE = (415, 'Unsupported Media Type', + 'Entity body in unsupported format') + REQUESTED_RANGE_NOT_SATISFIABLE = (416, + 'Requested Range Not Satisfiable', + 'Cannot satisfy request range') + EXPECTATION_FAILED = (417, 'Expectation Failed', + 'Expect condition could not be satisfied') + UNPROCESSABLE_ENTITY = 422, 'Unprocessable Entity' + LOCKED = 423, 'Locked' + FAILED_DEPENDENCY = 424, 'Failed Dependency' + UPGRADE_REQUIRED = 426, 'Upgrade Required' + PRECONDITION_REQUIRED = (428, 'Precondition Required', + 'The origin server requires the request to be conditional') + TOO_MANY_REQUESTS = (429, 'Too Many Requests', + 'The user has sent too many requests in ' + 'a given amount of time ("rate limiting")') + REQUEST_HEADER_FIELDS_TOO_LARGE = (431, + 'Request Header Fields Too Large', + 'The server is unwilling to process the request because its header ' + 'fields are too large') + + # server errors + INTERNAL_SERVER_ERROR = (500, 'Internal Server Error', + 'Server got itself in trouble') + NOT_IMPLEMENTED = (501, 'Not Implemented', + 'Server does not support this operation') + BAD_GATEWAY = (502, 'Bad Gateway', + 'Invalid responses from another server/proxy') + SERVICE_UNAVAILABLE = (503, 'Service Unavailable', + 'The server cannot process the request due to a high load') + GATEWAY_TIMEOUT = (504, 'Gateway Timeout', + 'The gateway server did not receive a timely response') + HTTP_VERSION_NOT_SUPPORTED = (505, 'HTTP Version Not Supported', + 'Cannot fulfill request') + VARIANT_ALSO_NEGOTIATES = 506, 'Variant Also Negotiates' + INSUFFICIENT_STORAGE = 507, 'Insufficient Storage' + LOOP_DETECTED = 508, 'Loop Detected' + NOT_EXTENDED = 510, 'Not Extended' + NETWORK_AUTHENTICATION_REQUIRED = (511, + 'Network Authentication Required', + 'The client needs to authenticate to gain network access') diff --git a/modules/language/python/module/http/client.py b/modules/language/python/module/http/client.py new file mode 100644 index 0000000..8dd6028 --- /dev/null +++ b/modules/language/python/module/http/client.py @@ -0,0 +1,1480 @@ +module(http,client) + +r"""HTTP/1.1 client library + +<intro stuff goes here> +<other stuff, too> + +HTTPConnection goes through a number of "states", which define when a client +may legally make another request or fetch the response for a particular +request. This diagram details these state transitions: + + (null) + | + | HTTPConnection() + v + Idle + | + | putrequest() + v + Request-started + | + | ( putheader() )* endheaders() + v + Request-sent + |\_____________________________ + | | getresponse() raises + | response = getresponse() | ConnectionError + v v + Unread-response Idle + [Response-headers-read] + |\____________________ + | | + | response.read() | putrequest() + v v + Idle Req-started-unread-response + ______/| + / | + response.read() | | ( putheader() )* endheaders() + v v + Request-started Req-sent-unread-response + | + | response.read() + v + Request-sent + +This diagram presents the following rules: + -- a second request may not be started until {response-headers-read} + -- a response [object] cannot be retrieved until {request-sent} + -- there is no differentiation between an unread response body and a + partially read response body + +Note: this enforcement is applied by the HTTPConnection class. The + HTTPResponse class does not enforce this state machine, which + implies sophisticated clients may accelerate the request/response + pipeline. Caution should be taken, though: accelerating the states + beyond the above pattern may imply knowledge of the server's + connection-close behavior for certain requests. For example, it + is impossible to tell whether the server will close the connection + UNTIL the response headers have been read; this means that further + requests cannot be placed into the pipeline until it is known that + the server will NOT be closing the connection. + +Logical State __state __response +------------- ------- ---------- +Idle _CS_IDLE None +Request-started _CS_REQ_STARTED None +Request-sent _CS_REQ_SENT None +Unread-response _CS_IDLE <response_class> +Req-started-unread-response _CS_REQ_STARTED <response_class> +Req-sent-unread-response _CS_REQ_SENT <response_class> +""" + +import email.parser +import email.message +import http +import io +import os +import re +import socket +import collections +from urllib.parse import urlsplit + +# HTTPMessage, parse_headers(), and the HTTP status code constants are +# intentionally omitted for simplicity +__all__ = ["HTTPResponse", "HTTPConnection", + "HTTPException", "NotConnected", "UnknownProtocol", + "UnknownTransferEncoding", "UnimplementedFileMode", + "IncompleteRead", "InvalidURL", "ImproperConnectionState", + "CannotSendRequest", "CannotSendHeader", "ResponseNotReady", + "BadStatusLine", "LineTooLong", "RemoteDisconnected", "error", + "responses"] + +HTTP_PORT = 80 +HTTPS_PORT = 443 + +_UNKNOWN = 'UNKNOWN' + +# connection states +_CS_IDLE = 'Idle' +_CS_REQ_STARTED = 'Request-started' +_CS_REQ_SENT = 'Request-sent' + +# hack to maintain backwards compatibility +globals().update(http.HTTPStatus.__members__) + +# another hack to maintain backwards compatibility +# Mapping status codes to official W3C names +pk(6) +responses = {v: v.phrase for v in http.HTTPStatus.__members__.values()} +pk(7) +# maximal amount of data to read at one time in _safe_read +MAXAMOUNT = 1048576 + +# maximal line length when calling readline(). +_MAXLINE = 65536 +_MAXHEADERS = 100 + +# Header name/value ABNF (http://tools.ietf.org/html/rfc7230#section-3.2) +# +# VCHAR = %x21-7E +# obs-text = %x80-FF +# header-field = field-name ":" OWS field-value OWS +# field-name = token +# field-value = *( field-content / obs-fold ) +# field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ] +# field-vchar = VCHAR / obs-text +# +# obs-fold = CRLF 1*( SP / HTAB ) +# ; obsolete line folding +# ; see Section 3.2.4 + +# token = 1*tchar +# +# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" +# / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" +# / DIGIT / ALPHA +# ; any VCHAR, except delimiters +# +# VCHAR defined in http://tools.ietf.org/html/rfc5234#appendix-B.1 + +# the patterns for both name and value are more lenient than RFC +# definitions to allow for backwards compatibility +_is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch +_is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search + +# We always set the Content-Length header for these methods because some +# servers will otherwise respond with a 411 +_METHODS_EXPECTING_BODY = {'PATCH', 'POST', 'PUT'} + +pk(8) +def _encode(data, name='data'): + """Call data.encode("latin-1") but show a better error message.""" + try: + return data.encode("latin-1") + except UnicodeEncodeError as err: + raise UnicodeEncodeError( + err.encoding, + err.object, + err.start, + err.end, + "%s (%.20r) is not valid Latin-1. Use %s.encode('utf-8') " + "if you want to send it encoded in UTF-8." % + (name.title(), data[err.start:err.end], name)) from None + + +class HTTPMessage(email.message.Message): + # XXX The only usage of this method is in + # http.server.CGIHTTPRequestHandler. Maybe move the code there so + # that it doesn't need to be part of the public API. The API has + # never been defined so this could cause backwards compatibility + # issues. + + def getallmatchingheaders(self, name): + """Find all header lines matching a given header name. + + Look through the list of headers and find all lines matching a given + header name (and their continuation lines). A list of the lines is + returned, without interpretation. If the header does not occur, an + empty list is returned. If the header occurs multiple times, all + occurrences are returned. Case is not important in the header name. + + """ + name = name.lower() + ':' + n = len(name) + lst = [] + hit = 0 + for line in self.keys(): + if line[:n].lower() == name: + hit = 1 + elif not line[:1].isspace(): + hit = 0 + if hit: + lst.append(line) + return lst + +def parse_headers(fp, _class=HTTPMessage): + """Parses only RFC2822 headers from a file pointer. + + email Parser wants to see strings rather than bytes. + But a TextIOWrapper around self.rfile would buffer too many bytes + from the stream, bytes which we later need to read as bytes. + So we read the correct bytes here, as bytes, for email Parser + to parse. + + """ + headers = [] + while True: + line = fp.readline(_MAXLINE + 1) + if len(line) > _MAXLINE: + raise LineTooLong("header line") + headers.append(line) + if len(headers) > _MAXHEADERS: + raise HTTPException("got more than %d headers" % _MAXHEADERS) + if line in (b'\r\n', b'\n', b''): + break + hstring = b''.join(headers).decode('iso-8859-1') + return email.parser.Parser(_class=_class).parsestr(hstring) + + +class HTTPResponse(io.BufferedIOBase): + + # See RFC 2616 sec 19.6 and RFC 1945 sec 6 for details. + + # The bytes from the socket object are iso-8859-1 strings. + # See RFC 2616 sec 2.2 which notes an exception for MIME-encoded + # text following RFC 2047. The basic status line parsing only + # accepts iso-8859-1. + + def __init__(self, sock, debuglevel=0, method=None, url=None): + # If the response includes a content-length header, we need to + # make sure that the client doesn't read more than the + # specified number of bytes. If it does, it will block until + # the server times out and closes the connection. This will + # happen if a self.fp.read() is done (without a size) whether + # self.fp is buffered or not. So, no self.fp.read() by + # clients unless they know what they are doing. + self.fp = sock.makefile("rb") + self.debuglevel = debuglevel + self._method = method + + # The HTTPResponse object is returned via urllib. The clients + # of http and urllib expect different attributes for the + # headers. headers is used here and supports urllib. msg is + # provided as a backwards compatibility layer for http + # clients. + + self.headers = self.msg = None + + # from the Status-Line of the response + self.version = _UNKNOWN # HTTP-Version + self.status = _UNKNOWN # Status-Code + self.reason = _UNKNOWN # Reason-Phrase + + self.chunked = _UNKNOWN # is "chunked" being used? + self.chunk_left = _UNKNOWN # bytes left to read in current chunk + self.length = _UNKNOWN # number of bytes left in response + self.will_close = _UNKNOWN # conn will close at end of response + + def _read_status(self): + line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") + if len(line) > _MAXLINE: + raise LineTooLong("status line") + if self.debuglevel > 0: + print("reply:", repr(line)) + if not line: + # Presumably, the server closed the connection before + # sending a valid response. + raise RemoteDisconnected("Remote end closed connection without" + " response") + try: + version, status, reason = line.split(None, 2) + except ValueError: + try: + version, status = line.split(None, 1) + reason = "" + except ValueError: + # empty version will cause next test to fail. + version = "" + if not version.startswith("HTTP/"): + self._close_conn() + raise BadStatusLine(line) + + # The status code is a three-digit number + try: + status = int(status) + if status < 100 or status > 999: + raise BadStatusLine(line) + except ValueError: + raise BadStatusLine(line) + return version, status, reason + + def begin(self): + if self.headers is not None: + # we've already started reading the response + return + + # read until we get a non-100 response + while True: + version, status, reason = self._read_status() + if status != CONTINUE: + break + # skip the header from the 100 response + while True: + skip = self.fp.readline(_MAXLINE + 1) + if len(skip) > _MAXLINE: + raise LineTooLong("header line") + skip = skip.strip() + if not skip: + break + if self.debuglevel > 0: + print("header:", skip) + + self.code = self.status = status + self.reason = reason.strip() + if version in ("HTTP/1.0", "HTTP/0.9"): + # Some servers might still return "0.9", treat it as 1.0 anyway + self.version = 10 + elif version.startswith("HTTP/1."): + self.version = 11 # use HTTP/1.1 code for HTTP/1.x where x>=1 + else: + raise UnknownProtocol(version) + + self.headers = self.msg = parse_headers(self.fp) + + if self.debuglevel > 0: + for hdr in self.headers: + print("header:", hdr, end=" ") + + # are we using the chunked-style of transfer encoding? + tr_enc = self.headers.get("transfer-encoding") + if tr_enc and tr_enc.lower() == "chunked": + self.chunked = True + self.chunk_left = None + else: + self.chunked = False + + # will the connection close at the end of the response? + self.will_close = self._check_close() + + # do we have a Content-Length? + # NOTE: RFC 2616, S4.4, #3 says we ignore this if tr_enc is "chunked" + self.length = None + length = self.headers.get("content-length") + + # are we using the chunked-style of transfer encoding? + tr_enc = self.headers.get("transfer-encoding") + if length and not self.chunked: + try: + self.length = int(length) + except ValueError: + self.length = None + else: + if self.length < 0: # ignore nonsensical negative lengths + self.length = None + else: + self.length = None + + # does the body have a fixed length? (of zero) + if (status == NO_CONTENT or status == NOT_MODIFIED or + 100 <= status < 200 or # 1xx codes + self._method == "HEAD"): + self.length = 0 + + # if the connection remains open, and we aren't using chunked, and + # a content-length was not provided, then assume that the connection + # WILL close. + if (not self.will_close and + not self.chunked and + self.length is None): + self.will_close = True + + def _check_close(self): + conn = self.headers.get("connection") + if self.version == 11: + # An HTTP/1.1 proxy is assumed to stay open unless + # explicitly closed. + conn = self.headers.get("connection") + if conn and "close" in conn.lower(): + return True + return False + + # Some HTTP/1.0 implementations have support for persistent + # connections, using rules different than HTTP/1.1. + + # For older HTTP, Keep-Alive indicates persistent connection. + if self.headers.get("keep-alive"): + return False + + # At least Akamai returns a "Connection: Keep-Alive" header, + # which was supposed to be sent by the client. + if conn and "keep-alive" in conn.lower(): + return False + + # Proxy-Connection is a netscape hack. + pconn = self.headers.get("proxy-connection") + if pconn and "keep-alive" in pconn.lower(): + return False + + # otherwise, assume it will close + return True + + def _close_conn(self): + fp = self.fp + self.fp = None + fp.close() + + def close(self): + try: + super().close() # set "closed" flag + finally: + if self.fp: + self._close_conn() + + # These implementations are for the benefit of io.BufferedReader. + + # XXX This class should probably be revised to act more like + # the "raw stream" that BufferedReader expects. + + def flush(self): + super().flush() + if self.fp: + self.fp.flush() + + def readable(self): + """Always returns True""" + return True + + # End of "raw stream" methods + + def isclosed(self): + """True if the connection is closed.""" + # NOTE: it is possible that we will not ever call self.close(). This + # case occurs when will_close is TRUE, length is None, and we + # read up to the last byte, but NOT past it. + # + # IMPLIES: if will_close is FALSE, then self.close() will ALWAYS be + # called, meaning self.isclosed() is meaningful. + return self.fp is None + + def read(self, amt=None): + if self.fp is None: + return b"" + + if self._method == "HEAD": + self._close_conn() + return b"" + + if amt is not None: + # Amount is given, implement using readinto + b = bytearray(amt) + n = self.readinto(b) + return memoryview(b)[:n].tobytes() + else: + # Amount is not given (unbounded read) so we must check self.length + # and self.chunked + + if self.chunked: + return self._readall_chunked() + + if self.length is None: + s = self.fp.read() + else: + try: + s = self._safe_read(self.length) + except IncompleteRead: + self._close_conn() + raise + self.length = 0 + self._close_conn() # we read everything + return s + + def readinto(self, b): + """Read up to len(b) bytes into bytearray b and return the number + of bytes read. + """ + + if self.fp is None: + return 0 + + if self._method == "HEAD": + self._close_conn() + return 0 + + if self.chunked: + return self._readinto_chunked(b) + + if self.length is not None: + if len(b) > self.length: + # clip the read to the "end of response" + b = memoryview(b)[0:self.length] + + # we do not use _safe_read() here because this may be a .will_close + # connection, and the user is reading more bytes than will be provided + # (for example, reading in 1k chunks) + n = self.fp.readinto(b) + if not n and b: + # Ideally, we would raise IncompleteRead if the content-length + # wasn't satisfied, but it might break compatibility. + self._close_conn() + elif self.length is not None: + self.length -= n + if not self.length: + self._close_conn() + return n + + def _read_next_chunk_size(self): + # Read the next chunk size from the file + line = self.fp.readline(_MAXLINE + 1) + if len(line) > _MAXLINE: + raise LineTooLong("chunk size") + i = line.find(b";") + if i >= 0: + line = line[:i] # strip chunk-extensions + try: + return int(line, 16) + except ValueError: + # close the connection as protocol synchronisation is + # probably lost + self._close_conn() + raise + + def _read_and_discard_trailer(self): + # read and discard trailer up to the CRLF terminator + ### note: we shouldn't have any trailers! + while True: + line = self.fp.readline(_MAXLINE + 1) + if len(line) > _MAXLINE: + raise LineTooLong("trailer line") + if not line: + # a vanishingly small number of sites EOF without + # sending the trailer + break + if line in (b'\r\n', b'\n', b''): + break + + def _get_chunk_left(self): + # return self.chunk_left, reading a new chunk if necessary. + # chunk_left == 0: at the end of the current chunk, need to close it + # chunk_left == None: No current chunk, should read next. + # This function returns non-zero or None if the last chunk has + # been read. + chunk_left = self.chunk_left + if not chunk_left: # Can be 0 or None + if chunk_left is not None: + # We are at the end of chunk, discard chunk end + self._safe_read(2) # toss the CRLF at the end of the chunk + try: + chunk_left = self._read_next_chunk_size() + except ValueError: + raise IncompleteRead(b'') + if chunk_left == 0: + # last chunk: 1*("0") [ chunk-extension ] CRLF + self._read_and_discard_trailer() + # we read everything; close the "file" + self._close_conn() + chunk_left = None + self.chunk_left = chunk_left + return chunk_left + + def _readall_chunked(self): + assert self.chunked != _UNKNOWN + value = [] + try: + while True: + chunk_left = self._get_chunk_left() + if chunk_left is None: + break + value.append(self._safe_read(chunk_left)) + self.chunk_left = 0 + return b''.join(value) + except IncompleteRead: + raise IncompleteRead(b''.join(value)) + + def _readinto_chunked(self, b): + assert self.chunked != _UNKNOWN + total_bytes = 0 + mvb = memoryview(b) + try: + while True: + chunk_left = self._get_chunk_left() + if chunk_left is None: + return total_bytes + + if len(mvb) <= chunk_left: + n = self._safe_readinto(mvb) + self.chunk_left = chunk_left - n + return total_bytes + n + + temp_mvb = mvb[:chunk_left] + n = self._safe_readinto(temp_mvb) + mvb = mvb[n:] + total_bytes += n + self.chunk_left = 0 + + except IncompleteRead: + raise IncompleteRead(bytes(b[0:total_bytes])) + + def _safe_read(self, amt): + """Read the number of bytes requested, compensating for partial reads. + + Normally, we have a blocking socket, but a read() can be interrupted + by a signal (resulting in a partial read). + + Note that we cannot distinguish between EOF and an interrupt when zero + bytes have been read. IncompleteRead() will be raised in this + situation. + + This function should be used when <amt> bytes "should" be present for + reading. If the bytes are truly not available (due to EOF), then the + IncompleteRead exception can be used to detect the problem. + """ + s = [] + while amt > 0: + chunk = self.fp.read(min(amt, MAXAMOUNT)) + if not chunk: + raise IncompleteRead(b''.join(s), amt) + s.append(chunk) + amt -= len(chunk) + return b"".join(s) + + def _safe_readinto(self, b): + """Same as _safe_read, but for reading into a buffer.""" + total_bytes = 0 + mvb = memoryview(b) + while total_bytes < len(b): + if MAXAMOUNT < len(mvb): + temp_mvb = mvb[0:MAXAMOUNT] + n = self.fp.readinto(temp_mvb) + else: + n = self.fp.readinto(mvb) + if not n: + raise IncompleteRead(bytes(mvb[0:total_bytes]), len(b)) + mvb = mvb[n:] + total_bytes += n + return total_bytes + + def read1(self, n=-1): + """Read with at most one underlying system call. If at least one + byte is buffered, return that instead. + """ + if self.fp is None or self._method == "HEAD": + return b"" + if self.chunked: + return self._read1_chunked(n) + if self.length is not None and (n < 0 or n > self.length): + n = self.length + try: + result = self.fp.read1(n) + except ValueError: + if n >= 0: + raise + # some implementations, like BufferedReader, don't support -1 + # Read an arbitrarily selected largeish chunk. + result = self.fp.read1(16*1024) + if not result and n: + self._close_conn() + elif self.length is not None: + self.length -= len(result) + return result + + def peek(self, n=-1): + # Having this enables IOBase.readline() to read more than one + # byte at a time + if self.fp is None or self._method == "HEAD": + return b"" + if self.chunked: + return self._peek_chunked(n) + return self.fp.peek(n) + + def readline(self, limit=-1): + if self.fp is None or self._method == "HEAD": + return b"" + if self.chunked: + # Fallback to IOBase readline which uses peek() and read() + return super().readline(limit) + if self.length is not None and (limit < 0 or limit > self.length): + limit = self.length + result = self.fp.readline(limit) + if not result and limit: + self._close_conn() + elif self.length is not None: + self.length -= len(result) + return result + + def _read1_chunked(self, n): + # Strictly speaking, _get_chunk_left() may cause more than one read, + # but that is ok, since that is to satisfy the chunked protocol. + chunk_left = self._get_chunk_left() + if chunk_left is None or n == 0: + return b'' + if not (0 <= n <= chunk_left): + n = chunk_left # if n is negative or larger than chunk_left + read = self.fp.read1(n) + self.chunk_left -= len(read) + if not read: + raise IncompleteRead(b"") + return read + + def _peek_chunked(self, n): + # Strictly speaking, _get_chunk_left() may cause more than one read, + # but that is ok, since that is to satisfy the chunked protocol. + try: + chunk_left = self._get_chunk_left() + except IncompleteRead: + return b'' # peek doesn't worry about protocol + if chunk_left is None: + return b'' # eof + # peek is allowed to return more than requested. Just request the + # entire chunk, and truncate what we get. + return self.fp.peek(chunk_left)[:chunk_left] + + def fileno(self): + return self.fp.fileno() + + def getheader(self, name, default=None): + '''Returns the value of the header matching *name*. + + If there are multiple matching headers, the values are + combined into a single string separated by commas and spaces. + + If no matching header is found, returns *default* or None if + the *default* is not specified. + + If the headers are unknown, raises http.client.ResponseNotReady. + + ''' + if self.headers is None: + raise ResponseNotReady() + headers = self.headers.get_all(name) or default + if isinstance(headers, str) or not hasattr(headers, '__iter__'): + return headers + else: + return ', '.join(headers) + + def getheaders(self): + """Return list of (header, value) tuples.""" + if self.headers is None: + raise ResponseNotReady() + return list(self.headers.items()) + + # We override IOBase.__iter__ so that it doesn't check for closed-ness + + def __iter__(self): + return self + + # For compatibility with old-style urllib responses. + + def info(self): + '''Returns an instance of the class mimetools.Message containing + meta-information associated with the URL. + + When the method is HTTP, these headers are those returned by + the server at the head of the retrieved HTML page (including + Content-Length and Content-Type). + + When the method is FTP, a Content-Length header will be + present if (as is now usual) the server passed back a file + length in response to the FTP retrieval request. A + Content-Type header will be present if the MIME type can be + guessed. + + When the method is local-file, returned headers will include + a Date representing the file's last-modified time, a + Content-Length giving file size, and a Content-Type + containing a guess at the file's type. See also the + description of the mimetools module. + + ''' + return self.headers + + def geturl(self): + '''Return the real URL of the page. + + In some cases, the HTTP server redirects a client to another + URL. The urlopen() function handles this transparently, but in + some cases the caller needs to know which URL the client was + redirected to. The geturl() method can be used to get at this + redirected URL. + + ''' + return self.url + + def getcode(self): + '''Return the HTTP status code that was sent with the response, + or None if the URL is not an HTTP URL. + + ''' + return self.status + +class HTTPConnection: + + _http_vsn = 11 + _http_vsn_str = 'HTTP/1.1' + + response_class = HTTPResponse + default_port = HTTP_PORT + auto_open = 1 + debuglevel = 0 + + @staticmethod + def _is_textIO(stream): + """Test whether a file-like object is a text or a binary stream. + """ + return isinstance(stream, io.TextIOBase) + + @staticmethod + def _get_content_length(body, method): + """Get the content-length based on the body. + + If the body is None, we set Content-Length: 0 for methods that expect + a body (RFC 7230, Section 3.3.2). We also set the Content-Length for + any method if the body is a str or bytes-like object and not a file. + """ + if body is None: + # do an explicit check for not None here to distinguish + # between unset and set but empty + if method.upper() in _METHODS_EXPECTING_BODY: + return 0 + else: + return None + + if hasattr(body, 'read'): + # file-like object. + return None + + try: + # does it implement the buffer protocol (bytes, bytearray, array)? + mv = memoryview(body) + return mv.nbytes + except TypeError: + pass + + if isinstance(body, str): + return len(body) + + return None + + def __init__(self, host, port=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, + source_address=None): + self.timeout = timeout + self.source_address = source_address + self.sock = None + self._buffer = [] + self.__response = None + self.__state = _CS_IDLE + self._method = None + self._tunnel_host = None + self._tunnel_port = None + self._tunnel_headers = {} + + (self.host, self.port) = self._get_hostport(host, port) + + # This is stored as an instance variable to allow unit + # tests to replace it with a suitable mockup + self._create_connection = socket.create_connection + + def set_tunnel(self, host, port=None, headers=None): + """Set up host and port for HTTP CONNECT tunnelling. + + In a connection that uses HTTP CONNECT tunneling, the host passed to the + constructor is used as a proxy server that relays all communication to + the endpoint passed to `set_tunnel`. This done by sending an HTTP + CONNECT request to the proxy server when the connection is established. + + This method must be called before the HTML connection has been + established. + + The headers argument should be a mapping of extra HTTP headers to send + with the CONNECT request. + """ + + if self.sock: + raise RuntimeError("Can't set up tunnel for established connection") + + self._tunnel_host, self._tunnel_port = self._get_hostport(host, port) + if headers: + self._tunnel_headers = headers + else: + self._tunnel_headers.clear() + + def _get_hostport(self, host, port): + if port is None: + i = host.rfind(':') + j = host.rfind(']') # ipv6 addresses have [...] + if i > j: + try: + port = int(host[i+1:]) + except ValueError: + if host[i+1:] == "": # http://foo.com:/ == http://foo.com/ + port = self.default_port + else: + raise InvalidURL("nonnumeric port: '%s'" % host[i+1:]) + host = host[:i] + else: + port = self.default_port + if host and host[0] == '[' and host[-1] == ']': + host = host[1:-1] + + return (host, port) + + def set_debuglevel(self, level): + self.debuglevel = level + + def _tunnel(self): + connect_str = "CONNECT %s:%d HTTP/1.0\r\n" % (self._tunnel_host, + self._tunnel_port) + connect_bytes = connect_str.encode("ascii") + self.send(connect_bytes) + for header, value in self._tunnel_headers.items(): + header_str = "%s: %s\r\n" % (header, value) + header_bytes = header_str.encode("latin-1") + self.send(header_bytes) + self.send(b'\r\n') + + response = self.response_class(self.sock, method=self._method) + (version, code, message) = response._read_status() + + if code != http.HTTPStatus.OK: + self.close() + raise OSError("Tunnel connection failed: %d %s" % (code, + message.strip())) + while True: + line = response.fp.readline(_MAXLINE + 1) + if len(line) > _MAXLINE: + raise LineTooLong("header line") + if not line: + # for sites which EOF without sending a trailer + break + if line in (b'\r\n', b'\n', b''): + break + + if self.debuglevel > 0: + print('header:', line.decode()) + + def connect(self): + """Connect to the host and port specified in __init__.""" + self.sock = self._create_connection( + (self.host,self.port), self.timeout, self.source_address) + self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + + if self._tunnel_host: + self._tunnel() + + def close(self): + """Close the connection to the HTTP server.""" + self.__state = _CS_IDLE + try: + sock = self.sock + if sock: + self.sock = None + sock.close() # close it manually... there may be other refs + finally: + response = self.__response + if response: + self.__response = None + response.close() + + def send(self, data): + """Send `data' to the server. + ``data`` can be a string object, a bytes object, an array object, a + file-like object that supports a .read() method, or an iterable object. + """ + + if self.sock is None: + if self.auto_open: + self.connect() + else: + raise NotConnected() + + if self.debuglevel > 0: + print("send:", repr(data)) + blocksize = 8192 + if hasattr(data, "read") : + if self.debuglevel > 0: + print("sendIng a read()able") + encode = self._is_textIO(data) + if encode and self.debuglevel > 0: + print("encoding file using iso-8859-1") + while 1: + datablock = data.read(blocksize) + if not datablock: + break + if encode: + datablock = datablock.encode("iso-8859-1") + self.sock.sendall(datablock) + return + try: + self.sock.sendall(data) + except TypeError: + if isinstance(data, collections.Iterable): + for d in data: + self.sock.sendall(d) + else: + raise TypeError("data should be a bytes-like object " + "or an iterable, got %r" % type(data)) + + def _output(self, s): + """Add a line of output to the current request buffer. + + Assumes that the line does *not* end with \\r\\n. + """ + self._buffer.append(s) + + def _read_readable(self, readable): + blocksize = 8192 + if self.debuglevel > 0: + print("sendIng a read()able") + encode = self._is_textIO(readable) + if encode and self.debuglevel > 0: + print("encoding file using iso-8859-1") + while True: + datablock = readable.read(blocksize) + if not datablock: + break + if encode: + datablock = datablock.encode("iso-8859-1") + yield datablock + + def _send_output(self, message_body=None, encode_chunked=False): + """Send the currently buffered request and clear the buffer. + + Appends an extra \\r\\n to the buffer. + A message_body may be specified, to be appended to the request. + """ + self._buffer.extend((b"", b"")) + msg = b"\r\n".join(self._buffer) + del self._buffer[:] + self.send(msg) + + if message_body is not None: + + # create a consistent interface to message_body + if hasattr(message_body, 'read'): + # Let file-like take precedence over byte-like. This + # is needed to allow the current position of mmap'ed + # files to be taken into account. + chunks = self._read_readable(message_body) + else: + try: + # this is solely to check to see if message_body + # implements the buffer API. it /would/ be easier + # to capture if PyObject_CheckBuffer was exposed + # to Python. + memoryview(message_body) + except TypeError: + try: + chunks = iter(message_body) + except TypeError: + raise TypeError("message_body should be a bytes-like " + "object or an iterable, got %r" + % type(message_body)) + else: + # the object implements the buffer interface and + # can be passed directly into socket methods + chunks = (message_body,) + + for chunk in chunks: + if not chunk: + if self.debuglevel > 0: + print('Zero length chunk ignored') + continue + + if encode_chunked and self._http_vsn == 11: + # chunked encoding + chunk = f'{len(chunk):X}\r\n'.encode('ascii') + chunk \ + + b'\r\n' + self.send(chunk) + + if encode_chunked and self._http_vsn == 11: + # end chunked transfer + self.send(b'0\r\n\r\n') + + def putrequest(self, method, url, skip_host=False, + skip_accept_encoding=False): + """Send a request to the server. + + `method' specifies an HTTP request method, e.g. 'GET'. + `url' specifies the object being requested, e.g. '/index.html'. + `skip_host' if True does not add automatically a 'Host:' header + `skip_accept_encoding' if True does not add automatically an + 'Accept-Encoding:' header + """ + + # if a prior response has been completed, then forget about it. + if self.__response and self.__response.isclosed(): + self.__response = None + + + # in certain cases, we cannot issue another request on this connection. + # this occurs when: + # 1) we are in the process of sending a request. (_CS_REQ_STARTED) + # 2) a response to a previous request has signalled that it is going + # to close the connection upon completion. + # 3) the headers for the previous response have not been read, thus + # we cannot determine whether point (2) is true. (_CS_REQ_SENT) + # + # if there is no prior response, then we can request at will. + # + # if point (2) is true, then we will have passed the socket to the + # response (effectively meaning, "there is no prior response"), and + # will open a new one when a new request is made. + # + # Note: if a prior response exists, then we *can* start a new request. + # We are not allowed to begin fetching the response to this new + # request, however, until that prior response is complete. + # + if self.__state == _CS_IDLE: + self.__state = _CS_REQ_STARTED + else: + raise CannotSendRequest(self.__state) + + # Save the method we use, we need it later in the response phase + self._method = method + if not url: + url = '/' + request = '%s %s %s' % (method, url, self._http_vsn_str) + + # Non-ASCII characters should have been eliminated earlier + self._output(request.encode('ascii')) + + if self._http_vsn == 11: + # Issue some standard headers for better HTTP/1.1 compliance + + if not skip_host: + # this header is issued *only* for HTTP/1.1 + # connections. more specifically, this means it is + # only issued when the client uses the new + # HTTPConnection() class. backwards-compat clients + # will be using HTTP/1.0 and those clients may be + # issuing this header themselves. we should NOT issue + # it twice; some web servers (such as Apache) barf + # when they see two Host: headers + + # If we need a non-standard port,include it in the + # header. If the request is going through a proxy, + # but the host of the actual URL, not the host of the + # proxy. + + netloc = '' + if url.startswith('http'): + nil, netloc, nil, nil, nil = urlsplit(url) + + if netloc: + try: + netloc_enc = netloc.encode("ascii") + except UnicodeEncodeError: + netloc_enc = netloc.encode("idna") + self.putheader('Host', netloc_enc) + else: + if self._tunnel_host: + host = self._tunnel_host + port = self._tunnel_port + else: + host = self.host + port = self.port + + try: + host_enc = host.encode("ascii") + except UnicodeEncodeError: + host_enc = host.encode("idna") + + # As per RFC 273, IPv6 address should be wrapped with [] + # when used as Host header + + if host.find(':') >= 0: + host_enc = b'[' + host_enc + b']' + + if port == self.default_port: + self.putheader('Host', host_enc) + else: + host_enc = host_enc.decode("ascii") + self.putheader('Host', "%s:%s" % (host_enc, port)) + + # note: we are assuming that clients will not attempt to set these + # headers since *this* library must deal with the + # consequences. this also means that when the supporting + # libraries are updated to recognize other forms, then this + # code should be changed (removed or updated). + + # we only want a Content-Encoding of "identity" since we don't + # support encodings such as x-gzip or x-deflate. + if not skip_accept_encoding: + self.putheader('Accept-Encoding', 'identity') + + # we can accept "chunked" Transfer-Encodings, but no others + # NOTE: no TE header implies *only* "chunked" + #self.putheader('TE', 'chunked') + + # if TE is supplied in the header, then it must appear in a + # Connection header. + #self.putheader('Connection', 'TE') + + else: + # For HTTP/1.0, the server will assume "not chunked" + pass + + def putheader(self, header, *values): + """Send a request header line to the server. + + For example: h.putheader('Accept', 'text/html') + """ + if self.__state != _CS_REQ_STARTED: + raise CannotSendHeader() + + if hasattr(header, 'encode'): + header = header.encode('ascii') + + if not _is_legal_header_name(header): + raise ValueError('Invalid header name %r' % (header,)) + + values = list(values) + for i, one_value in enumerate(values): + if hasattr(one_value, 'encode'): + values[i] = one_value.encode('latin-1') + elif isinstance(one_value, int): + values[i] = str(one_value).encode('ascii') + + if _is_illegal_header_value(values[i]): + raise ValueError('Invalid header value %r' % (values[i],)) + + value = b'\r\n\t'.join(values) + header = header + b': ' + value + self._output(header) + + def endheaders(self, message_body=None, *, encode_chunked=False): + """Indicate that the last header line has been sent to the server. + + This method sends the request to the server. The optional message_body + argument can be used to pass a message body associated with the + request. + """ + if self.__state == _CS_REQ_STARTED: + self.__state = _CS_REQ_SENT + else: + raise CannotSendHeader() + self._send_output(message_body, encode_chunked=encode_chunked) + + def request(self, method, url, body=None, headers={}, *, + encode_chunked=False): + """Send a complete request to the server.""" + self._send_request(method, url, body, headers, encode_chunked) + + def _send_request(self, method, url, body, headers, encode_chunked): + # Honor explicitly requested Host: and Accept-Encoding: headers. + header_names = frozenset(k.lower() for k in headers) + skips = {} + if 'host' in header_names: + skips['skip_host'] = 1 + if 'accept-encoding' in header_names: + skips['skip_accept_encoding'] = 1 + + self.putrequest(method, url, **skips) + + # chunked encoding will happen if HTTP/1.1 is used and either + # the caller passes encode_chunked=True or the following + # conditions hold: + # 1. content-length has not been explicitly set + # 2. the body is a file or iterable, but not a str or bytes-like + # 3. Transfer-Encoding has NOT been explicitly set by the caller + + if 'content-length' not in header_names: + # only chunk body if not explicitly set for backwards + # compatibility, assuming the client code is already handling the + # chunking + if 'transfer-encoding' not in header_names: + # if content-length cannot be automatically determined, fall + # back to chunked encoding + encode_chunked = False + content_length = self._get_content_length(body, method) + if content_length is None: + if body is not None: + if self.debuglevel > 0: + print('Unable to determine size of %r' % body) + encode_chunked = True + self.putheader('Transfer-Encoding', 'chunked') + else: + self.putheader('Content-Length', str(content_length)) + else: + encode_chunked = False + + for hdr, value in headers.items(): + self.putheader(hdr, value) + if isinstance(body, str): + # RFC 2616 Section 3.7.1 says that text default has a + # default charset of iso-8859-1. + body = _encode(body, 'body') + self.endheaders(body, encode_chunked=encode_chunked) + + def getresponse(self): + """Get the response from the server. + + If the HTTPConnection is in the correct state, returns an + instance of HTTPResponse or of whatever object is returned by + the response_class variable. + + If a request has not been sent or if a previous response has + not be handled, ResponseNotReady is raised. If the HTTP + response indicates that the connection should be closed, then + it will be closed before the response is returned. When the + connection is closed, the underlying socket is closed. + """ + + # if a prior response has been completed, then forget about it. + if self.__response and self.__response.isclosed(): + self.__response = None + + # if a prior response exists, then it must be completed (otherwise, we + # cannot read this response's header to determine the connection-close + # behavior) + # + # note: if a prior response existed, but was connection-close, then the + # socket and response were made independent of this HTTPConnection + # object since a new request requires that we open a whole new + # connection + # + # this means the prior response had one of two states: + # 1) will_close: this connection was reset and the prior socket and + # response operate independently + # 2) persistent: the response was retained and we await its + # isclosed() status to become true. + # + if self.__state != _CS_REQ_SENT or self.__response: + raise ResponseNotReady(self.__state) + + if self.debuglevel > 0: + response = self.response_class(self.sock, self.debuglevel, + method=self._method) + else: + response = self.response_class(self.sock, method=self._method) + + try: + try: + response.begin() + except ConnectionError: + self.close() + raise + assert response.will_close != _UNKNOWN + self.__state = _CS_IDLE + + if response.will_close: + # this effectively passes the connection to the response + self.close() + else: + # remember this, so we can tell when it is complete + self.__response = response + + return response + except: + response.close() + raise + +try: + import ssl +except ImportError: + pass +else: + class HTTPSConnection(HTTPConnection): + "This class allows communication via SSL." + + default_port = HTTPS_PORT + + # XXX Should key_file and cert_file be deprecated in favour of context? + + def __init__(self, host, port=None, key_file=None, cert_file=None, + timeout=socket._GLOBAL_DEFAULT_TIMEOUT, + source_address=None, *, context=None, + check_hostname=None): + super(HTTPSConnection, self).__init__(host, port, timeout, + source_address) + if (key_file is not None or cert_file is not None or + check_hostname is not None): + import warnings + warnings.warn("key_file, cert_file and check_hostname are " + "deprecated, use a custom context instead.", + DeprecationWarning, 2) + self.key_file = key_file + self.cert_file = cert_file + if context is None: + context = ssl._create_default_https_context() + will_verify = context.verify_mode != ssl.CERT_NONE + if check_hostname is None: + check_hostname = context.check_hostname + if check_hostname and not will_verify: + raise ValueError("check_hostname needs a SSL context with " + "either CERT_OPTIONAL or CERT_REQUIRED") + if key_file or cert_file: + context.load_cert_chain(cert_file, key_file) + self._context = context + self._check_hostname = check_hostname + + def connect(self): + "Connect to a host on a given (SSL) port." + + super().connect() + + if self._tunnel_host: + server_hostname = self._tunnel_host + else: + server_hostname = self.host + + self.sock = self._context.wrap_socket(self.sock, + server_hostname=server_hostname) + if not self._context.check_hostname and self._check_hostname: + try: + ssl.match_hostname(self.sock.getpeercert(), server_hostname) + except Exception: + self.sock.shutdown(socket.SHUT_RDWR) + self.sock.close() + raise + + __all__.append("HTTPSConnection") + +class HTTPException(Exception): + # Subclasses that define an __init__ must call Exception.__init__ + # or define self.args. Otherwise, str() will fail. + pass + +class NotConnected(HTTPException): + pass + +class InvalidURL(HTTPException): + pass + +class UnknownProtocol(HTTPException): + def __init__(self, version): + self.args = version, + self.version = version + +class UnknownTransferEncoding(HTTPException): + pass + +class UnimplementedFileMode(HTTPException): + pass + +class IncompleteRead(HTTPException): + def __init__(self, partial, expected=None): + self.args = partial, + self.partial = partial + self.expected = expected + def __repr__(self): + if self.expected is not None: + e = ', %i more expected' % self.expected + else: + e = '' + return '%s(%i bytes read%s)' % (self.__class__.__name__, + len(self.partial), e) + def __str__(self): + return repr(self) + +class ImproperConnectionState(HTTPException): + pass + +class CannotSendRequest(ImproperConnectionState): + pass + +class CannotSendHeader(ImproperConnectionState): + pass + +class ResponseNotReady(ImproperConnectionState): + pass + +class BadStatusLine(HTTPException): + def __init__(self, line): + if not line: + line = repr(line) + self.args = line, + self.line = line + +class LineTooLong(HTTPException): + def __init__(self, line_type): + HTTPException.__init__(self, "got more than %d bytes when reading %s" + % (_MAXLINE, line_type)) + +class RemoteDisconnected(ConnectionResetError, BadStatusLine): + def __init__(self, *pos, **kw): + BadStatusLine.__init__(self, "") + ConnectionResetError.__init__(self, *pos, **kw) + +# for backwards compatibility +error = HTTPException diff --git a/modules/language/python/module/python.scm b/modules/language/python/module/python.scm index 5c962e6..6f78f8f 100644 --- a/modules/language/python/module/python.scm +++ b/modules/language/python/module/python.scm @@ -2,6 +2,7 @@ #:use-module (language python module _python) #:use-module (language python compile ) #:use-module (language python module ) + #:use-module ((language python module string) #:select ()) #:use-module (language python memoryview ) #:use-module ((oop pf-objects) #:select (define-python-class)) #:use-module ((language python format2) #:select ()) diff --git a/modules/language/python/module/re/parser.scm b/modules/language/python/module/re/parser.scm index 40469e2..765500d 100644 --- a/modules/language/python/module/re/parser.scm +++ b/modules/language/python/module/re/parser.scm @@ -14,7 +14,7 @@ (mk q+? #:+? "+?") (define subexpr (f-list #:sub - (f-seq (f-tag "(") (Ds ee) (f-tag ")")))) + (f-seq (f-tag "(") (Ds ee-) (f-tag ")")))) (define f-back (f-or (f-list #:class (mk-token (f-reg! "[AZbBdDsSwntr]"))) @@ -58,6 +58,7 @@ ")")) (define (bch f) (f-or! (f-seq (f-or! (f-tag "\\n") f-nl) (f-out (list->string (list #\newline)))) + (f-seq (f-char #\\) f-back) f)) (define bbody (f-cons (f-or! (f-list #:range (bch (mk-token (f-reg! "."))) @@ -83,7 +84,7 @@ (define f-bar (f-tag "|")) -(define qq (ch (f-reg "[][?+|*.$^()\\]"))) +(define qq (ch (f-reg "[[?+|*.$^()\\]"))) (define atom (f-or qq f-. flags2 choice subexpr anongroup namegroup incant coment lookh lookh! rev rev! f-^ f-$ flags)) (define spec (f-list #:op atom (f-or! q+? q?? q*? q* q? q+ repn? repnm? repn repnm))) @@ -93,7 +94,9 @@ (f* (f-or! f-com f-nl (f-reg "[ \t\r]"))) f-true))) (define line (f-cons* #:seq ws aatom ws (ff* (f-seq ws aatom ws) ))) +(define line- (f-cons* #:seq aatom (ff* (f-seq aatom) ))) (define ee (f-cons* #:or line (ff* (f-seq f-bar line)))) +(define ee- (f-cons* #:or line- (ff* (f-seq f-bar line-)))) (define pretty (make-fluid #f)) (define (parse-reg str) (with-fluids ((*whitespace* ws)) diff --git a/modules/language/python/module/socketserver.py b/modules/language/python/module/socketserver.py new file mode 100644 index 0000000..516f718 --- /dev/null +++ b/modules/language/python/module/socketserver.py @@ -0,0 +1,795 @@ +module(socketserver) + +"""Generic socket server classes. + +This module tries to capture the various aspects of defining a server: + +For socket-based servers: + +- address family: + - AF_INET{,6}: IP (Internet Protocol) sockets (default) + - AF_UNIX: Unix domain sockets + - others, e.g. AF_DECNET are conceivable (see <socket.h> +- socket type: + - SOCK_STREAM (reliable stream, e.g. TCP) + - SOCK_DGRAM (datagrams, e.g. UDP) + +For request-based servers (including socket-based): + +- client address verification before further looking at the request + (This is actually a hook for any processing that needs to look + at the request before anything else, e.g. logging) +- how to handle multiple requests: + - synchronous (one request is handled at a time) + - forking (each request is handled by a new process) + - threading (each request is handled by a new thread) + +The classes in this module favor the server type that is simplest to +write: a synchronous TCP/IP server. This is bad class design, but +save some typing. (There's also the issue that a deep class hierarchy +slows down method lookups.) + +There are five classes in an inheritance diagram, four of which represent +synchronous servers of four types: + + +------------+ + | BaseServer | + +------------+ + | + v + +-----------+ +------------------+ + | TCPServer |------->| UnixStreamServer | + +-----------+ +------------------+ + | + v + +-----------+ +--------------------+ + | UDPServer |------->| UnixDatagramServer | + +-----------+ +--------------------+ + +Note that UnixDatagramServer derives from UDPServer, not from +UnixStreamServer -- the only difference between an IP and a Unix +stream server is the address family, which is simply repeated in both +unix server classes. + +Forking and threading versions of each type of server can be created +using the ForkingMixIn and ThreadingMixIn mix-in classes. For +instance, a threading UDP server class is created as follows: + + class ThreadingUDPServer(ThreadingMixIn, UDPServer): pass + +The Mix-in class must come first, since it overrides a method defined +in UDPServer! Setting the various member variables also changes +the behavior of the underlying server mechanism. + +To implement a service, you must derive a class from +BaseRequestHandler and redefine its handle() method. You can then run +various versions of the service by combining one of the server classes +with your request handler class. + +The request handler class must be different for datagram or stream +services. This can be hidden by using the request handler +subclasses StreamRequestHandler or DatagramRequestHandler. + +Of course, you still have to use your head! + +For instance, it makes no sense to use a forking server if the service +contains state in memory that can be modified by requests (since the +modifications in the child process would never reach the initial state +kept in the parent process and passed to each child). In this case, +you can use a threading server, but you will probably have to use +locks to avoid two requests that come in nearly simultaneous to apply +conflicting changes to the server state. + +On the other hand, if you are building e.g. an HTTP server, where all +data is stored externally (e.g. in the file system), a synchronous +class will essentially render the service "deaf" while one request is +being handled -- which may be for a very long time if a client is slow +to read all the data it has requested. Here a threading or forking +server is appropriate. + +In some cases, it may be appropriate to process part of a request +synchronously, but to finish processing in a forked child depending on +the request data. This can be implemented by using a synchronous +server and doing an explicit fork in the request handler class +handle() method. + +Another approach to handling multiple simultaneous requests in an +environment that supports neither threads nor fork (or where these are +too expensive or inappropriate for the service) is to maintain an +explicit table of partially finished requests and to use a selector to +decide which request to work on next (or whether to handle a new +incoming request). This is particularly important for stream services +where each client can potentially be connected for a long time (if +threads or subprocesses cannot be used). + +Future work: +- Standard classes for Sun RPC (which uses either UDP or TCP) +- Standard mix-in classes to implement various authentication + and encryption schemes + +XXX Open problems: +- What to do with out-of-band data? + +BaseServer: +- split generic "request" functionality out into BaseServer class. + Copyright (C) 2000 Luke Kenneth Casson Leighton <lkcl@samba.org> + + example: read entries from a SQL database (requires overriding + get_request() to return a table entry from the database). + entry is processed by a RequestHandlerClass. + +""" + +# Author of the BaseServer patch: Luke Kenneth Casson Leighton + +__version__ = "0.4" + + +import socket +import selectors +import os +import errno +import sys +try: + import threading +except ImportError: + import dummy_threading as threading +from io import BufferedIOBase +from time import monotonic as time + +__all__ = ["BaseServer", "TCPServer", "UDPServer", + "ThreadingUDPServer", "ThreadingTCPServer", + "BaseRequestHandler", "StreamRequestHandler", + "DatagramRequestHandler", "ThreadingMixIn"] +if hasattr(os, "fork"): + __all__.extend(["ForkingUDPServer","ForkingTCPServer", "ForkingMixIn"]) +if hasattr(socket, "AF_UNIX"): + __all__.extend(["UnixStreamServer","UnixDatagramServer", + "ThreadingUnixStreamServer", + "ThreadingUnixDatagramServer"]) + +# poll/select have the advantage of not requiring any extra file descriptor, +# contrarily to epoll/kqueue (also, they require a single syscall). +if hasattr(selectors, 'PollSelector'): + _ServerSelector = selectors.PollSelector +else: + _ServerSelector = selectors.SelectSelector + + +class BaseServer: + + """Base class for server classes. + + Methods for the caller: + + - __init__(server_address, RequestHandlerClass) + - serve_forever(poll_interval=0.5) + - shutdown() + - handle_request() # if you do not use serve_forever() + - fileno() -> int # for selector + + Methods that may be overridden: + + - server_bind() + - server_activate() + - get_request() -> request, client_address + - handle_timeout() + - verify_request(request, client_address) + - server_close() + - process_request(request, client_address) + - shutdown_request(request) + - close_request(request) + - service_actions() + - handle_error() + + Methods for derived classes: + + - finish_request(request, client_address) + + Class variables that may be overridden by derived classes or + instances: + + - timeout + - address_family + - socket_type + - allow_reuse_address + + Instance variables: + + - RequestHandlerClass + - socket + + """ + + timeout = None + + def __init__(self, server_address, RequestHandlerClass): + """Constructor. May be extended, do not override.""" + self.server_address = server_address + self.RequestHandlerClass = RequestHandlerClass + self.__is_shut_down = threading.Event() + self.__shutdown_request = False + + def server_activate(self): + """Called by constructor to activate the server. + + May be overridden. + + """ + pass + + def serve_forever(self, poll_interval=0.5): + """Handle one request at a time until shutdown. + + Polls for shutdown every poll_interval seconds. Ignores + self.timeout. If you need to do periodic tasks, do them in + another thread. + """ + self.__is_shut_down.clear() + try: + # XXX: Consider using another file descriptor or connecting to the + # socket to wake this up instead of polling. Polling reduces our + # responsiveness to a shutdown request and wastes cpu at all other + # times. + with _ServerSelector() as selector: + selector.register(self, selectors.EVENT_READ) + + while not self.__shutdown_request: + ready = selector.select(poll_interval) + if ready: + self._handle_request_noblock() + + self.service_actions() + finally: + self.__shutdown_request = False + self.__is_shut_down.set() + + def shutdown(self): + """Stops the serve_forever loop. + + Blocks until the loop has finished. This must be called while + serve_forever() is running in another thread, or it will + deadlock. + """ + self.__shutdown_request = True + self.__is_shut_down.wait() + + def service_actions(self): + """Called by the serve_forever() loop. + + May be overridden by a subclass / Mixin to implement any code that + needs to be run during the loop. + """ + pass + + # The distinction between handling, getting, processing and finishing a + # request is fairly arbitrary. Remember: + # + # - handle_request() is the top-level call. It calls selector.select(), + # get_request(), verify_request() and process_request() + # - get_request() is different for stream or datagram sockets + # - process_request() is the place that may fork a new process or create a + # new thread to finish the request + # - finish_request() instantiates the request handler class; this + # constructor will handle the request all by itself + + def handle_request(self): + """Handle one request, possibly blocking. + + Respects self.timeout. + """ + # Support people who used socket.settimeout() to escape + # handle_request before self.timeout was available. + timeout = self.socket.gettimeout() + if timeout is None: + timeout = self.timeout + elif self.timeout is not None: + timeout = min(timeout, self.timeout) + if timeout is not None: + deadline = time() + timeout + + # Wait until a request arrives or the timeout expires - the loop is + # necessary to accommodate early wakeups due to EINTR. + with _ServerSelector() as selector: + selector.register(self, selectors.EVENT_READ) + + while True: + ready = selector.select(timeout) + if ready: + return self._handle_request_noblock() + else: + if timeout is not None: + timeout = deadline - time() + if timeout < 0: + return self.handle_timeout() + + def _handle_request_noblock(self): + """Handle one request, without blocking. + + I assume that selector.select() has returned that the socket is + readable before this function was called, so there should be no risk of + blocking in get_request(). + """ + try: + request, client_address = self.get_request() + except OSError: + return + if self.verify_request(request, client_address): + try: + self.process_request(request, client_address) + except Exception: + self.handle_error(request, client_address) + self.shutdown_request(request) + except: + self.shutdown_request(request) + raise + else: + self.shutdown_request(request) + + def handle_timeout(self): + """Called if no new request arrives within self.timeout. + + Overridden by ForkingMixIn. + """ + pass + + def verify_request(self, request, client_address): + """Verify the request. May be overridden. + + Return True if we should proceed with this request. + + """ + return True + + def process_request(self, request, client_address): + """Call finish_request. + + Overridden by ForkingMixIn and ThreadingMixIn. + + """ + self.finish_request(request, client_address) + self.shutdown_request(request) + + def server_close(self): + """Called to clean-up the server. + + May be overridden. + + """ + pass + + def finish_request(self, request, client_address): + """Finish one request by instantiating RequestHandlerClass.""" + self.RequestHandlerClass(request, client_address, self) + + def shutdown_request(self, request): + """Called to shutdown and close an individual request.""" + self.close_request(request) + + def close_request(self, request): + """Called to clean up an individual request.""" + pass + + def handle_error(self, request, client_address): + """Handle an error gracefully. May be overridden. + + The default is to print a traceback and continue. + + """ + print('-'*40, file=sys.stderr) + print('Exception happened during processing of request from', + client_address, file=sys.stderr) + import traceback + traceback.print_exc() + print('-'*40, file=sys.stderr) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.server_close() + + +class TCPServer(BaseServer): + + """Base class for various socket-based server classes. + + Defaults to synchronous IP stream (i.e., TCP). + + Methods for the caller: + + - __init__(server_address, RequestHandlerClass, bind_and_activate=True) + - serve_forever(poll_interval=0.5) + - shutdown() + - handle_request() # if you don't use serve_forever() + - fileno() -> int # for selector + + Methods that may be overridden: + + - server_bind() + - server_activate() + - get_request() -> request, client_address + - handle_timeout() + - verify_request(request, client_address) + - process_request(request, client_address) + - shutdown_request(request) + - close_request(request) + - handle_error() + + Methods for derived classes: + + - finish_request(request, client_address) + + Class variables that may be overridden by derived classes or + instances: + + - timeout + - address_family + - socket_type + - request_queue_size (only for stream sockets) + - allow_reuse_address + + Instance variables: + + - server_address + - RequestHandlerClass + - socket + + """ + + address_family = socket.AF_INET + + socket_type = socket.SOCK_STREAM + + request_queue_size = 5 + + allow_reuse_address = False + + def __init__(self, server_address, RequestHandlerClass, bind_and_activate=True): + """Constructor. May be extended, do not override.""" + BaseServer.__init__(self, server_address, RequestHandlerClass) + self.socket = socket.socket(self.address_family, + self.socket_type) + if bind_and_activate: + try: + self.server_bind() + self.server_activate() + except: + self.server_close() + raise + + def server_bind(self): + """Called by constructor to bind the socket. + + May be overridden. + + """ + if self.allow_reuse_address: + self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self.socket.bind(self.server_address) + self.server_address = self.socket.getsockname() + + def server_activate(self): + """Called by constructor to activate the server. + + May be overridden. + + """ + self.socket.listen(self.request_queue_size) + + def server_close(self): + """Called to clean-up the server. + + May be overridden. + + """ + self.socket.close() + + def fileno(self): + """Return socket file number. + + Interface required by selector. + + """ + return self.socket.fileno() + + def get_request(self): + """Get the request and client address from the socket. + + May be overridden. + + """ + return self.socket.accept() + + def shutdown_request(self, request): + """Called to shutdown and close an individual request.""" + try: + #explicitly shutdown. socket.close() merely releases + #the socket and waits for GC to perform the actual close. + request.shutdown(socket.SHUT_WR) + except OSError: + pass #some platforms may raise ENOTCONN here + self.close_request(request) + + def close_request(self, request): + """Called to clean up an individual request.""" + request.close() + + +class UDPServer(TCPServer): + + """UDP server class.""" + + allow_reuse_address = False + + socket_type = socket.SOCK_DGRAM + + max_packet_size = 8192 + + def get_request(self): + data, client_addr = self.socket.recvfrom(self.max_packet_size) + return (data, self.socket), client_addr + + def server_activate(self): + # No need to call listen() for UDP. + pass + + def shutdown_request(self, request): + # No need to shutdown anything. + self.close_request(request) + + def close_request(self, request): + # No need to close anything. + pass + +if hasattr(os, "fork"): + class ForkingMixIn: + """Mix-in class to handle each request in a new process.""" + + timeout = 300 + active_children = None + max_children = 40 + + def collect_children(self): + """Internal routine to wait for children that have exited.""" + if self.active_children is None: + return + + # If we're above the max number of children, wait and reap them until + # we go back below threshold. Note that we use waitpid(-1) below to be + # able to collect children in size(<defunct children>) syscalls instead + # of size(<children>): the downside is that this might reap children + # which we didn't spawn, which is why we only resort to this when we're + # above max_children. + while len(self.active_children) >= self.max_children: + try: + pid, _ = os.waitpid(-1, 0) + self.active_children.discard(pid) + except ChildProcessError: + # we don't have any children, we're done + self.active_children.clear() + except OSError: + break + + # Now reap all defunct children. + for pid in self.active_children.copy(): + try: + pid, _ = os.waitpid(pid, os.WNOHANG) + # if the child hasn't exited yet, pid will be 0 and ignored by + # discard() below + self.active_children.discard(pid) + except ChildProcessError: + # someone else reaped it + self.active_children.discard(pid) + except OSError: + pass + + def handle_timeout(self): + """Wait for zombies after self.timeout seconds of inactivity. + + May be extended, do not override. + """ + self.collect_children() + + def service_actions(self): + """Collect the zombie child processes regularly in the ForkingMixIn. + + service_actions is called in the BaseServer's serve_forver loop. + """ + self.collect_children() + + def process_request(self, request, client_address): + """Fork a new subprocess to process the request.""" + pid = os.fork() + if pid: + # Parent process + if self.active_children is None: + self.active_children = set() + self.active_children.add(pid) + self.close_request(request) + return + else: + # Child process. + # This must never return, hence os._exit()! + status = 1 + try: + self.finish_request(request, client_address) + status = 0 + except Exception: + self.handle_error(request, client_address) + finally: + try: + self.shutdown_request(request) + finally: + os._exit(status) + + +class ThreadingMixIn: + """Mix-in class to handle each request in a new thread.""" + + # Decides how threads will act upon termination of the + # main process + daemon_threads = False + + def process_request_thread(self, request, client_address): + """Same as in BaseServer but as a thread. + + In addition, exception handling is done here. + + """ + try: + self.finish_request(request, client_address) + except Exception: + self.handle_error(request, client_address) + finally: + self.shutdown_request(request) + + def process_request(self, request, client_address): + """Start a new thread to process the request.""" + t = threading.Thread(target = self.process_request_thread, + args = (request, client_address)) + t.daemon = self.daemon_threads + t.start() + + +if hasattr(os, "fork"): + class ForkingUDPServer(ForkingMixIn, UDPServer): pass + class ForkingTCPServer(ForkingMixIn, TCPServer): pass + +class ThreadingUDPServer(ThreadingMixIn, UDPServer): pass +class ThreadingTCPServer(ThreadingMixIn, TCPServer): pass + +if hasattr(socket, 'AF_UNIX'): + + class UnixStreamServer(TCPServer): + address_family = socket.AF_UNIX + + class UnixDatagramServer(UDPServer): + address_family = socket.AF_UNIX + + class ThreadingUnixStreamServer(ThreadingMixIn, UnixStreamServer): pass + + class ThreadingUnixDatagramServer(ThreadingMixIn, UnixDatagramServer): pass + +class BaseRequestHandler: + + """Base class for request handler classes. + + This class is instantiated for each request to be handled. The + constructor sets the instance variables request, client_address + and server, and then calls the handle() method. To implement a + specific service, all you need to do is to derive a class which + defines a handle() method. + + The handle() method can find the request as self.request, the + client address as self.client_address, and the server (in case it + needs access to per-server information) as self.server. Since a + separate instance is created for each request, the handle() method + can define other arbitrary instance variables. + + """ + + def __init__(self, request, client_address, server): + self.request = request + self.client_address = client_address + self.server = server + self.setup() + try: + self.handle() + finally: + self.finish() + + def setup(self): + pass + + def handle(self): + pass + + def finish(self): + pass + + +# The following two classes make it possible to use the same service +# class for stream or datagram servers. +# Each class sets up these instance variables: +# - rfile: a file object from which receives the request is read +# - wfile: a file object to which the reply is written +# When the handle() method returns, wfile is flushed properly + + +class StreamRequestHandler(BaseRequestHandler): + + """Define self.rfile and self.wfile for stream sockets.""" + + # Default buffer sizes for rfile, wfile. + # We default rfile to buffered because otherwise it could be + # really slow for large data (a getc() call per byte); we make + # wfile unbuffered because (a) often after a write() we want to + # read and we need to flush the line; (b) big writes to unbuffered + # files are typically optimized by stdio even when big reads + # aren't. + rbufsize = -1 + wbufsize = 0 + + # A timeout to apply to the request socket, if not None. + timeout = None + + # Disable nagle algorithm for this socket, if True. + # Use only when wbufsize != 0, to avoid small packets. + disable_nagle_algorithm = False + + def setup(self): + self.connection = self.request + if self.timeout is not None: + self.connection.settimeout(self.timeout) + if self.disable_nagle_algorithm: + self.connection.setsockopt(socket.IPPROTO_TCP, + socket.TCP_NODELAY, True) + self.rfile = self.connection.makefile('rb', self.rbufsize) + if self.wbufsize == 0: + self.wfile = _SocketWriter(self.connection) + else: + self.wfile = self.connection.makefile('wb', self.wbufsize) + + def finish(self): + if not self.wfile.closed: + try: + self.wfile.flush() + except socket.error: + # A final socket error may have occurred here, such as + # the local error ECONNABORTED. + pass + self.wfile.close() + self.rfile.close() + +class _SocketWriter(BufferedIOBase): + """Simple writable BufferedIOBase implementation for a socket + + Does not hold data in a buffer, avoiding any need to call flush().""" + + def __init__(self, sock): + self._sock = sock + + def writable(self): + return True + + def write(self, b): + self._sock.sendall(b) + with memoryview(b) as view: + return view.nbytes + + def fileno(self): + return self._sock.fileno() + +class DatagramRequestHandler(BaseRequestHandler): + + """Define self.rfile and self.wfile for datagram sockets.""" + + def setup(self): + from io import BytesIO + self.packet, self.socket = self.request + self.rfile = BytesIO(self.packet) + self.wfile = BytesIO() + + def finish(self): + self.socket.sendto(self.wfile.getvalue(), self.client_address) diff --git a/modules/language/python/module/string.scm b/modules/language/python/module/string.scm index 9c0d818..63adfcd 100644 --- a/modules/language/python/module/string.scm +++ b/modules/language/python/module/string.scm @@ -338,7 +338,7 @@ (lambda (self format_string args kwargs) (set self '_args '()) (for ((s fn fo co : ((ref self 'parse) format_string))) ((ss '(""))) - (vformat self s fn fo co ss args kwargs) + (vformat1 self s fn fo co ss args kwargs) #:final (begin ((ref self 'check_unused_args) (ref self '_args) args kwargs) diff --git a/modules/language/python/module/threading.scm b/modules/language/python/module/threading.scm index 7a7d669..1f1f83b 100644 --- a/modules/language/python/module/threading.scm +++ b/modules/language/python/module/threading.scm @@ -2,8 +2,9 @@ #:use-module (ice-9 threads) #:use-module (oop pf-objects) #:use-module (language python def) + #:use-module (language python exceptions) #:use-module (language python list) - #:export (RLock start_new_thread allocate_lock)) + #:export (RLock start_new_thread allocate_lock Thread)) (define-python-class RLock () (define __init__ @@ -40,3 +41,17 @@ (call-with-new-thread (lambda () (apply fkn (to-list args))))) + +(define-python-class Thread () + (define __init__ + (lam (self (= target None) (= args '())) + (set self 'target target) + (set self 'args args))) + + (define start + (lambda (self) + (call-with-new-thread + (lambda () + (apply (ref self 'target) + (to-list (ref self 'args)))))))) + diff --git a/modules/language/python/module/urllib.py b/modules/language/python/module/urllib.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/modules/language/python/module/urllib.py diff --git a/modules/language/python/module/urllib/error.py b/modules/language/python/module/urllib/error.py new file mode 100644 index 0000000..b6b1211 --- /dev/null +++ b/modules/language/python/module/urllib/error.py @@ -0,0 +1,78 @@ +module(urllib,error) + +"""Exception classes raised by urllib. + +The base exception class is URLError, which inherits from OSError. It +doesn't define any behavior of its own, but is the base class for all +exceptions defined in this package. + +HTTPError is an exception class that is also a valid HTTP response +instance. It behaves this way because HTTP protocol errors are valid +responses, with a status code, headers, and a body. In some contexts, +an application may want to handle an exception like a regular +response. +""" + +import urllib.response + +__all__ = ['URLError', 'HTTPError', 'ContentTooShortError'] + + +class URLError(OSError): + # URLError is a sub-type of OSError, but it doesn't share any of + # the implementation. need to override __init__ and __str__. + # It sets self.args for compatibility with other EnvironmentError + # subclasses, but args doesn't have the typical format with errno in + # slot 0 and strerror in slot 1. This may be better than nothing. + def __init__(self, reason, filename=None): + self.args = reason, + self.reason = reason + if filename is not None: + self.filename = filename + + def __str__(self): + return '<urlopen error %s>' % self.reason + + +class HTTPError(URLError, urllib.response.addinfourl): + """Raised when HTTP error occurs, but also acts like non-error return""" + __super_init = urllib.response.addinfourl.__init__ + + def __init__(self, url, code, msg, hdrs, fp): + self.code = code + self.msg = msg + self.hdrs = hdrs + self.fp = fp + self.filename = url + # The addinfourl classes depend on fp being a valid file + # object. In some cases, the HTTPError may not have a valid + # file object. If this happens, the simplest workaround is to + # not initialize the base classes. + if fp is not None: + self.__super_init(fp, hdrs, url, code) + + def __str__(self): + return 'HTTP Error %s: %s' % (self.code, self.msg) + + def __repr__(self): + return '<HTTPError %s: %r>' % (self.code, self.msg) + + # since URLError specifies a .reason attribute, HTTPError should also + # provide this attribute. See issue13211 for discussion. + @property + def reason(self): + return self.msg + + @property + def headers(self): + return self.hdrs + + @headers.setter + def headers(self, headers): + self.hdrs = headers + +class ContentTooShortError(URLError): + """Exception raised when downloaded size does not match content-length.""" + def __init__(self, message, content): + URLError.__init__(self, message) + self.content = content diff --git a/modules/language/python/number.scm b/modules/language/python/number.scm index 3d95ded..6ede96c 100644 --- a/modules/language/python/number.scm +++ b/modules/language/python/number.scm @@ -402,6 +402,12 @@ (let ((mask (ash 1 (- i 1)))) (- s mask)) s)))))))) + (define __newobj__ + (lambda (cls n) + (let ((obj ((rawref object '__new__) cls))) + (slot-set! obj 'x (__new__ cls n)) + obj))) + (define __new__ (letrec ((__new__ (case-lambda diff --git a/modules/language/python/string.scm b/modules/language/python/string.scm index ad80a87..864a7a1 100644 --- a/modules/language/python/string.scm +++ b/modules/language/python/string.scm @@ -409,10 +409,8 @@ (define-py (py-split split s . l) (define N 1000000000000) - (define ws (list (list (char->integer #\space)) - (list (char->integer #\newline)) - (list (char->integer #\tab)))) - + (define ws? #f) + (define ws (list #\space #\newline #\tab #\return)) (define (to-ch x) (string-ref (scm-str x) 0)) (define (mksep sep) @@ -426,38 +424,61 @@ (else (to-ch x))) l) #:final - (list (reverse l)))) + (reverse l))) (call-with-values (lambda () (match l - (() (values (list ws ) N)) - ((sep) (values (mksep sep) N)) - ((sep n) (values (mksep sep) n)))) + (() + (set! ws? #t) + (values '() N)) + + ((sep) + (values (mksep sep) N)) + + ((sep n) + (values (mksep sep) n)))) + (lambda (sep n) - (let lp ((l (to-list s)) (i 0) (v '()) (r '())) + (let lp ((l (string->list (scm-str s))) (i 0) (v '()) (r '())) (if (= i n) - (reverse r) + (reverse (cons (list->string l) r)) (if (pair? l) - (let ((ch (to-ch (car l)))) - (let lp2 ((ss sep)) - (if (pair? ss) - (let lp3 ((sl (car ss)) (l3 l)) - (if (pair? sl) - (if (pair? l3) - (let ((s (car sl))) - (if (eqv? s ch) - (lp3 (cdr sl) (cdr l3)) - (lp2 (cdr ss)))) - (lp2 (cdr ss))) - (lp l3 (+ i 1) - '() - (cons - (list->string (reverse v)) - r)))) - (lp (cdr l) i (cons ch v) r)))) + (let ((ch (car l))) + (if ws? + (if (member ch ws) + (let lp2 ((l (cdr l))) + (if (pair? l) + (let ((ch (car l))) + (if (member ch ws) + (lp2 (cdr l)) + (lp l (+ i 1) '() + (cons + (list->string (reverse v)) + r)))) + (lp l (+ i 1) '() + (cons + (list->string (reverse v)) + r)))) + (lp (cdr l) i (cons ch v) r)) + (if (eq? ch (car sep)) + (let lp2 ((ll (cdr l)) (s (cdr sep))) + (if (pair? s) + (if (pair? ll) + (let ((ch2 (car ll))) + (if (eq? ch2 (car s)) + (lp2 (cdr ll) (cdr sep)) + (lp (cdr l) i (cons ch v) r))) + (lp (cdr l) i (cons ch v) r)) + (lp ll (+ i 1) '() + (cons + (list->string (reverse v)) + r)))) + (lp (cdr l) i (cons ch v) r)))) (reverse (cons (list->string (reverse v)) r)))))))) + + @@ -598,7 +619,7 @@ (define b? #f) (define b-decode #f) -(define-python-class string (<py-string>) +(define-python-class string (<py> <py-string>) (define __init__ (case-lambda ((self) @@ -620,7 +641,13 @@ (define __new__ (lambda x (apply __init__ x))) - + + (define __newobj__ + (lambda (cls value) + (let ((obj ((rawref object '__new__) cls))) + (slot-set! obj 'str (__new__ cls value)) + obj))) + (define __repr__ (lambda (self) (slot-ref self 'str)))) |