summaryrefslogtreecommitdiff
path: root/modules/language/python/module/email
diff options
context:
space:
mode:
authorStefan Israelsson Tampe <stefan.itampe@gmail.com>2018-09-06 23:40:42 +0200
committerStefan Israelsson Tampe <stefan.itampe@gmail.com>2018-09-06 23:40:42 +0200
commit8733038b4de44a8cfb669420621597457eeef88a (patch)
tree9f7345b4b407ea0919320e17955b058cb2f2a5bc /modules/language/python/module/email
parentb950c0d70a1d2c95da8d60aca7af02bc50542a2b (diff)
email policy works mm
Diffstat (limited to 'modules/language/python/module/email')
-rw-r--r--modules/language/python/module/email/_encoded_words.py222
-rw-r--r--modules/language/python/module/email/_header_value_parser.py2820
-rw-r--r--modules/language/python/module/email/_policybase.py6
-rw-r--r--modules/language/python/module/email/contentmanager.py254
-rw-r--r--modules/language/python/module/email/feedparser.py1
-rw-r--r--modules/language/python/module/email/generator.py509
-rw-r--r--modules/language/python/module/email/headerregistry.py592
-rw-r--r--modules/language/python/module/email/iterators.py72
-rw-r--r--modules/language/python/module/email/message.py1164
-rw-r--r--modules/language/python/module/email/parser.py1
-rw-r--r--modules/language/python/module/email/policy.py231
-rw-r--r--modules/language/python/module/email/utils.py1
12 files changed, 5868 insertions, 5 deletions
diff --git a/modules/language/python/module/email/_encoded_words.py b/modules/language/python/module/email/_encoded_words.py
new file mode 100644
index 0000000..8c02709
--- /dev/null
+++ b/modules/language/python/module/email/_encoded_words.py
@@ -0,0 +1,222 @@
+module(email,_encoded_words)
+""" Routines for manipulating RFC2047 encoded words.
+
+This is currently a package-private API, but will be considered for promotion
+to a public API if there is demand.
+
+"""
+
+# An ecoded word looks like this:
+#
+# =?charset[*lang]?cte?encoded_string?=
+#
+# for more information about charset see the charset module. Here it is one
+# of the preferred MIME charset names (hopefully; you never know when parsing).
+# cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In
+# theory other letters could be used for other encodings, but in practice this
+# (almost?) never happens. There could be a public API for adding entries
+# to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is
+# Base64. The meaning of encoded_string should be obvious. 'lang' is optional
+# as indicated by the brackets (they are not part of the syntax) but is almost
+# never encountered in practice.
+#
+# The general interface for a CTE decoder is that it takes the encoded_string
+# as its argument, and returns a tuple (cte_decoded_string, defects). The
+# cte_decoded_string is the original binary that was encoded using the
+# specified cte. 'defects' is a list of MessageDefect instances indicating any
+# problems encountered during conversion. 'charset' and 'lang' are the
+# corresponding strings extracted from the EW, case preserved.
+#
+# The general interface for a CTE encoder is that it takes a binary sequence
+# as input and returns the cte_encoded_string, which is an ascii-only string.
+#
+# Each decoder must also supply a length function that takes the binary
+# sequence as its argument and returns the length of the resulting encoded
+# string.
+#
+# The main API functions for the module are decode, which calls the decoder
+# referenced by the cte specifier, and encode, which adds the appropriate
+# RFC 2047 "chrome" to the encoded string, and can optionally automatically
+# select the shortest possible encoding. See their docstrings below for
+# details.
+
+import re
+import base64
+import binascii
+import functools
+from string import ascii_letters, digits
+import email.errors as errors
+
+__all__ = ['decode_q',
+ 'encode_q',
+ 'decode_b',
+ 'encode_b',
+ 'len_q',
+ 'len_b',
+ 'decode',
+ 'encode',
+ ]
+
+#
+# Quoted Printable
+#
+
+# regex based decoder.
+_q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub,
+ lambda m: bytes([int(m.group(1), 16)]))
+
+def decode_q(encoded):
+ encoded = encoded.replace(b'_', b' ')
+ return _q_byte_subber(encoded), []
+
+
+# dict mapping bytes to their encoded form
+class _QByteMap(dict):
+
+ safe = b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii')
+
+ def __missing__(self, key):
+ if key in self.safe:
+ self[key] = chr(key)
+ else:
+ self[key] = "={:02X}".format(key)
+ return self[key]
+
+_q_byte_map = _QByteMap()
+
+# In headers spaces are mapped to '_'.
+_q_byte_map[ord(' ')] = '_'
+
+def encode_q(bstring):
+ return ''.join(_q_byte_map[x] for x in bstring)
+
+def len_q(bstring):
+ return sum(len(_q_byte_map[x]) for x in bstring)
+
+
+#
+# Base64
+#
+
+def decode_b(encoded):
+ defects = []
+ pad_err = len(encoded) % 4
+ if pad_err:
+ defects.append(errors.InvalidBase64PaddingDefect())
+ padded_encoded = encoded + b'==='[:4-pad_err]
+ else:
+ padded_encoded = encoded
+ try:
+ return base64.b64decode(padded_encoded, validate=True), defects
+ except binascii.Error:
+ # Since we had correct padding, this must an invalid char error.
+ defects = [errors.InvalidBase64CharactersDefect()]
+ # The non-alphabet characters are ignored as far as padding
+ # goes, but we don't know how many there are. So we'll just
+ # try various padding lengths until something works.
+ for i in 0, 1, 2, 3:
+ try:
+ return base64.b64decode(encoded+b'='*i, validate=False), defects
+ except binascii.Error:
+ if i==0:
+ defects.append(errors.InvalidBase64PaddingDefect())
+ else:
+ # This should never happen.
+ raise AssertionError("unexpected binascii.Error")
+
+def encode_b(bstring):
+ return base64.b64encode(bstring).decode('ascii')
+
+def len_b(bstring):
+ groups_of_3, leftover = divmod(len(bstring), 3)
+ # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in.
+ return groups_of_3 * 4 + (4 if leftover else 0)
+
+
+_cte_decoders = {
+ 'q': decode_q,
+ 'b': decode_b,
+ }
+
+def decode(ew):
+ """Decode encoded word and return (string, charset, lang, defects) tuple.
+
+ An RFC 2047/2243 encoded word has the form:
+
+ =?charset*lang?cte?encoded_string?=
+
+ where '*lang' may be omitted but the other parts may not be.
+
+ This function expects exactly such a string (that is, it does not check the
+ syntax and may raise errors if the string is not well formed), and returns
+ the encoded_string decoded first from its Content Transfer Encoding and
+ then from the resulting bytes into unicode using the specified charset. If
+ the cte-decoded string does not successfully decode using the specified
+ character set, a defect is added to the defects list and the unknown octets
+ are replaced by the unicode 'unknown' character \\uFDFF.
+
+ The specified charset and language are returned. The default for language,
+ which is rarely if ever encountered, is the empty string.
+
+ """
+ _, charset, cte, cte_string, _ = ew.split('?')
+ charset, _, lang = charset.partition('*')
+ cte = cte.lower()
+ # Recover the original bytes and do CTE decoding.
+ bstring = cte_string.encode('ascii', 'surrogateescape')
+ bstring, defects = _cte_decoders[cte](bstring)
+ # Turn the CTE decoded bytes into unicode.
+ try:
+ string = bstring.decode(charset)
+ except UnicodeError:
+ defects.append(errors.UndecodableBytesDefect("Encoded word "
+ "contains bytes not decodable using {} charset".format(charset)))
+ string = bstring.decode(charset, 'surrogateescape')
+ except LookupError:
+ string = bstring.decode('ascii', 'surrogateescape')
+ if charset.lower() != 'unknown-8bit':
+ defects.append(errors.CharsetError("Unknown charset {} "
+ "in encoded word; decoded as unknown bytes".format(charset)))
+ return string, charset, lang, defects
+
+
+_cte_encoders = {
+ 'q': encode_q,
+ 'b': encode_b,
+ }
+
+_cte_encode_length = {
+ 'q': len_q,
+ 'b': len_b,
+ }
+
+def encode(string, charset='utf-8', encoding=None, lang=''):
+ """Encode string using the CTE encoding that produces the shorter result.
+
+ Produces an RFC 2047/2243 encoded word of the form:
+
+ =?charset*lang?cte?encoded_string?=
+
+ where '*lang' is omitted unless the 'lang' parameter is given a value.
+ Optional argument charset (defaults to utf-8) specifies the charset to use
+ to encode the string to binary before CTE encoding it. Optional argument
+ 'encoding' is the cte specifier for the encoding that should be used ('q'
+ or 'b'); if it is None (the default) the encoding which produces the
+ shortest encoded sequence is used, except that 'q' is preferred if it is up
+ to five characters longer. Optional argument 'lang' (default '') gives the
+ RFC 2243 language string to specify in the encoded word.
+
+ """
+ if charset == 'unknown-8bit':
+ bstring = string.encode('ascii', 'surrogateescape')
+ else:
+ bstring = string.encode(charset)
+ if encoding is None:
+ qlen = _cte_encode_length['q'](bstring)
+ blen = _cte_encode_length['b'](bstring)
+ # Bias toward q. 5 is arbitrary.
+ encoding = 'q' if qlen - blen < 5 else 'b'
+ encoded = _cte_encoders[encoding](bstring)
+ if lang:
+ lang = '*' + lang
+ return "=?{}{}?{}?{}?=".format(charset, lang, encoding, encoded)
diff --git a/modules/language/python/module/email/_header_value_parser.py b/modules/language/python/module/email/_header_value_parser.py
new file mode 100644
index 0000000..0f4a1cb
--- /dev/null
+++ b/modules/language/python/module/email/_header_value_parser.py
@@ -0,0 +1,2820 @@
+module(email,_header_value_parser)
+
+"""Header value parser implementing various email-related RFC parsing rules.
+
+The parsing methods defined in this module implement various email related
+parsing rules. Principal among them is RFC 5322, which is the followon
+to RFC 2822 and primarily a clarification of the former. It also implements
+RFC 2047 encoded word decoding.
+
+RFC 5322 goes to considerable trouble to maintain backward compatibility with
+RFC 822 in the parse phase, while cleaning up the structure on the generation
+phase. This parser supports correct RFC 5322 generation by tagging white space
+as folding white space only when folding is allowed in the non-obsolete rule
+sets. Actually, the parser is even more generous when accepting input than RFC
+5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
+Where possible deviations from the standard are annotated on the 'defects'
+attribute of tokens that deviate.
+
+The general structure of the parser follows RFC 5322, and uses its terminology
+where there is a direct correspondence. Where the implementation requires a
+somewhat different structure than that used by the formal grammar, new terms
+that mimic the closest existing terms are used. Thus, it really helps to have
+a copy of RFC 5322 handy when studying this code.
+
+Input to the parser is a string that has already been unfolded according to
+RFC 5322 rules. According to the RFC this unfolding is the very first step, and
+this parser leaves the unfolding step to a higher level message parser, which
+will have already detected the line breaks that need unfolding while
+determining the beginning and end of each header.
+
+The output of the parser is a TokenList object, which is a list subclass. A
+TokenList is a recursive data structure. The terminal nodes of the structure
+are Terminal objects, which are subclasses of str. These do not correspond
+directly to terminal objects in the formal grammar, but are instead more
+practical higher level combinations of true terminals.
+
+All TokenList and Terminal objects have a 'value' attribute, which produces the
+semantically meaningful value of that part of the parse subtree. The value of
+all whitespace tokens (no matter how many sub-tokens they may contain) is a
+single space, as per the RFC rules. This includes 'CFWS', which is herein
+included in the general class of whitespace tokens. There is one exception to
+the rule that whitespace tokens are collapsed into single spaces in values: in
+the value of a 'bare-quoted-string' (a quoted-string with no leading or
+trailing whitespace), any whitespace that appeared between the quotation marks
+is preserved in the returned value. Note that in all Terminal strings quoted
+pairs are turned into their unquoted values.
+
+All TokenList and Terminal objects also have a string value, which attempts to
+be a "canonical" representation of the RFC-compliant form of the substring that
+produced the parsed subtree, including minimal use of quoted pair quoting.
+Whitespace runs are not collapsed.
+
+Comment tokens also have a 'content' attribute providing the string found
+between the parens (including any nested comments) with whitespace preserved.
+
+All TokenList and Terminal objects have a 'defects' attribute which is a
+possibly empty list all of the defects found while creating the token. Defects
+may appear on any token in the tree, and a composite list of all defects in the
+subtree is available through the 'all_defects' attribute of any node. (For
+Terminal notes x.defects == x.all_defects.)
+
+Each object in a parse tree is called a 'token', and each has a 'token_type'
+attribute that gives the name from the RFC 5322 grammar that it represents.
+Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
+may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
+It is returned in place of lists of (ctext/quoted-pair) and
+(qtext/quoted-pair).
+
+XXX: provide complete list of token types.
+"""
+
+import re
+import urllib.parse as parse # For urllib.parse.unquote
+from string import hexdigits
+from collections import OrderedDict
+from operator import itemgetter
+import email._encoded_words as _ew
+import email.errors as errors
+import email.utils as utils
+
+#
+# Useful constants and functions
+#
+
+WSP = set(' \t')
+CFWS_LEADER = WSP | set('(')
+SPECIALS = set(r'()<>@,:;.\"[]')
+ATOM_ENDS = SPECIALS | WSP
+DOT_ATOM_ENDS = ATOM_ENDS - set('.')
+# '.', '"', and '(' do not end phrases in order to support obs-phrase
+PHRASE_ENDS = SPECIALS - set('."(')
+TSPECIALS = (SPECIALS | set('/?=')) - set('.')
+TOKEN_ENDS = TSPECIALS | WSP
+ASPECIALS = TSPECIALS | set("*'%")
+ATTRIBUTE_ENDS = ASPECIALS | WSP
+EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
+
+def quote_string(value):
+ return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
+
+#
+# TokenList and its subclasses
+#
+
+class TokenList(list):
+
+ token_type = None
+ syntactic_break = True
+ ew_combine_allowed = True
+
+ def __init__(self, *args, **kw):
+ super().__init__(*args, **kw)
+ self.defects = []
+
+ def __str__(self):
+ return ''.join(str(x) for x in self)
+
+ def __repr__(self):
+ return '{}({})'.format(self.__class__.__name__,
+ super().__repr__())
+
+ @property
+ def value(self):
+ return ''.join(x.value for x in self if x.value)
+
+ @property
+ def all_defects(self):
+ return sum((x.all_defects for x in self), self.defects)
+
+ def startswith_fws(self):
+ return self[0].startswith_fws()
+
+ @property
+ def as_ew_allowed(self):
+ """True if all top level tokens of this part may be RFC2047 encoded."""
+ return all(part.as_ew_allowed for part in self)
+
+ @property
+ def comments(self):
+ comments = []
+ for token in self:
+ comments.extend(token.comments)
+ return comments
+
+ def fold(self, *, policy):
+ return _refold_parse_tree(self, policy=policy)
+
+ def pprint(self, indent=''):
+ print(self.ppstr(indent=indent))
+
+ def ppstr(self, indent=''):
+ return '\n'.join(self._pp(indent=indent))
+
+ def _pp(self, indent=''):
+ yield '{}{}/{}('.format(
+ indent,
+ self.__class__.__name__,
+ self.token_type)
+ for token in self:
+ if not hasattr(token, '_pp'):
+ yield (indent + ' !! invalid element in token '
+ 'list: {!r}'.format(token))
+ else:
+ yield from token._pp(indent+' ')
+ if self.defects:
+ extra = ' Defects: {}'.format(self.defects)
+ else:
+ extra = ''
+ yield '{}){}'.format(indent, extra)
+
+
+class WhiteSpaceTokenList(TokenList):
+
+ @property
+ def value(self):
+ return ' '
+
+ @property
+ def comments(self):
+ return [x.content for x in self if x.token_type=='comment']
+
+
+class UnstructuredTokenList(TokenList):
+
+ token_type = 'unstructured'
+
+
+class Phrase(TokenList):
+
+ token_type = 'phrase'
+
+class Word(TokenList):
+
+ token_type = 'word'
+
+
+class CFWSList(WhiteSpaceTokenList):
+
+ token_type = 'cfws'
+
+
+class Atom(TokenList):
+
+ token_type = 'atom'
+
+
+class Token(TokenList):
+
+ token_type = 'token'
+ encode_as_ew = False
+
+
+class EncodedWord(TokenList):
+
+ token_type = 'encoded-word'
+ cte = None
+ charset = None
+ lang = None
+
+
+class QuotedString(TokenList):
+
+ token_type = 'quoted-string'
+
+ @property
+ def content(self):
+ for x in self:
+ if x.token_type == 'bare-quoted-string':
+ return x.value
+
+ @property
+ def quoted_value(self):
+ res = []
+ for x in self:
+ if x.token_type == 'bare-quoted-string':
+ res.append(str(x))
+ else:
+ res.append(x.value)
+ return ''.join(res)
+
+ @property
+ def stripped_value(self):
+ for token in self:
+ if token.token_type == 'bare-quoted-string':
+ return token.value
+
+
+class BareQuotedString(QuotedString):
+
+ token_type = 'bare-quoted-string'
+
+ def __str__(self):
+ return quote_string(''.join(str(x) for x in self))
+
+ @property
+ def value(self):
+ return ''.join(str(x) for x in self)
+
+
+class Comment(WhiteSpaceTokenList):
+
+ token_type = 'comment'
+
+ def __str__(self):
+ return ''.join(sum([
+ ["("],
+ [self.quote(x) for x in self],
+ [")"],
+ ], []))
+
+ def quote(self, value):
+ if value.token_type == 'comment':
+ return str(value)
+ return str(value).replace('\\', '\\\\').replace(
+ '(', r'\(').replace(
+ ')', r'\)')
+
+ @property
+ def content(self):
+ return ''.join(str(x) for x in self)
+
+ @property
+ def comments(self):
+ return [self.content]
+
+class AddressList(TokenList):
+
+ token_type = 'address-list'
+
+ @property
+ def addresses(self):
+ return [x for x in self if x.token_type=='address']
+
+ @property
+ def mailboxes(self):
+ return sum((x.mailboxes
+ for x in self if x.token_type=='address'), [])
+
+ @property
+ def all_mailboxes(self):
+ return sum((x.all_mailboxes
+ for x in self if x.token_type=='address'), [])
+
+
+class Address(TokenList):
+
+ token_type = 'address'
+
+ @property
+ def display_name(self):
+ if self[0].token_type == 'group':
+ return self[0].display_name
+
+ @property
+ def mailboxes(self):
+ if self[0].token_type == 'mailbox':
+ return [self[0]]
+ elif self[0].token_type == 'invalid-mailbox':
+ return []
+ return self[0].mailboxes
+
+ @property
+ def all_mailboxes(self):
+ if self[0].token_type == 'mailbox':
+ return [self[0]]
+ elif self[0].token_type == 'invalid-mailbox':
+ return [self[0]]
+ return self[0].all_mailboxes
+
+class MailboxList(TokenList):
+
+ token_type = 'mailbox-list'
+
+ @property
+ def mailboxes(self):
+ return [x for x in self if x.token_type=='mailbox']
+
+ @property
+ def all_mailboxes(self):
+ return [x for x in self
+ if x.token_type in ('mailbox', 'invalid-mailbox')]
+
+
+class GroupList(TokenList):
+
+ token_type = 'group-list'
+
+ @property
+ def mailboxes(self):
+ if not self or self[0].token_type != 'mailbox-list':
+ return []
+ return self[0].mailboxes
+
+ @property
+ def all_mailboxes(self):
+ if not self or self[0].token_type != 'mailbox-list':
+ return []
+ return self[0].all_mailboxes
+
+
+class Group(TokenList):
+
+ token_type = "group"
+
+ @property
+ def mailboxes(self):
+ if self[2].token_type != 'group-list':
+ return []
+ return self[2].mailboxes
+
+ @property
+ def all_mailboxes(self):
+ if self[2].token_type != 'group-list':
+ return []
+ return self[2].all_mailboxes
+
+ @property
+ def display_name(self):
+ return self[0].display_name
+
+
+class NameAddr(TokenList):
+
+ token_type = 'name-addr'
+
+ @property
+ def display_name(self):
+ if len(self) == 1:
+ return None
+ return self[0].display_name
+
+ @property
+ def local_part(self):
+ return self[-1].local_part
+
+ @property
+ def domain(self):
+ return self[-1].domain
+
+ @property
+ def route(self):
+ return self[-1].route
+
+ @property
+ def addr_spec(self):
+ return self[-1].addr_spec
+
+
+class AngleAddr(TokenList):
+
+ token_type = 'angle-addr'
+
+ @property
+ def local_part(self):
+ for x in self:
+ if x.token_type == 'addr-spec':
+ return x.local_part
+
+ @property
+ def domain(self):
+ for x in self:
+ if x.token_type == 'addr-spec':
+ return x.domain
+
+ @property
+ def route(self):
+ for x in self:
+ if x.token_type == 'obs-route':
+ return x.domains
+
+ @property
+ def addr_spec(self):
+ for x in self:
+ if x.token_type == 'addr-spec':
+ if x.local_part:
+ return x.addr_spec
+ else:
+ return quote_string(x.local_part) + x.addr_spec
+ else:
+ return '<>'
+
+
+class ObsRoute(TokenList):
+
+ token_type = 'obs-route'
+
+ @property
+ def domains(self):
+ return [x.domain for x in self if x.token_type == 'domain']
+
+
+class Mailbox(TokenList):
+
+ token_type = 'mailbox'
+
+ @property
+ def display_name(self):
+ if self[0].token_type == 'name-addr':
+ return self[0].display_name
+
+ @property
+ def local_part(self):
+ return self[0].local_part
+
+ @property
+ def domain(self):
+ return self[0].domain
+
+ @property
+ def route(self):
+ if self[0].token_type == 'name-addr':
+ return self[0].route
+
+ @property
+ def addr_spec(self):
+ return self[0].addr_spec
+
+
+class InvalidMailbox(TokenList):
+
+ token_type = 'invalid-mailbox'
+
+ @property
+ def display_name(self):
+ return None
+
+ local_part = domain = route = addr_spec = display_name
+
+
+class Domain(TokenList):
+
+ token_type = 'domain'
+ as_ew_allowed = False
+
+ @property
+ def domain(self):
+ return ''.join(super().value.split())
+
+
+class DotAtom(TokenList):
+
+ token_type = 'dot-atom'
+
+
+class DotAtomText(TokenList):
+
+ token_type = 'dot-atom-text'
+ as_ew_allowed = True
+
+
+class AddrSpec(TokenList):
+
+ token_type = 'addr-spec'
+ as_ew_allowed = False
+
+ @property
+ def local_part(self):
+ return self[0].local_part
+
+ @property
+ def domain(self):
+ if len(self) < 3:
+ return None
+ return self[-1].domain
+
+ @property
+ def value(self):
+ if len(self) < 3:
+ return self[0].value
+ return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
+
+ @property
+ def addr_spec(self):
+ nameset = set(self.local_part)
+ if len(nameset) > len(nameset-DOT_ATOM_ENDS):
+ lp = quote_string(self.local_part)
+ else:
+ lp = self.local_part
+ if self.domain is not None:
+ return lp + '@' + self.domain
+ return lp
+
+
+class ObsLocalPart(TokenList):
+
+ token_type = 'obs-local-part'
+ as_ew_allowed = False
+
+
+class DisplayName(Phrase):
+
+ token_type = 'display-name'
+ ew_combine_allowed = False
+
+ @property
+ def display_name(self):
+ res = TokenList(self)
+ if res[0].token_type == 'cfws':
+ res.pop(0)
+ else:
+ if res[0][0].token_type == 'cfws':
+ res[0] = TokenList(res[0][1:])
+ if res[-1].token_type == 'cfws':
+ res.pop()
+ else:
+ if res[-1][-1].token_type == 'cfws':
+ res[-1] = TokenList(res[-1][:-1])
+ return res.value
+
+ @property
+ def value(self):
+ quote = False
+ if self.defects:
+ quote = True
+ else:
+ for x in self:
+ if x.token_type == 'quoted-string':
+ quote = True
+ if quote:
+ pre = post = ''
+ if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
+ pre = ' '
+ if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
+ post = ' '
+ return pre+quote_string(self.display_name)+post
+ else:
+ return super().value
+
+
+class LocalPart(TokenList):
+
+ token_type = 'local-part'
+ as_ew_allowed = False
+
+ @property
+ def value(self):
+ if self[0].token_type == "quoted-string":
+ return self[0].quoted_value
+ else:
+ return self[0].value
+
+ @property
+ def local_part(self):
+ # Strip whitespace from front, back, and around dots.
+ res = [DOT]
+ last = DOT
+ last_is_tl = False
+ for tok in self[0] + [DOT]:
+ if tok.token_type == 'cfws':
+ continue
+ if (last_is_tl and tok.token_type == 'dot' and
+ last[-1].token_type == 'cfws'):
+ res[-1] = TokenList(last[:-1])
+ is_tl = isinstance(tok, TokenList)
+ if (is_tl and last.token_type == 'dot' and
+ tok[0].token_type == 'cfws'):
+ res.append(TokenList(tok[1:]))
+ else:
+ res.append(tok)
+ last = res[-1]
+ last_is_tl = is_tl
+ res = TokenList(res[1:-1])
+ return res.value
+
+
+class DomainLiteral(TokenList):
+
+ token_type = 'domain-literal'
+ as_ew_allowed = False
+
+ @property
+ def domain(self):
+ return ''.join(super().value.split())
+
+ @property
+ def ip(self):
+ for x in self:
+ if x.token_type == 'ptext':
+ return x.value
+
+
+class MIMEVersion(TokenList):
+
+ token_type = 'mime-version'
+ major = None
+ minor = None
+
+
+class Parameter(TokenList):
+
+ token_type = 'parameter'
+ sectioned = False
+ extended = False
+ charset = 'us-ascii'
+
+ @property
+ def section_number(self):
+ # Because the first token, the attribute (name) eats CFWS, the second
+ # token is always the section if there is one.
+ return self[1].number if self.sectioned else 0
+
+ @property
+ def param_value(self):
+ # This is part of the "handle quoted extended parameters" hack.
+ for token in self:
+ if token.token_type == 'value':
+ return token.stripped_value
+ if token.token_type == 'quoted-string':
+ for token in token:
+ if token.token_type == 'bare-quoted-string':
+ for token in token:
+ if token.token_type == 'value':
+ return token.stripped_value
+ return ''
+
+
+class InvalidParameter(Parameter):
+
+ token_type = 'invalid-parameter'
+
+
+class Attribute(TokenList):
+
+ token_type = 'attribute'
+
+ @property
+ def stripped_value(self):
+ for token in self:
+ if token.token_type.endswith('attrtext'):
+ return token.value
+
+class Section(TokenList):
+
+ token_type = 'section'
+ number = None
+
+
+class Value(TokenList):
+
+ token_type = 'value'
+
+ @property
+ def stripped_value(self):
+ token = self[0]
+ if token.token_type == 'cfws':
+ token = self[1]
+ if token.token_type.endswith(
+ ('quoted-string', 'attribute', 'extended-attribute')):
+ return token.stripped_value
+ return self.value
+
+
+class MimeParameters(TokenList):
+
+ token_type = 'mime-parameters'
+ syntactic_break = False
+
+ @property
+ def params(self):
+ # The RFC specifically states that the ordering of parameters is not
+ # guaranteed and may be reordered by the transport layer. So we have
+ # to assume the RFC 2231 pieces can come in any order. However, we
+ # output them in the order that we first see a given name, which gives
+ # us a stable __str__.
+ params = OrderedDict()
+ for token in self:
+ if not token.token_type.endswith('parameter'):
+ continue
+ if token[0].token_type != 'attribute':
+ continue
+ name = token[0].value.strip()
+ if name not in params:
+ params[name] = []
+ params[name].append((token.section_number, token))
+ for name, parts in params.items():
+ parts = sorted(parts, key=itemgetter(0))
+ first_param = parts[0][1]
+ charset = first_param.charset
+ # Our arbitrary error recovery is to ignore duplicate parameters,
+ # to use appearance order if there are duplicate rfc 2231 parts,
+ # and to ignore gaps. This mimics the error recovery of get_param.
+ if not first_param.extended and len(parts) > 1:
+ if parts[1][0] == 0:
+ parts[1][1].defects.append(errors.InvalidHeaderDefect(
+ 'duplicate parameter name; duplicate(s) ignored'))
+ parts = parts[:1]
+ # Else assume the *0* was missing...note that this is different
+ # from get_param, but we registered a defect for this earlier.
+ value_parts = []
+ i = 0
+ for section_number, param in parts:
+ if section_number != i:
+ # We could get fancier here and look for a complete
+ # duplicate extended parameter and ignore the second one
+ # seen. But we're not doing that. The old code didn't.
+ if not param.extended:
+ param.defects.append(errors.InvalidHeaderDefect(
+ 'duplicate parameter name; duplicate ignored'))
+ continue
+ else:
+ param.defects.append(errors.InvalidHeaderDefect(
+ "inconsistent RFC2231 parameter numbering"))
+ i += 1
+ value = param.param_value
+ if param.extended:
+ try:
+ value = parse.unquote_to_bytes(value)
+ except UnicodeEncodeError:
+ # source had surrogate escaped bytes. What we do now
+ # is a bit of an open question. I'm not sure this is
+ # the best choice, but it is what the old algorithm did
+ value = parse.unquote(value, encoding='latin-1')
+ else:
+ try:
+ value = value.decode(charset, 'surrogateescape')
+ except LookupError:
+ # XXX: there should really be a custom defect for
+ # unknown character set to make it easy to find,
+ # because otherwise unknown charset is a silent
+ # failure.
+ value = value.decode('us-ascii', 'surrogateescape')
+ if utils._has_surrogates(value):
+ param.defects.append(errors.UndecodableBytesDefect())
+ value_parts.append(value)
+ value = ''.join(value_parts)
+ yield name, value
+
+ def __str__(self):
+ params = []
+ for name, value in self.params:
+ if value:
+ params.append('{}={}'.format(name, quote_string(value)))
+ else:
+ params.append(name)
+ params = '; '.join(params)
+ return ' ' + params if params else ''
+
+
+class ParameterizedHeaderValue(TokenList):
+
+ # Set this false so that the value doesn't wind up on a new line even
+ # if it and the parameters would fit there but not on the first line.
+ syntactic_break = False
+
+ @property
+ def params(self):
+ for token in reversed(self):
+ if token.token_type == 'mime-parameters':
+ return token.params
+ return {}
+
+
+class ContentType(ParameterizedHeaderValue):
+
+ token_type = 'content-type'
+ as_ew_allowed = False
+ maintype = 'text'
+ subtype = 'plain'
+
+
+class ContentDisposition(ParameterizedHeaderValue):
+
+ token_type = 'content-disposition'
+ as_ew_allowed = False
+ content_disposition = None
+
+
+class ContentTransferEncoding(TokenList):
+
+ token_type = 'content-transfer-encoding'
+ as_ew_allowed = False
+ cte = '7bit'
+
+
+class HeaderLabel(TokenList):
+
+ token_type = 'header-label'
+ as_ew_allowed = False
+
+
+class Header(TokenList):
+
+ token_type = 'header'
+
+
+#
+# Terminal classes and instances
+#
+
+class Terminal(str):
+
+ as_ew_allowed = True
+ ew_combine_allowed = True
+ syntactic_break = True
+
+ def __new__(cls, value, token_type):
+ self = str.__newobj__(cls, value)
+ self.token_type = token_type
+ self.defects = []
+ return self
+
+ def __repr__(self):
+ return "{}({})".format(self.__class__.__name__, super().__repr__())
+
+ def pprint(self):
+ print(self.__class__.__name__ + '/' + self.token_type)
+
+ @property
+ def all_defects(self):
+ return list(self.defects)
+
+ def _pp(self, indent=''):
+ return ["{}{}/{}({}){}".format(
+ indent,
+ self.__class__.__name__,
+ self.token_type,
+ super().__repr__(),
+ '' if not self.defects else ' {}'.format(self.defects),
+ )]
+
+ def pop_trailing_ws(self):
+ # This terminates the recursion.
+ return None
+
+ @property
+ def comments(self):
+ return []
+
+ def __getnewargs__(self):
+ return(str(self), self.token_type)
+
+
+class WhiteSpaceTerminal(Terminal):
+
+ @property
+ def value(self):
+ return ' '
+
+ def startswith_fws(self):
+ return True
+
+
+class ValueTerminal(Terminal):
+
+ @property
+ def value(self):
+ return self
+
+ def startswith_fws(self):
+ return False
+
+
+class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
+
+ @property
+ def value(self):
+ return ''
+
+ def __str__(self):
+ return ''
+
+
+# XXX these need to become classes and used as instances so
+# that a program can't change them in a parse tree and screw
+# up other parse trees. Maybe should have tests for that, too.
+DOT = ValueTerminal('.', 'dot')
+ListSeparator = ValueTerminal(',', 'list-separator')
+RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
+
+#
+# Parser
+#
+
+# Parse strings according to RFC822/2047/2822/5322 rules.
+#
+# This is a stateless parser. Each get_XXX function accepts a string and
+# returns either a Terminal or a TokenList representing the RFC object named
+# by the method and a string containing the remaining unparsed characters
+# from the input. Thus a parser method consumes the next syntactic construct
+# of a given type and returns a token representing the construct plus the
+# unparsed remainder of the input string.
+#
+# For example, if the first element of a structured header is a 'phrase',
+# then:
+#
+# phrase, value = get_phrase(value)
+#
+# returns the complete phrase from the start of the string value, plus any
+# characters left in the string after the phrase is removed.
+
+
+_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
+_non_atom_end_matcher = re.compile(r"[^{}]+".format(
+ ''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
+_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
+_non_token_end_matcher = re.compile(r"[^{}]+".format(
+ ''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
+_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
+ ''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
+_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
+ ''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
+ '\\','\\\\').replace(']',r'\]'))).match
+
+def _validate_xtext(xtext):
+ """If input token contains ASCII non-printables, register a defect."""
+
+ non_printables = _non_printable_finder(xtext)
+ if non_printables:
+ xtext.defects.append(errors.NonPrintableDefect(non_printables))
+ if utils._has_surrogates(xtext):
+ xtext.defects.append(errors.UndecodableBytesDefect(
+ "Non-ASCII characters found in header token"))
+
+def _get_ptext_to_endchars(value, endchars):
+ """Scan printables/quoted-pairs until endchars and return unquoted ptext.
+
+ This function turns a run of qcontent, ccontent-without-comments, or
+ dtext-with-quoted-printables into a single string by unquoting any
+ quoted printables. It returns the string, the remaining value, and
+ a flag that is True iff there were any quoted printables decoded.
+
+ """
+ fragment, *remainder = _wsp_splitter(value, 1)
+ vchars = []
+ escape = False
+ had_qp = False
+ for pos in range(len(fragment)):
+ if fragment[pos] == '\\':
+ if escape:
+ escape = False
+ had_qp = True
+ else:
+ escape = True
+ continue
+ if escape:
+ escape = False
+ elif fragment[pos] in endchars:
+ break
+ vchars.append(fragment[pos])
+ else:
+ pos = pos + 1
+ return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
+
+def get_fws(value):
+ """FWS = 1*WSP
+
+ This isn't the RFC definition. We're using fws to represent tokens where
+ folding can be done, but when we are parsing the *un*folding has already
+ been done so we don't need to watch out for CRLF.
+
+ """
+ newvalue = value.lstrip()
+ fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
+ return fws, newvalue
+
+def get_encoded_word(value):
+ """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
+
+ """
+ ew = EncodedWord()
+ if not value.startswith('=?'):
+ raise errors.HeaderParseError(
+ "expected encoded word but found {}".format(value))
+ tok, *remainder = value[2:].split('?=', 1)
+ if tok == value[2:]:
+ raise errors.HeaderParseError(
+ "expected encoded word but found {}".format(value))
+ remstr = ''.join(remainder)
+ if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
+ # The ? after the CTE was followed by an encoded word escape (=XX).
+ rest, *remainder = remstr.split('?=', 1)
+ tok = tok + '?=' + rest
+ if len(tok.split()) > 1:
+ ew.defects.append(errors.InvalidHeaderDefect(
+ "whitespace inside encoded word"))
+ ew.cte = value
+ value = ''.join(remainder)
+ try:
+ text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
+ except ValueError:
+ raise errors.HeaderParseError(
+ "encoded word format invalid: '{}'".format(ew.cte))
+ ew.charset = charset
+ ew.lang = lang
+ ew.defects.extend(defects)
+ while text:
+ if text[0] in WSP:
+ token, text = get_fws(text)
+ ew.append(token)
+ continue
+ chars, *remainder = _wsp_splitter(text, 1)
+ vtext = ValueTerminal(chars, 'vtext')
+ _validate_xtext(vtext)
+ ew.append(vtext)
+ text = ''.join(remainder)
+ return ew, value
+
+def get_unstructured(value):
+ """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct
+ obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS)
+ obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
+
+ obs-NO-WS-CTL is control characters except WSP/CR/LF.
+
+ So, basically, we have printable runs, plus control characters or nulls in
+ the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
+ obsolete syntax in its specification, but requires whitespace on either
+ side of the encoded words, I can see no reason to need to separate the
+ non-printable-non-whitespace from the printable runs if they occur, so we
+ parse this into xtext tokens separated by WSP tokens.
+
+ Because an 'unstructured' value must by definition constitute the entire
+ value, this 'get' routine does not return a remaining value, only the
+ parsed TokenList.
+
+ """
+ # XXX: but what about bare CR and LF? They might signal the start or
+ # end of an encoded word. YAGNI for now, since our current parsers
+ # will never send us strings with bare CR or LF.
+
+ unstructured = UnstructuredTokenList()
+ while value:
+ if value[0] in WSP:
+ token, value = get_fws(value)
+ unstructured.append(token)
+ continue
+ if value.startswith('=?'):
+ try:
+ token, value = get_encoded_word(value)
+ except errors.HeaderParseError:
+ # XXX: Need to figure out how to register defects when
+ # appropriate here.
+ pass
+ else:
+ have_ws = True
+ if len(unstructured) > 0:
+ if unstructured[-1].token_type != 'fws':
+ unstructured.defects.append(errors.InvalidHeaderDefect(
+ "missing whitespace before encoded word"))
+ have_ws = False
+ if have_ws and len(unstructured) > 1:
+ if unstructured[-2].token_type == 'encoded-word':
+ unstructured[-1] = EWWhiteSpaceTerminal(
+ unstructured[-1], 'fws')
+ unstructured.append(token)
+ continue
+ tok, *remainder = _wsp_splitter(value, 1)
+ vtext = ValueTerminal(tok, 'vtext')
+ _validate_xtext(vtext)
+ unstructured.append(vtext)
+ value = ''.join(remainder)
+ return unstructured
+
+def get_qp_ctext(value):
+ r"""ctext = <printable ascii except \ ( )>
+
+ This is not the RFC ctext, since we are handling nested comments in comment
+ and unquoting quoted-pairs here. We allow anything except the '()'
+ characters, but if we find any ASCII other than the RFC defined printable
+ ASCII, a NonPrintableDefect is added to the token's defects list. Since
+ quoted pairs are converted to their unquoted values, what is returned is
+ a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
+ is ' '.
+
+ """
+ ptext, value, _ = _get_ptext_to_endchars(value, '()')
+ ptext = WhiteSpaceTerminal(ptext, 'ptext')
+ _validate_xtext(ptext)
+ return ptext, value
+
+def get_qcontent(value):
+ """qcontent = qtext / quoted-pair
+
+ We allow anything except the DQUOTE character, but if we find any ASCII
+ other than the RFC defined printable ASCII, a NonPrintableDefect is
+ added to the token's defects list. Any quoted pairs are converted to their
+ unquoted values, so what is returned is a 'ptext' token. In this case it
+ is a ValueTerminal.
+
+ """
+ ptext, value, _ = _get_ptext_to_endchars(value, '"')
+ ptext = ValueTerminal(ptext, 'ptext')
+ _validate_xtext(ptext)
+ return ptext, value
+
+def get_atext(value):
+ """atext = <matches _atext_matcher>
+
+ We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
+ the token's defects list if we find non-atext characters.
+ """
+ m = _non_atom_end_matcher(value)
+ if not m:
+ raise errors.HeaderParseError(
+ "expected atext but found '{}'".format(value))
+ atext = m.group()
+ value = value[len(atext):]
+ atext = ValueTerminal(atext, 'atext')
+ _validate_xtext(atext)
+ return atext, value
+
+def get_bare_quoted_string(value):
+ """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
+
+ A quoted-string without the leading or trailing white space. Its
+ value is the text between the quote marks, with whitespace
+ preserved and quoted pairs decoded.
+ """
+ if value[0] != '"':
+ raise errors.HeaderParseError(
+ "expected '\"' but found '{}'".format(value))
+ bare_quoted_string = BareQuotedString()
+ value = value[1:]
+ if value[0] == '"':
+ token, value = get_qcontent(value)
+ bare_quoted_string.append(token)
+ while value and value[0] != '"':
+ if value[0] in WSP:
+ token, value = get_fws(value)
+ elif value[:2] == '=?':
+ try:
+ token, value = get_encoded_word(value)
+ bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
+ "encoded word inside quoted string"))
+ except errors.HeaderParseError:
+ token, value = get_qcontent(value)
+ else:
+ token, value = get_qcontent(value)
+ bare_quoted_string.append(token)
+ if not value:
+ bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
+ "end of header inside quoted string"))
+ return bare_quoted_string, value
+ return bare_quoted_string, value[1:]
+
+def get_comment(value):
+ """comment = "(" *([FWS] ccontent) [FWS] ")"
+ ccontent = ctext / quoted-pair / comment
+
+ We handle nested comments here, and quoted-pair in our qp-ctext routine.
+ """
+ if value and value[0] != '(':
+ raise errors.HeaderParseError(
+ "expected '(' but found '{}'".format(value))
+ comment = Comment()
+ value = value[1:]
+ while value and value[0] != ")":
+ if value[0] in WSP:
+ token, value = get_fws(value)
+ elif value[0] == '(':
+ token, value = get_comment(value)
+ else:
+ token, value = get_qp_ctext(value)
+ comment.append(token)
+ if not value:
+ comment.defects.append(errors.InvalidHeaderDefect(
+ "end of header inside comment"))
+ return comment, value
+ return comment, value[1:]
+
+def get_cfws(value):
+ """CFWS = (1*([FWS] comment) [FWS]) / FWS
+
+ """
+ cfws = CFWSList()
+ while value and value[0] in CFWS_LEADER:
+ if value[0] in WSP:
+ token, value = get_fws(value)
+ else:
+ token, value = get_comment(value)
+ cfws.append(token)
+ return cfws, value
+
+def get_quoted_string(value):
+ """quoted-string = [CFWS] <bare-quoted-string> [CFWS]
+
+ 'bare-quoted-string' is an intermediate class defined by this
+ parser and not by the RFC grammar. It is the quoted string
+ without any attached CFWS.
+ """
+ quoted_string = QuotedString()
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ quoted_string.append(token)
+ token, value = get_bare_quoted_string(value)
+ quoted_string.append(token)
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ quoted_string.append(token)
+ return quoted_string, value
+
+def get_atom(value):
+ """atom = [CFWS] 1*atext [CFWS]
+
+ An atom could be an rfc2047 encoded word.
+ """
+ atom = Atom()
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ atom.append(token)
+ if value and value[0] in ATOM_ENDS:
+ raise errors.HeaderParseError(
+ "expected atom but found '{}'".format(value))
+ if value.startswith('=?'):
+ try:
+ token, value = get_encoded_word(value)
+ except errors.HeaderParseError:
+ # XXX: need to figure out how to register defects when
+ # appropriate here.
+ token, value = get_atext(value)
+ else:
+ token, value = get_atext(value)
+ atom.append(token)
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ atom.append(token)
+ return atom, value
+
+def get_dot_atom_text(value):
+ """ dot-text = 1*atext *("." 1*atext)
+
+ """
+ dot_atom_text = DotAtomText()
+ if not value or value[0] in ATOM_ENDS:
+ raise errors.HeaderParseError("expected atom at a start of "
+ "dot-atom-text but found '{}'".format(value))
+ while value and value[0] not in ATOM_ENDS:
+ token, value = get_atext(value)
+ dot_atom_text.append(token)
+ if value and value[0] == '.':
+ dot_atom_text.append(DOT)
+ value = value[1:]
+ if dot_atom_text[-1] is DOT:
+ raise errors.HeaderParseError("expected atom at end of dot-atom-text "
+ "but found '{}'".format('.'+value))
+ return dot_atom_text, value
+
+def get_dot_atom(value):
+ """ dot-atom = [CFWS] dot-atom-text [CFWS]
+
+ Any place we can have a dot atom, we could instead have an rfc2047 encoded
+ word.
+ """
+ dot_atom = DotAtom()
+ if value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ dot_atom.append(token)
+ if value.startswith('=?'):
+ try:
+ token, value = get_encoded_word(value)
+ except errors.HeaderParseError:
+ # XXX: need to figure out how to register defects when
+ # appropriate here.
+ token, value = get_dot_atom_text(value)
+ else:
+ token, value = get_dot_atom_text(value)
+ dot_atom.append(token)
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ dot_atom.append(token)
+ return dot_atom, value
+
+def get_word(value):
+ """word = atom / quoted-string
+
+ Either atom or quoted-string may start with CFWS. We have to peel off this
+ CFWS first to determine which type of word to parse. Afterward we splice
+ the leading CFWS, if any, into the parsed sub-token.
+
+ If neither an atom or a quoted-string is found before the next special, a
+ HeaderParseError is raised.
+
+ The token returned is either an Atom or a QuotedString, as appropriate.
+ This means the 'word' level of the formal grammar is not represented in the
+ parse tree; this is because having that extra layer when manipulating the
+ parse tree is more confusing than it is helpful.
+
+ """
+ if value[0] in CFWS_LEADER:
+ leader, value = get_cfws(value)
+ else:
+ leader = None
+ if value[0]=='"':
+ token, value = get_quoted_string(value)
+ elif value[0] in SPECIALS:
+ raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
+ "but found '{}'".format(value))
+ else:
+ token, value = get_atom(value)
+ if leader is not None:
+ token[:0] = [leader]
+ return token, value
+
+def get_phrase(value):
+ """ phrase = 1*word / obs-phrase
+ obs-phrase = word *(word / "." / CFWS)
+
+ This means a phrase can be a sequence of words, periods, and CFWS in any
+ order as long as it starts with at least one word. If anything other than
+ words is detected, an ObsoleteHeaderDefect is added to the token's defect
+ list. We also accept a phrase that starts with CFWS followed by a dot;
+ this is registered as an InvalidHeaderDefect, since it is not supported by
+ even the obsolete grammar.
+
+ """
+ phrase = Phrase()
+ try:
+ token, value = get_word(value)
+ phrase.append(token)
+ except errors.HeaderParseError:
+ phrase.defects.append(errors.InvalidHeaderDefect(
+ "phrase does not start with word"))
+ while value and value[0] not in PHRASE_ENDS:
+ if value[0]=='.':
+ phrase.append(DOT)
+ phrase.defects.append(errors.ObsoleteHeaderDefect(
+ "period in 'phrase'"))
+ value = value[1:]
+ else:
+ try:
+ token, value = get_word(value)
+ except errors.HeaderParseError:
+ if value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ phrase.defects.append(errors.ObsoleteHeaderDefect(
+ "comment found without atom"))
+ else:
+ raise
+ phrase.append(token)
+ return phrase, value
+
+def get_local_part(value):
+ """ local-part = dot-atom / quoted-string / obs-local-part
+
+ """
+ local_part = LocalPart()
+ leader = None
+ if value[0] in CFWS_LEADER:
+ leader, value = get_cfws(value)
+ if not value:
+ raise errors.HeaderParseError(
+ "expected local-part but found '{}'".format(value))
+ try:
+ token, value = get_dot_atom(value)
+ except errors.HeaderParseError:
+ try:
+ token, value = get_word(value)
+ except errors.HeaderParseError:
+ if value[0] != '\\' and value[0] in PHRASE_ENDS:
+ raise
+ token = TokenList()
+ if leader is not None:
+ token[:0] = [leader]
+ local_part.append(token)
+ if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
+ obs_local_part, value = get_obs_local_part(str(local_part) + value)
+ if obs_local_part.token_type == 'invalid-obs-local-part':
+ local_part.defects.append(errors.InvalidHeaderDefect(
+ "local-part is not dot-atom, quoted-string, or obs-local-part"))
+ else:
+ local_part.defects.append(errors.ObsoleteHeaderDefect(
+ "local-part is not a dot-atom (contains CFWS)"))
+ local_part[0] = obs_local_part
+ try:
+ local_part.value.encode('ascii')
+ except UnicodeEncodeError:
+ local_part.defects.append(errors.NonASCIILocalPartDefect(
+ "local-part contains non-ASCII characters)"))
+ return local_part, value
+
+def get_obs_local_part(value):
+ """ obs-local-part = word *("." word)
+ """
+ obs_local_part = ObsLocalPart()
+ last_non_ws_was_dot = False
+ while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
+ if value[0] == '.':
+ if last_non_ws_was_dot:
+ obs_local_part.defects.append(errors.InvalidHeaderDefect(
+ "invalid repeated '.'"))
+ obs_local_part.append(DOT)
+ last_non_ws_was_dot = True
+ value = value[1:]
+ continue
+ elif value[0]=='\\':
+ obs_local_part.append(ValueTerminal(value[0],
+ 'misplaced-special'))
+ value = value[1:]
+ obs_local_part.defects.append(errors.InvalidHeaderDefect(
+ "'\\' character outside of quoted-string/ccontent"))
+ last_non_ws_was_dot = False
+ continue
+ if obs_local_part and obs_local_part[-1].token_type != 'dot':
+ obs_local_part.defects.append(errors.InvalidHeaderDefect(
+ "missing '.' between words"))
+ try:
+ token, value = get_word(value)
+ last_non_ws_was_dot = False
+ except errors.HeaderParseError:
+ if value[0] not in CFWS_LEADER:
+ raise
+ token, value = get_cfws(value)
+ obs_local_part.append(token)
+ if (obs_local_part[0].token_type == 'dot' or
+ obs_local_part[0].token_type=='cfws' and
+ obs_local_part[1].token_type=='dot'):
+ obs_local_part.defects.append(errors.InvalidHeaderDefect(
+ "Invalid leading '.' in local part"))
+ if (obs_local_part[-1].token_type == 'dot' or
+ obs_local_part[-1].token_type=='cfws' and
+ obs_local_part[-2].token_type=='dot'):
+ obs_local_part.defects.append(errors.InvalidHeaderDefect(
+ "Invalid trailing '.' in local part"))
+ if obs_local_part.defects:
+ obs_local_part.token_type = 'invalid-obs-local-part'
+ return obs_local_part, value
+
+def get_dtext(value):
+ r""" dtext = <printable ascii except \ [ ]> / obs-dtext
+ obs-dtext = obs-NO-WS-CTL / quoted-pair
+
+ We allow anything except the excluded characters, but if we find any
+ ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
+ added to the token's defects list. Quoted pairs are converted to their
+ unquoted values, so what is returned is a ptext token, in this case a
+ ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
+ added to the returned token's defect list.
+
+ """
+ ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
+ ptext = ValueTerminal(ptext, 'ptext')
+ if had_qp:
+ ptext.defects.append(errors.ObsoleteHeaderDefect(
+ "quoted printable found in domain-literal"))
+ _validate_xtext(ptext)
+ return ptext, value
+
+def _check_for_early_dl_end(value, domain_literal):
+ if value:
+ return False
+ domain_literal.append(errors.InvalidHeaderDefect(
+ "end of input inside domain-literal"))
+ domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
+ return True
+
+def get_domain_literal(value):
+ """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
+
+ """
+ domain_literal = DomainLiteral()
+ if value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ domain_literal.append(token)
+ if not value:
+ raise errors.HeaderParseError("expected domain-literal")
+ if value[0] != '[':
+ raise errors.HeaderParseError("expected '[' at start of domain-literal "
+ "but found '{}'".format(value))
+ value = value[1:]
+ if _check_for_early_dl_end(value, domain_literal):
+ return domain_literal, value
+ domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
+ if value[0] in WSP:
+ token, value = get_fws(value)
+ domain_literal.append(token)
+ token, value = get_dtext(value)
+ domain_literal.append(token)
+ if _check_for_early_dl_end(value, domain_literal):
+ return domain_literal, value
+ if value[0] in WSP:
+ token, value = get_fws(value)
+ domain_literal.append(token)
+ if _check_for_early_dl_end(value, domain_literal):
+ return domain_literal, value
+ if value[0] != ']':
+ raise errors.HeaderParseError("expected ']' at end of domain-literal "
+ "but found '{}'".format(value))
+ domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
+ value = value[1:]
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ domain_literal.append(token)
+ return domain_literal, value
+
+def get_domain(value):
+ """ domain = dot-atom / domain-literal / obs-domain
+ obs-domain = atom *("." atom))
+
+ """
+ domain = Domain()
+ leader = None
+ if value[0] in CFWS_LEADER:
+ leader, value = get_cfws(value)
+ if not value:
+ raise errors.HeaderParseError(
+ "expected domain but found '{}'".format(value))
+ if value[0] == '[':
+ token, value = get_domain_literal(value)
+ if leader is not None:
+ token[:0] = [leader]
+ domain.append(token)
+ return domain, value
+ try:
+ token, value = get_dot_atom(value)
+ except errors.HeaderParseError:
+ token, value = get_atom(value)
+ if leader is not None:
+ token[:0] = [leader]
+ domain.append(token)
+ if value and value[0] == '.':
+ domain.defects.append(errors.ObsoleteHeaderDefect(
+ "domain is not a dot-atom (contains CFWS)"))
+ if domain[0].token_type == 'dot-atom':
+ domain[:] = domain[0]
+ while value and value[0] == '.':
+ domain.append(DOT)
+ token, value = get_atom(value[1:])
+ domain.append(token)
+ return domain, value
+
+def get_addr_spec(value):
+ """ addr-spec = local-part "@" domain
+
+ """
+ addr_spec = AddrSpec()
+ token, value = get_local_part(value)
+ addr_spec.append(token)
+ if not value or value[0] != '@':
+ addr_spec.defects.append(errors.InvalidHeaderDefect(
+ "add-spec local part with no domain"))
+ return addr_spec, value
+ addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
+ token, value = get_domain(value[1:])
+ addr_spec.append(token)
+ return addr_spec, value
+
+def get_obs_route(value):
+ """ obs-route = obs-domain-list ":"
+ obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain])
+
+ Returns an obs-route token with the appropriate sub-tokens (that is,
+ there is no obs-domain-list in the parse tree).
+ """
+ obs_route = ObsRoute()
+ while value and (value[0]==',' or value[0] in CFWS_LEADER):
+ if value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ obs_route.append(token)
+ elif value[0] == ',':
+ obs_route.append(ListSeparator)
+ value = value[1:]
+ if not value or value[0] != '@':
+ raise errors.HeaderParseError(
+ "expected obs-route domain but found '{}'".format(value))
+ obs_route.append(RouteComponentMarker)
+ token, value = get_domain(value[1:])
+ obs_route.append(token)
+ while value and value[0]==',':
+ obs_route.append(ListSeparator)
+ value = value[1:]
+ if not value:
+ break
+ if value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ obs_route.append(token)
+ if value[0] == '@':
+ obs_route.append(RouteComponentMarker)
+ token, value = get_domain(value[1:])
+ obs_route.append(token)
+ if not value:
+ raise errors.HeaderParseError("end of header while parsing obs-route")
+ if value[0] != ':':
+ raise errors.HeaderParseError( "expected ':' marking end of "
+ "obs-route but found '{}'".format(value))
+ obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
+ return obs_route, value[1:]
+
+def get_angle_addr(value):
+ """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
+ obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
+
+ """
+ angle_addr = AngleAddr()
+ if value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ angle_addr.append(token)
+ if not value or value[0] != '<':
+ raise errors.HeaderParseError(
+ "expected angle-addr but found '{}'".format(value))
+ angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
+ value = value[1:]
+ # Although it is not legal per RFC5322, SMTP uses '<>' in certain
+ # circumstances.
+ if value[0] == '>':
+ angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
+ angle_addr.defects.append(errors.InvalidHeaderDefect(
+ "null addr-spec in angle-addr"))
+ value = value[1:]
+ return angle_addr, value
+ try:
+ token, value = get_addr_spec(value)
+ except errors.HeaderParseError:
+ try:
+ token, value = get_obs_route(value)
+ angle_addr.defects.append(errors.ObsoleteHeaderDefect(
+ "obsolete route specification in angle-addr"))
+ except errors.HeaderParseError:
+ raise errors.HeaderParseError(
+ "expected addr-spec or obs-route but found '{}'".format(value))
+ angle_addr.append(token)
+ token, value = get_addr_spec(value)
+ angle_addr.append(token)
+ if value and value[0] == '>':
+ value = value[1:]
+ else:
+ angle_addr.defects.append(errors.InvalidHeaderDefect(
+ "missing trailing '>' on angle-addr"))
+ angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ angle_addr.append(token)
+ return angle_addr, value
+
+def get_display_name(value):
+ """ display-name = phrase
+
+ Because this is simply a name-rule, we don't return a display-name
+ token containing a phrase, but rather a display-name token with
+ the content of the phrase.
+
+ """
+ display_name = DisplayName()
+ token, value = get_phrase(value)
+ display_name.extend(token[:])
+ display_name.defects = token.defects[:]
+ return display_name, value
+
+
+def get_name_addr(value):
+ """ name-addr = [display-name] angle-addr
+
+ """
+ name_addr = NameAddr()
+ # Both the optional display name and the angle-addr can start with cfws.
+ leader = None
+ if value[0] in CFWS_LEADER:
+ leader, value = get_cfws(value)
+ if not value:
+ raise errors.HeaderParseError(
+ "expected name-addr but found '{}'".format(leader))
+ if value[0] != '<':
+ if value[0] in PHRASE_ENDS:
+ raise errors.HeaderParseError(
+ "expected name-addr but found '{}'".format(value))
+ token, value = get_display_name(value)
+ if not value:
+ raise errors.HeaderParseError(
+ "expected name-addr but found '{}'".format(token))
+ if leader is not None:
+ token[0][:0] = [leader]
+ leader = None
+ name_addr.append(token)
+ token, value = get_angle_addr(value)
+ if leader is not None:
+ token[:0] = [leader]
+ name_addr.append(token)
+ return name_addr, value
+
+def get_mailbox(value):
+ """ mailbox = name-addr / addr-spec
+
+ """
+ # The only way to figure out if we are dealing with a name-addr or an
+ # addr-spec is to try parsing each one.
+ mailbox = Mailbox()
+ try:
+ token, value = get_name_addr(value)
+ except errors.HeaderParseError:
+ try:
+ token, value = get_addr_spec(value)
+ except errors.HeaderParseError:
+ raise errors.HeaderParseError(
+ "expected mailbox but found '{}'".format(value))
+ if any(isinstance(x, errors.InvalidHeaderDefect)
+ for x in token.all_defects):
+ mailbox.token_type = 'invalid-mailbox'
+ mailbox.append(token)
+ return mailbox, value
+
+def get_invalid_mailbox(value, endchars):
+ """ Read everything up to one of the chars in endchars.
+
+ This is outside the formal grammar. The InvalidMailbox TokenList that is
+ returned acts like a Mailbox, but the data attributes are None.
+
+ """
+ invalid_mailbox = InvalidMailbox()
+ while value and value[0] not in endchars:
+ if value[0] in PHRASE_ENDS:
+ invalid_mailbox.append(ValueTerminal(value[0],
+ 'misplaced-special'))
+ value = value[1:]
+ else:
+ token, value = get_phrase(value)
+ invalid_mailbox.append(token)
+ return invalid_mailbox, value
+
+def get_mailbox_list(value):
+ """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
+ obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])
+
+ For this routine we go outside the formal grammar in order to improve error
+ handling. We recognize the end of the mailbox list only at the end of the
+ value or at a ';' (the group terminator). This is so that we can turn
+ invalid mailboxes into InvalidMailbox tokens and continue parsing any
+ remaining valid mailboxes. We also allow all mailbox entries to be null,
+ and this condition is handled appropriately at a higher level.
+
+ """
+ mailbox_list = MailboxList()
+ while value and value[0] != ';':
+ try:
+ token, value = get_mailbox(value)
+ mailbox_list.append(token)
+ except errors.HeaderParseError:
+ leader = None
+ if value[0] in CFWS_LEADER:
+ leader, value = get_cfws(value)
+ if not value or value[0] in ',;':
+ mailbox_list.append(leader)
+ mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
+ "empty element in mailbox-list"))
+ else:
+ token, value = get_invalid_mailbox(value, ',;')
+ if leader is not None:
+ token[:0] = [leader]
+ mailbox_list.append(token)
+ mailbox_list.defects.append(errors.InvalidHeaderDefect(
+ "invalid mailbox in mailbox-list"))
+ elif value[0] == ',':
+ mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
+ "empty element in mailbox-list"))
+ else:
+ token, value = get_invalid_mailbox(value, ',;')
+ if leader is not None:
+ token[:0] = [leader]
+ mailbox_list.append(token)
+ mailbox_list.defects.append(errors.InvalidHeaderDefect(
+ "invalid mailbox in mailbox-list"))
+ if value and value[0] not in ',;':
+ # Crap after mailbox; treat it as an invalid mailbox.
+ # The mailbox info will still be available.
+ mailbox = mailbox_list[-1]
+ mailbox.token_type = 'invalid-mailbox'
+ token, value = get_invalid_mailbox(value, ',;')
+ mailbox.extend(token)
+ mailbox_list.defects.append(errors.InvalidHeaderDefect(
+ "invalid mailbox in mailbox-list"))
+ if value and value[0] == ',':
+ mailbox_list.append(ListSeparator)
+ value = value[1:]
+ return mailbox_list, value
+
+
+def get_group_list(value):
+ """ group-list = mailbox-list / CFWS / obs-group-list
+ obs-group-list = 1*([CFWS] ",") [CFWS]
+
+ """
+ group_list = GroupList()
+ if not value:
+ group_list.defects.append(errors.InvalidHeaderDefect(
+ "end of header before group-list"))
+ return group_list, value
+ leader = None
+ if value and value[0] in CFWS_LEADER:
+ leader, value = get_cfws(value)
+ if not value:
+ # This should never happen in email parsing, since CFWS-only is a
+ # legal alternative to group-list in a group, which is the only
+ # place group-list appears.
+ group_list.defects.append(errors.InvalidHeaderDefect(
+ "end of header in group-list"))
+ group_list.append(leader)
+ return group_list, value
+ if value[0] == ';':
+ group_list.append(leader)
+ return group_list, value
+ token, value = get_mailbox_list(value)
+ if len(token.all_mailboxes)==0:
+ if leader is not None:
+ group_list.append(leader)
+ group_list.extend(token)
+ group_list.defects.append(errors.ObsoleteHeaderDefect(
+ "group-list with empty entries"))
+ return group_list, value
+ if leader is not None:
+ token[:0] = [leader]
+ group_list.append(token)
+ return group_list, value
+
+def get_group(value):
+ """ group = display-name ":" [group-list] ";" [CFWS]
+
+ """
+ group = Group()
+ token, value = get_display_name(value)
+ if not value or value[0] != ':':
+ raise errors.HeaderParseError("expected ':' at end of group "
+ "display name but found '{}'".format(value))
+ group.append(token)
+ group.append(ValueTerminal(':', 'group-display-name-terminator'))
+ value = value[1:]
+ if value and value[0] == ';':
+ group.append(ValueTerminal(';', 'group-terminator'))
+ return group, value[1:]
+ token, value = get_group_list(value)
+ group.append(token)
+ if not value:
+ group.defects.append(errors.InvalidHeaderDefect(
+ "end of header in group"))
+ if value[0] != ';':
+ raise errors.HeaderParseError(
+ "expected ';' at end of group but found {}".format(value))
+ group.append(ValueTerminal(';', 'group-terminator'))
+ value = value[1:]
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ group.append(token)
+ return group, value
+
+def get_address(value):
+ """ address = mailbox / group
+
+ Note that counter-intuitively, an address can be either a single address or
+ a list of addresses (a group). This is why the returned Address object has
+ a 'mailboxes' attribute which treats a single address as a list of length
+ one. When you need to differentiate between to two cases, extract the single
+ element, which is either a mailbox or a group token.
+
+ """
+ # The formal grammar isn't very helpful when parsing an address. mailbox
+ # and group, especially when allowing for obsolete forms, start off very
+ # similarly. It is only when you reach one of @, <, or : that you know
+ # what you've got. So, we try each one in turn, starting with the more
+ # likely of the two. We could perhaps make this more efficient by looking
+ # for a phrase and then branching based on the next character, but that
+ # would be a premature optimization.
+ address = Address()
+ try:
+ token, value = get_group(value)
+ except errors.HeaderParseError:
+ try:
+ token, value = get_mailbox(value)
+ except errors.HeaderParseError:
+ raise errors.HeaderParseError(
+ "expected address but found '{}'".format(value))
+ address.append(token)
+ return address, value
+
+def get_address_list(value):
+ """ address_list = (address *("," address)) / obs-addr-list
+ obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])
+
+ We depart from the formal grammar here by continuing to parse until the end
+ of the input, assuming the input to be entirely composed of an
+ address-list. This is always true in email parsing, and allows us
+ to skip invalid addresses to parse additional valid ones.
+
+ """
+ address_list = AddressList()
+ while value:
+ try:
+ token, value = get_address(value)
+ address_list.append(token)
+ except errors.HeaderParseError as err:
+ leader = None
+ if value[0] in CFWS_LEADER:
+ leader, value = get_cfws(value)
+ if not value or value[0] == ',':
+ address_list.append(leader)
+ address_list.defects.append(errors.ObsoleteHeaderDefect(
+ "address-list entry with no content"))
+ else:
+ token, value = get_invalid_mailbox(value, ',')
+ if leader is not None:
+ token[:0] = [leader]
+ address_list.append(Address([token]))
+ address_list.defects.append(errors.InvalidHeaderDefect(
+ "invalid address in address-list"))
+ elif value[0] == ',':
+ address_list.defects.append(errors.ObsoleteHeaderDefect(
+ "empty element in address-list"))
+ else:
+ token, value = get_invalid_mailbox(value, ',')
+ if leader is not None:
+ token[:0] = [leader]
+ address_list.append(Address([token]))
+ address_list.defects.append(errors.InvalidHeaderDefect(
+ "invalid address in address-list"))
+ if value and value[0] != ',':
+ # Crap after address; treat it as an invalid mailbox.
+ # The mailbox info will still be available.
+ mailbox = address_list[-1][0]
+ mailbox.token_type = 'invalid-mailbox'
+ token, value = get_invalid_mailbox(value, ',')
+ mailbox.extend(token)
+ address_list.defects.append(errors.InvalidHeaderDefect(
+ "invalid address in address-list"))
+ if value: # Must be a , at this point.
+ address_list.append(ValueTerminal(',', 'list-separator'))
+ value = value[1:]
+ return address_list, value
+
+#
+# XXX: As I begin to add additional header parsers, I'm realizing we probably
+# have two level of parser routines: the get_XXX methods that get a token in
+# the grammar, and parse_XXX methods that parse an entire field value. So
+# get_address_list above should really be a parse_ method, as probably should
+# be get_unstructured.
+#
+
+def parse_mime_version(value):
+ """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS]
+
+ """
+ # The [CFWS] is implicit in the RFC 2045 BNF.
+ # XXX: This routine is a bit verbose, should factor out a get_int method.
+ mime_version = MIMEVersion()
+ if not value:
+ mime_version.defects.append(errors.HeaderMissingRequiredValue(
+ "Missing MIME version number (eg: 1.0)"))
+ return mime_version
+ if value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ mime_version.append(token)
+ if not value:
+ mime_version.defects.append(errors.HeaderMissingRequiredValue(
+ "Expected MIME version number but found only CFWS"))
+ digits = ''
+ while value and value[0] != '.' and value[0] not in CFWS_LEADER:
+ digits += value[0]
+ value = value[1:]
+ if not digits.isdigit():
+ mime_version.defects.append(errors.InvalidHeaderDefect(
+ "Expected MIME major version number but found {!r}".format(digits)))
+ mime_version.append(ValueTerminal(digits, 'xtext'))
+ else:
+ mime_version.major = int(digits)
+ mime_version.append(ValueTerminal(digits, 'digits'))
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ mime_version.append(token)
+ if not value or value[0] != '.':
+ if mime_version.major is not None:
+ mime_version.defects.append(errors.InvalidHeaderDefect(
+ "Incomplete MIME version; found only major number"))
+ if value:
+ mime_version.append(ValueTerminal(value, 'xtext'))
+ return mime_version
+ mime_version.append(ValueTerminal('.', 'version-separator'))
+ value = value[1:]
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ mime_version.append(token)
+ if not value:
+ if mime_version.major is not None:
+ mime_version.defects.append(errors.InvalidHeaderDefect(
+ "Incomplete MIME version; found only major number"))
+ return mime_version
+ digits = ''
+ while value and value[0] not in CFWS_LEADER:
+ digits += value[0]
+ value = value[1:]
+ if not digits.isdigit():
+ mime_version.defects.append(errors.InvalidHeaderDefect(
+ "Expected MIME minor version number but found {!r}".format(digits)))
+ mime_version.append(ValueTerminal(digits, 'xtext'))
+ else:
+ mime_version.minor = int(digits)
+ mime_version.append(ValueTerminal(digits, 'digits'))
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ mime_version.append(token)
+ if value:
+ mime_version.defects.append(errors.InvalidHeaderDefect(
+ "Excess non-CFWS text after MIME version"))
+ mime_version.append(ValueTerminal(value, 'xtext'))
+ return mime_version
+
+def get_invalid_parameter(value):
+ """ Read everything up to the next ';'.
+
+ This is outside the formal grammar. The InvalidParameter TokenList that is
+ returned acts like a Parameter, but the data attributes are None.
+
+ """
+ invalid_parameter = InvalidParameter()
+ while value and value[0] != ';':
+ if value[0] in PHRASE_ENDS:
+ invalid_parameter.append(ValueTerminal(value[0],
+ 'misplaced-special'))
+ value = value[1:]
+ else:
+ token, value = get_phrase(value)
+ invalid_parameter.append(token)
+ return invalid_parameter, value
+
+def get_ttext(value):
+ """ttext = <matches _ttext_matcher>
+
+ We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
+ defects list if we find non-ttext characters. We also register defects for
+ *any* non-printables even though the RFC doesn't exclude all of them,
+ because we follow the spirit of RFC 5322.
+
+ """
+ m = _non_token_end_matcher(value)
+ if not m:
+ raise errors.HeaderParseError(
+ "expected ttext but found '{}'".format(value))
+ ttext = m.group()
+ value = value[len(ttext):]
+ ttext = ValueTerminal(ttext, 'ttext')
+ _validate_xtext(ttext)
+ return ttext, value
+
+def get_token(value):
+ """token = [CFWS] 1*ttext [CFWS]
+
+ The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
+ tspecials. We also exclude tabs even though the RFC doesn't.
+
+ The RFC implies the CFWS but is not explicit about it in the BNF.
+
+ """
+ mtoken = Token()
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ mtoken.append(token)
+ if value and value[0] in TOKEN_ENDS:
+ raise errors.HeaderParseError(
+ "expected token but found '{}'".format(value))
+ token, value = get_ttext(value)
+ mtoken.append(token)
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ mtoken.append(token)
+ return mtoken, value
+
+def get_attrtext(value):
+ """attrtext = 1*(any non-ATTRIBUTE_ENDS character)
+
+ We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
+ token's defects list if we find non-attrtext characters. We also register
+ defects for *any* non-printables even though the RFC doesn't exclude all of
+ them, because we follow the spirit of RFC 5322.
+
+ """
+ m = _non_attribute_end_matcher(value)
+ if not m:
+ raise errors.HeaderParseError(
+ "expected attrtext but found {!r}".format(value))
+ attrtext = m.group()
+ value = value[len(attrtext):]
+ attrtext = ValueTerminal(attrtext, 'attrtext')
+ _validate_xtext(attrtext)
+ return attrtext, value
+
+def get_attribute(value):
+ """ [CFWS] 1*attrtext [CFWS]
+
+ This version of the BNF makes the CFWS explicit, and as usual we use a
+ value terminal for the actual run of characters. The RFC equivalent of
+ attrtext is the token characters, with the subtraction of '*', "'", and '%'.
+ We include tab in the excluded set just as we do for token.
+
+ """
+ attribute = Attribute()
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ attribute.append(token)
+ if value and value[0] in ATTRIBUTE_ENDS:
+ raise errors.HeaderParseError(
+ "expected token but found '{}'".format(value))
+ token, value = get_attrtext(value)
+ attribute.append(token)
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ attribute.append(token)
+ return attribute, value
+
+def get_extended_attrtext(value):
+ """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
+
+ This is a special parsing routine so that we get a value that
+ includes % escapes as a single string (which we decode as a single
+ string later).
+
+ """
+ m = _non_extended_attribute_end_matcher(value)
+ if not m:
+ raise errors.HeaderParseError(
+ "expected extended attrtext but found {!r}".format(value))
+ attrtext = m.group()
+ value = value[len(attrtext):]
+ attrtext = ValueTerminal(attrtext, 'extended-attrtext')
+ _validate_xtext(attrtext)
+ return attrtext, value
+
+def get_extended_attribute(value):
+ """ [CFWS] 1*extended_attrtext [CFWS]
+
+ This is like the non-extended version except we allow % characters, so that
+ we can pick up an encoded value as a single string.
+
+ """
+ # XXX: should we have an ExtendedAttribute TokenList?
+ attribute = Attribute()
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ attribute.append(token)
+ if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
+ raise errors.HeaderParseError(
+ "expected token but found '{}'".format(value))
+ token, value = get_extended_attrtext(value)
+ attribute.append(token)
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ attribute.append(token)
+ return attribute, value
+
+def get_section(value):
+ """ '*' digits
+
+ The formal BNF is more complicated because leading 0s are not allowed. We
+ check for that and add a defect. We also assume no CFWS is allowed between
+ the '*' and the digits, though the RFC is not crystal clear on that.
+ The caller should already have dealt with leading CFWS.
+
+ """
+ section = Section()
+ if not value or value[0] != '*':
+ raise errors.HeaderParseError("Expected section but found {}".format(
+ value))
+ section.append(ValueTerminal('*', 'section-marker'))
+ value = value[1:]
+ if not value or not value[0].isdigit():
+ raise errors.HeaderParseError("Expected section number but "
+ "found {}".format(value))
+ digits = ''
+ while value and value[0].isdigit():
+ digits += value[0]
+ value = value[1:]
+ if digits[0] == '0' and digits != '0':
+ section.defects.append(errors.InvalidHeaderError("section number"
+ "has an invalid leading 0"))
+ section.number = int(digits)
+ section.append(ValueTerminal(digits, 'digits'))
+ return section, value
+
+
+def get_value(value):
+ """ quoted-string / attribute
+
+ """
+ v = Value()
+ if not value:
+ raise errors.HeaderParseError("Expected value but found end of string")
+ leader = None
+ if value[0] in CFWS_LEADER:
+ leader, value = get_cfws(value)
+ if not value:
+ raise errors.HeaderParseError("Expected value but found "
+ "only {}".format(leader))
+ if value[0] == '"':
+ token, value = get_quoted_string(value)
+ else:
+ token, value = get_extended_attribute(value)
+ if leader is not None:
+ token[:0] = [leader]
+ v.append(token)
+ return v, value
+
+def get_parameter(value):
+ """ attribute [section] ["*"] [CFWS] "=" value
+
+ The CFWS is implied by the RFC but not made explicit in the BNF. This
+ simplified form of the BNF from the RFC is made to conform with the RFC BNF
+ through some extra checks. We do it this way because it makes both error
+ recovery and working with the resulting parse tree easier.
+ """
+ # It is possible CFWS would also be implicitly allowed between the section
+ # and the 'extended-attribute' marker (the '*') , but we've never seen that
+ # in the wild and we will therefore ignore the possibility.
+ param = Parameter()
+ token, value = get_attribute(value)
+ param.append(token)
+ if not value or value[0] == ';':
+ param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
+ "name ({}) but no value".format(token)))
+ return param, value
+ if value[0] == '*':
+ try:
+ token, value = get_section(value)
+ param.sectioned = True
+ param.append(token)
+ except errors.HeaderParseError:
+ pass
+ if not value:
+ raise errors.HeaderParseError("Incomplete parameter")
+ if value[0] == '*':
+ param.append(ValueTerminal('*', 'extended-parameter-marker'))
+ value = value[1:]
+ param.extended = True
+ if value[0] != '=':
+ raise errors.HeaderParseError("Parameter not followed by '='")
+ param.append(ValueTerminal('=', 'parameter-separator'))
+ value = value[1:]
+ leader = None
+ if value and value[0] in CFWS_LEADER:
+ token, value = get_cfws(value)
+ param.append(token)
+ remainder = None
+ appendto = param
+ if param.extended and value and value[0] == '"':
+ # Now for some serious hackery to handle the common invalid case of
+ # double quotes around an extended value. We also accept (with defect)
+ # a value marked as encoded that isn't really.
+ qstring, remainder = get_quoted_string(value)
+ inner_value = qstring.stripped_value
+ semi_valid = False
+ if param.section_number == 0:
+ if inner_value and inner_value[0] == "'":
+ semi_valid = True
+ else:
+ token, rest = get_attrtext(inner_value)
+ if rest and rest[0] == "'":
+ semi_valid = True
+ else:
+ try:
+ token, rest = get_extended_attrtext(inner_value)
+ except:
+ pass
+ else:
+ if not rest:
+ semi_valid = True
+ if semi_valid:
+ param.defects.append(errors.InvalidHeaderDefect(
+ "Quoted string value for extended parameter is invalid"))
+ param.append(qstring)
+ for t in qstring:
+ if t.token_type == 'bare-quoted-string':
+ t[:] = []
+ appendto = t
+ break
+ value = inner_value
+ else:
+ remainder = None
+ param.defects.append(errors.InvalidHeaderDefect(
+ "Parameter marked as extended but appears to have a "
+ "quoted string value that is non-encoded"))
+ if value and value[0] == "'":
+ token = None
+ else:
+ token, value = get_value(value)
+ if not param.extended or param.section_number > 0:
+ if not value or value[0] != "'":
+ appendto.append(token)
+ if remainder is not None:
+ assert not value, value
+ value = remainder
+ return param, value
+ param.defects.append(errors.InvalidHeaderDefect(
+ "Apparent initial-extended-value but attribute "
+ "was not marked as extended or was not initial section"))
+ if not value:
+ # Assume the charset/lang is missing and the token is the value.
+ param.defects.append(errors.InvalidHeaderDefect(
+ "Missing required charset/lang delimiters"))
+ appendto.append(token)
+ if remainder is None:
+ return param, value
+ else:
+ if token is not None:
+ for t in token:
+ if t.token_type == 'extended-attrtext':
+ break
+ t.token_type == 'attrtext'
+ appendto.append(t)
+ param.charset = t.value
+ if value[0] != "'":
+ raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
+ "delimiter, but found {!r}".format(value))
+ appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
+ value = value[1:]
+ if value and value[0] != "'":
+ token, value = get_attrtext(value)
+ appendto.append(token)
+ param.lang = token.value
+ if not value or value[0] != "'":
+ raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
+ "delimiter, but found {}".format(value))
+ appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
+ value = value[1:]
+ if remainder is not None:
+ # Treat the rest of value as bare quoted string content.
+ v = Value()
+ while value:
+ if value[0] in WSP:
+ token, value = get_fws(value)
+ else:
+ token, value = get_qcontent(value)
+ v.append(token)
+ token = v
+ else:
+ token, value = get_value(value)
+ appendto.append(token)
+ if remainder is not None:
+ assert not value, value
+ value = remainder
+ return param, value
+
+def parse_mime_parameters(value):
+ """ parameter *( ";" parameter )
+
+ That BNF is meant to indicate this routine should only be called after
+ finding and handling the leading ';'. There is no corresponding rule in
+ the formal RFC grammar, but it is more convenient for us for the set of
+ parameters to be treated as its own TokenList.
+
+ This is 'parse' routine because it consumes the reminaing value, but it
+ would never be called to parse a full header. Instead it is called to
+ parse everything after the non-parameter value of a specific MIME header.
+
+ """
+ mime_parameters = MimeParameters()
+ while value:
+ try:
+ token, value = get_parameter(value)
+ mime_parameters.append(token)
+ except errors.HeaderParseError as err:
+ leader = None
+ if value[0] in CFWS_LEADER:
+ leader, value = get_cfws(value)
+ if not value:
+ mime_parameters.append(leader)
+ return mime_parameters
+ if value[0] == ';':
+ if leader is not None:
+ mime_parameters.append(leader)
+ mime_parameters.defects.append(errors.InvalidHeaderDefect(
+ "parameter entry with no content"))
+ else:
+ token, value = get_invalid_parameter(value)
+ if leader:
+ token[:0] = [leader]
+ mime_parameters.append(token)
+ mime_parameters.defects.append(errors.InvalidHeaderDefect(
+ "invalid parameter {!r}".format(token)))
+ if value and value[0] != ';':
+ # Junk after the otherwise valid parameter. Mark it as
+ # invalid, but it will have a value.
+ param = mime_parameters[-1]
+ param.token_type = 'invalid-parameter'
+ token, value = get_invalid_parameter(value)
+ param.extend(token)
+ mime_parameters.defects.append(errors.InvalidHeaderDefect(
+ "parameter with invalid trailing text {!r}".format(token)))
+ if value:
+ # Must be a ';' at this point.
+ mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
+ value = value[1:]
+ return mime_parameters
+
+def _find_mime_parameters(tokenlist, value):
+ """Do our best to find the parameters in an invalid MIME header
+
+ """
+ while value and value[0] != ';':
+ if value[0] in PHRASE_ENDS:
+ tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
+ value = value[1:]
+ else:
+ token, value = get_phrase(value)
+ tokenlist.append(token)
+ if not value:
+ return
+ tokenlist.append(ValueTerminal(';', 'parameter-separator'))
+ tokenlist.append(parse_mime_parameters(value[1:]))
+
+def parse_content_type_header(value):
+ """ maintype "/" subtype *( ";" parameter )
+
+ The maintype and substype are tokens. Theoretically they could
+ be checked against the official IANA list + x-token, but we
+ don't do that.
+ """
+ ctype = ContentType()
+ recover = False
+ if not value:
+ ctype.defects.append(errors.HeaderMissingRequiredValue(
+ "Missing content type specification"))
+ return ctype
+ try:
+ token, value = get_token(value)
+ except errors.HeaderParseError:
+ ctype.defects.append(errors.InvalidHeaderDefect(
+ "Expected content maintype but found {!r}".format(value)))
+ _find_mime_parameters(ctype, value)
+ return ctype
+ ctype.append(token)
+ # XXX: If we really want to follow the formal grammar we should make
+ # mantype and subtype specialized TokenLists here. Probably not worth it.
+ if not value or value[0] != '/':
+ ctype.defects.append(errors.InvalidHeaderDefect(
+ "Invalid content type"))
+ if value:
+ _find_mime_parameters(ctype, value)
+ return ctype
+ ctype.maintype = token.value.strip().lower()
+ ctype.append(ValueTerminal('/', 'content-type-separator'))
+ value = value[1:]
+ try:
+ token, value = get_token(value)
+ except errors.HeaderParseError:
+ ctype.defects.append(errors.InvalidHeaderDefect(
+ "Expected content subtype but found {!r}".format(value)))
+ _find_mime_parameters(ctype, value)
+ return ctype
+ ctype.append(token)
+ ctype.subtype = token.value.strip().lower()
+ if not value:
+ return ctype
+ if value[0] != ';':
+ ctype.defects.append(errors.InvalidHeaderDefect(
+ "Only parameters are valid after content type, but "
+ "found {!r}".format(value)))
+ # The RFC requires that a syntactically invalid content-type be treated
+ # as text/plain. Perhaps we should postel this, but we should probably
+ # only do that if we were checking the subtype value against IANA.
+ del ctype.maintype, ctype.subtype
+ _find_mime_parameters(ctype, value)
+ return ctype
+ ctype.append(ValueTerminal(';', 'parameter-separator'))
+ ctype.append(parse_mime_parameters(value[1:]))
+ return ctype
+
+def parse_content_disposition_header(value):
+ """ disposition-type *( ";" parameter )
+
+ """
+ disp_header = ContentDisposition()
+ if not value:
+ disp_header.defects.append(errors.HeaderMissingRequiredValue(
+ "Missing content disposition"))
+ return disp_header
+ try:
+ token, value = get_token(value)
+ except errors.HeaderParseError:
+ disp_header.defects.append(errors.InvalidHeaderDefect(
+ "Expected content disposition but found {!r}".format(value)))
+ _find_mime_parameters(disp_header, value)
+ return disp_header
+ disp_header.append(token)
+ disp_header.content_disposition = token.value.strip().lower()
+ if not value:
+ return disp_header
+ if value[0] != ';':
+ disp_header.defects.append(errors.InvalidHeaderDefect(
+ "Only parameters are valid after content disposition, but "
+ "found {!r}".format(value)))
+ _find_mime_parameters(disp_header, value)
+ return disp_header
+ disp_header.append(ValueTerminal(';', 'parameter-separator'))
+ disp_header.append(parse_mime_parameters(value[1:]))
+ return disp_header
+
+def parse_content_transfer_encoding_header(value):
+ """ mechanism
+
+ """
+ # We should probably validate the values, since the list is fixed.
+ cte_header = ContentTransferEncoding()
+ if not value:
+ cte_header.defects.append(errors.HeaderMissingRequiredValue(
+ "Missing content transfer encoding"))
+ return cte_header
+ try:
+ token, value = get_token(value)
+ except errors.HeaderParseError:
+ cte_header.defects.append(errors.InvalidHeaderDefect(
+ "Expected content transfer encoding but found {!r}".format(value)))
+ else:
+ cte_header.append(token)
+ cte_header.cte = token.value.strip().lower()
+ if not value:
+ return cte_header
+ while value:
+ cte_header.defects.append(errors.InvalidHeaderDefect(
+ "Extra text after content transfer encoding"))
+ if value[0] in PHRASE_ENDS:
+ cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
+ value = value[1:]
+ else:
+ token, value = get_phrase(value)
+ cte_header.append(token)
+ return cte_header
+
+
+#
+# Header folding
+#
+# Header folding is complex, with lots of rules and corner cases. The
+# following code does its best to obey the rules and handle the corner
+# cases, but you can be sure there are few bugs:)
+#
+# This folder generally canonicalizes as it goes, preferring the stringified
+# version of each token. The tokens contain information that supports the
+# folder, including which tokens can be encoded in which ways.
+#
+# Folded text is accumulated in a simple list of strings ('lines'), each
+# one of which should be less than policy.max_line_length ('maxlen').
+#
+
+def _steal_trailing_WSP_if_exists(lines):
+ wsp = ''
+ if lines and lines[-1] and lines[-1][-1] in WSP:
+ wsp = lines[-1][-1]
+ lines[-1] = lines[-1][:-1]
+ return wsp
+
+def _refold_parse_tree(parse_tree, *, policy):
+ """Return string of contents of parse_tree folded according to RFC rules.
+
+ """
+ # max_line_length 0/None means no limit, ie: infinitely long.
+ maxlen = policy.max_line_length or float("+inf")
+ encoding = 'utf-8' if policy.utf8 else 'us-ascii'
+ lines = ['']
+ last_ew = None
+ wrap_as_ew_blocked = 0
+ want_encoding = False
+ end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
+ parts = list(parse_tree)
+ while parts:
+ part = parts.pop(0)
+ if part is end_ew_not_allowed:
+ wrap_as_ew_blocked -= 1
+ continue
+ tstr = str(part)
+ try:
+ tstr.encode(encoding)
+ charset = encoding
+ except UnicodeEncodeError:
+ if any(isinstance(x, errors.UndecodableBytesDefect)
+ for x in part.all_defects):
+ charset = 'unknown-8bit'
+ else:
+ # If policy.utf8 is false this should really be taken from a
+ # 'charset' property on the policy.
+ charset = 'utf-8'
+ want_encoding = True
+ if part.token_type == 'mime-parameters':
+ # Mime parameter folding (using RFC2231) is extra special.
+ _fold_mime_parameters(part, lines, maxlen, encoding)
+ continue
+ if want_encoding and not wrap_as_ew_blocked:
+ if not part.as_ew_allowed:
+ want_encoding = False
+ last_ew = None
+ if part.syntactic_break:
+ encoded_part = part.fold(policy=policy)[:-1] # strip nl
+ if policy.linesep not in encoded_part:
+ # It fits on a single line
+ if len(encoded_part) > maxlen - len(lines[-1]):
+ # But not on this one, so start a new one.
+ newline = _steal_trailing_WSP_if_exists(lines)
+ # XXX what if encoded_part has no leading FWS?
+ lines.append(newline)
+ lines[-1] += encoded_part
+ continue
+ # Either this is not a major syntactic break, so we don't
+ # want it on a line by itself even if it fits, or it
+ # doesn't fit on a line by itself. Either way, fall through
+ # to unpacking the subparts and wrapping them.
+ if not hasattr(part, 'encode'):
+ # It's not a Terminal, do each piece individually.
+ parts = list(part) + parts
+ else:
+ # It's a terminal, wrap it as an encoded word, possibly
+ # combining it with previously encoded words if allowed.
+ last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
+ part.ew_combine_allowed, charset)
+ want_encoding = False
+ continue
+ if len(tstr) <= maxlen - len(lines[-1]):
+ lines[-1] += tstr
+ continue
+ # This part is too long to fit. The RFC wants us to break at
+ # "major syntactic breaks", so unless we don't consider this
+ # to be one, check if it will fit on the next line by itself.
+ if (part.syntactic_break and
+ len(tstr) + 1 <= maxlen):
+ newline = _steal_trailing_WSP_if_exists(lines)
+ if newline or part.startswith_fws():
+ lines.append(newline + tstr)
+ continue
+ if not hasattr(part, 'encode'):
+ # It's not a terminal, try folding the subparts.
+ newparts = list(part)
+ if not part.as_ew_allowed:
+ wrap_as_ew_blocked += 1
+ newparts.append(end_ew_not_allowed)
+ parts = newparts + parts
+ continue
+ if part.as_ew_allowed and not wrap_as_ew_blocked:
+ # It doesn't need CTE encoding, but encode it anyway so we can
+ # wrap it.
+ parts.insert(0, part)
+ want_encoding = True
+ continue
+ # We can't figure out how to wrap, it, so give up.
+ newline = _steal_trailing_WSP_if_exists(lines)
+ if newline or part.startswith_fws():
+ lines.append(newline + tstr)
+ else:
+ # We can't fold it onto the next line either...
+ lines[-1] += tstr
+ return policy.linesep.join(lines) + policy.linesep
+
+def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
+ """Fold string to_encode into lines as encoded word, combining if allowed.
+ Return the new value for last_ew, or None if ew_combine_allowed is False.
+
+ If there is already an encoded word in the last line of lines (indicated by
+ a non-None value for last_ew) and ew_combine_allowed is true, decode the
+ existing ew, combine it with to_encode, and re-encode. Otherwise, encode
+ to_encode. In either case, split to_encode as necessary so that the
+ encoded segments fit within maxlen.
+
+ """
+ if last_ew is not None and ew_combine_allowed:
+ to_encode = str(
+ get_unstructured(lines[-1][last_ew:] + to_encode))
+ lines[-1] = lines[-1][:last_ew]
+ if to_encode[0] in WSP:
+ # We're joining this to non-encoded text, so don't encode
+ # the leading blank.
+ leading_wsp = to_encode[0]
+ to_encode = to_encode[1:]
+ if (len(lines[-1]) == maxlen):
+ lines.append(_steal_trailing_WSP_if_exists(lines))
+ lines[-1] += leading_wsp
+ trailing_wsp = ''
+ if to_encode[-1] in WSP:
+ # Likewise for the trailing space.
+ trailing_wsp = to_encode[-1]
+ to_encode = to_encode[:-1]
+ new_last_ew = len(lines[-1]) if last_ew is None else last_ew
+ while to_encode:
+ remaining_space = maxlen - len(lines[-1])
+ # The RFC2047 chrome takes up 7 characters plus the length
+ # of the charset name.
+ encode_as = 'utf-8' if charset == 'us-ascii' else charset
+ text_space = remaining_space - len(encode_as) - 7
+ if text_space <= 0:
+ lines.append(' ')
+ # XXX We'll get an infinite loop here if maxlen is <= 7
+ continue
+ first_part = to_encode[:text_space]
+ ew = _ew.encode(first_part, charset=encode_as)
+ excess = len(ew) - remaining_space
+ if excess > 0:
+ # encode always chooses the shortest encoding, so this
+ # is guaranteed to fit at this point.
+ first_part = first_part[:-excess]
+ ew = _ew.encode(first_part)
+ lines[-1] += ew
+ to_encode = to_encode[len(first_part):]
+ if to_encode:
+ lines.append(' ')
+ new_last_ew = len(lines[-1])
+ lines[-1] += trailing_wsp
+ return new_last_ew if ew_combine_allowed else None
+
+def _fold_mime_parameters(part, lines, maxlen, encoding):
+ """Fold TokenList 'part' into the 'lines' list as mime parameters.
+
+ Using the decoded list of parameters and values, format them according to
+ the RFC rules, including using RFC2231 encoding if the value cannot be
+ expressed in 'encoding' and/or the parameter+value is too long to fit
+ within 'maxlen'.
+
+ """
+ # Special case for RFC2231 encoding: start from decoded values and use
+ # RFC2231 encoding iff needed.
+ #
+ # Note that the 1 and 2s being added to the length calculations are
+ # accounting for the possibly-needed spaces and semicolons we'll be adding.
+ #
+ for name, value in part.params:
+ # XXX What if this ';' puts us over maxlen the first time through the
+ # loop? We should split the header value onto a newline in that case,
+ # but to do that we need to recognize the need earlier or reparse the
+ # header, so I'm going to ignore that bug for now. It'll only put us
+ # one character over.
+ if not lines[-1].rstrip().endswith(';'):
+ lines[-1] += ';'
+ charset = encoding
+ error_handler = 'strict'
+ try:
+ value.encode(encoding)
+ encoding_required = False
+ except UnicodeEncodeError:
+ encoding_required = True
+ if utils._has_surrogates(value):
+ charset = 'unknown-8bit'
+ error_handler = 'surrogateescape'
+ else:
+ charset = 'utf-8'
+ if encoding_required:
+ encoded_value = parse.quote(
+ value, safe='', errors=error_handler)
+ tstr = "{}*={}''{}".format(name, charset, encoded_value)
+ else:
+ tstr = '{}={}'.format(name, quote_string(value))
+ if len(lines[-1]) + len(tstr) + 1 < maxlen:
+ lines[-1] = lines[-1] + ' ' + tstr
+ continue
+ elif len(tstr) + 2 <= maxlen:
+ lines.append(' ' + tstr)
+ continue
+ # We need multiple sections. We are allowed to mix encoded and
+ # non-encoded sections, but we aren't going to. We'll encode them all.
+ section = 0
+ extra_chrome = charset + "''"
+ while value:
+ chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome)
+ if maxlen <= chrome_len + 3:
+ # We need room for the leading blank, the trailing semicolon,
+ # and at least one character of the value. If we don't
+ # have that, we'd be stuck, so in that case fall back to
+ # the RFC standard width.
+ maxlen = 78
+ splitpoint = maxchars = maxlen - chrome_len - 2
+ while True:
+ partial = value[:splitpoint]
+ encoded_value = parse.quote(
+ partial, safe='', errors=error_handler)
+ if len(encoded_value) <= maxchars:
+ break
+ splitpoint -= 1
+ lines.append(" {}*{}*={}{}".format(
+ name, section, extra_chrome, encoded_value))
+ extra_chrome = ''
+ section += 1
+ value = value[splitpoint:]
+ if value:
+ lines[-1] += ';'
diff --git a/modules/language/python/module/email/_policybase.py b/modules/language/python/module/email/_policybase.py
index a8633e2..bbc8b3c 100644
--- a/modules/language/python/module/email/_policybase.py
+++ b/modules/language/python/module/email/_policybase.py
@@ -3,7 +3,6 @@ module(email,_policybase)
Allows fine grained feature control of how the package parses and emits data.
"""
-
import abc
import email.header as header
import email.charset as _charset
@@ -13,6 +12,7 @@ __all__ = [
'Policy',
'Compat32',
'compat32',
+ '_extend_docstrings'
]
@@ -101,9 +101,9 @@ def _extend_docstrings(cls):
if cls.__doc__ and cls.__doc__.startswith('+'):
cls.__doc__ = _append_doc(cls.__bases__[0].__doc__, cls.__doc__)
for name, attr in cls.__dict__.items():
- if attr.__doc__ and attr.__doc__.startswith('+'):
+ if getattr(attr, '__doc__',False) and attr.__doc__.startswith('+'):
for c in (c for base in cls.__bases__ for c in base.mro()):
- doc = getattr(getattr(c, name), '__doc__')
+ doc = getattr(getattr(c, name, False), '__doc__', False)
if doc:
attr.__doc__ = _append_doc(doc, attr.__doc__)
break
diff --git a/modules/language/python/module/email/contentmanager.py b/modules/language/python/module/email/contentmanager.py
new file mode 100644
index 0000000..5eaf2ee
--- /dev/null
+++ b/modules/language/python/module/email/contentmanager.py
@@ -0,0 +1,254 @@
+module(email,contentmanager)
+
+__all__=['ContentManager','raw_data_manager']
+
+import binascii
+import email.charset as charset
+import email.message as message
+import email.errors as errors
+import email.quoprimime as quoprimime
+
+class ContentManager:
+
+ def __init__(self):
+ self.get_handlers = {}
+ self.set_handlers = {}
+
+ def add_get_handler(self, key, handler):
+ self.get_handlers[key] = handler
+
+ def get_content(self, msg, *args, **kw):
+ content_type = msg.get_content_type()
+ if content_type in self.get_handlers:
+ return self.get_handlers[content_type](msg, *args, **kw)
+ maintype = msg.get_content_maintype()
+ if maintype in self.get_handlers:
+ return self.get_handlers[maintype](msg, *args, **kw)
+ if '' in self.get_handlers:
+ return self.get_handlers[''](msg, *args, **kw)
+ raise KeyError(content_type)
+
+ def add_set_handler(self, typekey, handler):
+ self.set_handlers[typekey] = handler
+
+ def set_content(self, msg, obj, *args, **kw):
+ if msg.get_content_maintype() == 'multipart':
+ # XXX: is this error a good idea or not? We can remove it later,
+ # but we can't add it later, so do it for now.
+ raise TypeError("set_content not valid on multipart")
+ handler = self._find_set_handler(msg, obj)
+ msg.clear_content()
+ handler(msg, obj, *args, **kw)
+
+ def _find_set_handler(self, msg, obj):
+ full_path_for_error = None
+ for typ in type(obj).__mro__:
+ if typ in self.set_handlers:
+ return self.set_handlers[typ]
+ qname = typ.__qualname__
+ modname = getattr(typ, '__module__', '')
+ full_path = '.'.join((modname, qname)) if modname else qname
+ if full_path_for_error is None:
+ full_path_for_error = full_path
+ if full_path in self.set_handlers:
+ return self.set_handlers[full_path]
+ if qname in self.set_handlers:
+ return self.set_handlers[qname]
+ name = typ.__name__
+ if name in self.set_handlers:
+ return self.set_handlers[name]
+ if None in self.set_handlers:
+ return self.set_handlers[None]
+ raise KeyError(full_path_for_error)
+
+
+raw_data_manager = ContentManager()
+
+
+def get_text_content(msg, errors='replace'):
+ content = msg.get_payload(decode=True)
+ charset = msg.get_param('charset', 'ASCII')
+ return content.decode(charset, errors=errors)
+raw_data_manager.add_get_handler('text', get_text_content)
+
+
+def get_non_text_content(msg):
+ return msg.get_payload(decode=True)
+for maintype in 'audio image video application'.split():
+ raw_data_manager.add_get_handler(maintype, get_non_text_content)
+
+
+def get_message_content(msg):
+ return msg.get_payload(0)
+for subtype in 'rfc822 external-body'.split():
+ raw_data_manager.add_get_handler('message/'+subtype, get_message_content)
+
+
+def get_and_fixup_unknown_message_content(msg):
+ # If we don't understand a message subtype, we are supposed to treat it as
+ # if it were application/octet-stream, per
+ # tools.ietf.org/html/rfc2046#section-5.2.4. Feedparser doesn't do that,
+ # so do our best to fix things up. Note that it is *not* appropriate to
+ # model message/partial content as Message objects, so they are handled
+ # here as well. (How to reassemble them is out of scope for this comment :)
+ return bytes(msg.get_payload(0))
+raw_data_manager.add_get_handler('message',
+ get_and_fixup_unknown_message_content)
+
+
+def _prepare_set(msg, maintype, subtype, headers):
+ msg['Content-Type'] = '/'.join((maintype, subtype))
+ if headers:
+ if not hasattr(headers[0], 'name'):
+ mp = msg.policy
+ headers = [mp.header_factory(*mp.header_source_parse([header]))
+ for header in headers]
+ try:
+ for header in headers:
+ if header.defects:
+ raise header.defects[0]
+ msg[header.name] = header
+ except errors.HeaderDefect as exc:
+ raise ValueError("Invalid header: {}".format(
+ header.fold(policy=msg.policy))) from exc
+
+
+def _finalize_set(msg, disposition, filename, cid, params):
+ if disposition is None and filename is not None:
+ disposition = 'attachment'
+ if disposition is not None:
+ msg['Content-Disposition'] = disposition
+ if filename is not None:
+ msg.set_param('filename',
+ filename,
+ header='Content-Disposition',
+ replace=True)
+ if cid is not None:
+ msg['Content-ID'] = cid
+ if params is not None:
+ for key, value in params.items():
+ msg.set_param(key, value)
+
+
+# XXX: This is a cleaned-up version of base64mime.body_encode (including a bug
+# fix in the calculation of unencoded_bytes_per_line). It would be nice to
+# drop both this and quoprimime.body_encode in favor of enhanced binascii
+# routines that accepted a max_line_length parameter.
+def _encode_base64(data, max_line_length):
+ encoded_lines = []
+ unencoded_bytes_per_line = max_line_length // 4 * 3
+ for i in range(0, len(data), unencoded_bytes_per_line):
+ thisline = data[i:i+unencoded_bytes_per_line]
+ encoded_lines.append(binascii.b2a_base64(thisline).decode('ascii'))
+ return ''.join(encoded_lines)
+
+
+def _encode_text(string, charset, cte, policy):
+ lines = string.encode(charset).splitlines()
+ linesep = policy.linesep.encode('ascii')
+ def embedded_body(lines): return linesep.join(lines) + linesep
+ def normal_body(lines): return b'\n'.join(lines) + b'\n'
+ if cte==None:
+ # Use heuristics to decide on the "best" encoding.
+ try:
+ return '7bit', normal_body(lines).decode('ascii')
+ except UnicodeDecodeError:
+ pass
+ if (policy.cte_type == '8bit' and
+ max(len(x) for x in lines) <= policy.max_line_length):
+ return '8bit', normal_body(lines).decode('ascii', 'surrogateescape')
+ sniff = embedded_body(lines[:10])
+ sniff_qp = quoprimime.body_encode(sniff.decode('latin-1'),
+ policy.max_line_length)
+ sniff_base64 = binascii.b2a_base64(sniff)
+ # This is a little unfair to qp; it includes lineseps, base64 doesn't.
+ if len(sniff_qp) > len(sniff_base64):
+ cte = 'base64'
+ else:
+ cte = 'quoted-printable'
+ if len(lines) <= 10:
+ return cte, sniff_qp
+ if cte == '7bit':
+ data = normal_body(lines).decode('ascii')
+ elif cte == '8bit':
+ data = normal_body(lines).decode('ascii', 'surrogateescape')
+ elif cte == 'quoted-printable':
+ data = quoprimime.body_encode(normal_body(lines).decode('latin-1'),
+ policy.max_line_length)
+ elif cte == 'base64':
+ data = _encode_base64(embedded_body(lines), policy.max_line_length)
+ else:
+ raise ValueError("Unknown content transfer encoding {}".format(cte))
+ return cte, data
+
+
+def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None,
+ disposition=None, filename=None, cid=None,
+ params=None, headers=None):
+ _prepare_set(msg, 'text', subtype, headers)
+ cte, payload = _encode_text(string, charset, cte, msg.policy)
+ msg.set_payload(payload)
+ msg.set_param('charset',
+ charset.ALIASES.get(charset, charset),
+ replace=True)
+ msg['Content-Transfer-Encoding'] = cte
+ _finalize_set(msg, disposition, filename, cid, params)
+raw_data_manager.add_set_handler(str, set_text_content)
+
+
+def set_message_content(msg, message, subtype="rfc822", cte=None,
+ disposition=None, filename=None, cid=None,
+ params=None, headers=None):
+ if subtype == 'partial':
+ raise ValueError("message/partial is not supported for Message objects")
+ if subtype == 'rfc822':
+ if cte not in (None, '7bit', '8bit', 'binary'):
+ # http://tools.ietf.org/html/rfc2046#section-5.2.1 mandate.
+ raise ValueError(
+ "message/rfc822 parts do not support cte={}".format(cte))
+ # 8bit will get coerced on serialization if policy.cte_type='7bit'. We
+ # may end up claiming 8bit when it isn't needed, but the only negative
+ # result of that should be a gateway that needs to coerce to 7bit
+ # having to look through the whole embedded message to discover whether
+ # or not it actually has to do anything.
+ cte = '8bit' if cte is None else cte
+ elif subtype == 'external-body':
+ if cte not in (None, '7bit'):
+ # http://tools.ietf.org/html/rfc2046#section-5.2.3 mandate.
+ raise ValueError(
+ "message/external-body parts do not support cte={}".format(cte))
+ cte = '7bit'
+ elif cte is None:
+ # http://tools.ietf.org/html/rfc2046#section-5.2.4 says all future
+ # subtypes should be restricted to 7bit, so assume that.
+ cte = '7bit'
+ _prepare_set(msg, 'message', subtype, headers)
+ msg.set_payload([message])
+ msg['Content-Transfer-Encoding'] = cte
+ _finalize_set(msg, disposition, filename, cid, params)
+raw_data_manager.add_set_handler(message.Message, set_message_content)
+
+
+def set_bytes_content(msg, data, maintype, subtype, cte='base64',
+ disposition=None, filename=None, cid=None,
+ params=None, headers=None):
+ _prepare_set(msg, maintype, subtype, headers)
+ if cte == 'base64':
+ data = _encode_base64(data, max_line_length=msg.policy.max_line_length)
+ elif cte == 'quoted-printable':
+ # XXX: quoprimime.body_encode won't encode newline characters in data,
+ # so we can't use it. This means max_line_length is ignored. Another
+ # bug to fix later. (Note: encoders.quopri is broken on line ends.)
+ data = binascii.b2a_qp(data, istext=False, header=False, quotetabs=True)
+ data = data.decode('ascii')
+ elif cte == '7bit':
+ # Make sure it really is only ASCII. The early warning here seems
+ # worth the overhead...if you care write your own content manager :).
+ data.encode('ascii')
+ elif cte in ('8bit', 'binary'):
+ data = data.decode('ascii', 'surrogateescape')
+ msg.set_payload(data)
+ msg['Content-Transfer-Encoding'] = cte
+ _finalize_set(msg, disposition, filename, cid, params)
+for typ in (bytes, bytearray, memoryview):
+ raw_data_manager.add_set_handler(typ, set_bytes_content)
diff --git a/modules/language/python/module/email/feedparser.py b/modules/language/python/module/email/feedparser.py
index f07448a..2f31633 100644
--- a/modules/language/python/module/email/feedparser.py
+++ b/modules/language/python/module/email/feedparser.py
@@ -23,7 +23,6 @@ object's .defects attribute.
__all__ = ['FeedParser', 'BytesFeedParser']
import re
-
import email.errors as errors
from email._policybase import compat32
from collections import deque
diff --git a/modules/language/python/module/email/generator.py b/modules/language/python/module/email/generator.py
new file mode 100644
index 0000000..2b90fe5
--- /dev/null
+++ b/modules/language/python/module/email/generator.py
@@ -0,0 +1,509 @@
+module(email,generator)
+# Copyright (C) 2001-2010 Python Software Foundation
+# Author: Barry Warsaw
+# Contact: email-sig@python.org
+
+"""Classes to generate plain text from a message object tree."""
+
+__all__ = ['Generator', 'DecodedGenerator', 'BytesGenerator']
+
+import re
+import sys
+import time
+import random
+
+from copy import deepcopy
+from io import StringIO, BytesIO
+from email.utils import _has_surrogates
+
+UNDERSCORE = '_'
+NL = '\n' # XXX: no longer used by the code below.
+
+NLCRE = re.compile(r'\r\n|\r|\n')
+fcre = re.compile(r'^From ', re.MULTILINE)
+
+
+
+class Generator:
+ """Generates output from a Message object tree.
+
+ This basic generator writes the message to the given file object as plain
+ text.
+ """
+ #
+ # Public interface
+ #
+
+ def __init__(self, outfp, mangle_from_=None, maxheaderlen=None, *,
+ policy=None):
+ """Create the generator for message flattening.
+
+ outfp is the output file-like object for writing the message to. It
+ must have a write() method.
+
+ Optional mangle_from_ is a flag that, when True (the default if policy
+ is not set), escapes From_ lines in the body of the message by putting
+ a `>' in front of them.
+
+ Optional maxheaderlen specifies the longest length for a non-continued
+ header. When a header line is longer (in characters, with tabs
+ expanded to 8 spaces) than maxheaderlen, the header will split as
+ defined in the Header class. Set maxheaderlen to zero to disable
+ header wrapping. The default is 78, as recommended (but not required)
+ by RFC 2822.
+
+ The policy keyword specifies a policy object that controls a number of
+ aspects of the generator's operation. If no policy is specified,
+ the policy associated with the Message object passed to the
+ flatten method is used.
+
+ """
+
+ if mangle_from_ is None:
+ mangle_from_ = True if policy is None else policy.mangle_from_
+ self._fp = outfp
+ self._mangle_from_ = mangle_from_
+ self.maxheaderlen = maxheaderlen
+ self.policy = policy
+
+ def write(self, s):
+ # Just delegate to the file object
+ self._fp.write(s)
+
+ def flatten(self, msg, unixfrom=False, linesep=None):
+ r"""Print the message object tree rooted at msg to the output file
+ specified when the Generator instance was created.
+
+ unixfrom is a flag that forces the printing of a Unix From_ delimiter
+ before the first object in the message tree. If the original message
+ has no From_ delimiter, a `standard' one is crafted. By default, this
+ is False to inhibit the printing of any From_ delimiter.
+
+ Note that for subobjects, no From_ line is printed.
+
+ linesep specifies the characters used to indicate a new line in
+ the output. The default value is determined by the policy specified
+ when the Generator instance was created or, if none was specified,
+ from the policy associated with the msg.
+
+ """
+ # We use the _XXX constants for operating on data that comes directly
+ # from the msg, and _encoded_XXX constants for operating on data that
+ # has already been converted (to bytes in the BytesGenerator) and
+ # inserted into a temporary buffer.
+ policy = msg.policy if self.policy is None else self.policy
+ if linesep is not None:
+ policy = policy.clone(linesep=linesep)
+ if self.maxheaderlen is not None:
+ policy = policy.clone(max_line_length=self.maxheaderlen)
+ self._NL = policy.linesep
+ self._encoded_NL = self._encode(self._NL)
+ self._EMPTY = ''
+ self._encoded_EMPTY = self._encode(self._EMPTY)
+ # Because we use clone (below) when we recursively process message
+ # subparts, and because clone uses the computed policy (not None),
+ # submessages will automatically get set to the computed policy when
+ # they are processed by this code.
+ old_gen_policy = self.policy
+ old_msg_policy = msg.policy
+ try:
+ self.policy = policy
+ msg.policy = policy
+ if unixfrom:
+ ufrom = msg.get_unixfrom()
+ if not ufrom:
+ ufrom = 'From nobody ' + time.ctime(time.time())
+ self.write(ufrom + self._NL)
+ self._write(msg)
+ finally:
+ self.policy = old_gen_policy
+ msg.policy = old_msg_policy
+
+ def clone(self, fp):
+ """Clone this generator with the exact same options."""
+ return self.__class__(fp,
+ self._mangle_from_,
+ None, # Use policy setting, which we've adjusted
+ policy=self.policy)
+
+ #
+ # Protected interface - undocumented ;/
+ #
+
+ # Note that we use 'self.write' when what we are writing is coming from
+ # the source, and self._fp.write when what we are writing is coming from a
+ # buffer (because the Bytes subclass has already had a chance to transform
+ # the data in its write method in that case). This is an entirely
+ # pragmatic split determined by experiment; we could be more general by
+ # always using write and having the Bytes subclass write method detect when
+ # it has already transformed the input; but, since this whole thing is a
+ # hack anyway this seems good enough.
+
+ def _new_buffer(self):
+ # BytesGenerator overrides this to return BytesIO.
+ return StringIO()
+
+ def _encode(self, s):
+ # BytesGenerator overrides this to encode strings to bytes.
+ return s
+
+ def _write_lines(self, lines):
+ # We have to transform the line endings.
+ if not lines:
+ return
+ lines = NLCRE.split(lines)
+ for line in lines[:-1]:
+ self.write(line)
+ self.write(self._NL)
+ if lines[-1]:
+ self.write(lines[-1])
+ # XXX logic tells me this else should be needed, but the tests fail
+ # with it and pass without it. (NLCRE.split ends with a blank element
+ # if and only if there was a trailing newline.)
+ #else:
+ # self.write(self._NL)
+
+ def _write(self, msg):
+ # We can't write the headers yet because of the following scenario:
+ # say a multipart message includes the boundary string somewhere in
+ # its body. We'd have to calculate the new boundary /before/ we write
+ # the headers so that we can write the correct Content-Type:
+ # parameter.
+ #
+ # The way we do this, so as to make the _handle_*() methods simpler,
+ # is to cache any subpart writes into a buffer. The we write the
+ # headers and the buffer contents. That way, subpart handlers can
+ # Do The Right Thing, and can still modify the Content-Type: header if
+ # necessary.
+ oldfp = self._fp
+ try:
+ self._munge_cte = None
+ self._fp = sfp = self._new_buffer()
+ self._dispatch(msg)
+ finally:
+ self._fp = oldfp
+ munge_cte = self._munge_cte
+ del self._munge_cte
+ # If we munged the cte, copy the message again and re-fix the CTE.
+ if munge_cte:
+ msg = deepcopy(msg)
+ msg.replace_header('content-transfer-encoding', munge_cte[0])
+ msg.replace_header('content-type', munge_cte[1])
+ # Write the headers. First we see if the message object wants to
+ # handle that itself. If not, we'll do it generically.
+ meth = getattr(msg, '_write_headers', None)
+ if meth is None:
+ self._write_headers(msg)
+ else:
+ meth(self)
+ self._fp.write(sfp.getvalue())
+
+ def _dispatch(self, msg):
+ # Get the Content-Type: for the message, then try to dispatch to
+ # self._handle_<maintype>_<subtype>(). If there's no handler for the
+ # full MIME type, then dispatch to self._handle_<maintype>(). If
+ # that's missing too, then dispatch to self._writeBody().
+ main = msg.get_content_maintype()
+ sub = msg.get_content_subtype()
+ specific = UNDERSCORE.join((main, sub)).replace('-', '_')
+ meth = getattr(self, '_handle_' + specific, None)
+ if meth is None:
+ generic = main.replace('-', '_')
+ meth = getattr(self, '_handle_' + generic, None)
+ if meth is None:
+ meth = self._writeBody
+ meth(msg)
+
+ #
+ # Default handlers
+ #
+
+ def _write_headers(self, msg):
+ for h, v in msg.raw_items():
+ self.write(self.policy.fold(h, v))
+ # A blank line always separates headers from body
+ self.write(self._NL)
+
+ #
+ # Handlers for writing types and subtypes
+ #
+
+ def _handle_text(self, msg):
+ payload = msg.get_payload()
+ if payload is None:
+ return
+ if not isinstance(payload, str):
+ raise TypeError('string payload expected: %s' % type(payload))
+ if _has_surrogates(msg._payload):
+ charset = msg.get_param('charset')
+ if charset is not None:
+ # XXX: This copy stuff is an ugly hack to avoid modifying the
+ # existing message.
+ msg = deepcopy(msg)
+ del msg['content-transfer-encoding']
+ msg.set_payload(payload, charset)
+ payload = msg.get_payload()
+ self._munge_cte = (msg['content-transfer-encoding'],
+ msg['content-type'])
+ if self._mangle_from_:
+ payload = fcre.sub('>From ', payload)
+ self._write_lines(payload)
+
+ # Default body handler
+ _writeBody = _handle_text
+
+ def _handle_multipart(self, msg):
+ # The trick here is to write out each part separately, merge them all
+ # together, and then make sure that the boundary we've chosen isn't
+ # present in the payload.
+ msgtexts = []
+ subparts = msg.get_payload()
+ if subparts is None:
+ subparts = []
+ elif isinstance(subparts, str):
+ # e.g. a non-strict parse of a message with no starting boundary.
+ self.write(subparts)
+ return
+ elif not isinstance(subparts, list):
+ # Scalar payload
+ subparts = [subparts]
+ for part in subparts:
+ s = self._new_buffer()
+ g = self.clone(s)
+ g.flatten(part, unixfrom=False, linesep=self._NL)
+ msgtexts.append(s.getvalue())
+ # BAW: What about boundaries that are wrapped in double-quotes?
+ boundary = msg.get_boundary()
+ if not boundary:
+ # Create a boundary that doesn't appear in any of the
+ # message texts.
+ alltext = self._encoded_NL.join(msgtexts)
+ boundary = self._make_boundary(alltext)
+ msg.set_boundary(boundary)
+ # If there's a preamble, write it out, with a trailing CRLF
+ if msg.preamble is not None:
+ if self._mangle_from_:
+ preamble = fcre.sub('>From ', msg.preamble)
+ else:
+ preamble = msg.preamble
+ self._write_lines(preamble)
+ self.write(self._NL)
+ # dash-boundary transport-padding CRLF
+ self.write('--' + boundary + self._NL)
+ # body-part
+ if msgtexts:
+ self._fp.write(msgtexts.pop(0))
+ # *encapsulation
+ # --> delimiter transport-padding
+ # --> CRLF body-part
+ for body_part in msgtexts:
+ # delimiter transport-padding CRLF
+ self.write(self._NL + '--' + boundary + self._NL)
+ # body-part
+ self._fp.write(body_part)
+ # close-delimiter transport-padding
+ self.write(self._NL + '--' + boundary + '--' + self._NL)
+ if msg.epilogue is not None:
+ if self._mangle_from_:
+ epilogue = fcre.sub('>From ', msg.epilogue)
+ else:
+ epilogue = msg.epilogue
+ self._write_lines(epilogue)
+
+ def _handle_multipart_signed(self, msg):
+ # The contents of signed parts has to stay unmodified in order to keep
+ # the signature intact per RFC1847 2.1, so we disable header wrapping.
+ # RDM: This isn't enough to completely preserve the part, but it helps.
+ p = self.policy
+ self.policy = p.clone(max_line_length=0)
+ try:
+ self._handle_multipart(msg)
+ finally:
+ self.policy = p
+
+ def _handle_message_delivery_status(self, msg):
+ # We can't just write the headers directly to self's file object
+ # because this will leave an extra newline between the last header
+ # block and the boundary. Sigh.
+ blocks = []
+ for part in msg.get_payload():
+ s = self._new_buffer()
+ g = self.clone(s)
+ g.flatten(part, unixfrom=False, linesep=self._NL)
+ text = s.getvalue()
+ lines = text.split(self._encoded_NL)
+ # Strip off the unnecessary trailing empty line
+ if lines and lines[-1] == self._encoded_EMPTY:
+ blocks.append(self._encoded_NL.join(lines[:-1]))
+ else:
+ blocks.append(text)
+ # Now join all the blocks with an empty line. This has the lovely
+ # effect of separating each block with an empty line, but not adding
+ # an extra one after the last one.
+ self._fp.write(self._encoded_NL.join(blocks))
+
+ def _handle_message(self, msg):
+ s = self._new_buffer()
+ g = self.clone(s)
+ # The payload of a message/rfc822 part should be a multipart sequence
+ # of length 1. The zeroth element of the list should be the Message
+ # object for the subpart. Extract that object, stringify it, and
+ # write it out.
+ # Except, it turns out, when it's a string instead, which happens when
+ # and only when HeaderParser is used on a message of mime type
+ # message/rfc822. Such messages are generated by, for example,
+ # Groupwise when forwarding unadorned messages. (Issue 7970.) So
+ # in that case we just emit the string body.
+ payload = msg._payload
+ if isinstance(payload, list):
+ g.flatten(msg.get_payload(0), unixfrom=False, linesep=self._NL)
+ payload = s.getvalue()
+ else:
+ payload = self._encode(payload)
+ self._fp.write(payload)
+
+ # This used to be a module level function; we use a classmethod for this
+ # and _compile_re so we can continue to provide the module level function
+ # for backward compatibility by doing
+ # _make_boundary = Generator._make_boundary
+ # at the end of the module. It *is* internal, so we could drop that...
+ @classmethod
+ def _make_boundary(cls, text=None):
+ # Craft a random boundary. If text is given, ensure that the chosen
+ # boundary doesn't appear in the text.
+ token = random.randrange(sys.maxsize)
+ boundary = ('=' * 15) + (_fmt % token) + '=='
+ if text is None:
+ return boundary
+ b = boundary
+ counter = 0
+ while True:
+ cre = cls._compile_re('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
+ if not cre.search(text):
+ break
+ b = boundary + '.' + str(counter)
+ counter += 1
+ return b
+
+ @classmethod
+ def _compile_re(cls, s, flags):
+ return re.compile(s, flags)
+
+
+class BytesGenerator(Generator):
+ """Generates a bytes version of a Message object tree.
+
+ Functionally identical to the base Generator except that the output is
+ bytes and not string. When surrogates were used in the input to encode
+ bytes, these are decoded back to bytes for output. If the policy has
+ cte_type set to 7bit, then the message is transformed such that the
+ non-ASCII bytes are properly content transfer encoded, using the charset
+ unknown-8bit.
+
+ The outfp object must accept bytes in its write method.
+ """
+
+ def write(self, s):
+ self._fp.write(s.encode('ascii', 'surrogateescape'))
+
+ def _new_buffer(self):
+ return BytesIO()
+
+ def _encode(self, s):
+ return s.encode('ascii')
+
+ def _write_headers(self, msg):
+ # This is almost the same as the string version, except for handling
+ # strings with 8bit bytes.
+ for h, v in msg.raw_items():
+ self._fp.write(self.policy.fold_binary(h, v))
+ # A blank line always separates headers from body
+ self.write(self._NL)
+
+ def _handle_text(self, msg):
+ # If the string has surrogates the original source was bytes, so
+ # just write it back out.
+ if msg._payload is None:
+ return
+ if _has_surrogates(msg._payload) and not self.policy.cte_type=='7bit':
+ if self._mangle_from_:
+ msg._payload = fcre.sub(">From ", msg._payload)
+ self._write_lines(msg._payload)
+ else:
+ super(BytesGenerator,self)._handle_text(msg)
+
+ # Default body handler
+ _writeBody = _handle_text
+
+ @classmethod
+ def _compile_re(cls, s, flags):
+ return re.compile(s.encode('ascii'), flags)
+
+
+
+_FMT = '[Non-text (%(type)s) part of message omitted, filename %(filename)s]'
+
+class DecodedGenerator(Generator):
+ """Generates a text representation of a message.
+
+ Like the Generator base class, except that non-text parts are substituted
+ with a format string representing the part.
+ """
+ def __init__(self, outfp, mangle_from_=None, maxheaderlen=None, fmt=None, *,
+ policy=None):
+ """Like Generator.__init__() except that an additional optional
+ argument is allowed.
+
+ Walks through all subparts of a message. If the subpart is of main
+ type `text', then it prints the decoded payload of the subpart.
+
+ Otherwise, fmt is a format string that is used instead of the message
+ payload. fmt is expanded with the following keywords (in
+ %(keyword)s format):
+
+ type : Full MIME type of the non-text part
+ maintype : Main MIME type of the non-text part
+ subtype : Sub-MIME type of the non-text part
+ filename : Filename of the non-text part
+ description: Description associated with the non-text part
+ encoding : Content transfer encoding of the non-text part
+
+ The default value for fmt is None, meaning
+
+ [Non-text (%(type)s) part of message omitted, filename %(filename)s]
+ """
+ Generator.__init__(self, outfp, mangle_from_, maxheaderlen,
+ policy=policy)
+ if fmt is None:
+ self._fmt = _FMT
+ else:
+ self._fmt = fmt
+
+ def _dispatch(self, msg):
+ for part in msg.walk():
+ maintype = part.get_content_maintype()
+ if maintype == 'text':
+ print(part.get_payload(decode=False), file=self)
+ elif maintype == 'multipart':
+ # Just skip this
+ pass
+ else:
+ print(self._fmt % {
+ 'type' : part.get_content_type(),
+ 'maintype' : part.get_content_maintype(),
+ 'subtype' : part.get_content_subtype(),
+ 'filename' : part.get_filename('[no filename]'),
+ 'description': part.get('Content-Description',
+ '[no description]'),
+ 'encoding' : part.get('Content-Transfer-Encoding',
+ '[no encoding]'),
+ }, file=self)
+
+
+
+# Helper used by Generator._make_boundary
+_width = len(repr(sys.maxsize-1))
+_fmt = '%%0%dd' % _width
+
+# Backward compatibility
+_make_boundary = Generator._make_boundary
diff --git a/modules/language/python/module/email/headerregistry.py b/modules/language/python/module/email/headerregistry.py
new file mode 100644
index 0000000..6b8615d
--- /dev/null
+++ b/modules/language/python/module/email/headerregistry.py
@@ -0,0 +1,592 @@
+module(email,headerregistry)
+"""Representing and manipulating email headers via custom objects.
+
+This module provides an implementation of the HeaderRegistry API.
+The implementation is designed to flexibly follow RFC5322 rules.
+
+Eventually HeaderRegistry will be a public API, but it isn't yet,
+and will probably change some before that happens.
+
+"""
+
+__all__=['HeaderRegistry']
+
+from types import MappingProxyType
+import email.utils as utils
+import email.errors as errors
+import email._header_value_parser as parser
+
+class Address:
+
+ def __init__(self, display_name='', username='', domain='', addr_spec=None):
+ """Create an object representing a full email address.
+
+ An address can have a 'display_name', a 'username', and a 'domain'. In
+ addition to specifying the username and domain separately, they may be
+ specified together by using the addr_spec keyword *instead of* the
+ username and domain keywords. If an addr_spec string is specified it
+ must be properly quoted according to RFC 5322 rules; an error will be
+ raised if it is not.
+
+ An Address object has display_name, username, domain, and addr_spec
+ attributes, all of which are read-only. The addr_spec and the string
+ value of the object are both quoted according to RFC5322 rules, but
+ without any Content Transfer Encoding.
+
+ """
+ # This clause with its potential 'raise' may only happen when an
+ # application program creates an Address object using an addr_spec
+ # keyword. The email library code itself must always supply username
+ # and domain.
+ if addr_spec is not None:
+ if username or domain:
+ raise TypeError("addrspec specified when username and/or "
+ "domain also specified")
+ a_s, rest = parser.get_addr_spec(addr_spec)
+ if rest:
+ raise ValueError("Invalid addr_spec; only '{}' "
+ "could be parsed from '{}'".format(
+ a_s, addr_spec))
+ if a_s.all_defects:
+ raise a_s.all_defects[0]
+ username = a_s.local_part
+ domain = a_s.domain
+ self._display_name = display_name
+ self._username = username
+ self._domain = domain
+
+ @property
+ def display_name(self):
+ return self._display_name
+
+ @property
+ def username(self):
+ return self._username
+
+ @property
+ def domain(self):
+ return self._domain
+
+ @property
+ def addr_spec(self):
+ """The addr_spec (username@domain) portion of the address, quoted
+ according to RFC 5322 rules, but with no Content Transfer Encoding.
+ """
+ nameset = set(self.username)
+ if len(nameset) > len(nameset-parser.DOT_ATOM_ENDS):
+ lp = parser.quote_string(self.username)
+ else:
+ lp = self.username
+ if self.domain:
+ return lp + '@' + self.domain
+ if not lp:
+ return '<>'
+ return lp
+
+ def __repr__(self):
+ return "{}(display_name={!r}, username={!r}, domain={!r})".format(
+ self.__class__.__name__,
+ self.display_name, self.username, self.domain)
+
+ def __str__(self):
+ nameset = set(self.display_name)
+ if len(nameset) > len(nameset-parser.SPECIALS):
+ disp = parser.quote_string(self.display_name)
+ else:
+ disp = self.display_name
+ if disp:
+ addr_spec = '' if self.addr_spec=='<>' else self.addr_spec
+ return "{} <{}>".format(disp, addr_spec)
+ return self.addr_spec
+
+ def __eq__(self, other):
+ if type(other) != type(self):
+ return False
+ return (self.display_name == other.display_name and
+ self.username == other.username and
+ self.domain == other.domain)
+
+
+class Group:
+
+ def __init__(self, display_name=None, addresses=None):
+ """Create an object representing an address group.
+
+ An address group consists of a display_name followed by colon and a
+ list of addresses (see Address) terminated by a semi-colon. The Group
+ is created by specifying a display_name and a possibly empty list of
+ Address objects. A Group can also be used to represent a single
+ address that is not in a group, which is convenient when manipulating
+ lists that are a combination of Groups and individual Addresses. In
+ this case the display_name should be set to None. In particular, the
+ string representation of a Group whose display_name is None is the same
+ as the Address object, if there is one and only one Address object in
+ the addresses list.
+
+ """
+ self._display_name = display_name
+ self._addresses = tuple(addresses) if addresses else tuple()
+
+ @property
+ def display_name(self):
+ return self._display_name
+
+ @property
+ def addresses(self):
+ return self._addresses
+
+ def __repr__(self):
+ return "{}(display_name={!r}, addresses={!r}".format(
+ self.__class__.__name__,
+ self.display_name, self.addresses)
+
+ def __str__(self):
+ if self.display_name is None and len(self.addresses)==1:
+ return str(self.addresses[0])
+ disp = self.display_name
+ if disp is not None:
+ nameset = set(disp)
+ if len(nameset) > len(nameset-parser.SPECIALS):
+ disp = parser.quote_string(disp)
+ adrstr = ", ".join(str(x) for x in self.addresses)
+ adrstr = ' ' + adrstr if adrstr else adrstr
+ return "{}:{};".format(disp, adrstr)
+
+ def __eq__(self, other):
+ if type(other) != type(self):
+ return False
+ return (self.display_name == other.display_name and
+ self.addresses == other.addresses)
+
+
+# Header Classes #
+
+class BaseHeader(str):
+
+ """Base class for message headers.
+
+ Implements generic behavior and provides tools for subclasses.
+
+ A subclass must define a classmethod named 'parse' that takes an unfolded
+ value string and a dictionary as its arguments. The dictionary will
+ contain one key, 'defects', initialized to an empty list. After the call
+ the dictionary must contain two additional keys: parse_tree, set to the
+ parse tree obtained from parsing the header, and 'decoded', set to the
+ string value of the idealized representation of the data from the value.
+ (That is, encoded words are decoded, and values that have canonical
+ representations are so represented.)
+
+ The defects key is intended to collect parsing defects, which the message
+ parser will subsequently dispose of as appropriate. The parser should not,
+ insofar as practical, raise any errors. Defects should be added to the
+ list instead. The standard header parsers register defects for RFC
+ compliance issues, for obsolete RFC syntax, and for unrecoverable parsing
+ errors.
+
+ The parse method may add additional keys to the dictionary. In this case
+ the subclass must define an 'init' method, which will be passed the
+ dictionary as its keyword arguments. The method should use (usually by
+ setting them as the value of similarly named attributes) and remove all the
+ extra keys added by its parse method, and then use super to call its parent
+ class with the remaining arguments and keywords.
+
+ The subclass should also make sure that a 'max_count' attribute is defined
+ that is either None or 1. XXX: need to better define this API.
+
+ """
+
+ def __new__(cls, name, value):
+ kwds = {'defects': []}
+ cls.parse(value, kwds)
+ if utils._has_surrogates(kwds['decoded']):
+ kwds['decoded'] = utils._sanitize(kwds['decoded'])
+ self = str.__new__(cls, kwds['decoded'])
+ del kwds['decoded']
+ self.init(name, **kwds)
+ return self
+
+ def init(self, name, *, parse_tree, defects):
+ self._name = name
+ self._parse_tree = parse_tree
+ self._defects = defects
+
+ @property
+ def name(self):
+ return self._name
+
+ @property
+ def defects(self):
+ return tuple(self._defects)
+
+ def __reduce__(self):
+ return (
+ _reconstruct_header,
+ (
+ self.__class__.__name__,
+ self.__class__.__bases__,
+ str(self),
+ ),
+ self.__dict__)
+
+ @classmethod
+ def _reconstruct(cls, value):
+ return str.__new__(cls, value)
+
+ def fold(self, *, policy):
+ """Fold header according to policy.
+
+ The parsed representation of the header is folded according to
+ RFC5322 rules, as modified by the policy. If the parse tree
+ contains surrogateescaped bytes, the bytes are CTE encoded using
+ the charset 'unknown-8bit".
+
+ Any non-ASCII characters in the parse tree are CTE encoded using
+ charset utf-8. XXX: make this a policy setting.
+
+ The returned value is an ASCII-only string possibly containing linesep
+ characters, and ending with a linesep character. The string includes
+ the header name and the ': ' separator.
+
+ """
+ # At some point we need to put fws here iif it was in the source.
+ header = parser.Header([
+ parser.HeaderLabel([
+ parser.ValueTerminal(self.name, 'header-name'),
+ parser.ValueTerminal(':', 'header-sep')]),
+ ])
+ if self._parse_tree:
+ header.append(
+ parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]))
+ header.append(self._parse_tree)
+ return header.fold(policy=policy)
+
+
+def _reconstruct_header(cls_name, bases, value):
+ return type(cls_name, bases, {})._reconstruct(value)
+
+
+class UnstructuredHeader:
+
+ max_count = None
+ value_parser = staticmethod(parser.get_unstructured)
+
+ @classmethod
+ def parse(cls, value, kwds):
+ kwds['parse_tree'] = cls.value_parser(value)
+ kwds['decoded'] = str(kwds['parse_tree'])
+
+
+class UniqueUnstructuredHeader(UnstructuredHeader):
+
+ max_count = 1
+
+
+class DateHeader:
+
+ """Header whose value consists of a single timestamp.
+
+ Provides an additional attribute, datetime, which is either an aware
+ datetime using a timezone, or a naive datetime if the timezone
+ in the input string is -0000. Also accepts a datetime as input.
+ The 'value' attribute is the normalized form of the timestamp,
+ which means it is the output of format_datetime on the datetime.
+ """
+
+ max_count = None
+
+ # This is used only for folding, not for creating 'decoded'.
+ value_parser = staticmethod(parser.get_unstructured)
+
+ @classmethod
+ def parse(cls, value, kwds):
+ if not value:
+ kwds['defects'].append(errors.HeaderMissingRequiredValue())
+ kwds['datetime'] = None
+ kwds['decoded'] = ''
+ kwds['parse_tree'] = parser.TokenList()
+ return
+ if isinstance(value, str):
+ value = utils.parsedate_to_datetime(value)
+ kwds['datetime'] = value
+ kwds['decoded'] = utils.format_datetime(kwds['datetime'])
+ kwds['parse_tree'] = cls.value_parser(kwds['decoded'])
+
+ def init(self, *args, **kw):
+ self._datetime = kw.pop('datetime')
+ super().init(*args, **kw)
+
+ @property
+ def datetime(self):
+ return self._datetime
+
+
+class UniqueDateHeader(DateHeader):
+
+ max_count = 1
+
+
+class AddressHeader:
+
+ max_count = None
+
+ @staticmethod
+ def value_parser(value):
+ address_list, value = parser.get_address_list(value)
+ assert not value, 'this should not happen'
+ return address_list
+
+ @classmethod
+ def parse(cls, value, kwds):
+ if isinstance(value, str):
+ # We are translating here from the RFC language (address/mailbox)
+ # to our API language (group/address).
+ kwds['parse_tree'] = address_list = cls.value_parser(value)
+ groups = []
+ for addr in address_list.addresses:
+ groups.append(Group(addr.display_name,
+ [Address(mb.display_name or '',
+ mb.local_part or '',
+ mb.domain or '')
+ for mb in addr.all_mailboxes]))
+ defects = list(address_list.all_defects)
+ else:
+ # Assume it is Address/Group stuff
+ if not hasattr(value, '__iter__'):
+ value = [value]
+ groups = [Group(None, [item]) if not hasattr(item, 'addresses')
+ else item
+ for item in value]
+ defects = []
+ kwds['groups'] = groups
+ kwds['defects'] = defects
+ kwds['decoded'] = ', '.join([str(item) for item in groups])
+ if 'parse_tree' not in kwds:
+ kwds['parse_tree'] = cls.value_parser(kwds['decoded'])
+
+ def init(self, *args, **kw):
+ self._groups = tuple(kw.pop('groups'))
+ self._addresses = None
+ super().init(*args, **kw)
+
+ @property
+ def groups(self):
+ return self._groups
+
+ @property
+ def addresses(self):
+ if self._addresses is None:
+ self._addresses = tuple([address for group in self._groups
+ for address in group.addresses])
+ return self._addresses
+
+
+class UniqueAddressHeader(AddressHeader):
+
+ max_count = 1
+
+
+class SingleAddressHeader(AddressHeader):
+
+ @property
+ def address(self):
+ if len(self.addresses)!=1:
+ raise ValueError(("value of single address header {} is not "
+ "a single address").format(self.name))
+ return self.addresses[0]
+
+
+class UniqueSingleAddressHeader(SingleAddressHeader):
+
+ max_count = 1
+
+
+class MIMEVersionHeader:
+
+ max_count = 1
+
+ value_parser = staticmethod(parser.parse_mime_version)
+
+ @classmethod
+ def parse(cls, value, kwds):
+ kwds['parse_tree'] = parse_tree = cls.value_parser(value)
+ kwds['decoded'] = str(parse_tree)
+ kwds['defects'].extend(parse_tree.all_defects)
+ kwds['major'] = None if parse_tree.minor is None else parse_tree.major
+ kwds['minor'] = parse_tree.minor
+ if parse_tree.minor is not None:
+ kwds['version'] = '{}.{}'.format(kwds['major'], kwds['minor'])
+ else:
+ kwds['version'] = None
+
+ def init(self, *args, **kw):
+ self._version = kw.pop('version')
+ self._major = kw.pop('major')
+ self._minor = kw.pop('minor')
+ super().init(*args, **kw)
+
+ @property
+ def major(self):
+ return self._major
+
+ @property
+ def minor(self):
+ return self._minor
+
+ @property
+ def version(self):
+ return self._version
+
+
+class ParameterizedMIMEHeader:
+
+ # Mixin that handles the params dict. Must be subclassed and
+ # a property value_parser for the specific header provided.
+
+ max_count = 1
+
+ @classmethod
+ def parse(cls, value, kwds):
+ kwds['parse_tree'] = parse_tree = cls.value_parser(value)
+ kwds['decoded'] = str(parse_tree)
+ kwds['defects'].extend(parse_tree.all_defects)
+ if parse_tree.params is None:
+ kwds['params'] = {}
+ else:
+ # The MIME RFCs specify that parameter ordering is arbitrary.
+ kwds['params'] = {utils._sanitize(name).lower():
+ utils._sanitize(value)
+ for name, value in parse_tree.params}
+
+ def init(self, *args, **kw):
+ self._params = kw.pop('params')
+ super().init(*args, **kw)
+
+ @property
+ def params(self):
+ return MappingProxyType(self._params)
+
+
+class ContentTypeHeader(ParameterizedMIMEHeader):
+
+ value_parser = staticmethod(parser.parse_content_type_header)
+
+ def init(self, *args, **kw):
+ super().init(*args, **kw)
+ self._maintype = utils._sanitize(self._parse_tree.maintype)
+ self._subtype = utils._sanitize(self._parse_tree.subtype)
+
+ @property
+ def maintype(self):
+ return self._maintype
+
+ @property
+ def subtype(self):
+ return self._subtype
+
+ @property
+ def content_type(self):
+ return self.maintype + '/' + self.subtype
+
+
+class ContentDispositionHeader(ParameterizedMIMEHeader):
+
+ value_parser = staticmethod(parser.parse_content_disposition_header)
+
+ def init(self, *args, **kw):
+ super().init(*args, **kw)
+ cd = self._parse_tree.content_disposition
+ self._content_disposition = cd if cd is None else utils._sanitize(cd)
+
+ @property
+ def content_disposition(self):
+ return self._content_disposition
+
+
+class ContentTransferEncodingHeader:
+
+ max_count = 1
+
+ value_parser = staticmethod(parser.parse_content_transfer_encoding_header)
+
+ @classmethod
+ def parse(cls, value, kwds):
+ kwds['parse_tree'] = parse_tree = cls.value_parser(value)
+ kwds['decoded'] = str(parse_tree)
+ kwds['defects'].extend(parse_tree.all_defects)
+
+ def init(self, *args, **kw):
+ super().init(*args, **kw)
+ self._cte = utils._sanitize(self._parse_tree.cte)
+
+ @property
+ def cte(self):
+ return self._cte
+
+
+# The header factory #
+
+_default_header_map = {
+ 'subject': UniqueUnstructuredHeader,
+ 'date': UniqueDateHeader,
+ 'resent-date': DateHeader,
+ 'orig-date': UniqueDateHeader,
+ 'sender': UniqueSingleAddressHeader,
+ 'resent-sender': SingleAddressHeader,
+ 'to': UniqueAddressHeader,
+ 'resent-to': AddressHeader,
+ 'cc': UniqueAddressHeader,
+ 'resent-cc': AddressHeader,
+ 'bcc': UniqueAddressHeader,
+ 'resent-bcc': AddressHeader,
+ 'from': UniqueAddressHeader,
+ 'resent-from': AddressHeader,
+ 'reply-to': UniqueAddressHeader,
+ 'mime-version': MIMEVersionHeader,
+ 'content-type': ContentTypeHeader,
+ 'content-disposition': ContentDispositionHeader,
+ 'content-transfer-encoding': ContentTransferEncodingHeader,
+ }
+
+class HeaderRegistry:
+
+ """A header_factory and header registry."""
+
+ def __init__(self, base_class=BaseHeader, default_class=UnstructuredHeader,
+ use_default_map=True):
+ """Create a header_factory that works with the Policy API.
+
+ base_class is the class that will be the last class in the created
+ header class's __bases__ list. default_class is the class that will be
+ used if "name" (see __call__) does not appear in the registry.
+ use_default_map controls whether or not the default mapping of names to
+ specialized classes is copied in to the registry when the factory is
+ created. The default is True.
+
+ """
+ self.registry = {}
+ self.base_class = base_class
+ self.default_class = default_class
+ if use_default_map:
+ self.registry.update(_default_header_map)
+
+ def map_to_type(self, name, cls):
+ """Register cls as the specialized class for handling "name" headers.
+
+ """
+ self.registry[name.lower()] = cls
+
+ def __getitem__(self, name):
+ cls = self.registry.get(name.lower(), self.default_class)
+ return type('_'+cls.__name__, (cls, self.base_class), {})
+
+ def __call__(self, name, value):
+ """Create a header instance for header 'name' from 'value'.
+
+ Creates a header instance by creating a specialized class for parsing
+ and representing the specified header by combining the factory
+ base_class with a specialized class from the registry or the
+ default_class, and passing the name and value to the constructed
+ class's constructor.
+
+ """
+ return self[name](name, value)
diff --git a/modules/language/python/module/email/iterators.py b/modules/language/python/module/email/iterators.py
new file mode 100644
index 0000000..0773bdf
--- /dev/null
+++ b/modules/language/python/module/email/iterators.py
@@ -0,0 +1,72 @@
+module(email,iterators)
+# Copyright (C) 2001-2006 Python Software Foundation
+# Author: Barry Warsaw
+# Contact: email-sig@python.org
+
+"""Various types of useful iterators and generators."""
+
+__all__ = [
+ 'body_line_iterator',
+ 'typed_subpart_iterator',
+ 'walk',
+ # Do not include _structure() since it's part of the debugging API.
+ ]
+
+import sys
+from io import StringIO
+
+
+
+# This function will become a method of the Message class
+def walk(self):
+ """Walk over the message tree, yielding each subpart.
+
+ The walk is performed in depth-first order. This method is a
+ generator.
+ """
+ yield self
+ if self.is_multipart():
+ for subpart in self.get_payload():
+ yield from subpart.walk()
+
+
+
+# These two functions are imported into the Iterators.py interface module.
+def body_line_iterator(msg, decode=False):
+ """Iterate over the parts, returning string payloads line-by-line.
+
+ Optional decode (default False) is passed through to .get_payload().
+ """
+ for subpart in msg.walk():
+ payload = subpart.get_payload(decode=decode)
+ if isinstance(payload, str):
+ yield from StringIO(payload)
+
+
+def typed_subpart_iterator(msg, maintype='text', subtype=None):
+ """Iterate over the subparts with a given MIME type.
+
+ Use `maintype' as the main MIME type to match against; this defaults to
+ "text". Optional `subtype' is the MIME subtype to match against; if
+ omitted, only the main type is matched.
+ """
+ for subpart in msg.walk():
+ if subpart.get_content_maintype() == maintype:
+ if subtype is None or subpart.get_content_subtype() == subtype:
+ yield subpart
+
+
+
+def _structure(msg, fp=None, level=0, include_default=False):
+ """A handy debugging aid"""
+ if fp is None:
+ fp = sys.stdout
+ tab = ' ' * (level * 4)
+ print(tab + msg.get_content_type(), end='', file=fp)
+ if include_default:
+ print(' [%s]' % msg.get_default_type(), file=fp)
+ else:
+ print(file=fp)
+ if msg.is_multipart():
+ for subpart in msg.get_payload():
+ _structure(subpart, fp, level+1, include_default)
diff --git a/modules/language/python/module/email/message.py b/modules/language/python/module/email/message.py
new file mode 100644
index 0000000..c73aa3c
--- /dev/null
+++ b/modules/language/python/module/email/message.py
@@ -0,0 +1,1164 @@
+module(email,message)
+# Copyright (C) 2001-2007 Python Software Foundation
+# Author: Barry Warsaw
+# Contact: email-sig@python.org
+
+"""Basic message object for the email package object model."""
+
+__all__ = ['Message', 'EmailMessage']
+
+import re
+import uu
+import quopri
+from io import BytesIO, StringIO
+
+# Intrapackage imports
+import email.utils as utils
+import email.errors as errors
+from email._policybase import Policy, compat32
+import email.charset as _charset
+from email._encoded_words import decode_b
+Charset = _charset.Charset
+
+SEMISPACE = '; '
+
+# Regular expression that matches `special' characters in parameters, the
+# existence of which force quoting of the parameter value.
+tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]')
+
+
+def _splitparam(param):
+ # Split header parameters. BAW: this may be too simple. It isn't
+ # strictly RFC 2045 (section 5.1) compliant, but it catches most headers
+ # found in the wild. We may eventually need a full fledged parser.
+ # RDM: we might have a Header here; for now just stringify it.
+ a, sep, b = str(param).partition(';')
+ if not sep:
+ return a.strip(), None
+ return a.strip(), b.strip()
+
+def _formatparam(param, value=None, quote=True):
+ """Convenience function to format and return a key=value pair.
+
+ This will quote the value if needed or if quote is true. If value is a
+ three tuple (charset, language, value), it will be encoded according
+ to RFC2231 rules. If it contains non-ascii characters it will likewise
+ be encoded according to RFC2231 rules, using the utf-8 charset and
+ a null language.
+ """
+ if value is not None and len(value) > 0:
+ # A tuple is used for RFC 2231 encoded parameter values where items
+ # are (charset, language, value). charset is a string, not a Charset
+ # instance. RFC 2231 encoded values are never quoted, per RFC.
+ if isinstance(value, tuple):
+ # Encode as per RFC 2231
+ param += '*'
+ value = utils.encode_rfc2231(value[2], value[0], value[1])
+ return '%s=%s' % (param, value)
+ else:
+ try:
+ value.encode('ascii')
+ except UnicodeEncodeError:
+ param += '*'
+ value = utils.encode_rfc2231(value, 'utf-8', '')
+ return '%s=%s' % (param, value)
+ # BAW: Please check this. I think that if quote is set it should
+ # force quoting even if not necessary.
+ if quote or tspecials.search(value):
+ return '%s="%s"' % (param, utils.quote(value))
+ else:
+ return '%s=%s' % (param, value)
+ else:
+ return param
+
+def _parseparam(s):
+ # RDM This might be a Header, so for now stringify it.
+ s = ';' + str(s)
+ plist = []
+ while s[:1] == ';':
+ s = s[1:]
+ end = s.find(';')
+ while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2:
+ end = s.find(';', end + 1)
+ if end < 0:
+ end = len(s)
+ f = s[:end]
+ if '=' in f:
+ i = f.index('=')
+ f = f[:i].strip().lower() + '=' + f[i+1:].strip()
+ plist.append(f.strip())
+ s = s[end:]
+ return plist
+
+
+def _unquotevalue(value):
+ # This is different than utils.collapse_rfc2231_value() because it doesn't
+ # try to convert the value to a unicode. Message.get_param() and
+ # Message.get_params() are both currently defined to return the tuple in
+ # the face of RFC 2231 parameters.
+ if isinstance(value, tuple):
+ return value[0], value[1], utils.unquote(value[2])
+ else:
+ return utils.unquote(value)
+
+
+class Message:
+ """Basic message object.
+
+ A message object is defined as something that has a bunch of RFC 2822
+ headers and a payload. It may optionally have an envelope header
+ (a.k.a. Unix-From or From_ header). If the message is a container (i.e. a
+ multipart or a message/rfc822), then the payload is a list of Message
+ objects, otherwise it is a string.
+
+ Message objects implement part of the `mapping' interface, which assumes
+ there is exactly one occurrence of the header per message. Some headers
+ do in fact appear multiple times (e.g. Received) and for those headers,
+ you must use the explicit API to set or get all the headers. Not all of
+ the mapping methods are implemented.
+ """
+ def __init__(self, policy=compat32):
+ self.policy = policy
+ self._headers = []
+ self._unixfrom = None
+ self._payload = None
+ self._charset = None
+ # Defaults for multipart messages
+ self.preamble = self.epilogue = None
+ self.defects = []
+ # Default content type
+ self._default_type = 'text/plain'
+
+ def __str__(self):
+ """Return the entire formatted message as a string.
+ """
+ return self.as_string()
+
+ def as_string(self, unixfrom=False, maxheaderlen=0, policy=None):
+ """Return the entire formatted message as a string.
+
+ Optional 'unixfrom', when true, means include the Unix From_ envelope
+ header. For backward compatibility reasons, if maxheaderlen is
+ not specified it defaults to 0, so you must override it explicitly
+ if you want a different maxheaderlen. 'policy' is passed to the
+ Generator instance used to serialize the mesasge; if it is not
+ specified the policy associated with the message instance is used.
+
+ If the message object contains binary data that is not encoded
+ according to RFC standards, the non-compliant data will be replaced by
+ unicode "unknown character" code points.
+ """
+ from email.generator import Generator
+ policy = self.policy if policy is None else policy
+ fp = StringIO()
+ g = Generator(fp,
+ mangle_from_=False,
+ maxheaderlen=maxheaderlen,
+ policy=policy)
+ g.flatten(self, unixfrom=unixfrom)
+ return fp.getvalue()
+
+ def __bytes__(self):
+ """Return the entire formatted message as a bytes object.
+ """
+ return self.as_bytes()
+
+ def as_bytes(self, unixfrom=False, policy=None):
+ """Return the entire formatted message as a bytes object.
+
+ Optional 'unixfrom', when true, means include the Unix From_ envelope
+ header. 'policy' is passed to the BytesGenerator instance used to
+ serialize the message; if not specified the policy associated with
+ the message instance is used.
+ """
+ from email.generator import BytesGenerator
+ policy = self.policy if policy is None else policy
+ fp = BytesIO()
+ g = BytesGenerator(fp, mangle_from_=False, policy=policy)
+ g.flatten(self, unixfrom=unixfrom)
+ return fp.getvalue()
+
+ def is_multipart(self):
+ """Return True if the message consists of multiple parts."""
+ return isinstance(self._payload, list)
+
+ #
+ # Unix From_ line
+ #
+ def set_unixfrom(self, unixfrom):
+ self._unixfrom = unixfrom
+
+ def get_unixfrom(self):
+ return self._unixfrom
+
+ #
+ # Payload manipulation.
+ #
+ def attach(self, payload):
+ """Add the given payload to the current payload.
+
+ The current payload will always be a list of objects after this method
+ is called. If you want to set the payload to a scalar object, use
+ set_payload() instead.
+ """
+ if self._payload is None:
+ self._payload = [payload]
+ else:
+ try:
+ self._payload.append(payload)
+ except AttributeError:
+ raise TypeError("Attach is not valid on a message with a"
+ " non-multipart payload")
+
+ def get_payload(self, i=None, decode=False):
+ """Return a reference to the payload.
+
+ The payload will either be a list object or a string. If you mutate
+ the list object, you modify the message's payload in place. Optional
+ i returns that index into the payload.
+
+ Optional decode is a flag indicating whether the payload should be
+ decoded or not, according to the Content-Transfer-Encoding header
+ (default is False).
+
+ When True and the message is not a multipart, the payload will be
+ decoded if this header's value is `quoted-printable' or `base64'. If
+ some other encoding is used, or the header is missing, or if the
+ payload has bogus data (i.e. bogus base64 or uuencoded data), the
+ payload is returned as-is.
+
+ If the message is a multipart and the decode flag is True, then None
+ is returned.
+ """
+ # Here is the logic table for this code, based on the email5.0.0 code:
+ # i decode is_multipart result
+ # ------ ------ ------------ ------------------------------
+ # None True True None
+ # i True True None
+ # None False True _payload (a list)
+ # i False True _payload element i (a Message)
+ # i False False error (not a list)
+ # i True False error (not a list)
+ # None False False _payload
+ # None True False _payload decoded (bytes)
+ # Note that Barry planned to factor out the 'decode' case, but that
+ # isn't so easy now that we handle the 8 bit data, which needs to be
+ # converted in both the decode and non-decode path.
+ if self.is_multipart():
+ if decode:
+ return None
+ if i is None:
+ return self._payload
+ else:
+ return self._payload[i]
+ # For backward compatibility, Use isinstance and this error message
+ # instead of the more logical is_multipart test.
+ if i is not None and not isinstance(self._payload, list):
+ raise TypeError('Expected list, got %s' % type(self._payload))
+ payload = self._payload
+ # cte might be a Header, so for now stringify it.
+ cte = str(self.get('content-transfer-encoding', '')).lower()
+ # payload may be bytes here.
+ if isinstance(payload, str):
+ if utils._has_surrogates(payload):
+ bpayload = payload.encode('ascii', 'surrogateescape')
+ if not decode:
+ try:
+ payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
+ except LookupError:
+ payload = bpayload.decode('ascii', 'replace')
+ elif decode:
+ try:
+ bpayload = payload.encode('ascii')
+ except UnicodeError:
+ # This won't happen for RFC compliant messages (messages
+ # containing only ASCII code points in the unicode input).
+ # If it does happen, turn the string into bytes in a way
+ # guaranteed not to fail.
+ bpayload = payload.encode('raw-unicode-escape')
+ if not decode:
+ return payload
+ if cte == 'quoted-printable':
+ return quopri.decodestring(bpayload)
+ elif cte == 'base64':
+ # XXX: this is a bit of a hack; decode_b should probably be factored
+ # out somewhere, but I haven't figured out where yet.
+ value, defects = decode_b(b''.join(bpayload.splitlines()))
+ for defect in defects:
+ self.policy.handle_defect(self, defect)
+ return value
+ elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'):
+ in_file = BytesIO(bpayload)
+ out_file = BytesIO()
+ try:
+ uu.decode(in_file, out_file, quiet=True)
+ return out_file.getvalue()
+ except uu.Error:
+ # Some decoding problem
+ return bpayload
+ if isinstance(payload, str):
+ return bpayload
+ return payload
+
+ def set_payload(self, payload, charset=None):
+ """Set the payload to the given value.
+
+ Optional charset sets the message's default character set. See
+ set_charset() for details.
+ """
+ if hasattr(payload, 'encode'):
+ if charset is None:
+ self._payload = payload
+ return
+ if not isinstance(charset, Charset):
+ charset = Charset(charset)
+ payload = payload.encode(charset.output_charset)
+ if hasattr(payload, 'decode'):
+ self._payload = payload.decode('ascii', 'surrogateescape')
+ else:
+ self._payload = payload
+ if charset is not None:
+ self.set_charset(charset)
+
+ def set_charset(self, charset):
+ """Set the charset of the payload to a given character set.
+
+ charset can be a Charset instance, a string naming a character set, or
+ None. If it is a string it will be converted to a Charset instance.
+ If charset is None, the charset parameter will be removed from the
+ Content-Type field. Anything else will generate a TypeError.
+
+ The message will be assumed to be of type text/* encoded with
+ charset.input_charset. It will be converted to charset.output_charset
+ and encoded properly, if needed, when generating the plain text
+ representation of the message. MIME headers (MIME-Version,
+ Content-Type, Content-Transfer-Encoding) will be added as needed.
+ """
+ if charset is None:
+ self.del_param('charset')
+ self._charset = None
+ return
+ if not isinstance(charset, Charset):
+ charset = Charset(charset)
+ self._charset = charset
+ if 'MIME-Version' not in self:
+ self.add_header('MIME-Version', '1.0')
+ if 'Content-Type' not in self:
+ self.add_header('Content-Type', 'text/plain',
+ charset=charset.get_output_charset())
+ else:
+ self.set_param('charset', charset.get_output_charset())
+ if charset != charset.get_output_charset():
+ self._payload = charset.body_encode(self._payload)
+ if 'Content-Transfer-Encoding' not in self:
+ cte = charset.get_body_encoding()
+ try:
+ cte(self)
+ except TypeError:
+ # This 'if' is for backward compatibility, it allows unicode
+ # through even though that won't work correctly if the
+ # message is serialized.
+ payload = self._payload
+ if payload:
+ try:
+ payload = payload.encode('ascii', 'surrogateescape')
+ except UnicodeError:
+ payload = payload.encode(charset.output_charset)
+ self._payload = charset.body_encode(payload)
+ self.add_header('Content-Transfer-Encoding', cte)
+
+ def get_charset(self):
+ """Return the Charset instance associated with the message's payload.
+ """
+ return self._charset
+
+ #
+ # MAPPING INTERFACE (partial)
+ #
+ def __len__(self):
+ """Return the total number of headers, including duplicates."""
+ return len(self._headers)
+
+ def __getitem__(self, name):
+ """Get a header value.
+
+ Return None if the header is missing instead of raising an exception.
+
+ Note that if the header appeared multiple times, exactly which
+ occurrence gets returned is undefined. Use get_all() to get all
+ the values matching a header field name.
+ """
+ return self.get(name)
+
+ def __setitem__(self, name, val):
+ """Set the value of a header.
+
+ Note: this does not overwrite an existing header with the same field
+ name. Use __delitem__() first to delete any existing headers.
+ """
+ max_count = self.policy.header_max_count(name)
+ if max_count:
+ lname = name.lower()
+ found = 0
+ for k, v in self._headers:
+ if k.lower() == lname:
+ found += 1
+ if found >= max_count:
+ raise ValueError("There may be at most {} {} headers "
+ "in a message".format(max_count, name))
+ self._headers.append(self.policy.header_store_parse(name, val))
+
+ def __delitem__(self, name):
+ """Delete all occurrences of a header, if present.
+
+ Does not raise an exception if the header is missing.
+ """
+ name = name.lower()
+ newheaders = []
+ for k, v in self._headers:
+ if k.lower() != name:
+ newheaders.append((k, v))
+ self._headers = newheaders
+
+ def __contains__(self, name):
+ return name.lower() in [k.lower() for k, v in self._headers]
+
+ def __iter__(self):
+ for field, value in self._headers:
+ yield field
+
+ def keys(self):
+ """Return a list of all the message's header field names.
+
+ These will be sorted in the order they appeared in the original
+ message, or were added to the message, and may contain duplicates.
+ Any fields deleted and re-inserted are always appended to the header
+ list.
+ """
+ return [k for k, v in self._headers]
+
+ def values(self):
+ """Return a list of all the message's header values.
+
+ These will be sorted in the order they appeared in the original
+ message, or were added to the message, and may contain duplicates.
+ Any fields deleted and re-inserted are always appended to the header
+ list.
+ """
+ return [self.policy.header_fetch_parse(k, v)
+ for k, v in self._headers]
+
+ def items(self):
+ """Get all the message's header fields and values.
+
+ These will be sorted in the order they appeared in the original
+ message, or were added to the message, and may contain duplicates.
+ Any fields deleted and re-inserted are always appended to the header
+ list.
+ """
+ return [(k, self.policy.header_fetch_parse(k, v))
+ for k, v in self._headers]
+
+ def get(self, name, failobj=None):
+ """Get a header value.
+
+ Like __getitem__() but return failobj instead of None when the field
+ is missing.
+ """
+ name = name.lower()
+ for k, v in self._headers:
+ if k.lower() == name:
+ return self.policy.header_fetch_parse(k, v)
+ return failobj
+
+ #
+ # "Internal" methods (public API, but only intended for use by a parser
+ # or generator, not normal application code.
+ #
+
+ def set_raw(self, name, value):
+ """Store name and value in the model without modification.
+
+ This is an "internal" API, intended only for use by a parser.
+ """
+ self._headers.append((name, value))
+
+ def raw_items(self):
+ """Return the (name, value) header pairs without modification.
+
+ This is an "internal" API, intended only for use by a generator.
+ """
+ return iter(self._headers.copy())
+
+ #
+ # Additional useful stuff
+ #
+
+ def get_all(self, name, failobj=None):
+ """Return a list of all the values for the named field.
+
+ These will be sorted in the order they appeared in the original
+ message, and may contain duplicates. Any fields deleted and
+ re-inserted are always appended to the header list.
+
+ If no such fields exist, failobj is returned (defaults to None).
+ """
+ values = []
+ name = name.lower()
+ for k, v in self._headers:
+ if k.lower() == name:
+ values.append(self.policy.header_fetch_parse(k, v))
+ if not values:
+ return failobj
+ return values
+
+ def add_header(self, _name, _value, **_params):
+ """Extended header setting.
+
+ name is the header field to add. keyword arguments can be used to set
+ additional parameters for the header field, with underscores converted
+ to dashes. Normally the parameter will be added as key="value" unless
+ value is None, in which case only the key will be added. If a
+ parameter value contains non-ASCII characters it can be specified as a
+ three-tuple of (charset, language, value), in which case it will be
+ encoded according to RFC2231 rules. Otherwise it will be encoded using
+ the utf-8 charset and a language of ''.
+
+ Examples:
+
+ msg.add_header('content-disposition', 'attachment', filename='bud.gif')
+ msg.add_header('content-disposition', 'attachment',
+ filename=('utf-8', '', Fußballer.ppt'))
+ msg.add_header('content-disposition', 'attachment',
+ filename='Fußballer.ppt'))
+ """
+ parts = []
+ for k, v in _params.items():
+ if v is None:
+ parts.append(k.replace('_', '-'))
+ else:
+ parts.append(_formatparam(k.replace('_', '-'), v))
+ if _value is not None:
+ parts.insert(0, _value)
+ self[_name] = SEMISPACE.join(parts)
+
+ def replace_header(self, _name, _value):
+ """Replace a header.
+
+ Replace the first matching header found in the message, retaining
+ header order and case. If no matching header was found, a KeyError is
+ raised.
+ """
+ _name = _name.lower()
+ for i, (k, v) in zip(range(len(self._headers)), self._headers):
+ if k.lower() == _name:
+ self._headers[i] = self.policy.header_store_parse(k, _value)
+ break
+ else:
+ raise KeyError(_name)
+
+ #
+ # Use these three methods instead of the three above.
+ #
+
+ def get_content_type(self):
+ """Return the message's content type.
+
+ The returned string is coerced to lower case of the form
+ `maintype/subtype'. If there was no Content-Type header in the
+ message, the default type as given by get_default_type() will be
+ returned. Since according to RFC 2045, messages always have a default
+ type this will always return a value.
+
+ RFC 2045 defines a message's default type to be text/plain unless it
+ appears inside a multipart/digest container, in which case it would be
+ message/rfc822.
+ """
+ missing = object()
+ value = self.get('content-type', missing)
+ if value is missing:
+ # This should have no parameters
+ return self.get_default_type()
+ ctype = _splitparam(value)[0].lower()
+ # RFC 2045, section 5.2 says if its invalid, use text/plain
+ if ctype.count('/') != 1:
+ return 'text/plain'
+ return ctype
+
+ def get_content_maintype(self):
+ """Return the message's main content type.
+
+ This is the `maintype' part of the string returned by
+ get_content_type().
+ """
+ ctype = self.get_content_type()
+ return ctype.split('/')[0]
+
+ def get_content_subtype(self):
+ """Returns the message's sub-content type.
+
+ This is the `subtype' part of the string returned by
+ get_content_type().
+ """
+ ctype = self.get_content_type()
+ return ctype.split('/')[1]
+
+ def get_default_type(self):
+ """Return the `default' content type.
+
+ Most messages have a default content type of text/plain, except for
+ messages that are subparts of multipart/digest containers. Such
+ subparts have a default content type of message/rfc822.
+ """
+ return self._default_type
+
+ def set_default_type(self, ctype):
+ """Set the `default' content type.
+
+ ctype should be either "text/plain" or "message/rfc822", although this
+ is not enforced. The default content type is not stored in the
+ Content-Type header.
+ """
+ self._default_type = ctype
+
+ def _get_params_preserve(self, failobj, header):
+ # Like get_params() but preserves the quoting of values. BAW:
+ # should this be part of the public interface?
+ missing = object()
+ value = self.get(header, missing)
+ if value is missing:
+ return failobj
+ params = []
+ for p in _parseparam(value):
+ try:
+ name, val = p.split('=', 1)
+ name = name.strip()
+ val = val.strip()
+ except ValueError:
+ # Must have been a bare attribute
+ name = p.strip()
+ val = ''
+ params.append((name, val))
+ params = utils.decode_params(params)
+ return params
+
+ def get_params(self, failobj=None, header='content-type', unquote=True):
+ """Return the message's Content-Type parameters, as a list.
+
+ The elements of the returned list are 2-tuples of key/value pairs, as
+ split on the `=' sign. The left hand side of the `=' is the key,
+ while the right hand side is the value. If there is no `=' sign in
+ the parameter the value is the empty string. The value is as
+ described in the get_param() method.
+
+ Optional failobj is the object to return if there is no Content-Type
+ header. Optional header is the header to search instead of
+ Content-Type. If unquote is True, the value is unquoted.
+ """
+ missing = object()
+ params = self._get_params_preserve(missing, header)
+ if params is missing:
+ return failobj
+ if unquote:
+ return [(k, _unquotevalue(v)) for k, v in params]
+ else:
+ return params
+
+ def get_param(self, param, failobj=None, header='content-type',
+ unquote=True):
+ """Return the parameter value if found in the Content-Type header.
+
+ Optional failobj is the object to return if there is no Content-Type
+ header, or the Content-Type header has no such parameter. Optional
+ header is the header to search instead of Content-Type.
+
+ Parameter keys are always compared case insensitively. The return
+ value can either be a string, or a 3-tuple if the parameter was RFC
+ 2231 encoded. When it's a 3-tuple, the elements of the value are of
+ the form (CHARSET, LANGUAGE, VALUE). Note that both CHARSET and
+ LANGUAGE can be None, in which case you should consider VALUE to be
+ encoded in the us-ascii charset. You can usually ignore LANGUAGE.
+ The parameter value (either the returned string, or the VALUE item in
+ the 3-tuple) is always unquoted, unless unquote is set to False.
+
+ If your application doesn't care whether the parameter was RFC 2231
+ encoded, it can turn the return value into a string as follows:
+
+ rawparam = msg.get_param('foo')
+ param = email.utils.collapse_rfc2231_value(rawparam)
+
+ """
+ if header not in self:
+ return failobj
+ for k, v in self._get_params_preserve(failobj, header):
+ if k.lower() == param.lower():
+ if unquote:
+ return _unquotevalue(v)
+ else:
+ return v
+ return failobj
+
+ def set_param(self, param, value, header='Content-Type', requote=True,
+ charset=None, language='', replace=False):
+ """Set a parameter in the Content-Type header.
+
+ If the parameter already exists in the header, its value will be
+ replaced with the new value.
+
+ If header is Content-Type and has not yet been defined for this
+ message, it will be set to "text/plain" and the new parameter and
+ value will be appended as per RFC 2045.
+
+ An alternate header can be specified in the header argument, and all
+ parameters will be quoted as necessary unless requote is False.
+
+ If charset is specified, the parameter will be encoded according to RFC
+ 2231. Optional language specifies the RFC 2231 language, defaulting
+ to the empty string. Both charset and language should be strings.
+ """
+ if not isinstance(value, tuple) and charset:
+ value = (charset, language, value)
+
+ if header not in self and header.lower() == 'content-type':
+ ctype = 'text/plain'
+ else:
+ ctype = self.get(header)
+ if not self.get_param(param, header=header):
+ if not ctype:
+ ctype = _formatparam(param, value, requote)
+ else:
+ ctype = SEMISPACE.join(
+ [ctype, _formatparam(param, value, requote)])
+ else:
+ ctype = ''
+ for old_param, old_value in self.get_params(header=header,
+ unquote=requote):
+ append_param = ''
+ if old_param.lower() == param.lower():
+ append_param = _formatparam(param, value, requote)
+ else:
+ append_param = _formatparam(old_param, old_value, requote)
+ if not ctype:
+ ctype = append_param
+ else:
+ ctype = SEMISPACE.join([ctype, append_param])
+ if ctype != self.get(header):
+ if replace:
+ self.replace_header(header, ctype)
+ else:
+ del self[header]
+ self[header] = ctype
+
+ def del_param(self, param, header='content-type', requote=True):
+ """Remove the given parameter completely from the Content-Type header.
+
+ The header will be re-written in place without the parameter or its
+ value. All values will be quoted as necessary unless requote is
+ False. Optional header specifies an alternative to the Content-Type
+ header.
+ """
+ if header not in self:
+ return
+ new_ctype = ''
+ for p, v in self.get_params(header=header, unquote=requote):
+ if p.lower() != param.lower():
+ if not new_ctype:
+ new_ctype = _formatparam(p, v, requote)
+ else:
+ new_ctype = SEMISPACE.join([new_ctype,
+ _formatparam(p, v, requote)])
+ if new_ctype != self.get(header):
+ del self[header]
+ self[header] = new_ctype
+
+ def set_type(self, type, header='Content-Type', requote=True):
+ """Set the main type and subtype for the Content-Type header.
+
+ type must be a string in the form "maintype/subtype", otherwise a
+ ValueError is raised.
+
+ This method replaces the Content-Type header, keeping all the
+ parameters in place. If requote is False, this leaves the existing
+ header's quoting as is. Otherwise, the parameters will be quoted (the
+ default).
+
+ An alternative header can be specified in the header argument. When
+ the Content-Type header is set, we'll always also add a MIME-Version
+ header.
+ """
+ # BAW: should we be strict?
+ if not type.count('/') == 1:
+ raise ValueError
+ # Set the Content-Type, you get a MIME-Version
+ if header.lower() == 'content-type':
+ del self['mime-version']
+ self['MIME-Version'] = '1.0'
+ if header not in self:
+ self[header] = type
+ return
+ params = self.get_params(header=header, unquote=requote)
+ del self[header]
+ self[header] = type
+ # Skip the first param; it's the old type.
+ for p, v in params[1:]:
+ self.set_param(p, v, header, requote)
+
+ def get_filename(self, failobj=None):
+ """Return the filename associated with the payload if present.
+
+ The filename is extracted from the Content-Disposition header's
+ `filename' parameter, and it is unquoted. If that header is missing
+ the `filename' parameter, this method falls back to looking for the
+ `name' parameter.
+ """
+ missing = object()
+ filename = self.get_param('filename', missing, 'content-disposition')
+ if filename is missing:
+ filename = self.get_param('name', missing, 'content-type')
+ if filename is missing:
+ return failobj
+ return utils.collapse_rfc2231_value(filename).strip()
+
+ def get_boundary(self, failobj=None):
+ """Return the boundary associated with the payload if present.
+
+ The boundary is extracted from the Content-Type header's `boundary'
+ parameter, and it is unquoted.
+ """
+ missing = object()
+ boundary = self.get_param('boundary', missing)
+ if boundary is missing:
+ return failobj
+ # RFC 2046 says that boundaries may begin but not end in w/s
+ return utils.collapse_rfc2231_value(boundary).rstrip()
+
+ def set_boundary(self, boundary):
+ """Set the boundary parameter in Content-Type to 'boundary'.
+
+ This is subtly different than deleting the Content-Type header and
+ adding a new one with a new boundary parameter via add_header(). The
+ main difference is that using the set_boundary() method preserves the
+ order of the Content-Type header in the original message.
+
+ HeaderParseError is raised if the message has no Content-Type header.
+ """
+ missing = object()
+ params = self._get_params_preserve(missing, 'content-type')
+ if params is missing:
+ # There was no Content-Type header, and we don't know what type
+ # to set it to, so raise an exception.
+ raise errors.HeaderParseError('No Content-Type header found')
+ newparams = []
+ foundp = False
+ for pk, pv in params:
+ if pk.lower() == 'boundary':
+ newparams.append(('boundary', '"%s"' % boundary))
+ foundp = True
+ else:
+ newparams.append((pk, pv))
+ if not foundp:
+ # The original Content-Type header had no boundary attribute.
+ # Tack one on the end. BAW: should we raise an exception
+ # instead???
+ newparams.append(('boundary', '"%s"' % boundary))
+ # Replace the existing Content-Type header with the new value
+ newheaders = []
+ for h, v in self._headers:
+ if h.lower() == 'content-type':
+ parts = []
+ for k, v in newparams:
+ if v == '':
+ parts.append(k)
+ else:
+ parts.append('%s=%s' % (k, v))
+ val = SEMISPACE.join(parts)
+ newheaders.append(self.policy.header_store_parse(h, val))
+
+ else:
+ newheaders.append((h, v))
+ self._headers = newheaders
+
+ def get_content_charset(self, failobj=None):
+ """Return the charset parameter of the Content-Type header.
+
+ The returned string is always coerced to lower case. If there is no
+ Content-Type header, or if that header has no charset parameter,
+ failobj is returned.
+ """
+ missing = object()
+ charset = self.get_param('charset', missing)
+ if charset is missing:
+ return failobj
+ if isinstance(charset, tuple):
+ # RFC 2231 encoded, so decode it, and it better end up as ascii.
+ pcharset = charset[0] or 'us-ascii'
+ try:
+ # LookupError will be raised if the charset isn't known to
+ # Python. UnicodeError will be raised if the encoded text
+ # contains a character not in the charset.
+ as_bytes = charset[2].encode('raw-unicode-escape')
+ charset = str(as_bytes, pcharset)
+ except (LookupError, UnicodeError):
+ charset = charset[2]
+ # charset characters must be in us-ascii range
+ try:
+ charset.encode('us-ascii')
+ except UnicodeError:
+ return failobj
+ # RFC 2046, $4.1.2 says charsets are not case sensitive
+ return charset.lower()
+
+ def get_charsets(self, failobj=None):
+ """Return a list containing the charset(s) used in this message.
+
+ The returned list of items describes the Content-Type headers'
+ charset parameter for this message and all the subparts in its
+ payload.
+
+ Each item will either be a string (the value of the charset parameter
+ in the Content-Type header of that part) or the value of the
+ 'failobj' parameter (defaults to None), if the part does not have a
+ main MIME type of "text", or the charset is not defined.
+
+ The list will contain one string for each part of the message, plus
+ one for the container message (i.e. self), so that a non-multipart
+ message will still return a list of length 1.
+ """
+ return [part.get_content_charset(failobj) for part in self.walk()]
+
+ def get_content_disposition(self):
+ """Return the message's content-disposition if it exists, or None.
+
+ The return values can be either 'inline', 'attachment' or None
+ according to the rfc2183.
+ """
+ value = self.get('content-disposition')
+ if value is None:
+ return None
+ c_d = _splitparam(value)[0].lower()
+ return c_d
+
+ # I.e. def walk(self): ...
+ from email.iterators import walk
+
+
+class MIMEPart(Message):
+
+ def __init__(self, policy=None):
+ if policy is None:
+ from email.policy import default
+ policy = default
+ Message.__init__(self, policy)
+
+
+ def as_string(self, unixfrom=False, maxheaderlen=None, policy=None):
+ """Return the entire formatted message as a string.
+
+ Optional 'unixfrom', when true, means include the Unix From_ envelope
+ header. maxheaderlen is retained for backward compatibility with the
+ base Message class, but defaults to None, meaning that the policy value
+ for max_line_length controls the header maximum length. 'policy' is
+ passed to the Generator instance used to serialize the mesasge; if it
+ is not specified the policy associated with the message instance is
+ used.
+ """
+ policy = self.policy if policy is None else policy
+ if maxheaderlen is None:
+ maxheaderlen = policy.max_line_length
+ return super().as_string(maxheaderlen=maxheaderlen, policy=policy)
+
+ def __str__(self):
+ return self.as_string(policy=self.policy.clone(utf8=True))
+
+ def is_attachment(self):
+ c_d = self.get('content-disposition')
+ return False if c_d is None else c_d.content_disposition == 'attachment'
+
+ def _find_body(self, part, preferencelist):
+ if part.is_attachment():
+ return
+ maintype, subtype = part.get_content_type().split('/')
+ if maintype == 'text':
+ if subtype in preferencelist:
+ yield (preferencelist.index(subtype), part)
+ return
+ if maintype != 'multipart':
+ return
+ if subtype != 'related':
+ for subpart in part.iter_parts():
+ yield from self._find_body(subpart, preferencelist)
+ return
+ if 'related' in preferencelist:
+ yield (preferencelist.index('related'), part)
+ candidate = None
+ start = part.get_param('start')
+ if start:
+ for subpart in part.iter_parts():
+ if subpart['content-id'] == start:
+ candidate = subpart
+ break
+ if candidate is None:
+ subparts = part.get_payload()
+ candidate = subparts[0] if subparts else None
+ if candidate is not None:
+ yield from self._find_body(candidate, preferencelist)
+
+ def get_body(self, preferencelist=('related', 'html', 'plain')):
+ """Return best candidate mime part for display as 'body' of message.
+
+ Do a depth first search, starting with self, looking for the first part
+ matching each of the items in preferencelist, and return the part
+ corresponding to the first item that has a match, or None if no items
+ have a match. If 'related' is not included in preferencelist, consider
+ the root part of any multipart/related encountered as a candidate
+ match. Ignore parts with 'Content-Disposition: attachment'.
+ """
+ best_prio = len(preferencelist)
+ body = None
+ for prio, part in self._find_body(self, preferencelist):
+ if prio < best_prio:
+ best_prio = prio
+ body = part
+ if prio == 0:
+ break
+ return body
+
+ _body_types = {('text', 'plain'),
+ ('text', 'html'),
+ ('multipart', 'related'),
+ ('multipart', 'alternative')}
+ def iter_attachments(self):
+ """Return an iterator over the non-main parts of a multipart.
+
+ Skip the first of each occurrence of text/plain, text/html,
+ multipart/related, or multipart/alternative in the multipart (unless
+ they have a 'Content-Disposition: attachment' header) and include all
+ remaining subparts in the returned iterator. When applied to a
+ multipart/related, return all parts except the root part. Return an
+ empty iterator when applied to a multipart/alternative or a
+ non-multipart.
+ """
+ maintype, subtype = self.get_content_type().split('/')
+ if maintype != 'multipart' or subtype == 'alternative':
+ return
+ parts = self.get_payload().copy()
+ if maintype == 'multipart' and subtype == 'related':
+ # For related, we treat everything but the root as an attachment.
+ # The root may be indicated by 'start'; if there's no start or we
+ # can't find the named start, treat the first subpart as the root.
+ start = self.get_param('start')
+ if start:
+ found = False
+ attachments = []
+ for part in parts:
+ if part.get('content-id') == start:
+ found = True
+ else:
+ attachments.append(part)
+ if found:
+ yield from attachments
+ return
+ parts.pop(0)
+ yield from parts
+ return
+ # Otherwise we more or less invert the remaining logic in get_body.
+ # This only really works in edge cases (ex: non-text related or
+ # alternatives) if the sending agent sets content-disposition.
+ seen = [] # Only skip the first example of each candidate type.
+ for part in parts:
+ maintype, subtype = part.get_content_type().split('/')
+ if ((maintype, subtype) in self._body_types and
+ not part.is_attachment() and subtype not in seen):
+ seen.append(subtype)
+ continue
+ yield part
+
+ def iter_parts(self):
+ """Return an iterator over all immediate subparts of a multipart.
+
+ Return an empty iterator for a non-multipart.
+ """
+ if self.get_content_maintype() == 'multipart':
+ yield from self.get_payload()
+
+ def get_content(self, *args, content_manager=None, **kw):
+ if content_manager is None:
+ content_manager = self.policy.content_manager
+ return content_manager.get_content(self, *args, **kw)
+
+ def set_content(self, *args, content_manager=None, **kw):
+ if content_manager is None:
+ content_manager = self.policy.content_manager
+ content_manager.set_content(self, *args, **kw)
+
+ def _make_multipart(self, subtype, disallowed_subtypes, boundary):
+ if self.get_content_maintype() == 'multipart':
+ existing_subtype = self.get_content_subtype()
+ disallowed_subtypes = disallowed_subtypes + (subtype,)
+ if existing_subtype in disallowed_subtypes:
+ raise ValueError("Cannot convert {} to {}".format(
+ existing_subtype, subtype))
+ keep_headers = []
+ part_headers = []
+ for name, value in self._headers:
+ if name.lower().startswith('content-'):
+ part_headers.append((name, value))
+ else:
+ keep_headers.append((name, value))
+ if part_headers:
+ # There is existing content, move it to the first subpart.
+ part = type(self)(policy=self.policy)
+ part._headers = part_headers
+ part._payload = self._payload
+ self._payload = [part]
+ else:
+ self._payload = []
+ self._headers = keep_headers
+ self['Content-Type'] = 'multipart/' + subtype
+ if boundary is not None:
+ self.set_param('boundary', boundary)
+
+ def make_related(self, boundary=None):
+ self._make_multipart('related', ('alternative', 'mixed'), boundary)
+
+ def make_alternative(self, boundary=None):
+ self._make_multipart('alternative', ('mixed',), boundary)
+
+ def make_mixed(self, boundary=None):
+ self._make_multipart('mixed', (), boundary)
+
+ def _add_multipart(self, _subtype, *args, _disp=None, **kw):
+ if (self.get_content_maintype() != 'multipart' or
+ self.get_content_subtype() != _subtype):
+ getattr(self, 'make_' + _subtype)()
+ part = type(self)(policy=self.policy)
+ part.set_content(*args, **kw)
+ if _disp and 'content-disposition' not in part:
+ part['Content-Disposition'] = _disp
+ self.attach(part)
+
+ def add_related(self, *args, **kw):
+ self._add_multipart('related', *args, _disp='inline', **kw)
+
+ def add_alternative(self, *args, **kw):
+ self._add_multipart('alternative', *args, **kw)
+
+ def add_attachment(self, *args, **kw):
+ self._add_multipart('mixed', *args, _disp='attachment', **kw)
+
+ def clear(self):
+ self._headers = []
+ self._payload = None
+
+ def clear_content(self):
+ self._headers = [(n, v) for n, v in self._headers
+ if not n.lower().startswith('content-')]
+ self._payload = None
+
+
+class EmailMessage(MIMEPart):
+
+ def set_content(self, *args, **kw):
+ super().set_content(*args, **kw)
+ if 'MIME-Version' not in self:
+ self['MIME-Version'] = '1.0'
diff --git a/modules/language/python/module/email/parser.py b/modules/language/python/module/email/parser.py
index 5ec164e..962f9dc 100644
--- a/modules/language/python/module/email/parser.py
+++ b/modules/language/python/module/email/parser.py
@@ -9,7 +9,6 @@ __all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser',
'FeedParser', 'BytesFeedParser']
from io import StringIO, TextIOWrapper
-
from email.feedparser import FeedParser, BytesFeedParser
from email._policybase import compat32
diff --git a/modules/language/python/module/email/policy.py b/modules/language/python/module/email/policy.py
new file mode 100644
index 0000000..3f8551c
--- /dev/null
+++ b/modules/language/python/module/email/policy.py
@@ -0,0 +1,231 @@
+module(email,policy)
+"""This will be the home for the policy that hooks in the new
+code that adds all the email6 features.
+"""
+
+import re
+from email._policybase import Policy, Compat32, compat32, _extend_docstrings
+from email.utils import _has_surrogates
+from email.headerregistry import HeaderRegistry as HeaderRegistry
+from email.contentmanager import raw_data_manager
+from email.message import EmailMessage
+
+__all__ = [
+ 'Compat32',
+ 'compat32',
+ 'Policy',
+ 'EmailPolicy',
+ 'default',
+ 'strict',
+ 'SMTP',
+ 'HTTP',
+ ]
+
+linesep_splitter = re.compile(r'\n|\r')
+
+@_extend_docstrings
+class EmailPolicy(Policy):
+
+ """+
+ PROVISIONAL
+
+ The API extensions enabled by this policy are currently provisional.
+ Refer to the documentation for details.
+
+ This policy adds new header parsing and folding algorithms. Instead of
+ simple strings, headers are custom objects with custom attributes
+ depending on the type of the field. The folding algorithm fully
+ implements RFCs 2047 and 5322.
+
+ In addition to the settable attributes listed above that apply to
+ all Policies, this policy adds the following additional attributes:
+
+ utf8 -- if False (the default) message headers will be
+ serialized as ASCII, using encoded words to encode
+ any non-ASCII characters in the source strings. If
+ True, the message headers will be serialized using
+ utf8 and will not contain encoded words (see RFC
+ 6532 for more on this serialization format).
+
+ refold_source -- if the value for a header in the Message object
+ came from the parsing of some source, this attribute
+ indicates whether or not a generator should refold
+ that value when transforming the message back into
+ stream form. The possible values are:
+
+ none -- all source values use original folding
+ long -- source values that have any line that is
+ longer than max_line_length will be
+ refolded
+ all -- all values are refolded.
+
+ The default is 'long'.
+
+ header_factory -- a callable that takes two arguments, 'name' and
+ 'value', where 'name' is a header field name and
+ 'value' is an unfolded header field value, and
+ returns a string-like object that represents that
+ header. A default header_factory is provided that
+ understands some of the RFC5322 header field types.
+ (Currently address fields and date fields have
+ special treatment, while all other fields are
+ treated as unstructured. This list will be
+ completed before the extension is marked stable.)
+
+ content_manager -- an object with at least two methods: get_content
+ and set_content. When the get_content or
+ set_content method of a Message object is called,
+ it calls the corresponding method of this object,
+ passing it the message object as its first argument,
+ and any arguments or keywords that were passed to
+ it as additional arguments. The default
+ content_manager is
+ :data:`~email.contentmanager.raw_data_manager`.
+
+ """
+
+ message_factory = EmailMessage
+ utf8 = False
+ refold_source = 'long'
+ header_factory = HeaderRegistry()
+ content_manager = raw_data_manager
+
+ def __init__(self, **kw):
+ # Ensure that each new instance gets a unique header factory
+ # (as opposed to clones, which share the factory).
+
+ if 'header_factory' not in kw:
+ object.__setattr__(self, 'header_factory', HeaderRegistry())
+
+ super().__init__(**kw)
+
+
+ def header_max_count(self, name):
+ """+
+ The implementation for this class returns the max_count attribute from
+ the specialized header class that would be used to construct a header
+ of type 'name'.
+ """
+ return self.header_factory[name].max_count
+
+ # The logic of the next three methods is chosen such that it is possible to
+ # switch a Message object between a Compat32 policy and a policy derived
+ # from this class and have the results stay consistent. This allows a
+ # Message object constructed with this policy to be passed to a library
+ # that only handles Compat32 objects, or to receive such an object and
+ # convert it to use the newer style by just changing its policy. It is
+ # also chosen because it postpones the relatively expensive full rfc5322
+ # parse until as late as possible when parsing from source, since in many
+ # applications only a few headers will actually be inspected.
+
+ def header_source_parse(self, sourcelines):
+ """+
+ The name is parsed as everything up to the ':' and returned unmodified.
+ The value is determined by stripping leading whitespace off the
+ remainder of the first line, joining all subsequent lines together, and
+ stripping any trailing carriage return or linefeed characters. (This
+ is the same as Compat32).
+
+ """
+ name, value = sourcelines[0].split(':', 1)
+ value = value.lstrip(' \t') + ''.join(sourcelines[1:])
+ return (name, value.rstrip('\r\n'))
+
+ def header_store_parse(self, name, value):
+ """+
+ The name is returned unchanged. If the input value has a 'name'
+ attribute and it matches the name ignoring case, the value is returned
+ unchanged. Otherwise the name and value are passed to header_factory
+ method, and the resulting custom header object is returned as the
+ value. In this case a ValueError is raised if the input value contains
+ CR or LF characters.
+
+ """
+ if hasattr(value, 'name') and value.name.lower() == name.lower():
+ return (name, value)
+ if isinstance(value, str) and len(value.splitlines())>1:
+ # XXX this error message isn't quite right when we use splitlines
+ # (see issue 22233), but I'm not sure what should happen here.
+ raise ValueError("Header values may not contain linefeed "
+ "or carriage return characters")
+ return (name, self.header_factory(name, value))
+
+ def header_fetch_parse(self, name, value):
+ """+
+ If the value has a 'name' attribute, it is returned to unmodified.
+ Otherwise the name and the value with any linesep characters removed
+ are passed to the header_factory method, and the resulting custom
+ header object is returned. Any surrogateescaped bytes get turned
+ into the unicode unknown-character glyph.
+
+ """
+ if hasattr(value, 'name'):
+ return value
+ # We can't use splitlines here because it splits on more than \r and \n.
+ value = ''.join(linesep_splitter.split(value))
+ return self.header_factory(name, value)
+
+ def fold(self, name, value):
+ """+
+ Header folding is controlled by the refold_source policy setting. A
+ value is considered to be a 'source value' if and only if it does not
+ have a 'name' attribute (having a 'name' attribute means it is a header
+ object of some sort). If a source value needs to be refolded according
+ to the policy, it is converted into a custom header object by passing
+ the name and the value with any linesep characters removed to the
+ header_factory method. Folding of a custom header object is done by
+ calling its fold method with the current policy.
+
+ Source values are split into lines using splitlines. If the value is
+ not to be refolded, the lines are rejoined using the linesep from the
+ policy and returned. The exception is lines containing non-ascii
+ binary data. In that case the value is refolded regardless of the
+ refold_source setting, which causes the binary data to be CTE encoded
+ using the unknown-8bit charset.
+
+ """
+ return self._fold(name, value, refold_binary=True)
+
+ def fold_binary(self, name, value):
+ """+
+ The same as fold if cte_type is 7bit, except that the returned value is
+ bytes.
+
+ If cte_type is 8bit, non-ASCII binary data is converted back into
+ bytes. Headers with binary data are not refolded, regardless of the
+ refold_header setting, since there is no way to know whether the binary
+ data consists of single byte characters or multibyte characters.
+
+ If utf8 is true, headers are encoded to utf8, otherwise to ascii with
+ non-ASCII unicode rendered as encoded words.
+
+ """
+ folded = self._fold(name, value, refold_binary=self.cte_type=='7bit')
+ charset = 'utf8' if self.utf8 else 'ascii'
+ return folded.encode(charset, 'surrogateescape')
+
+ def _fold(self, name, value, refold_binary=False):
+ if hasattr(value, 'name'):
+ return value.fold(policy=self)
+ maxlen = self.max_line_length if self.max_line_length else float('inf')
+ lines = value.splitlines()
+ refold = (self.refold_source == 'all' or
+ self.refold_source == 'long' and
+ (lines and len(lines[0])+len(name)+2 > maxlen or
+ any(len(x) > maxlen for x in lines[1:])))
+ if refold or refold_binary and _has_surrogates(value):
+ return self.header_factory(name, ''.join(lines)).fold(policy=self)
+ return name + ': ' + self.linesep.join(lines) + self.linesep
+
+
+default = EmailPolicy()
+
+# Make the default policy use the class default header_factory
+del default.header_factory
+strict = default.clone(raise_on_defect=True)
+
+SMTP = default.clone(linesep='\r\n')
+
+HTTP = default.clone(linesep='\r\n', max_line_length=None)
+
+SMTPUTF8 = SMTP.clone(utf8=True)
diff --git a/modules/language/python/module/email/utils.py b/modules/language/python/module/email/utils.py
index abe3895..861f1ad 100644
--- a/modules/language/python/module/email/utils.py
+++ b/modules/language/python/module/email/utils.py
@@ -24,6 +24,7 @@ __all__ = [
'_has_surrogates'
]
+
import os
import re
import time