diff options
Diffstat (limited to 'modules/language/python/module/mimetypes.py')
-rw-r--r-- | modules/language/python/module/mimetypes.py | 553 |
1 files changed, 553 insertions, 0 deletions
diff --git a/modules/language/python/module/mimetypes.py b/modules/language/python/module/mimetypes.py new file mode 100644 index 0000000..76a5e87 --- /dev/null +++ b/modules/language/python/module/mimetypes.py @@ -0,0 +1,553 @@ +module(mimetypes) +"""Guess the MIME type of a file. + +This module defines two useful functions: + +guess_type(url, strict=True) -- guess the MIME type and encoding of a URL. + +guess_extension(type, strict=True) -- guess the extension for a given MIME type. + +It also contains the following, for tuning the behavior: + +Data: + +knownfiles -- list of files to parse +inited -- flag set when init() has been called +suffix_map -- dictionary mapping suffixes to suffixes +encodings_map -- dictionary mapping suffixes to encodings +types_map -- dictionary mapping suffixes to types + +Functions: + +init([files]) -- parse a list of files, default knownfiles (on Windows, the + default values are taken from the registry) +read_mime_types(file) -- parse one file, return a dictionary or None +""" + +import os +import sys +import posixpath +import urllib.parse +try: + import winreg as _winreg +except ImportError: + _winreg = None + +__all__ = [ + "knownfiles", "inited", "MimeTypes", + "guess_type", "guess_all_extensions", "guess_extension", + "add_type", "init", "read_mime_types", + "suffix_map", "encodings_map", "types_map", "common_types" +] + +knownfiles = [ + "/etc/mime.types", + "/etc/httpd/mime.types", # Mac OS X + "/etc/httpd/conf/mime.types", # Apache + "/etc/apache/mime.types", # Apache 1 + "/etc/apache2/mime.types", # Apache 2 + "/usr/local/etc/httpd/conf/mime.types", + "/usr/local/lib/netscape/mime.types", + "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 + "/usr/local/etc/mime.types", # Apache 1.3 + ] + +inited = False +_db = None + + +class MimeTypes: + """MIME-types datastore. + + This datastore can handle information from mime.types-style files + and supports basic determination of MIME type from a filename or + URL, and can guess a reasonable extension given a MIME type. + """ + + def __init__(self, filenames=(), strict=True): + if not inited: + init() + self.encodings_map = encodings_map.copy() + self.suffix_map = suffix_map.copy() + self.types_map = ({}, {}) # dict for (non-strict, strict) + self.types_map_inv = ({}, {}) + for (ext, type) in types_map.items(): + self.add_type(type, ext, True) + for (ext, type) in common_types.items(): + self.add_type(type, ext, False) + for name in filenames: + self.read(name, strict) + + def add_type(self, type, ext, strict=True): + """Add a mapping between a type and an extension. + + When the extension is already known, the new + type will replace the old one. When the type + is already known the extension will be added + to the list of known extensions. + + If strict is true, information will be added to + list of standard types, else to the list of non-standard + types. + """ + self.types_map[strict][ext] = type + exts = self.types_map_inv[strict].setdefault(type, []) + if ext not in exts: + exts.append(ext) + + def guess_type(self, url, strict=True): + """Guess the type of a file based on its URL. + + Return value is a tuple (type, encoding) where type is None if + the type can't be guessed (no or unknown suffix) or a string + of the form type/subtype, usable for a MIME Content-type + header; and encoding is None for no encoding or the name of + the program used to encode (e.g. compress or gzip). The + mappings are table driven. Encoding suffixes are case + sensitive; type suffixes are first tried case sensitive, then + case insensitive. + + The suffixes .tgz, .taz and .tz (case sensitive!) are all + mapped to '.tar.gz'. (This is table-driven too, using the + dictionary suffix_map.) + + Optional `strict' argument when False adds a bunch of commonly found, + but non-standard types. + """ + scheme, url = urllib.parse.splittype(url) + if scheme == 'data': + # syntax of data URLs: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + # type/subtype defaults to "text/plain" + comma = url.find(',') + if comma < 0: + # bad data URL + return None, None + semi = url.find(';', 0, comma) + if semi >= 0: + type = url[:semi] + else: + type = url[:comma] + if '=' in type or '/' not in type: + type = 'text/plain' + return type, None # never compressed, so encoding is None + base, ext = posixpath.splitext(url) + while ext in self.suffix_map: + base, ext = posixpath.splitext(base + self.suffix_map[ext]) + if ext in self.encodings_map: + encoding = self.encodings_map[ext] + base, ext = posixpath.splitext(base) + else: + encoding = None + types_map = self.types_map[True] + if ext in types_map: + return types_map[ext], encoding + elif ext.lower() in types_map: + return types_map[ext.lower()], encoding + elif strict: + return None, encoding + types_map = self.types_map[False] + if ext in types_map: + return types_map[ext], encoding + elif ext.lower() in types_map: + return types_map[ext.lower()], encoding + else: + return None, encoding + + def guess_all_extensions(self, type, strict=True): + """Guess the extensions for a file based on its MIME type. + + Return value is a list of strings giving the possible filename + extensions, including the leading dot ('.'). The extension is not + guaranteed to have been associated with any particular data stream, + but would be mapped to the MIME type `type' by guess_type(). + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. + """ + type = type.lower() + extensions = self.types_map_inv[True].get(type, []) + if not strict: + for ext in self.types_map_inv[False].get(type, []): + if ext not in extensions: + extensions.append(ext) + return extensions + + def guess_extension(self, type, strict=True): + """Guess the extension for a file based on its MIME type. + + Return value is a string giving a filename extension, + including the leading dot ('.'). The extension is not + guaranteed to have been associated with any particular data + stream, but would be mapped to the MIME type `type' by + guess_type(). If no extension can be guessed for `type', None + is returned. + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. + """ + extensions = self.guess_all_extensions(type, strict) + if not extensions: + return None + return extensions[0] + + def read(self, filename, strict=True): + """ + Read a single mime.types-format file, specified by pathname. + + If strict is true, information will be added to + list of standard types, else to the list of non-standard + types. + """ + with open(filename, encoding='utf-8') as fp: + self.readfp(fp, strict) + + def readfp(self, fp, strict=True): + """ + Read a single mime.types-format file. + + If strict is true, information will be added to + list of standard types, else to the list of non-standard + types. + """ + while 1: + line = fp.readline() + if not line: + break + words = line.split() + for i in range(len(words)): + if words[i][0] == '#': + del words[i:] + break + if not words: + continue + type, suffixes = words[0], words[1:] + for suff in suffixes: + self.add_type(type, '.' + suff, strict) + + def read_windows_registry(self, strict=True): + """ + Load the MIME types database from Windows registry. + + If strict is true, information will be added to + list of standard types, else to the list of non-standard + types. + """ + + # Windows only + if not _winreg: + return + + def enum_types(mimedb): + i = 0 + while True: + try: + ctype = _winreg.EnumKey(mimedb, i) + except EnvironmentError: + break + else: + if '\0' not in ctype: + yield ctype + i += 1 + + with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr: + for subkeyname in enum_types(hkcr): + try: + with _winreg.OpenKey(hkcr, subkeyname) as subkey: + # Only check file extensions + if not subkeyname.startswith("."): + continue + # raises EnvironmentError if no 'Content Type' value + mimetype, datatype = _winreg.QueryValueEx( + subkey, 'Content Type') + if datatype != _winreg.REG_SZ: + continue + self.add_type(mimetype, subkeyname, strict) + except EnvironmentError: + continue + +def guess_type(url, strict=True): + """Guess the type of a file based on its URL. + + Return value is a tuple (type, encoding) where type is None if the + type can't be guessed (no or unknown suffix) or a string of the + form type/subtype, usable for a MIME Content-type header; and + encoding is None for no encoding or the name of the program used + to encode (e.g. compress or gzip). The mappings are table + driven. Encoding suffixes are case sensitive; type suffixes are + first tried case sensitive, then case insensitive. + + The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped + to ".tar.gz". (This is table-driven too, using the dictionary + suffix_map). + + Optional `strict' argument when false adds a bunch of commonly found, but + non-standard types. + """ + if _db is None: + init() + return _db.guess_type(url, strict) + + +def guess_all_extensions(type, strict=True): + """Guess the extensions for a file based on its MIME type. + + Return value is a list of strings giving the possible filename + extensions, including the leading dot ('.'). The extension is not + guaranteed to have been associated with any particular data + stream, but would be mapped to the MIME type `type' by + guess_type(). If no extension can be guessed for `type', None + is returned. + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. + """ + if _db is None: + init() + return _db.guess_all_extensions(type, strict) + +def guess_extension(type, strict=True): + """Guess the extension for a file based on its MIME type. + + Return value is a string giving a filename extension, including the + leading dot ('.'). The extension is not guaranteed to have been + associated with any particular data stream, but would be mapped to the + MIME type `type' by guess_type(). If no extension can be guessed for + `type', None is returned. + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. + """ + if _db is None: + init() + return _db.guess_extension(type, strict) + +def add_type(type, ext, strict=True): + """Add a mapping between a type and an extension. + + When the extension is already known, the new + type will replace the old one. When the type + is already known the extension will be added + to the list of known extensions. + + If strict is true, information will be added to + list of standard types, else to the list of non-standard + types. + """ + if _db is None: + init() + return _db.add_type(type, ext, strict) + + +def init(files=None): + global suffix_map, types_map, encodings_map, common_types + global inited, _db + inited = True # so that MimeTypes.__init__() doesn't call us again + db = MimeTypes() + if files is None: + if _winreg: + db.read_windows_registry() + files = knownfiles + for file in files: + if os.path.isfile(file): + db.read(file) + encodings_map = db.encodings_map + suffix_map = db.suffix_map + types_map = db.types_map[True] + common_types = db.types_map[False] + # Make the DB a global variable now that it is fully initialized + _db = db + + +def read_mime_types(file): + try: + f = open(file) + except OSError: + return None + with f: + db = MimeTypes() + db.readfp(f, True) + return db.types_map[True] + + +def _default_mime_types(): + global suffix_map + global encodings_map + global types_map + global common_types + + suffix_map = { + '.svgz': '.svg.gz', + '.tgz': '.tar.gz', + '.taz': '.tar.gz', + '.tz': '.tar.gz', + '.tbz2': '.tar.bz2', + '.txz': '.tar.xz', + } + + encodings_map = { + '.gz': 'gzip', + '.Z': 'compress', + '.bz2': 'bzip2', + '.xz': 'xz', + } + + # Before adding new types, make sure they are either registered with IANA, + # at http://www.iana.org/assignments/media-types + # or extensions, i.e. using the x- prefix + + # If you add to these, please keep them sorted! + types_map = { + '.a' : 'application/octet-stream', + '.ai' : 'application/postscript', + '.aif' : 'audio/x-aiff', + '.aifc' : 'audio/x-aiff', + '.aiff' : 'audio/x-aiff', + '.au' : 'audio/basic', + '.avi' : 'video/x-msvideo', + '.bat' : 'text/plain', + '.bcpio' : 'application/x-bcpio', + '.bin' : 'application/octet-stream', + '.bmp' : 'image/x-ms-bmp', + '.c' : 'text/plain', + # Duplicates :( + '.cdf' : 'application/x-cdf', + '.cdf' : 'application/x-netcdf', + '.cpio' : 'application/x-cpio', + '.csh' : 'application/x-csh', + '.css' : 'text/css', + '.csv' : 'text/csv', + '.dll' : 'application/octet-stream', + '.doc' : 'application/msword', + '.dot' : 'application/msword', + '.dvi' : 'application/x-dvi', + '.eml' : 'message/rfc822', + '.eps' : 'application/postscript', + '.etx' : 'text/x-setext', + '.exe' : 'application/octet-stream', + '.gif' : 'image/gif', + '.gtar' : 'application/x-gtar', + '.h' : 'text/plain', + '.hdf' : 'application/x-hdf', + '.htm' : 'text/html', + '.html' : 'text/html', + '.ico' : 'image/vnd.microsoft.icon', + '.ief' : 'image/ief', + '.jpe' : 'image/jpeg', + '.jpeg' : 'image/jpeg', + '.jpg' : 'image/jpeg', + '.js' : 'application/javascript', + '.json' : 'application/json', + '.ksh' : 'text/plain', + '.latex' : 'application/x-latex', + '.m1v' : 'video/mpeg', + '.m3u' : 'application/vnd.apple.mpegurl', + '.m3u8' : 'application/vnd.apple.mpegurl', + '.man' : 'application/x-troff-man', + '.me' : 'application/x-troff-me', + '.mht' : 'message/rfc822', + '.mhtml' : 'message/rfc822', + '.mif' : 'application/x-mif', + '.mov' : 'video/quicktime', + '.movie' : 'video/x-sgi-movie', + '.mp2' : 'audio/mpeg', + '.mp3' : 'audio/mpeg', + '.mp4' : 'video/mp4', + '.mpa' : 'video/mpeg', + '.mpe' : 'video/mpeg', + '.mpeg' : 'video/mpeg', + '.mpg' : 'video/mpeg', + '.ms' : 'application/x-troff-ms', + '.nc' : 'application/x-netcdf', + '.nws' : 'message/rfc822', + '.o' : 'application/octet-stream', + '.obj' : 'application/octet-stream', + '.oda' : 'application/oda', + '.p12' : 'application/x-pkcs12', + '.p7c' : 'application/pkcs7-mime', + '.pbm' : 'image/x-portable-bitmap', + '.pdf' : 'application/pdf', + '.pfx' : 'application/x-pkcs12', + '.pgm' : 'image/x-portable-graymap', + '.pl' : 'text/plain', + '.png' : 'image/png', + '.pnm' : 'image/x-portable-anymap', + '.pot' : 'application/vnd.ms-powerpoint', + '.ppa' : 'application/vnd.ms-powerpoint', + '.ppm' : 'image/x-portable-pixmap', + '.pps' : 'application/vnd.ms-powerpoint', + '.ppt' : 'application/vnd.ms-powerpoint', + '.ps' : 'application/postscript', + '.pwz' : 'application/vnd.ms-powerpoint', + '.py' : 'text/x-python', + '.pyc' : 'application/x-python-code', + '.pyo' : 'application/x-python-code', + '.qt' : 'video/quicktime', + '.ra' : 'audio/x-pn-realaudio', + '.ram' : 'application/x-pn-realaudio', + '.ras' : 'image/x-cmu-raster', + '.rdf' : 'application/xml', + '.rgb' : 'image/x-rgb', + '.roff' : 'application/x-troff', + '.rtx' : 'text/richtext', + '.sgm' : 'text/x-sgml', + '.sgml' : 'text/x-sgml', + '.sh' : 'application/x-sh', + '.shar' : 'application/x-shar', + '.snd' : 'audio/basic', + '.so' : 'application/octet-stream', + '.src' : 'application/x-wais-source', + '.sv4cpio': 'application/x-sv4cpio', + '.sv4crc' : 'application/x-sv4crc', + '.svg' : 'image/svg+xml', + '.swf' : 'application/x-shockwave-flash', + '.t' : 'application/x-troff', + '.tar' : 'application/x-tar', + '.tcl' : 'application/x-tcl', + '.tex' : 'application/x-tex', + '.texi' : 'application/x-texinfo', + '.texinfo': 'application/x-texinfo', + '.tif' : 'image/tiff', + '.tiff' : 'image/tiff', + '.tr' : 'application/x-troff', + '.tsv' : 'text/tab-separated-values', + '.txt' : 'text/plain', + '.ustar' : 'application/x-ustar', + '.vcf' : 'text/x-vcard', + '.wav' : 'audio/x-wav', + '.webm' : 'video/webm', + '.wiz' : 'application/msword', + '.wsdl' : 'application/xml', + '.xbm' : 'image/x-xbitmap', + '.xlb' : 'application/vnd.ms-excel', + # Duplicates :( + '.xls' : 'application/excel', + '.xls' : 'application/vnd.ms-excel', + '.xml' : 'text/xml', + '.xpdl' : 'application/xml', + '.xpm' : 'image/x-xpixmap', + '.xsl' : 'application/xml', + '.xwd' : 'image/x-xwindowdump', + '.zip' : 'application/zip', + } + + # These are non-standard types, commonly found in the wild. They will + # only match if strict=0 flag is given to the API methods. + + # Please sort these too + common_types = { + '.jpg' : 'image/jpg', + '.mid' : 'audio/midi', + '.midi': 'audio/midi', + '.pct' : 'image/pict', + '.pic' : 'image/pict', + '.pict': 'image/pict', + '.rtf' : 'application/rtf', + '.xul' : 'text/xul' + } + + +_default_mime_types() |