summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Israelsson Tampe <stefan.itampe@gmail.com>2018-08-22 21:35:43 +0200
committerStefan Israelsson Tampe <stefan.itampe@gmail.com>2018-08-22 21:35:43 +0200
commit43792510bfeb15e8416a5782ab64126ee8950950 (patch)
tree7de7499a1a5a3f841d24ad2e0ea50964fa0e9e84
parenta41eeb67b1aa32199501db6d013e259ccb7484e6 (diff)
csv.py
-rw-r--r--modules/language/python/module/_csv.py413
-rw-r--r--modules/language/python/module/csv.py449
2 files changed, 862 insertions, 0 deletions
diff --git a/modules/language/python/module/_csv.py b/modules/language/python/module/_csv.py
new file mode 100644
index 0000000..6db2d05
--- /dev/null
+++ b/modules/language/python/module/_csv.py
@@ -0,0 +1,413 @@
+(define-module (language python module _csv)
+ #:use-module (oop pf-objects)
+ #:use-module (langauge python list)
+ #:use-module (langauge python def)
+ #:use-module (langauge python yield)
+ #:use-module (langauge python for)
+ #:use-module (langauge python exceptions)
+ #:export (QUOTE_ALL QUOTE_MINIMAL QUOTE_NONNUMERIC QUOTE_NONE
+ reader writer Error field_size_limit
+ get_dialect register_dialect unregister_dialect
+ list_dialects __doc__ Dialect))
+
+(define-python-class Error (Exception))
+
+(define-python-class Dialect ()
+ (define __init__
+ (lambda (self . x)
+ #f)))
+
+(define *field-size* (make-fluid 131072))
+(define field_size_limit
+ (case-lambda
+ (() (fluid-ref *field-size*))
+ ((x) (fluid-set! *field-size* x))))
+
+(define *dialects* (make-hash-table))
+(def (register_dialect nm (= val None) (** keyw))
+ (let ((newval (Dialect)))
+ (define-syntax-rule (set- x y z key default)
+ (set x 'key (hash-ref z (symbol->string 'key)
+ (if (eq? y None)
+ default
+ (ref y 'key default)))))
+ (define-syntax-rule (setter x y z ((k def) ...))
+ (begin (set- x y z k def) ...))
+
+ (setter newval val keyw
+ ((delimiter ",")
+ (doublequote #t)
+ (escapechar None)
+ (lineterminator "\r\n")
+ (quotechar "\"")
+ (quoting 'minimal)
+ (skipinitialspace #f)
+ (strict #f)))
+
+ (hash-set! *dialects* nm newval)))
+
+
+
+ (hash-set! *dialects* nm val))
+(define (get_dialect nm val)
+ (hash-ref *dialects* nm val None))
+(define (unregister_dialect nm)
+ (hash-delete! *dialects nm))
+
+(define (list_dialects)
+ (let ((ret '()))
+ (hash-for-each
+ (lambda (k v)
+ (set! ret (cons k ret)))
+ *dialects*)
+ (py-ist ret)))
+
+(define __doc__
+"CSV parsing and writing.
+
+This module provides classes that assist in the reading and writing
+of Comma Separated Value (CSV) files, and implements the interface
+described by PEP 305. Although many CSV files are simple to parse,
+the format is not formally defined by a stable specification and
+is subtle enough that parsing lines of a CSV file with something
+like line.split(\",\") is bound to fail. The module supports three\n
+basic APIs: reading, writing, and registration of dialects.
+
+
+DIALECT REGISTRATION:
+
+Readers and writers support a dialect argument, which is a convenient
+handle on a group of settings. When the dialect argument is a string,
+it identifies one of the dialects previously registered with the module.
+If it is a class or instance, the attributes of the argument are used as
+the settings for the reader or writer:
+
+ class excel:
+ delimiter = ','
+ quotechar = '\"'
+ escapechar = None
+ doublequote = True
+ skipinitialspace = False
+ lineterminator = '\\r\\n'
+ quoting = QUOTE_MINIMAL
+
+SETTINGS:
+
+ * quotechar - specifies a one-character string to use as the
+ quoting character. It defaults to '\"'.
+ * delimiter - specifies a one-character string to use as the
+ field separator. It defaults to ','.
+ * skipinitialspace - specifies how to interpret whitespace which
+ immediately follows a delimiter. It defaults to False, which
+ means that whitespace immediately following a delimiter is part
+ of the following field.
+ * lineterminator - specifies the character sequence which should
+ terminate rows.
+ * quoting - controls when quotes should be generated by the writer.
+ It can take on any of the following module constants:
+
+ csv.QUOTE_MINIMAL means only when required, for example, when a
+ field contains either the quotechar or the delimiter
+ csv.QUOTE_ALL means that quotes are always placed around fields.
+ csv.QUOTE_NONNUMERIC means that quotes are always placed around
+ fields which do not parse as integers or floating point
+ numbers.
+ csv.QUOTE_NONE means that quotes are never placed around fields.
+ * escapechar - specifies a one-character string used to escape
+ the delimiter when quoting is set to QUOTE_NONE.
+ * doublequote - controls the handling of quotes inside fields. When
+ True, two consecutive quotes are interpreted as one during read,
+ and when writing, each quote character embedded in the data is
+ written as two quotes")
+
+(define QUOTE_ALL 'all)
+(define QUOTE_MINIMAL 'minimal)
+(define QUOTE_NONNUMERIC 'nonumeric)
+(define QUOTE_NONE 'none)
+
+(def (reader csvfile (= dialect "excel") (** fmtparams))
+ (let*
+ ((dialect (get-dialect dialect))
+
+ (delimiter (chr (py-get fmtparams "delimiter" e)
+ (ref dialect 'Delimiter e)
+ ","))
+
+ (doublequote (oor (py-get fmtparams "doublequote" e)
+ (ref dialect 'doublequote e)
+ #t))
+
+ (escapechar (chr (py-get fmtparams "escapechar" e)
+ (ref dialect 'escapechar e)
+ None))
+
+ (lineterminator (str (py-get fmtparams "lineterminator" e)
+ (ref dialect 'lineterminator e)
+ "\r\n"))
+
+ (quotechar (chr (py-get fmtparams "quotechar" e)
+ (ref dialect 'quotechar e)
+ "\""))
+
+ (quoting (oor (py-get fmtparams "quoting" e)
+ (ref dialect 'quoting e)
+ QUOTE_MINIMAL))
+
+ (skipispace (oor (py-get fmtparams "skipinitialspace" e)
+ (ref dialect 'skipinitialspace e)
+ #t))
+
+ (strict (oor (py-get fmtparams "strict" e)
+ (ref dialect 'strict e)
+ #f)))
+ (make-generator ()
+ (lambda (yield)
+ (for ((s : cvsfile)) ()
+ (let ((n (len s)))
+ (let lp ((i 0) (state #f) (l '()))
+ (let lp2 ((j i) (r '()))
+ (define-syntax-rule (raise- s)
+ (if strict
+ (raise s)
+ (lp (+ j 1) r)))
+ (define (end j ch)
+ (if (and (eq? state 'start)
+ (eq? ch #\newline))
+ (yield (py-list (reverse l)))
+ (let* ((x (list->string (reverse r)))
+ (x (if (eq? state 'numeric)
+ (string->number x)
+ x)))
+ (if (eq? ch #\newline)
+ (yield (py-list (reverse (cons x l))))
+ (lp (+ j 1) 'start (cons x l))))))
+
+ (define (do-quotechar)
+ (cond
+ ((eq? state 'quote)
+ (if doublequote
+ (if (and (< (+ i 1) n)
+ (equal? quotechar
+ (string-ref s (+ i 1))))
+ (lp2 (+ j 2) (cons quotechar r))
+ (end (+ j 1)))
+ (end (+ j 1))))
+
+ ((eq? state 'start)
+ (if (or (eq? quoting 'minimal)
+ (eq? quoting 'all)
+ (eq? quoting 'nonnumeric))
+ (lp (+ j 1) 'quote l)
+ (raise- (Error "QOUTE_NONE supports no quoteing"))))
+
+ (else
+ (raise- (Error "wrong quoting found")))))
+
+ (define (do-whitespace ch)
+ (cond
+ ((eq? state 'start)
+ (if skipispace
+ (lp2 (+ i 1) r)
+ (if (or (eq? quoting 'minimal)
+ (eq? quoting 'none))
+ (lp i 'normal l)
+ (raise- (Error "whitespace outside quote")))))
+
+ ((or (eq? state 'normal)
+ (eq? state 'quote))
+ (lp2 (+ i 1) (cons ch r)))
+
+ ((eq? state 'nnumeric)
+ (raise- (Error "whitespace in numeric field")))
+
+ ((eq? state 'end)
+ (raise- (Error "whitespace after quote")))))
+
+ (define (do-esc-qupote)
+ (if (< (+ j 1) n)
+ (let ((ch2 (string-ref s (+ j 1))))
+ (cond
+ ((and (eq? state 'quoting)
+ (eq? ch2 quotechar))
+ (lp2 (+ j 2)
+ (cons quotechar r)))
+
+ ((eq? ch2 delimiter)
+ (lp2 (+ j 2)
+ (cons delimiter r)))
+
+ ((eq? ch2 escapechar)
+ (lp2 (+ j 2)
+ (cons escapechar r)))
+
+ (else
+ (lp2 (+ j 2) r))))
+ (raise- (Error "single escape ends line"))))
+
+ (define (do-escape)
+ (cond
+ ((eq? state 'start)
+ (if (eq? quoting 'none)
+ (lp j 'normal l)
+ (raise- (Error "escapecharacter in nonquote"))))
+
+ ((eq? state 'normal)
+ (if (eq? quoting 'none)
+ (do-esc-quote)
+ (raise- (Error "escapecharacter in nonequote"))))
+
+ ((eq? state 'numeric)
+ (raise- (Error "escacpechar in numeric field")))
+
+ ((eq? state 'quote)
+ (do-esc-quote))
+
+ ((eq? state 'end)
+ (raise- (Error "escapechar after quote")))))
+
+ (define (do-delim ch)
+ (cond
+ ((or (eq? state 'start)
+ (eq? state 'end))
+ (end ch))
+
+ ((eq? state 'quote)
+ (if (eq? quoteing 'minimal)
+ (raise-
+ (Error "minimal quoting must quote delimiter"))
+ (end ch)))
+
+ ((eq? state 'normal)
+ (end ch))
+
+ ((eq? state 'numeric)
+ (end ch))))
+
+ (if (< j n)
+ (let ((ch (string-ref s i)))
+ (cond
+ ((or (eq? ch #\newline)
+ (eq? ch #\return))
+ (if (eq? state 'quote)
+ (raise- (Error "missing end quote character"))
+ (end #\newline)))
+
+ ((or (eq? ch #\space) (eq? ch #\tab))
+ (do-whitespace ch))
+
+ ((eq? ch quotechar)
+ (do-quotechar))
+
+ ((eq? ch escapechar)
+ (do-escape))
+
+ ((eq? ch delimiter)
+ (do-delim ch))
+
+ ((eq? state 'numeric)
+ (if (or (eq? ch #\.)
+ (eq? ch #\-)
+ (eq? ch #\e)
+ (eq? ch #\E)
+ (char-numeric? ch))
+ (lp2 (+ j 1) (cons ch r))
+ (raise- (Error "nonumeric in numeric field"))))
+
+ ((eq? state 'start)
+ (cond
+ ((eq? quoting 'all)
+ (raise-
+ (Error
+ "nonquoted field when all should be quoted")))
+ ((eq? quoting 'nonnumeric)
+ (lp j 'numeric l))
+ (else
+ (lp j 'normal l))))
+
+ ((or (eq? state 'quote) (eq? state 'normal))
+ (lp2 (+ j 1) (cons ch r)))
+
+ ((eq? state 'end)
+ (raise-
+ (Error
+ "non delimeter after qouted field")))))
+ (do-delim #\newline))))))))))
+
+(define-python-class writer ()
+ (define __init__
+ (lam (csvfile (= dialect "excel") (** fmt))
+ (set! dialect (if (string? dialect)
+ (get_dialect dialect)
+ dialect))
+ (set self 'csvfile csvfile)
+ (set self 'dialect dialect)))
+
+ (define writerow
+ (lambda (self l)
+ (let*
+ ((dialect (ref self 'dialect))
+
+ (delimiter (chr (py-get fmtparams "delimiter" e)
+ (ref dialect 'Delimiter e)
+ ","))
+
+ (doublequote (oor (py-get fmtparams "doublequote" e)
+ (ref dialect 'doublequote e)
+ #t))
+
+ (escapechar (chr (py-get fmtparams "escapechar" e)
+ (ref dialect 'escapechar e)
+ None))
+
+ (lineterminator (str (py-get fmtparams "lineterminator" e)
+ (ref dialect 'lineterminator e)
+ "\r\n"))
+
+ (quotechar (chr (py-get fmtparams "quotechar" e)
+ (ref dialect 'quotechar e)
+ "\""))
+
+ (quoting (oor (py-get fmtparams "quoting" e)
+ (ref dialect 'quoting e)
+ QUOTE_MINIMAL))
+
+ (skipispace (oor (py-get fmtparams "skipinitialspace" e)
+ (ref dialect 'skipinitialspace e)
+ #t))
+
+ (strict (oor (py-get fmtparams "strict" e)
+ (ref dialect 'strict e)
+ #f)))
+
+ (for ((x : l)) (r '())
+ (let/ec ret
+ (cons
+ (cond
+ ((eq? quoting 'none)
+ (let ((x (if (string? x) x (str x))))
+ (if (has-escape-1 x)
+ (if strict
+ (raise (Error "None quoting and nonspecial chars"))
+ (ret r))
+ x)))
+
+ ((eq? quoting 'nonnumeric)
+ (let ((x (if (string? x) x (str x))))
+ (if (is-numeric x)
+ (number->string x)
+ (quote-it x))))
+
+ ((eq? quoting 'none)
+ (if (string? x)
+ x
+ (str x)))
+
+ ((eq? quoting 'minimal)
+ (let ((x (if (string? x) x (str x))))
+ (if (has-escape-2 x) (quote-it x) x))))
+ r)
+ #:final
+ (write
+ (string-join
+ (reverse
+ (cons lineterminator r)) delim))))))))
diff --git a/modules/language/python/module/csv.py b/modules/language/python/module/csv.py
new file mode 100644
index 0000000..0349e0b
--- /dev/null
+++ b/modules/language/python/module/csv.py
@@ -0,0 +1,449 @@
+
+"""
+csv.py - read/write/investigate CSV files
+"""
+
+import re
+from _csv import Error, __version__, writer, reader, register_dialect, \
+ unregister_dialect, get_dialect, list_dialects, \
+ field_size_limit, \
+ QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
+ __doc__
+from _csv import Dialect as _Dialect
+
+from collections import OrderedDict
+from io import StringIO
+
+__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
+ "Error", "Dialect", "__doc__", "excel", "excel_tab",
+ "field_size_limit", "reader", "writer",
+ "register_dialect", "get_dialect", "list_dialects", "Sniffer",
+ "unregister_dialect", "__version__", "DictReader", "DictWriter",
+ "unix_dialect"]
+
+class Dialect:
+ """Describe a CSV dialect.
+
+ This must be subclassed (see csv.excel). Valid attributes are:
+ delimiter, quotechar, escapechar, doublequote, skipinitialspace,
+ lineterminator, quoting.
+
+ """
+ _name = ""
+ _valid = False
+ # placeholders
+ delimiter = None
+ quotechar = None
+ escapechar = None
+ doublequote = None
+ skipinitialspace = None
+ lineterminator = None
+ quoting = None
+
+ def __init__(self):
+ if self.__class__ != Dialect:
+ self._valid = True
+ self._validate()
+
+ def _validate(self):
+ try:
+ _Dialect(self)
+ except TypeError as e:
+ # We do this for compatibility with py2.3
+ raise Error(str(e))
+
+class excel(Dialect):
+ """Describe the usual properties of Excel-generated CSV files."""
+ delimiter = ','
+ quotechar = '"'
+ doublequote = True
+ skipinitialspace = False
+ lineterminator = '\r\n'
+ quoting = QUOTE_MINIMAL
+register_dialect("excel", excel)
+
+class excel_tab(excel):
+ """Describe the usual properties of Excel-generated TAB-delimited files."""
+ delimiter = '\t'
+register_dialect("excel-tab", excel_tab)
+
+class unix_dialect(Dialect):
+ """Describe the usual properties of Unix-generated CSV files."""
+ delimiter = ','
+ quotechar = '"'
+ doublequote = True
+ skipinitialspace = False
+ lineterminator = '\n'
+ quoting = QUOTE_ALL
+register_dialect("unix", unix_dialect)
+
+
+class DictReader:
+ def __init__(self, f, fieldnames=None, restkey=None, restval=None,
+ dialect="excel", *args, **kwds):
+ self._fieldnames = fieldnames # list of keys for the dict
+ self.restkey = restkey # key to catch long rows
+ self.restval = restval # default value for short rows
+ self.reader = reader(f, dialect, *args, **kwds)
+ self.dialect = dialect
+ self.line_num = 0
+
+ def __iter__(self):
+ return self
+
+ @property
+ def fieldnames(self):
+ if self._fieldnames is None:
+ try:
+ self._fieldnames = next(self.reader)
+ except StopIteration:
+ pass
+ self.line_num = self.reader.line_num
+ return self._fieldnames
+
+ @fieldnames.setter
+ def fieldnames(self, value):
+ self._fieldnames = value
+
+ def __next__(self):
+ if self.line_num == 0:
+ # Used only for its side effect.
+ self.fieldnames
+ row = next(self.reader)
+ self.line_num = self.reader.line_num
+
+ # unlike the basic reader, we prefer not to return blanks,
+ # because we will typically wind up with a dict full of None
+ # values
+ while row == []:
+ row = next(self.reader)
+ d = OrderedDict(zip(self.fieldnames, row))
+ lf = len(self.fieldnames)
+ lr = len(row)
+ if lf < lr:
+ d[self.restkey] = row[lf:]
+ elif lf > lr:
+ for key in self.fieldnames[lr:]:
+ d[key] = self.restval
+ return d
+
+
+class DictWriter:
+ def __init__(self, f, fieldnames, restval="", extrasaction="raise",
+ dialect="excel", *args, **kwds):
+ self.fieldnames = fieldnames # list of keys for the dict
+ self.restval = restval # for writing short dicts
+ if extrasaction.lower() not in ("raise", "ignore"):
+ raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
+ % extrasaction)
+ self.extrasaction = extrasaction
+ self.writer = writer(f, dialect, *args, **kwds)
+
+ def writeheader(self):
+ header = dict(zip(self.fieldnames, self.fieldnames))
+ self.writerow(header)
+
+ def _dict_to_list(self, rowdict):
+ if self.extrasaction == "raise":
+ wrong_fields = rowdict.keys() - self.fieldnames
+ if wrong_fields:
+ raise ValueError("dict contains fields not in fieldnames: "
+ + ", ".join([repr(x) for x in wrong_fields]))
+ return (rowdict.get(key, self.restval) for key in self.fieldnames)
+
+ def writerow(self, rowdict):
+ return self.writer.writerow(self._dict_to_list(rowdict))
+
+ def writerows(self, rowdicts):
+ return self.writer.writerows(map(self._dict_to_list, rowdicts))
+
+# Guard Sniffer's type checking against builds that exclude complex()
+try:
+ complex
+except NameError:
+ complex = float
+
+class Sniffer:
+ '''
+ "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
+ Returns a Dialect object.
+ '''
+ def __init__(self):
+ # in case there is more than one possible delimiter
+ self.preferred = [',', '\t', ';', ' ', ':']
+
+
+ def sniff(self, sample, delimiters=None):
+ """
+ Returns a dialect (or None) corresponding to the sample
+ """
+
+ quotechar, doublequote, delimiter, skipinitialspace = \
+ self._guess_quote_and_delimiter(sample, delimiters)
+ if not delimiter:
+ delimiter, skipinitialspace = self._guess_delimiter(sample,
+ delimiters)
+
+ if not delimiter:
+ raise Error("Could not determine delimiter")
+
+ class dialect(Dialect):
+ _name = "sniffed"
+ lineterminator = '\r\n'
+ quoting = QUOTE_MINIMAL
+ # escapechar = ''
+
+ dialect.doublequote = doublequote
+ dialect.delimiter = delimiter
+ # _csv.reader won't accept a quotechar of ''
+ dialect.quotechar = quotechar or '"'
+ dialect.skipinitialspace = skipinitialspace
+
+ return dialect
+
+
+ def _guess_quote_and_delimiter(self, data, delimiters):
+ """
+ Looks for text enclosed between two identical quotes
+ (the probable quotechar) which are preceded and followed
+ by the same character (the probable delimiter).
+ For example:
+ ,'some text',
+ The quote with the most wins, same with the delimiter.
+ If there is no quotechar the delimiter can't be determined
+ this way.
+ """
+
+ matches = []
+ for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
+ r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
+ r'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
+ r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
+ regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
+ matches = regexp.findall(data)
+ if matches:
+ break
+
+ if not matches:
+ # (quotechar, doublequote, delimiter, skipinitialspace)
+ return ('', False, None, 0)
+ quotes = {}
+ delims = {}
+ spaces = 0
+ groupindex = regexp.groupindex
+ for m in matches:
+ n = groupindex['quote'] - 1
+ key = m[n]
+ if key:
+ quotes[key] = quotes.get(key, 0) + 1
+ try:
+ n = groupindex['delim'] - 1
+ key = m[n]
+ except KeyError:
+ continue
+ if key and (delimiters is None or key in delimiters):
+ delims[key] = delims.get(key, 0) + 1
+ try:
+ n = groupindex['space'] - 1
+ except KeyError:
+ continue
+ if m[n]:
+ spaces += 1
+
+ quotechar = max(quotes, key=quotes.get)
+
+ if delims:
+ delim = max(delims, key=delims.get)
+ skipinitialspace = delims[delim] == spaces
+ if delim == '\n': # most likely a file with a single column
+ delim = ''
+ else:
+ # there is *no* delimiter, it's a single column of quoted data
+ delim = ''
+ skipinitialspace = 0
+
+ # if we see an extra quote between delimiters, we've got a
+ # double quoted format
+ dq_regexp = re.compile(
+ r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
+ {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
+
+
+
+ if dq_regexp.search(data):
+ doublequote = True
+ else:
+ doublequote = False
+
+ return (quotechar, doublequote, delim, skipinitialspace)
+
+
+ def _guess_delimiter(self, data, delimiters):
+ """
+ The delimiter /should/ occur the same number of times on
+ each row. However, due to malformed data, it may not. We don't want
+ an all or nothing approach, so we allow for small variations in this
+ number.
+ 1) build a table of the frequency of each character on every line.
+ 2) build a table of frequencies of this frequency (meta-frequency?),
+ e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
+ 7 times in 2 rows'
+ 3) use the mode of the meta-frequency to determine the /expected/
+ frequency for that character
+ 4) find out how often the character actually meets that goal
+ 5) the character that best meets its goal is the delimiter
+ For performance reasons, the data is evaluated in chunks, so it can
+ try and evaluate the smallest portion of the data possible, evaluating
+ additional chunks as necessary.
+ """
+
+ data = list(filter(None, data.split('\n')))
+
+ ascii = [chr(c) for c in range(127)] # 7-bit ASCII
+
+ # build frequency tables
+ chunkLength = min(10, len(data))
+ iteration = 0
+ charFrequency = {}
+ modes = {}
+ delims = {}
+ start, end = 0, min(chunkLength, len(data))
+ while start < len(data):
+ iteration += 1
+ for line in data[start:end]:
+ for char in ascii:
+ metaFrequency = charFrequency.get(char, {})
+ # must count even if frequency is 0
+ freq = line.count(char)
+ # value is the mode
+ metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
+ charFrequency[char] = metaFrequency
+
+ for char in charFrequency.keys():
+ items = list(charFrequency[char].items())
+ if len(items) == 1 and items[0][0] == 0:
+ continue
+ # get the mode of the frequencies
+ if len(items) > 1:
+ modes[char] = max(items, key=lambda x: x[1])
+ # adjust the mode - subtract the sum of all
+ # other frequencies
+ items.remove(modes[char])
+ modes[char] = (modes[char][0], modes[char][1]
+ - sum(item[1] for item in items))
+ else:
+ modes[char] = items[0]
+
+ # build a list of possible delimiters
+ modeList = modes.items()
+ total = float(chunkLength * iteration)
+ # (rows of consistent data) / (number of rows) = 100%
+ consistency = 1.0
+ # minimum consistency threshold
+ threshold = 0.9
+ while len(delims) == 0 and consistency >= threshold:
+ for k, v in modeList:
+ if v[0] > 0 and v[1] > 0:
+ if ((v[1]/total) >= consistency and
+ (delimiters is None or k in delimiters)):
+ delims[k] = v
+ consistency -= 0.01
+
+ if len(delims) == 1:
+ delim = list(delims.keys())[0]
+ skipinitialspace = (data[0].count(delim) ==
+ data[0].count("%c " % delim))
+ return (delim, skipinitialspace)
+
+ # analyze another chunkLength lines
+ start = end
+ end += chunkLength
+
+ if not delims:
+ return ('', 0)
+
+ # if there's more than one, fall back to a 'preferred' list
+ if len(delims) > 1:
+ for d in self.preferred:
+ if d in delims.keys():
+ skipinitialspace = (data[0].count(d) ==
+ data[0].count("%c " % d))
+ return (d, skipinitialspace)
+
+ # nothing else indicates a preference, pick the character that
+ # dominates(?)
+ items = [(v,k) for (k,v) in delims.items()]
+ items.sort()
+ delim = items[-1][1]
+
+ skipinitialspace = (data[0].count(delim) ==
+ data[0].count("%c " % delim))
+ return (delim, skipinitialspace)
+
+
+ def has_header(self, sample):
+ # Creates a dictionary of types of data in each column. If any
+ # column is of a single type (say, integers), *except* for the first
+ # row, then the first row is presumed to be labels. If the type
+ # can't be determined, it is assumed to be a string in which case
+ # the length of the string is the determining factor: if all of the
+ # rows except for the first are the same length, it's a header.
+ # Finally, a 'vote' is taken at the end for each column, adding or
+ # subtracting from the likelihood of the first row being a header.
+
+ rdr = reader(StringIO(sample), self.sniff(sample))
+
+ header = next(rdr) # assume first row is header
+
+ columns = len(header)
+ columnTypes = {}
+ for i in range(columns): columnTypes[i] = None
+
+ checked = 0
+ for row in rdr:
+ # arbitrary number of rows to check, to keep it sane
+ if checked > 20:
+ break
+ checked += 1
+
+ if len(row) != columns:
+ continue # skip rows that have irregular number of columns
+
+ for col in list(columnTypes.keys()):
+
+ for thisType in [int, float, complex]:
+ try:
+ thisType(row[col])
+ break
+ except (ValueError, OverflowError):
+ pass
+ else:
+ # fallback to length of string
+ thisType = len(row[col])
+
+ if thisType != columnTypes[col]:
+ if columnTypes[col] is None: # add new column type
+ columnTypes[col] = thisType
+ else:
+ # type is inconsistent, remove column from
+ # consideration
+ del columnTypes[col]
+
+ # finally, compare results against first row and "vote"
+ # on whether it's a header
+ hasHeader = 0
+ for col, colType in columnTypes.items():
+ if type(colType) == type(0): # it's a length
+ if len(header[col]) != colType:
+ hasHeader += 1
+ else:
+ hasHeader -= 1
+ else: # attempt typecast
+ try:
+ colType(header[col])
+ except (ValueError, TypeError):
+ hasHeader += 1
+ else:
+ hasHeader -= 1
+
+ return hasHeader > 0