diff options
author | Stefan Israelsson Tampe <stefan.itampe@gmail.com> | 2018-08-22 21:35:43 +0200 |
---|---|---|
committer | Stefan Israelsson Tampe <stefan.itampe@gmail.com> | 2018-08-22 21:35:43 +0200 |
commit | 43792510bfeb15e8416a5782ab64126ee8950950 (patch) | |
tree | 7de7499a1a5a3f841d24ad2e0ea50964fa0e9e84 | |
parent | a41eeb67b1aa32199501db6d013e259ccb7484e6 (diff) |
csv.py
-rw-r--r-- | modules/language/python/module/_csv.py | 413 | ||||
-rw-r--r-- | modules/language/python/module/csv.py | 449 |
2 files changed, 862 insertions, 0 deletions
diff --git a/modules/language/python/module/_csv.py b/modules/language/python/module/_csv.py new file mode 100644 index 0000000..6db2d05 --- /dev/null +++ b/modules/language/python/module/_csv.py @@ -0,0 +1,413 @@ +(define-module (language python module _csv) + #:use-module (oop pf-objects) + #:use-module (langauge python list) + #:use-module (langauge python def) + #:use-module (langauge python yield) + #:use-module (langauge python for) + #:use-module (langauge python exceptions) + #:export (QUOTE_ALL QUOTE_MINIMAL QUOTE_NONNUMERIC QUOTE_NONE + reader writer Error field_size_limit + get_dialect register_dialect unregister_dialect + list_dialects __doc__ Dialect)) + +(define-python-class Error (Exception)) + +(define-python-class Dialect () + (define __init__ + (lambda (self . x) + #f))) + +(define *field-size* (make-fluid 131072)) +(define field_size_limit + (case-lambda + (() (fluid-ref *field-size*)) + ((x) (fluid-set! *field-size* x)))) + +(define *dialects* (make-hash-table)) +(def (register_dialect nm (= val None) (** keyw)) + (let ((newval (Dialect))) + (define-syntax-rule (set- x y z key default) + (set x 'key (hash-ref z (symbol->string 'key) + (if (eq? y None) + default + (ref y 'key default))))) + (define-syntax-rule (setter x y z ((k def) ...)) + (begin (set- x y z k def) ...)) + + (setter newval val keyw + ((delimiter ",") + (doublequote #t) + (escapechar None) + (lineterminator "\r\n") + (quotechar "\"") + (quoting 'minimal) + (skipinitialspace #f) + (strict #f))) + + (hash-set! *dialects* nm newval))) + + + + (hash-set! *dialects* nm val)) +(define (get_dialect nm val) + (hash-ref *dialects* nm val None)) +(define (unregister_dialect nm) + (hash-delete! *dialects nm)) + +(define (list_dialects) + (let ((ret '())) + (hash-for-each + (lambda (k v) + (set! ret (cons k ret))) + *dialects*) + (py-ist ret))) + +(define __doc__ +"CSV parsing and writing. + +This module provides classes that assist in the reading and writing +of Comma Separated Value (CSV) files, and implements the interface +described by PEP 305. Although many CSV files are simple to parse, +the format is not formally defined by a stable specification and +is subtle enough that parsing lines of a CSV file with something +like line.split(\",\") is bound to fail. The module supports three\n +basic APIs: reading, writing, and registration of dialects. + + +DIALECT REGISTRATION: + +Readers and writers support a dialect argument, which is a convenient +handle on a group of settings. When the dialect argument is a string, +it identifies one of the dialects previously registered with the module. +If it is a class or instance, the attributes of the argument are used as +the settings for the reader or writer: + + class excel: + delimiter = ',' + quotechar = '\"' + escapechar = None + doublequote = True + skipinitialspace = False + lineterminator = '\\r\\n' + quoting = QUOTE_MINIMAL + +SETTINGS: + + * quotechar - specifies a one-character string to use as the + quoting character. It defaults to '\"'. + * delimiter - specifies a one-character string to use as the + field separator. It defaults to ','. + * skipinitialspace - specifies how to interpret whitespace which + immediately follows a delimiter. It defaults to False, which + means that whitespace immediately following a delimiter is part + of the following field. + * lineterminator - specifies the character sequence which should + terminate rows. + * quoting - controls when quotes should be generated by the writer. + It can take on any of the following module constants: + + csv.QUOTE_MINIMAL means only when required, for example, when a + field contains either the quotechar or the delimiter + csv.QUOTE_ALL means that quotes are always placed around fields. + csv.QUOTE_NONNUMERIC means that quotes are always placed around + fields which do not parse as integers or floating point + numbers. + csv.QUOTE_NONE means that quotes are never placed around fields. + * escapechar - specifies a one-character string used to escape + the delimiter when quoting is set to QUOTE_NONE. + * doublequote - controls the handling of quotes inside fields. When + True, two consecutive quotes are interpreted as one during read, + and when writing, each quote character embedded in the data is + written as two quotes") + +(define QUOTE_ALL 'all) +(define QUOTE_MINIMAL 'minimal) +(define QUOTE_NONNUMERIC 'nonumeric) +(define QUOTE_NONE 'none) + +(def (reader csvfile (= dialect "excel") (** fmtparams)) + (let* + ((dialect (get-dialect dialect)) + + (delimiter (chr (py-get fmtparams "delimiter" e) + (ref dialect 'Delimiter e) + ",")) + + (doublequote (oor (py-get fmtparams "doublequote" e) + (ref dialect 'doublequote e) + #t)) + + (escapechar (chr (py-get fmtparams "escapechar" e) + (ref dialect 'escapechar e) + None)) + + (lineterminator (str (py-get fmtparams "lineterminator" e) + (ref dialect 'lineterminator e) + "\r\n")) + + (quotechar (chr (py-get fmtparams "quotechar" e) + (ref dialect 'quotechar e) + "\"")) + + (quoting (oor (py-get fmtparams "quoting" e) + (ref dialect 'quoting e) + QUOTE_MINIMAL)) + + (skipispace (oor (py-get fmtparams "skipinitialspace" e) + (ref dialect 'skipinitialspace e) + #t)) + + (strict (oor (py-get fmtparams "strict" e) + (ref dialect 'strict e) + #f))) + (make-generator () + (lambda (yield) + (for ((s : cvsfile)) () + (let ((n (len s))) + (let lp ((i 0) (state #f) (l '())) + (let lp2 ((j i) (r '())) + (define-syntax-rule (raise- s) + (if strict + (raise s) + (lp (+ j 1) r))) + (define (end j ch) + (if (and (eq? state 'start) + (eq? ch #\newline)) + (yield (py-list (reverse l))) + (let* ((x (list->string (reverse r))) + (x (if (eq? state 'numeric) + (string->number x) + x))) + (if (eq? ch #\newline) + (yield (py-list (reverse (cons x l)))) + (lp (+ j 1) 'start (cons x l)))))) + + (define (do-quotechar) + (cond + ((eq? state 'quote) + (if doublequote + (if (and (< (+ i 1) n) + (equal? quotechar + (string-ref s (+ i 1)))) + (lp2 (+ j 2) (cons quotechar r)) + (end (+ j 1))) + (end (+ j 1)))) + + ((eq? state 'start) + (if (or (eq? quoting 'minimal) + (eq? quoting 'all) + (eq? quoting 'nonnumeric)) + (lp (+ j 1) 'quote l) + (raise- (Error "QOUTE_NONE supports no quoteing")))) + + (else + (raise- (Error "wrong quoting found"))))) + + (define (do-whitespace ch) + (cond + ((eq? state 'start) + (if skipispace + (lp2 (+ i 1) r) + (if (or (eq? quoting 'minimal) + (eq? quoting 'none)) + (lp i 'normal l) + (raise- (Error "whitespace outside quote"))))) + + ((or (eq? state 'normal) + (eq? state 'quote)) + (lp2 (+ i 1) (cons ch r))) + + ((eq? state 'nnumeric) + (raise- (Error "whitespace in numeric field"))) + + ((eq? state 'end) + (raise- (Error "whitespace after quote"))))) + + (define (do-esc-qupote) + (if (< (+ j 1) n) + (let ((ch2 (string-ref s (+ j 1)))) + (cond + ((and (eq? state 'quoting) + (eq? ch2 quotechar)) + (lp2 (+ j 2) + (cons quotechar r))) + + ((eq? ch2 delimiter) + (lp2 (+ j 2) + (cons delimiter r))) + + ((eq? ch2 escapechar) + (lp2 (+ j 2) + (cons escapechar r))) + + (else + (lp2 (+ j 2) r)))) + (raise- (Error "single escape ends line")))) + + (define (do-escape) + (cond + ((eq? state 'start) + (if (eq? quoting 'none) + (lp j 'normal l) + (raise- (Error "escapecharacter in nonquote")))) + + ((eq? state 'normal) + (if (eq? quoting 'none) + (do-esc-quote) + (raise- (Error "escapecharacter in nonequote")))) + + ((eq? state 'numeric) + (raise- (Error "escacpechar in numeric field"))) + + ((eq? state 'quote) + (do-esc-quote)) + + ((eq? state 'end) + (raise- (Error "escapechar after quote"))))) + + (define (do-delim ch) + (cond + ((or (eq? state 'start) + (eq? state 'end)) + (end ch)) + + ((eq? state 'quote) + (if (eq? quoteing 'minimal) + (raise- + (Error "minimal quoting must quote delimiter")) + (end ch))) + + ((eq? state 'normal) + (end ch)) + + ((eq? state 'numeric) + (end ch)))) + + (if (< j n) + (let ((ch (string-ref s i))) + (cond + ((or (eq? ch #\newline) + (eq? ch #\return)) + (if (eq? state 'quote) + (raise- (Error "missing end quote character")) + (end #\newline))) + + ((or (eq? ch #\space) (eq? ch #\tab)) + (do-whitespace ch)) + + ((eq? ch quotechar) + (do-quotechar)) + + ((eq? ch escapechar) + (do-escape)) + + ((eq? ch delimiter) + (do-delim ch)) + + ((eq? state 'numeric) + (if (or (eq? ch #\.) + (eq? ch #\-) + (eq? ch #\e) + (eq? ch #\E) + (char-numeric? ch)) + (lp2 (+ j 1) (cons ch r)) + (raise- (Error "nonumeric in numeric field")))) + + ((eq? state 'start) + (cond + ((eq? quoting 'all) + (raise- + (Error + "nonquoted field when all should be quoted"))) + ((eq? quoting 'nonnumeric) + (lp j 'numeric l)) + (else + (lp j 'normal l)))) + + ((or (eq? state 'quote) (eq? state 'normal)) + (lp2 (+ j 1) (cons ch r))) + + ((eq? state 'end) + (raise- + (Error + "non delimeter after qouted field"))))) + (do-delim #\newline)))))))))) + +(define-python-class writer () + (define __init__ + (lam (csvfile (= dialect "excel") (** fmt)) + (set! dialect (if (string? dialect) + (get_dialect dialect) + dialect)) + (set self 'csvfile csvfile) + (set self 'dialect dialect))) + + (define writerow + (lambda (self l) + (let* + ((dialect (ref self 'dialect)) + + (delimiter (chr (py-get fmtparams "delimiter" e) + (ref dialect 'Delimiter e) + ",")) + + (doublequote (oor (py-get fmtparams "doublequote" e) + (ref dialect 'doublequote e) + #t)) + + (escapechar (chr (py-get fmtparams "escapechar" e) + (ref dialect 'escapechar e) + None)) + + (lineterminator (str (py-get fmtparams "lineterminator" e) + (ref dialect 'lineterminator e) + "\r\n")) + + (quotechar (chr (py-get fmtparams "quotechar" e) + (ref dialect 'quotechar e) + "\"")) + + (quoting (oor (py-get fmtparams "quoting" e) + (ref dialect 'quoting e) + QUOTE_MINIMAL)) + + (skipispace (oor (py-get fmtparams "skipinitialspace" e) + (ref dialect 'skipinitialspace e) + #t)) + + (strict (oor (py-get fmtparams "strict" e) + (ref dialect 'strict e) + #f))) + + (for ((x : l)) (r '()) + (let/ec ret + (cons + (cond + ((eq? quoting 'none) + (let ((x (if (string? x) x (str x)))) + (if (has-escape-1 x) + (if strict + (raise (Error "None quoting and nonspecial chars")) + (ret r)) + x))) + + ((eq? quoting 'nonnumeric) + (let ((x (if (string? x) x (str x)))) + (if (is-numeric x) + (number->string x) + (quote-it x)))) + + ((eq? quoting 'none) + (if (string? x) + x + (str x))) + + ((eq? quoting 'minimal) + (let ((x (if (string? x) x (str x)))) + (if (has-escape-2 x) (quote-it x) x)))) + r) + #:final + (write + (string-join + (reverse + (cons lineterminator r)) delim)))))))) diff --git a/modules/language/python/module/csv.py b/modules/language/python/module/csv.py new file mode 100644 index 0000000..0349e0b --- /dev/null +++ b/modules/language/python/module/csv.py @@ -0,0 +1,449 @@ + +""" +csv.py - read/write/investigate CSV files +""" + +import re +from _csv import Error, __version__, writer, reader, register_dialect, \ + unregister_dialect, get_dialect, list_dialects, \ + field_size_limit, \ + QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ + __doc__ +from _csv import Dialect as _Dialect + +from collections import OrderedDict +from io import StringIO + +__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", + "Error", "Dialect", "__doc__", "excel", "excel_tab", + "field_size_limit", "reader", "writer", + "register_dialect", "get_dialect", "list_dialects", "Sniffer", + "unregister_dialect", "__version__", "DictReader", "DictWriter", + "unix_dialect"] + +class Dialect: + """Describe a CSV dialect. + + This must be subclassed (see csv.excel). Valid attributes are: + delimiter, quotechar, escapechar, doublequote, skipinitialspace, + lineterminator, quoting. + + """ + _name = "" + _valid = False + # placeholders + delimiter = None + quotechar = None + escapechar = None + doublequote = None + skipinitialspace = None + lineterminator = None + quoting = None + + def __init__(self): + if self.__class__ != Dialect: + self._valid = True + self._validate() + + def _validate(self): + try: + _Dialect(self) + except TypeError as e: + # We do this for compatibility with py2.3 + raise Error(str(e)) + +class excel(Dialect): + """Describe the usual properties of Excel-generated CSV files.""" + delimiter = ',' + quotechar = '"' + doublequote = True + skipinitialspace = False + lineterminator = '\r\n' + quoting = QUOTE_MINIMAL +register_dialect("excel", excel) + +class excel_tab(excel): + """Describe the usual properties of Excel-generated TAB-delimited files.""" + delimiter = '\t' +register_dialect("excel-tab", excel_tab) + +class unix_dialect(Dialect): + """Describe the usual properties of Unix-generated CSV files.""" + delimiter = ',' + quotechar = '"' + doublequote = True + skipinitialspace = False + lineterminator = '\n' + quoting = QUOTE_ALL +register_dialect("unix", unix_dialect) + + +class DictReader: + def __init__(self, f, fieldnames=None, restkey=None, restval=None, + dialect="excel", *args, **kwds): + self._fieldnames = fieldnames # list of keys for the dict + self.restkey = restkey # key to catch long rows + self.restval = restval # default value for short rows + self.reader = reader(f, dialect, *args, **kwds) + self.dialect = dialect + self.line_num = 0 + + def __iter__(self): + return self + + @property + def fieldnames(self): + if self._fieldnames is None: + try: + self._fieldnames = next(self.reader) + except StopIteration: + pass + self.line_num = self.reader.line_num + return self._fieldnames + + @fieldnames.setter + def fieldnames(self, value): + self._fieldnames = value + + def __next__(self): + if self.line_num == 0: + # Used only for its side effect. + self.fieldnames + row = next(self.reader) + self.line_num = self.reader.line_num + + # unlike the basic reader, we prefer not to return blanks, + # because we will typically wind up with a dict full of None + # values + while row == []: + row = next(self.reader) + d = OrderedDict(zip(self.fieldnames, row)) + lf = len(self.fieldnames) + lr = len(row) + if lf < lr: + d[self.restkey] = row[lf:] + elif lf > lr: + for key in self.fieldnames[lr:]: + d[key] = self.restval + return d + + +class DictWriter: + def __init__(self, f, fieldnames, restval="", extrasaction="raise", + dialect="excel", *args, **kwds): + self.fieldnames = fieldnames # list of keys for the dict + self.restval = restval # for writing short dicts + if extrasaction.lower() not in ("raise", "ignore"): + raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'" + % extrasaction) + self.extrasaction = extrasaction + self.writer = writer(f, dialect, *args, **kwds) + + def writeheader(self): + header = dict(zip(self.fieldnames, self.fieldnames)) + self.writerow(header) + + def _dict_to_list(self, rowdict): + if self.extrasaction == "raise": + wrong_fields = rowdict.keys() - self.fieldnames + if wrong_fields: + raise ValueError("dict contains fields not in fieldnames: " + + ", ".join([repr(x) for x in wrong_fields])) + return (rowdict.get(key, self.restval) for key in self.fieldnames) + + def writerow(self, rowdict): + return self.writer.writerow(self._dict_to_list(rowdict)) + + def writerows(self, rowdicts): + return self.writer.writerows(map(self._dict_to_list, rowdicts)) + +# Guard Sniffer's type checking against builds that exclude complex() +try: + complex +except NameError: + complex = float + +class Sniffer: + ''' + "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) + Returns a Dialect object. + ''' + def __init__(self): + # in case there is more than one possible delimiter + self.preferred = [',', '\t', ';', ' ', ':'] + + + def sniff(self, sample, delimiters=None): + """ + Returns a dialect (or None) corresponding to the sample + """ + + quotechar, doublequote, delimiter, skipinitialspace = \ + self._guess_quote_and_delimiter(sample, delimiters) + if not delimiter: + delimiter, skipinitialspace = self._guess_delimiter(sample, + delimiters) + + if not delimiter: + raise Error("Could not determine delimiter") + + class dialect(Dialect): + _name = "sniffed" + lineterminator = '\r\n' + quoting = QUOTE_MINIMAL + # escapechar = '' + + dialect.doublequote = doublequote + dialect.delimiter = delimiter + # _csv.reader won't accept a quotechar of '' + dialect.quotechar = quotechar or '"' + dialect.skipinitialspace = skipinitialspace + + return dialect + + + def _guess_quote_and_delimiter(self, data, delimiters): + """ + Looks for text enclosed between two identical quotes + (the probable quotechar) which are preceded and followed + by the same character (the probable delimiter). + For example: + ,'some text', + The quote with the most wins, same with the delimiter. + If there is no quotechar the delimiter can't be determined + this way. + """ + + matches = [] + for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", + r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", + r'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" + r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) + regexp = re.compile(restr, re.DOTALL | re.MULTILINE) + matches = regexp.findall(data) + if matches: + break + + if not matches: + # (quotechar, doublequote, delimiter, skipinitialspace) + return ('', False, None, 0) + quotes = {} + delims = {} + spaces = 0 + groupindex = regexp.groupindex + for m in matches: + n = groupindex['quote'] - 1 + key = m[n] + if key: + quotes[key] = quotes.get(key, 0) + 1 + try: + n = groupindex['delim'] - 1 + key = m[n] + except KeyError: + continue + if key and (delimiters is None or key in delimiters): + delims[key] = delims.get(key, 0) + 1 + try: + n = groupindex['space'] - 1 + except KeyError: + continue + if m[n]: + spaces += 1 + + quotechar = max(quotes, key=quotes.get) + + if delims: + delim = max(delims, key=delims.get) + skipinitialspace = delims[delim] == spaces + if delim == '\n': # most likely a file with a single column + delim = '' + else: + # there is *no* delimiter, it's a single column of quoted data + delim = '' + skipinitialspace = 0 + + # if we see an extra quote between delimiters, we've got a + # double quoted format + dq_regexp = re.compile( + r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \ + {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) + + + + if dq_regexp.search(data): + doublequote = True + else: + doublequote = False + + return (quotechar, doublequote, delim, skipinitialspace) + + + def _guess_delimiter(self, data, delimiters): + """ + The delimiter /should/ occur the same number of times on + each row. However, due to malformed data, it may not. We don't want + an all or nothing approach, so we allow for small variations in this + number. + 1) build a table of the frequency of each character on every line. + 2) build a table of frequencies of this frequency (meta-frequency?), + e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, + 7 times in 2 rows' + 3) use the mode of the meta-frequency to determine the /expected/ + frequency for that character + 4) find out how often the character actually meets that goal + 5) the character that best meets its goal is the delimiter + For performance reasons, the data is evaluated in chunks, so it can + try and evaluate the smallest portion of the data possible, evaluating + additional chunks as necessary. + """ + + data = list(filter(None, data.split('\n'))) + + ascii = [chr(c) for c in range(127)] # 7-bit ASCII + + # build frequency tables + chunkLength = min(10, len(data)) + iteration = 0 + charFrequency = {} + modes = {} + delims = {} + start, end = 0, min(chunkLength, len(data)) + while start < len(data): + iteration += 1 + for line in data[start:end]: + for char in ascii: + metaFrequency = charFrequency.get(char, {}) + # must count even if frequency is 0 + freq = line.count(char) + # value is the mode + metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 + charFrequency[char] = metaFrequency + + for char in charFrequency.keys(): + items = list(charFrequency[char].items()) + if len(items) == 1 and items[0][0] == 0: + continue + # get the mode of the frequencies + if len(items) > 1: + modes[char] = max(items, key=lambda x: x[1]) + # adjust the mode - subtract the sum of all + # other frequencies + items.remove(modes[char]) + modes[char] = (modes[char][0], modes[char][1] + - sum(item[1] for item in items)) + else: + modes[char] = items[0] + + # build a list of possible delimiters + modeList = modes.items() + total = float(chunkLength * iteration) + # (rows of consistent data) / (number of rows) = 100% + consistency = 1.0 + # minimum consistency threshold + threshold = 0.9 + while len(delims) == 0 and consistency >= threshold: + for k, v in modeList: + if v[0] > 0 and v[1] > 0: + if ((v[1]/total) >= consistency and + (delimiters is None or k in delimiters)): + delims[k] = v + consistency -= 0.01 + + if len(delims) == 1: + delim = list(delims.keys())[0] + skipinitialspace = (data[0].count(delim) == + data[0].count("%c " % delim)) + return (delim, skipinitialspace) + + # analyze another chunkLength lines + start = end + end += chunkLength + + if not delims: + return ('', 0) + + # if there's more than one, fall back to a 'preferred' list + if len(delims) > 1: + for d in self.preferred: + if d in delims.keys(): + skipinitialspace = (data[0].count(d) == + data[0].count("%c " % d)) + return (d, skipinitialspace) + + # nothing else indicates a preference, pick the character that + # dominates(?) + items = [(v,k) for (k,v) in delims.items()] + items.sort() + delim = items[-1][1] + + skipinitialspace = (data[0].count(delim) == + data[0].count("%c " % delim)) + return (delim, skipinitialspace) + + + def has_header(self, sample): + # Creates a dictionary of types of data in each column. If any + # column is of a single type (say, integers), *except* for the first + # row, then the first row is presumed to be labels. If the type + # can't be determined, it is assumed to be a string in which case + # the length of the string is the determining factor: if all of the + # rows except for the first are the same length, it's a header. + # Finally, a 'vote' is taken at the end for each column, adding or + # subtracting from the likelihood of the first row being a header. + + rdr = reader(StringIO(sample), self.sniff(sample)) + + header = next(rdr) # assume first row is header + + columns = len(header) + columnTypes = {} + for i in range(columns): columnTypes[i] = None + + checked = 0 + for row in rdr: + # arbitrary number of rows to check, to keep it sane + if checked > 20: + break + checked += 1 + + if len(row) != columns: + continue # skip rows that have irregular number of columns + + for col in list(columnTypes.keys()): + + for thisType in [int, float, complex]: + try: + thisType(row[col]) + break + except (ValueError, OverflowError): + pass + else: + # fallback to length of string + thisType = len(row[col]) + + if thisType != columnTypes[col]: + if columnTypes[col] is None: # add new column type + columnTypes[col] = thisType + else: + # type is inconsistent, remove column from + # consideration + del columnTypes[col] + + # finally, compare results against first row and "vote" + # on whether it's a header + hasHeader = 0 + for col, colType in columnTypes.items(): + if type(colType) == type(0): # it's a length + if len(header[col]) != colType: + hasHeader += 1 + else: + hasHeader -= 1 + else: # attempt typecast + try: + colType(header[col]) + except (ValueError, TypeError): + hasHeader += 1 + else: + hasHeader -= 1 + + return hasHeader > 0 |