diff --git a/.gitignore b/.gitignore index e6fac63..33f6a3f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,12 @@ *.swp *.egg-info *.pyc +*.eggs *.DS_Store */_build/* *.py~ *.~lock.*# +.coverage +dist/* +.tox/* +pyenv3 diff --git a/.travis.yml b/.travis.yml index bd19ad7..e6af8c8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,8 @@ language: python python: - - "2.6" - "2.7" - "3.4" + - "3.5" install: # Fix for html5lib, probably can be removed after the version after # 0.999999999/1.0b10 is released. diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index b682622..0000000 --- a/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM ubuntu:14.04 - -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && \ - apt-get install -y \ - python-pip \ - python-dev - -RUN apt-get install -y python-numpy python-lxml -RUN apt-get install -y python3 python3-pip python3-lxml python3-nose -# chardet version is out of date; old version doesn't detect UTF8 w/ BOM -RUN pip3 install --upgrade chardet -RUN apt-get install -y python-nose -RUN locale-gen en_GB.UTF-8 - -RUN mkdir /home/messytables && \ - chown nobody /home/messytables -USER nobody -ENV HOME=/home/messytables \ - PATH=/home/messytables/.local/bin:$PATH \ - LANG=en_GB.UTF-8 -# LANG needed for httpretty install on Py3 -WORKDIR /home/messytables - -COPY ./requirements-test.txt /home/messytables/ -RUN pip install --user -r /home/messytables/requirements-test.txt -RUN pip3 install --user -r /home/messytables/requirements-test.txt -RUN pip install --user pdftables -COPY . /home/messytables/ diff --git a/Makefile b/Makefile index c5cf657..d22fbb6 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,4 @@ -run: build - @docker run \ - --rm \ - -ti \ - messytables +test: + nosetests --with-coverage --cover-package=messytables --cover-erase -build: - @docker build -t messytables . - -.PHONY: run build +.PHONY: run build test diff --git a/README.md b/README.md index 5d83641..f160196 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,4 @@ -# Parsing for messy tables - -[![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) -[![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) -[![Latest Version](https://img.shields.io/pypi/v/messytables.svg)](https://pypi.python.org/pypi/messytables/) +# Parsing for messy tables [![Build Status](https://travis-ci.org/okfn/messytables.png?branch=master)](https://travis-ci.org/okfn/messytables) [![Coverage Status](https://coveralls.io/repos/okfn/messytables/badge.png?branch=master)](https://coveralls.io/r/okfn/messytables?branch=master) [![Latest Version](https://img.shields.io/pypi/v/messytables.svg)](https://pypi.python.org/pypi/messytables/) A library for dealing with messy tabular data in several formats, guessing types and detecting headers. @@ -10,6 +6,6 @@ See the documentation at: https://messytables.readthedocs.io Find the package at: https://pypi.python.org/pypi/messytables -See CONTRIBUTING.md for how to send patches, run tests. +See ``CONTRIBUTING.md`` for how to send patches, run tests. **Contact**: Open Knowledge Labs - http://okfnlabs.org/contact/. We especially recommend the forum: http://discuss.okfn.org/category/open-knowledge-labs/ diff --git a/doc/index.rst b/doc/index.rst index 176e3f6..bb4c8f3 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -182,15 +182,8 @@ of a given column into all types and searching for the best match. .. automethod:: messytables.types.type_guess -The supported types include: - -.. autoclass:: messytables.types.StringType -.. autoclass:: messytables.types.IntegerType -.. autoclass:: messytables.types.FloatType -.. autoclass:: messytables.types.DecimalType -.. autoclass:: messytables.types.BoolType -.. autoclass:: messytables.types.DateType -.. autoclass:: messytables.types.DateUtilType +The supported types are detailed in +`typecast `_ Headers detection ----------------- diff --git a/messytables/__init__.py b/messytables/__init__.py index e2c03b9..c1ca1ba 100644 --- a/messytables/__init__.py +++ b/messytables/__init__.py @@ -1,25 +1,21 @@ from messytables.util import offset_processor, null_processor -from messytables.headers import headers_guess, headers_processor, headers_make_unique +from messytables.headers import headers_guess, headers_processor +from messytables.headers import headers_make_unique from messytables.types import type_guess, types_processor -from messytables.types import StringType, IntegerType, FloatType, \ - DecimalType, DateType, DateUtilType, BoolType from messytables.error import ReadError -from messytables.core import Cell, TableSet, RowSet, seekable_stream -from messytables.commas import CSVTableSet, CSVRowSet +from messytables.buffered import seekable_stream +from messytables.core import Cell, TableSet, RowSet +from messytables.commas import CSVTableSet, CSVRowSet, TSVTableSet from messytables.ods import ODSTableSet, ODSRowSet from messytables.excel import XLSTableSet, XLSRowSet - -# XLSXTableSet has been deprecated and its functionality is now provided by -# XLSTableSet. This is to retain backwards compatibility with anyone -# constructing XLSXTableSet directly (rather than using any_tableset) -XLSXTableSet = XLSTableSet -XLSXRowSet = XLSRowSet - from messytables.zip import ZIPTableSet from messytables.html import HTMLTableSet, HTMLRowSet from messytables.pdf import PDFTableSet, PDFRowSet -from messytables.any import any_tableset, AnyTableSet +from messytables.any import any_tableset from messytables.jts import rowset_as_jts, headers_and_typed_as_jts + +import warnings +warnings.filterwarnings('ignore', "Coercing non-XML name") diff --git a/messytables/any.py b/messytables/any.py index fd9dfc5..477e725 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -1,8 +1,10 @@ -from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet, - HTMLTableSet, ODSTableSet) -import messytables import re +from messytables import ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet +from messytables import HTMLTableSet, ODSTableSet, TSVTableSet +from messytables.buffered import seekable_stream +from messytables.error import ReadError + MIMELOOKUP = {'application/x-zip-compressed': 'ZIP', 'application/zip': 'ZIP', @@ -30,10 +32,8 @@ 'application/x-vnd.oasis.opendocument.spreadsheet': 'ODS', } -def TABTableSet(fileobj): - return CSVTableSet(fileobj, delimiter='\t') -parsers = {'TAB': TABTableSet, +parsers = {'TAB': TSVTableSet, 'ZIP': ZIPTableSet, 'XLS': XLSTableSet, 'HTML': HTMLTableSet, @@ -63,7 +63,7 @@ def get_mime(fileobj): import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. - fileobj = messytables.seekable_stream(fileobj) + fileobj = seekable_stream(fileobj) header = fileobj.read(4096) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) @@ -161,13 +161,6 @@ def any_tableset(fileobj, mimetype=None, extension='', auto_detect=True, **kw): mimetype=magic_mime)) if error: - raise messytables.ReadError('any: \n'.join(error)) + raise ReadError('any: \n'.join(error)) else: - raise messytables.ReadError("any: Did not attempt any detection.") - - -class AnyTableSet: - '''Deprecated - use any_tableset instead.''' - @staticmethod - def from_fileobj(fileobj, mimetype=None, extension=None): - return any_tableset(fileobj, mimetype=mimetype, extension=extension) + raise ReadError("any: Did not attempt any detection.") diff --git a/messytables/buffered.py b/messytables/buffered.py new file mode 100644 index 0000000..dd4daf8 --- /dev/null +++ b/messytables/buffered.py @@ -0,0 +1,89 @@ +import io + +BUFFER_SIZE = 4096 + + +def seekable_stream(fileobj): + try: + fileobj.seek(0) + # if we got here, the stream is seekable + return fileobj + except: + # otherwise seek failed, so slurp in stream and wrap + # it in a BytesIO + return BufferedFile(fileobj) + + +class BufferedFile(object): + """A buffered file that preserves the beginning of a stream.""" + + def __init__(self, fp, buffer_size=BUFFER_SIZE + 2): + self.data = io.BytesIO() + self.fp = fp + self.offset = 0 + self.len = 0 + self.fp_offset = 0 + self.buffer_size = buffer_size + + def _next_line(self): + try: + return self.fp.readline() + except AttributeError: + return next(self.fp) + + def _read(self, n): + return self.fp.read(n) + + @property + def _buffer_full(self): + return self.len >= self.buffer_size + + def readline(self): + if self.len < self.offset < self.fp_offset: + raise BufferError('Line is not available anymore') + if self.offset >= self.len: + line = self._next_line() + self.fp_offset += len(line) + + self.offset += len(line) + + if not self._buffer_full: + self.data.write(line) + self.len += len(line) + else: + line = self.data.readline() + self.offset += len(line) + return line + + def read(self, n=-1): + if n == -1: + # if the request is to do a complete read, then do a complete + # read. + self.data.seek(self.offset) + return self.data.read(-1) + self.fp.read(-1) + + if self.len < self.offset < self.fp_offset: + raise BufferError('Data is not available anymore') + if self.offset >= self.len: + byte = self._read(n) + self.fp_offset += len(byte) + + self.offset += len(byte) + + if not self._buffer_full: + self.data.write(byte) + self.len += len(byte) + else: + byte = self.data.read(n) + self.offset += len(byte) + return byte + + def tell(self): + return self.offset + + def seek(self, offset): + if self.len < offset < self.fp_offset: + raise BufferError('Cannot seek because data is not buffered here') + self.offset = offset + if offset < self.len: + self.data.seek(offset) diff --git a/messytables/commas.py b/messytables/commas.py index 65dd999..1a75613 100644 --- a/messytables/commas.py +++ b/messytables/commas.py @@ -1,194 +1,140 @@ +import re import csv -import codecs -import chardet +import six +import logging +from messytables.buffered import BUFFER_SIZE +from messytables.text import analyze_stream from messytables.core import RowSet, TableSet, Cell -import messytables -from messytables.compat23 import unicode_string, byte_string, native_string, PY2 +from messytables.error import ReadError +DELIMITERS = ['\t', ',', ';', '|'] +LINE_SEPARATOR = ['\r\n', '\r', '\n', '\0'] -class UTF8Recoder: - """ - Iterator that reads an encoded stream and re-encodes the input to UTF-8 - """ - - # maps between chardet encoding and codecs bom keys - BOM_MAPPING = { - 'utf-16le': 'BOM_UTF16_LE', - 'utf-16be': 'BOM_UTF16_BE', - 'utf-32le': 'BOM_UTF32_LE', - 'utf-32be': 'BOM_UTF32_BE', - 'utf-8': 'BOM_UTF8', - 'utf-8-sig': 'BOM_UTF8', - - } - - def __init__(self, f, encoding): - sample = f.read(2000) - if not encoding: - results = chardet.detect(sample) - encoding = results['encoding'] - if not encoding: - # Don't break, just try and load the data with - # a semi-sane encoding - encoding = 'utf-8' - f.seek(0) - self.reader = codecs.getreader(encoding)(f, 'ignore') - - # The reader only skips a BOM if the encoding isn't explicit about its - # endianness (i.e. if encoding is UTF-16 a BOM is handled properly - # and taken out, but if encoding is UTF-16LE a BOM is ignored). - # However, if chardet sees a BOM it returns an encoding with the - # endianness explicit, which results in the codecs stream leaving the - # BOM in the stream. This is ridiculously dumb. For UTF-{16,32}{LE,BE} - # encodings, check for a BOM and remove it if it's there. - if encoding.lower() in self.BOM_MAPPING: - bom = getattr(codecs, self.BOM_MAPPING[encoding.lower()], None) - if bom: - # Try to read the BOM, which is a byte sequence, from - # the underlying stream. If all characters match, then - # go on. Otherwise when a character doesn't match, seek - # the stream back to the beginning and go on. - for c in bom: - if f.read(1) != c: - f.seek(0) - break - - def __iter__(self): - return self - - def __next__(self): - line = self.reader.readline() - if not line or line == '\0': - raise StopIteration - result = line.encode("utf-8") - return result - - next = __next__ - - -def to_unicode_or_bust(obj, encoding='utf-8'): - if isinstance(obj, byte_string): - obj = unicode_string(obj, encoding) - return obj +# Fix the maximum field size to something a little larger +csv.field_size_limit(256000) +log = logging.getLogger(__name__) class CSVTableSet(TableSet): - """ A CSV table set. Since CSV is always just a single table, - this is just a pass-through for the row set. """ + """A CSV table set. + + Since CSV is always just a single table, this is just a pass-through for + the row set. + """ def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, - encoding=None, window=None, doublequote=None, - lineterminator=None, skipinitialspace=None, **kw): - self.fileobj = messytables.seekable_stream(fileobj) - self.name = name or 'table' - self.delimiter = delimiter - self.quotechar = quotechar - self.encoding = encoding - self.window = window - self.doublequote = doublequote - self.lineterminator = lineterminator - self.skipinitialspace = skipinitialspace + encoding=None, window=1000, doublequote=True, + skipinitialspace=None, **kw): + self._tables = [CSVRowSet(name or 'table', fileobj, + delimiter=delimiter, + quotechar=quotechar, + encoding=encoding, + window=window, + doublequote=doublequote, + skipinitialspace=skipinitialspace)] def make_tables(self): - """ Return the actual CSV table. """ - return [CSVRowSet(self.name, self.fileobj, - delimiter=self.delimiter, - quotechar=self.quotechar, - encoding=self.encoding, - window=self.window, - doublequote=self.doublequote, - lineterminator=self.lineterminator, - skipinitialspace=self.skipinitialspace)] + """Return the actual CSV table.""" + return self._tables + + +class TSVTableSet(CSVTableSet): + """A TSV table set. + + This is a slightly specialised version of the CSVTableSet that will always + generate a tab-based table parser. + """ + + def __init__(self, fileobj, quotechar=None, name=None, + encoding=None, window=1000, doublequote=True, + skipinitialspace=None, **kw): + super(TSVTableSet, self).__init__(fileobj, delimiter='\t', + quotechar=quotechar, name=name, + encoding=encoding, window=window, + doublequote=doublequote, + skipinitialspace=skipinitialspace, + **kw) class CSVRowSet(RowSet): - """ A CSV row set is an iterator on a CSV file-like object + """A CSV row set is an iterator on a CSV file-like object. + (which can potentially be infinetly large). When loading, a sample is read and cached so you can run analysis on the - fragment. """ + fragment. + """ def __init__(self, name, fileobj, delimiter=None, quotechar=None, - encoding='utf-8', window=None, doublequote=None, - lineterminator=None, skipinitialspace=None): + encoding=None, window=1000, doublequote=None, + skipinitialspace=None): self.name = name - seekable_fileobj = messytables.seekable_stream(fileobj) - self.fileobj = UTF8Recoder(seekable_fileobj, encoding) + self.encoding, self.buf = analyze_stream(fileobj, encoding=encoding) + self.fileobj = fileobj + + # For line breaking, use the (detected) encoding of the file: + linesep = [t.encode(self.encoding) for t in LINE_SEPARATOR] + linesep = b'(' + b'|'.join(linesep) + b')' + self.linesep = re.compile(linesep) - def fake_ilines(fobj): - for row in fobj: - yield row.decode('utf-8') - self.lines = fake_ilines(self.fileobj) self._sample = [] - self.delimiter = delimiter - self.quotechar = quotechar - self.window = window or 1000 - self.doublequote = doublequote - self.lineterminator = lineterminator - self.skipinitialspace = skipinitialspace - try: - for i in range(self.window): - self._sample.append(next(self.lines)) - except StopIteration: - pass - super(CSVRowSet, self).__init__() + self.window = window - @property - def _dialect(self): - delim = '\n' # NATIVE - sample = delim.join(self._sample) try: - dialect = csv.Sniffer().sniff(sample, - delimiters=['\t', ',', ';', '|']) # NATIVE - dialect.delimiter = native_string(dialect.delimiter) - dialect.quotechar = native_string(dialect.quotechar) - dialect.lineterminator = delim - dialect.doublequote = True - return dialect + sample = self.buf.decode(self.encoding) + if six.PY2: + sample = sample.encode('utf-8') + self.dialect = csv.Sniffer().sniff(sample, delimiters=DELIMITERS) except csv.Error: - return csv.excel - - @property - def _overrides(self): - # some variables in the dialect can be overridden - d = {} - if self.delimiter: - d['delimiter'] = self.delimiter - if self.quotechar: - d['quotechar'] = self.quotechar - if self.doublequote: - d['doublequote'] = self.doublequote - if self.lineterminator: - d['lineterminator'] = self.lineterminator - if self.skipinitialspace is not None: - d['skipinitialspace'] = self.skipinitialspace - return d + self.dialect = csv.excel + # override detected dialect with constructor values. + self.dialect.delimiter = delimiter or str(self.dialect.delimiter) + self.dialect.quotechar = quotechar or str(self.dialect.quotechar) + if skipinitialspace is not None: + self.dialect.skipinitialspace = skipinitialspace + if doublequote is not None: + self.dialect.doublequote = doublequote + super(CSVRowSet, self).__init__() - def raw(self, sample=False): - def rows(): - for line in self._sample: - if PY2: - yield line.encode('utf-8') + def get_lines(self, sample=False): + for line in self._sample: + yield line + + while True: + if self.buf is None: + break + if sample and len(self._sample) >= self.window: + break + match = self.linesep.search(self.buf) + if match is not None: + line = self.buf[:match.end(0)] + self.buf = self.buf[match.end(0):] + else: + buf = self.fileobj.read(BUFFER_SIZE) + if len(buf): + self.buf += buf + continue else: - yield line - if not sample: - for line in self.lines: - if PY2: - yield line.encode('utf-8') - else: - yield line + line, self.buf = self.buf, None + + line = line.decode(self.encoding) + if six.PY2: + line = line.encode('utf-8') - # Fix the maximum field size to something a little larger - csv.field_size_limit(256000) + if line in LINE_SEPARATOR or not len(line): + continue + if self.window >= len(self._sample): + self._sample.append(line) + yield line + + def raw(self, sample=False): try: - for row in csv.reader(rows(), - dialect=self._dialect, **self._overrides): - yield [Cell(to_unicode_or_bust(c)) for c in row] + for row in csv.reader(self.get_lines(sample=sample), + dialect=self.dialect): + if six.PY2: + row = [c.decode('utf-8') for c in row] + yield [Cell(c) for c in row] except csv.Error as err: - if u'newline inside string' in unicode_string(err) and sample: - pass - elif u'line contains NULL byte' in unicode_string(err): - pass - else: - raise messytables.ReadError('Error reading CSV: %r', err) + if 'new-line character' not in repr(err): + raise ReadError('Error reading CSV: %r', err) diff --git a/messytables/compat23.py b/messytables/compat23.py deleted file mode 100644 index 7970666..0000000 --- a/messytables/compat23.py +++ /dev/null @@ -1,19 +0,0 @@ -import sys -PY2 = sys.version_info[0] == 2 -if PY2: - import urllib2 - from itertools import izip_longest - unicode_string = unicode - native_string = str - byte_string = str - string_types = (str, unicode) - urlopen = urllib2.urlopen -else: # i.e. PY3 - import urllib.request - from itertools import zip_longest as izip_longest - unicode_string = str - native_string = str - byte_string = bytes - - string_types = (str,) - urlopen = urllib.request.urlopen diff --git a/messytables/core.py b/messytables/core.py index 28ad7eb..7adc9df 100644 --- a/messytables/core.py +++ b/messytables/core.py @@ -1,94 +1,15 @@ -from messytables.util import OrderedDict from collections import Mapping -from messytables.error import TableError, NoSuchPropertyError -import io -from messytables.compat23 import * - -def seekable_stream(fileobj): - try: - fileobj.seek(0) - # if we got here, the stream is seekable - except: - # otherwise seek failed, so slurp in stream and wrap - # it in a BytesIO - fileobj = BufferedFile(fileobj) - return fileobj - - -class BufferedFile(object): - ''' A buffered file that preserves the beginning of - a stream up to buffer_size - ''' - def __init__(self, fp, buffer_size=2048): - self.data = io.BytesIO() - self.fp = fp - self.offset = 0 - self.len = 0 - self.fp_offset = 0 - self.buffer_size = buffer_size - - def _next_line(self): - try: - return self.fp.readline() - except AttributeError: - return next(self.fp) - - def _read(self, n): - return self.fp.read(n) - - @property - def _buffer_full(self): - return self.len >= self.buffer_size - - def readline(self): - if self.len < self.offset < self.fp_offset: - raise BufferError('Line is not available anymore') - if self.offset >= self.len: - line = self._next_line() - self.fp_offset += len(line) - - self.offset += len(line) +try: + # python 2.7: + from collections import OrderedDict +except ImportError: + from ordereddict import OrderedDict # noqa - if not self._buffer_full: - self.data.write(line) - self.len += len(line) - else: - line = self.data.readline() - self.offset += len(line) - return line - - def read(self, n=-1): - if n == -1: - # if the request is to do a complete read, then do a complete - # read. - self.data.seek(self.offset) - return self.data.read(-1) + self.fp.read(-1) - - if self.len < self.offset < self.fp_offset: - raise BufferError('Data is not available anymore') - if self.offset >= self.len: - byte = self._read(n) - self.fp_offset += len(byte) - - self.offset += len(byte) - - if not self._buffer_full: - self.data.write(byte) - self.len += len(byte) - else: - byte = self.data.read(n) - self.offset += len(byte) - return byte - def tell(self): - return self.offset +from six import text_type, string_types +from typecast import String - def seek(self, offset): - if self.len < offset < self.fp_offset: - raise BufferError('Cannot seek because data is not buffered here') - self.offset = offset - if offset < self.len: - self.data.seek(offset) +from messytables.error import TableError, NoSuchPropertyError class CoreProperties(Mapping): @@ -108,15 +29,16 @@ def __len__(self): class Cell(object): - """ A cell is the basic value type. It always has a ``value`` (that - may be ``None`` and may optionally also have a type and column name - associated with it. If no ``type`` is set, the String type is set - but no type conversion is set. """ + """A cell is the basic value type. + + It always has a ``value`` (that may be ``None`` and may optionally + also have a type and column name associated with it. If no ``type`` + is set, the String type is set but no type conversion is set. + """ def __init__(self, value, column=None, type=None): if type is None: - from messytables.types import StringType - type = StringType() + type = String() self.value = value self.column = column self.column_autogenerated = False @@ -130,38 +52,38 @@ def __repr__(self): @property def empty(self): - """ Stringify the value and check that it has a length. """ + """Stringify the value and check that it has a length.""" if self.value is None: return True value = self.value if not isinstance(value, string_types): - value = unicode_string(value) + value = text_type(value) if len(value.strip()): return False return True @property def properties(self): - """ Source-specific information. Only a placeholder here. """ + """Source-specific information. Only a placeholder here.""" return CoreProperties() @property def topleft(self): - """ - Is the cell the top-left of a span? Non-spanning cells are the top left. - - This is used for example in HTML generation where the top left cell - is the only one which is written into the output representation. + """Non-spanning cells are the top left. + This is used for example in HTML generation where the top left + cell is the only one which is written into the output representation. In absense of other knowledge, we assume that all cells are top left. """ + # This seems oddly over-specific, can we solve it otherwise? return True class TableSet(object): - """ A table set is used for data formats in which multiple tabular - objects are bundled. This might include relational databases and - workbooks used in spreadsheet software (Excel, LibreOffice). + """A table set bundles multiple tabular objects. + + This might include relational databases and workbooks used in spreadsheet + software (Excel, LibreOffice). For each format, we derive from this abstract base class, providing a constructor that takes a file object and tables() that returns each table. @@ -170,14 +92,14 @@ class TableSet(object): On any fatal errors, it should raise messytables.ReadError """ + def __init__(self, fileobj): - """ Store the fileobj, and perhaps all or part of the file. """ + """Store the fileobj, and perhaps all or part of the file.""" pass @property def tables(self): - """ Return a listing of tables (i.e. RowSets) in the ``TableSet``. - Each table has a name. """ + """Get a listing of ``RowSets``.""" if getattr(self, "_tables", None) is None: self._tables = self.make_tables() return self._tables @@ -185,8 +107,9 @@ def tables(self): def make_tables(self): raise NotImplementedError("make_tables() not implemented on {0}" .format(type(self))) + def __getitem__(self, name): - """ Return a RowSet based on the name given """ + """Return a RowSet based on the name given.""" matching = [table for table in self.tables if table.name == name] if not matching: raise TableError("No table called %r" % name) @@ -196,16 +119,18 @@ def __getitem__(self, name): @classmethod def from_fileobj(cls, fileobj, *args, **kwargs): - """ Deprecated, only for compatibility reasons """ + """Deprecated, only for compatibility reasons.""" return cls(fileobj, *args, **kwargs) class RowSet(object): - """ A row set (aka: table) is a simple wrapper for an iterator of - rows (which in turn is a list of ``Cell`` objects). The main table - iterable can only be traversed once, so on order to allow analytics - like type and header guessing on the data, a sample of ``window`` - rows is read, cached, and made available. + """A single table, which allows iterating over individual rows. + + A row set (aka: table) is a simple wrapper for an iterator of rows + (which in turn is a list of ``Cell`` objects). The main table iterable + can only be traversed once, so on order to allow analytics like type and + header guessing on the data, a sample of ``window`` rows is read, cached, + and made available. On any fatal errors, it should raise messytables.ReadError """ @@ -225,14 +150,15 @@ def get_types(self): types = property(get_types, set_types) def register_processor(self, processor): - """ Register a stream processor to be used on each row. A - processor is a function called with the ``RowSet`` as its - first argument and the row to be processed as the second - argument. """ + """Register a stream processor to be used on each row. + + A processor is a function called with the ``RowSet`` as its first + argument and the row to be processed as the second argument. + """ self._processors.append(processor) def __iter__(self, sample=False): - """ Apply processors to the row data. """ + """Apply processors to the row data.""" for row in self.raw(sample=sample): for processor in self._processors: row = processor(self, row) @@ -249,10 +175,11 @@ def sample(self): return self.__iter__(sample=True) def dicts(self, sample=False): - """ Return a representation of the data as an iterator of - ordered dictionaries. This is less specific than the cell - format returned by the generic iterator but only gives a - subset of the information. """ + """Return the table data as an iterator of ordered dictionaries. + + This is less specific than the cell format returned by the generic + iterator but only gives a subset of the information. + """ generator = self.sample if sample else self for row in generator: yield OrderedDict([(c.column, c.value) for c in row]) diff --git a/messytables/dateparser.py b/messytables/dateparser.py deleted file mode 100644 index 05d7c93..0000000 --- a/messytables/dateparser.py +++ /dev/null @@ -1,66 +0,0 @@ -import re - -date_regex = re.compile(r'''^\d{1,4}[-\/\.\s]\S+[-\/\.\s]\S+''') - - -def is_date(value): - return len(value) != 1 and date_regex.match(value) - - -def create_date_formats(day_first=True): - """generate combinations of time and date - formats with different delimeters - """ - - if day_first: - date_formats = ['dd/mm/yyyy', 'dd/mm/yy', 'yyyy/mm/dd'] - python_date_formats = ['%d/%m/%Y', '%d/%m/%y', '%Y/%m/%d'] - else: - date_formats = ['mm/dd/yyyy', 'mm/dd/yy', 'yyyy/mm/dd'] - python_date_formats = ['%m/%d/%Y', '%m/%d/%y', '%Y/%m/%d'] - - date_formats += [ - # Things with words in - 'dd/bb/yyyy', 'dd/bbb/yyyy' - ] - python_date_formats += [ - # Things with words in - '%d/%b/%Y', '%d/%B/%Y' - ] - - both_date_formats = list(zip(date_formats, python_date_formats)) - - #time_formats = "hh:mmz hh:mm:ssz hh:mmtzd hh:mm:sstzd".split() - time_formats = "hh:mm:ssz hh:mm:ss hh:mm:sstzd".split() - python_time_formats = "%H:%M%Z %H:%M:%S %H:%M:%S%Z %H:%M%z %H:%M:%S%z".split() - both_time_formats = list(zip(time_formats, python_time_formats)) - - #date_separators = ["-","."," ","","/","\\"] - date_separators = ["-", ".", "/", " "] - - all_date_formats = [] - - for separator in date_separators: - for date_format, python_date_format in both_date_formats: - all_date_formats.append( - (date_format.replace("/", separator), - python_date_format.replace("/", separator)) - ) - - all_formats = {} - - for date_format, python_date_format in all_date_formats: - all_formats[date_format] = python_date_format - for time_format, python_time_format in both_time_formats: - - all_formats[date_format + time_format] = \ - python_date_format + python_time_format - - all_formats[date_format + "T" + time_format] =\ - python_date_format + "T" + python_time_format - - all_formats[date_format + " " + time_format] =\ - python_date_format + " " + python_time_format - return list(all_formats.values()) - -DATE_FORMATS = create_date_formats() diff --git a/messytables/error.py b/messytables/error.py index a65429c..4996bbd 100644 --- a/messytables/error.py +++ b/messytables/error.py @@ -1,16 +1,19 @@ + class MessytablesError(Exception): - """A generic error to inherit from""" + """A generic error to inherit from.""" class ReadError(MessytablesError): - '''Error reading the file/stream in terms of the expected format.''' - pass + """Error reading the file/stream in terms of the expected format.""" class TableError(MessytablesError, LookupError): """Couldn't identify correct table.""" - pass + class NoSuchPropertyError(MessytablesError, KeyError): - """The requested property doesn't exist""" - pass + """The requested property doesn't exist.""" + + +class InvalidDateError(Exception): + """Invalid date in structured data sources.""" diff --git a/messytables/excel.py b/messytables/excel.py index 9d30131..93c8004 100644 --- a/messytables/excel.py +++ b/messytables/excel.py @@ -1,69 +1,54 @@ -import sys from datetime import datetime, time -import xlrd from xlrd.biffh import XLRDError +from xlrd import open_workbook, xldate_as_tuple +from typecast import String, Integer, Date, Float from messytables.core import RowSet, TableSet, Cell, CoreProperties -from messytables.types import (StringType, IntegerType, - DateType, FloatType) -from messytables.error import ReadError -from messytables.compat23 import PY2 - -class InvalidDateError(Exception): - pass +from messytables.error import ReadError, InvalidDateError XLS_TYPES = { - 1: StringType(), + 1: String, # NB: Excel does not distinguish floats from integers so we use floats # We could try actual type detection between floats and ints later # or use the excel format string info - see # https://groups.google.com/forum/?fromgroups=#!topic/ # python-excel/cAQ1ndsCVxk - 2: FloatType(), - 3: DateType(None), + 2: Float, + 3: Date, # this is actually boolean but we do not have a boolean type yet - 4: IntegerType() + 4: Integer } class XLSTableSet(TableSet): - """An excel workbook wrapper object. - """ + """An excel workbook wrapper object.""" def __init__(self, fileobj=None, filename=None, window=None, encoding=None, with_formatting_info=True, **kw): - '''Initialize the tableset. + """Initilize the tableset. :param encoding: passed on to xlrd.open_workbook function as encoding_override - :param with_formatting_info: passed to xlrd to get font details of cells - ''' + :param with_formatting_info: whether xlrd should provide details + of the cells contents (e.g. colour, borders, etc. + Not sure what the behaviour of properties is with this turned off. + Turning this on apparently may have memory implications in xlrd. + + The convoluted "try it with with_formatting_info, then try it without" + is necessary because xlrd doesn't currently support getting this + information from XLSX files. Workarounds include converting the XLSX + document in LibreOffice. + """ def get_workbook(): try: - return xlrd.open_workbook( + return open_workbook( filename=filename, file_contents=read_obj, encoding_override=encoding, formatting_info=with_formatting_info) - except XLRDError as e: - _, value, traceback = sys.exc_info() - if PY2: - raise ReadError("Can't read Excel file: %r" % value, traceback) - else: - raise ReadError("Can't read Excel file: %r" % value).with_traceback(traceback) - '''Initilize the tableset. + except XLRDError as xlrdexc: + raise ReadError("Can't read Excel file: %r" % xlrdexc) - :param encoding: passed on to xlrd.open_workbook function - as encoding_override - :param with_formatting_info: whether xlrd should provide details - of the cells contents (e.g. colour, borders, etc. - Not sure what the behaviour of properties is with this turned off. - Turning this on apparently may have memory implications in xlrd. - - The convoluted "try it with with_formatting_info, then try it without" is - necessary because xlrd doesn't currently support getting this information - from XLSX files. Workarounds include converting the XLSX document in LibreOffice. - ''' self.window = window if not filename and not fileobj: @@ -76,23 +61,24 @@ def get_workbook(): try: self.workbook = get_workbook() - except NotImplementedError as e: + except NotImplementedError: if not with_formatting_info: raise else: - with_formatting_info=False + with_formatting_info = False self.workbook = get_workbook() - def make_tables(self): - """ Return the sheets in the workbook. """ + """Return the sheets in the workbook.""" return [XLSRowSet(name, self.workbook.sheet_by_name(name), self.window) for name in self.workbook.sheet_names()] class XLSRowSet(RowSet): - """ Excel support for a single sheet in the excel workbook. Unlike - the CSV row set this is not a streaming operation. """ + """Excel support for a single sheet in the excel workbook. + + Unlike the CSV row set this is not a streaming operation. + """ def __init__(self, name, sheet, window=None): self.name = name @@ -101,38 +87,47 @@ def __init__(self, name, sheet, window=None): super(XLSRowSet, self).__init__(typed=True) def raw(self, sample=False): - """ Iterate over all rows in this sheet. Types are automatically - converted according to the excel data types specified, including - conversion of excel dates, which are notoriously buggy. """ + """Iterate over all rows in this sheet. + + Types are automatically converted according to the excel data types + specified, including conversion of excel dates, which are notoriously + buggy. + """ num_rows = self.sheet.nrows - for rownum in range(min(self.window, num_rows) if sample else num_rows): + num_rows = min(self.window, num_rows) if sample else num_rows + for rownum in range(num_rows): row = [] for colnum, cell in enumerate(self.sheet.row(rownum)): try: - row.append(XLSCell.from_xlrdcell(cell, self.sheet, colnum, rownum)) + row.append(XLSCell.from_xlrdcell(cell, self.sheet, + colnum, rownum)) except InvalidDateError: - raise ValueError("Invalid date at '%s':%d,%d" % ( - self.sheet.name, colnum+1, rownum+1)) + raise ValueError("Invalid date at '%s':%d,%d" % + (self.sheet.name, colnum + 1, rownum + 1)) yield row + class XLSCell(Cell): - @staticmethod - def from_xlrdcell(xlrd_cell, sheet, col, row): + + @classmethod + def get_xl_date(cls, sheet, value): + if value == 0: + return None + date = xldate_as_tuple(value, sheet.book.datemode) + year, month, day, hour, minute, second = date + return datetime(year, month, day, hour, minute, second) + + @classmethod + def from_xlrdcell(cls, xlrd_cell, sheet, col, row): value = xlrd_cell.value - cell_type = XLS_TYPES.get(xlrd_cell.ctype, StringType()) - if cell_type == DateType(None): - if value == 0: - raise InvalidDateError - year, month, day, hour, minute, second = \ - xlrd.xldate_as_tuple(value, sheet.book.datemode) - if (year, month, day) == (0, 0, 0): - value = time(hour, minute, second) - else: - value = datetime(year, month, day, hour, minute, second) - messy_cell = XLSCell(value, type=cell_type) + cell_type = XLS_TYPES.get(xlrd_cell.ctype, String) + if cell_type == Date: + value = cls.get_xl_date(sheet, value) + messy_cell = XLSCell(value, type=cell_type()) messy_cell.sheet = sheet messy_cell.xlrd_cell = xlrd_cell - messy_cell.xlrd_pos = (row, col) # necessary for properties, note not (x,y) + # necessary for properties, note not (x,y) + messy_cell.xlrd_pos = (row, col) return messy_cell @property @@ -143,10 +138,13 @@ def topleft(self): def properties(self): return XLSProperties(self) + class XLSProperties(CoreProperties): + KEYS = ['bold', 'size', 'italic', 'font_name', 'strikeout', 'underline', 'font_colour', 'background_colour', 'any_border', 'all_border', 'richtext', 'blank', 'a_date', 'formatting_string'] + def __init__(self, cell): self.cell = cell self.merged = {} @@ -165,13 +163,19 @@ def formatting(self): @property def rich(self): - """returns a tuple of character position, font number which starts at that position - https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.rich_text_runlist_map-attribute""" - return self.cell.sheet.rich_text_runlist_map.get(self.cell.xlrd_pos, None) + """Return a tuple of character position, font number. + + Starts at that position: + https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.rich_text_runlist_map-attribute + """ + return self.cell.sheet.rich_text_runlist_map.get(self.cell.xlrd_pos, + None) def raw_span(self, always=False): - """return the bounding box of the cells it's part of. - https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.merged_cells-attribute""" + """Return the bounding box of the cells it's part of. + + https://secure.simplistix.co.uk/svn/xlrd/trunk/xlrd/doc/xlrd.html?p=4966#sheet.Sheet.merged_cells-attribute + """ row, col = self.cell.xlrd_pos for box in self.cell.sheet.merged_cells: rlo, rhi, clo, chi = box @@ -207,7 +211,7 @@ def get_bold(self): return self.font.weight > 500 def get_size(self): - """in pixels""" + """In pixels.""" return self.font.height / 20.0 def get_italic(self): @@ -224,15 +228,18 @@ def get_underline(self): def get_font_colour(self): # TODO - return self.font.color_index ## more lookup required + return self.font.color_index # more lookup required def get_blank(self): """Note that cells might not exist at all. - Behaviour for spanned cells might be complicated: hence this function""" + + Behaviour for spanned cells might be complicated: hence this function + """ return self.cell.value == '' def get_background_colour(self): - return self.xf.background.background_color_index ## more lookup required + # more lookup required: + return self.xf.background.background_color_index def get_any_border(self): b = self.xf.border @@ -243,4 +250,3 @@ def get_all_border(self): b = self.xf.border return b.top_line_style > 0 and b.bottom_line_style > 0 and \ b.left_line_style > 0 and b.right_line_style > 0 - diff --git a/messytables/headers.py b/messytables/headers.py index a50ebc7..cd53d39 100644 --- a/messytables/headers.py +++ b/messytables/headers.py @@ -1,12 +1,15 @@ +import six from collections import defaultdict -from messytables.compat23 import izip_longest +from itertools import islice + from messytables.core import Cell def column_count_modal(rows): - """ Return the modal value of columns in the row_set's - sample. This can be assumed to be the number of columns - of the table. """ + """Return the modal value of columns in the row_set's sample. + + This can be assumed to be the number of columns of the table. + """ counts = defaultdict(int) for row in rows: length = len([c for c in row if not c.empty]) @@ -18,14 +21,15 @@ def column_count_modal(rows): def headers_guess(rows, tolerance=1): - """ Guess the offset and names of the headers of the row set. + """Guess the offset and names of the headers of the row set. + This will attempt to locate the first row within ``tolerance`` of the mode of the number of columns in the row set sample. The return value is a tuple of the offset of the header row and the names of the columns. """ - rows = list(rows) + rows = list(islice(rows, 1000)) modal = column_count_modal(rows) for i, row in enumerate(rows): length = len([c for c in row if not c.empty]) @@ -38,12 +42,13 @@ def headers_guess(rows, tolerance=1): def headers_processor(headers): - """ Add column names to the cells in a row_set. If no header is - defined, use an autogenerated name. """ + """Add column names to the cells in a row_set. + If no header is defined, use an autogenerated name. + """ def apply_headers(row_set, row): _row = [] - pairs = izip_longest(row, headers) + pairs = six.moves.zip_longest(row, headers) for i, (cell, header) in enumerate(pairs): if cell is None: cell = Cell(None) @@ -57,11 +62,12 @@ def apply_headers(row_set, row): def headers_make_unique(headers, max_length=None): - """Make sure the header names are unique. For non-unique - columns, append 1, 2, 3, ... after the name. If max_length - is set, truncate the original string so that the headers are - unique up to that length.""" + """Make sure the header names are unique. + For non-unique columns, append 1, 2, 3, ... after the name. If max_length + is set, truncate the original string so that the headers are unique up to + that length. + """ headers = [h.strip() for h in headers] new_digits_length = 0 diff --git a/messytables/html.py b/messytables/html.py index 2214363..62c59d8 100644 --- a/messytables/html.py +++ b/messytables/html.py @@ -1,9 +1,12 @@ -from messytables.core import RowSet, TableSet, Cell, CoreProperties -import lxml.html from collections import defaultdict -import html5lib import xml.etree.ElementTree as etree +import html5lib +import lxml.html +from typecast import String + +from messytables.core import RowSet, TableSet, Cell, CoreProperties + def fromstring(s): tb = html5lib.getTreeBuilder("lxml", implementation=etree) @@ -12,9 +15,8 @@ def fromstring(s): class HTMLTableSet(TableSet): - """ - A TableSet from a HTML document. - """ + """A TableSet from a HTML document.""" + def __init__(self, fileobj=None, filename=None, window=None, **kw): if filename is not None: @@ -42,9 +44,7 @@ def __init__(self, fileobj=None, filename=None, window=None, **kw): "other tables. This is a bug." # avoid infinite loops def make_tables(self): - """ - Return a listing of tables (as HTMLRowSets) in the table set. - """ + """Return a listing of tables (as HTMLRowSets) in the table set.""" def rowset_name(rowset, table_index): return "Table {0} of {1}".format(table_index + 1, len(self.htmltables)) @@ -68,9 +68,8 @@ def insert_blank_cells(row, blanks): class HTMLRowSet(RowSet): - """ - A RowSet representing a HTML table. - """ + """A RowSet representing a HTML table.""" + def __init__(self, name, sheet, window=None): self.name = name self.sheet = sheet @@ -78,11 +77,8 @@ def __init__(self, name, sheet, window=None): super(HTMLRowSet, self).__init__() def in_table(self, els): - """ - takes a list of xpath elements and returns only those - whose parent table is this one - """ - + # Accept a list of xpath elements and returns only those + # whose parent table is this one return [e for e in els if self.sheet in e.xpath("./ancestor::table[1]")] @@ -134,17 +130,14 @@ def identify_anatomy(tag): class FakeHTMLCell(Cell): + """FakeHTMLCells are not present because of column or row spanning.""" + def __init__(self): super(FakeHTMLCell, self).__init__("") @property def topleft(self): - """ - FakeHTMLCells are those which are not physically present in the HTML - because of column or row spannning. - - See also: HTMLCell.topleft - """ + """See also: HTMLCell.topleft.""" return False @@ -152,27 +145,21 @@ class HTMLCell(Cell): """ The Cell __init__ signature is: def __init__(self, value=None, column=None, type=None): where 'value' is the primary input, 'column' is a column name, and - type is messytables.types.StringType() or better.""" + type is messytables.types.String() or better.""" def __init__(self, value=None, column=None, type=None, source=None): assert value is None assert isinstance(source, lxml.etree._Element) self._lxml = source if type is None: - from messytables.types import StringType - type = StringType() + type = String() self.type = type self.column = column self.column_autogenerated = False @property def topleft(self): - """ - HTMLCells are those which are physically present in the HTML. They are - always the top-left in their span. - - See also: FakeHTMLCell.topleft - """ + """See also: FakeHTMLCell.topleft.""" return True @property @@ -196,7 +183,7 @@ def text_from_element(elem): """ builder = [] for x in elem.iter(): - #print x.tag, x.attrib, x.text, x.tail + # print x.tag, x.attrib, x.text, x.tail if is_invisible_text(x): cell_str = x.tail or '' # handle None values. else: @@ -214,7 +201,6 @@ def is_invisible_text(elem): if 'style' in elem.attrib: if 'display:none' in elem.attrib['style']: flag = True - return flag diff --git a/messytables/jts.py b/messytables/jts.py index 031528f..1bafb68 100644 --- a/messytables/jts.py +++ b/messytables/jts.py @@ -1,45 +1,30 @@ -''' -Convert a rowset to the json table schema -(http://www.dataprotocols.org/en/latest/json-table-schema.html) -''' +"""Convert a rowset to the json table schema. -import messytables +(http://www.dataprotocols.org/en/latest/json-table-schema.html) +""" import jsontableschema -MESSYTABLES_TO_JTS_MAPPING = { - messytables.StringType: 'string', - messytables.IntegerType: 'integer', - messytables.FloatType: 'number', - messytables.DecimalType: 'number', - messytables.DateType: 'date', - messytables.DateUtilType: 'date', - messytables.BoolType: 'boolean' -} - - -def celltype_as_string(celltype): - return MESSYTABLES_TO_JTS_MAPPING[celltype.__class__] +from messytables.headers import headers_guess +from messytables.types import type_guess def rowset_as_jts(rowset, headers=None, types=None): - ''' Create a json table schema from a rowset - ''' - _, headers = messytables.headers_guess(rowset.sample) - types = list(map(celltype_as_string, messytables.type_guess(rowset.sample))) - + """Create a json table schema from a rowset.""" + _, headers = headers_guess(rowset.sample) + types = type_guess(rowset.sample) + types = [t.jts_name for t in types] return headers_and_typed_as_jts(headers, types) def headers_and_typed_as_jts(headers, types): - ''' Create a json table schema from headers and types as - returned from :meth:`~messytables.headers.headers_guess` - and :meth:`~messytables.types.type_guess`. - ''' - j = jsontableschema.JSONTableSchema() + """Create a json table schema from headers and types. + Those specs are returned from :meth:`~messytables.headers.headers_guess` + and :meth:`~messytables.types.type_guess`. + """ + jts = jsontableschema.JSONTableSchema() for field_id, field_type in zip(headers, types): - j.add_field(field_id=field_id, - label=field_id, - field_type=field_type) - - return j + jts.add_field(field_id=field_id, + label=field_id, + field_type=field_type) + return jts diff --git a/messytables/ods.py b/messytables/ods.py index ea7c86e..140c2c6 100644 --- a/messytables/ods.py +++ b/messytables/ods.py @@ -3,11 +3,10 @@ import zipfile from lxml import etree +from typecast import String, Decimal, Date +# TODO: do we add CurrencyType, BoolType, PercentagePage, TimeType to typecast? from messytables.core import RowSet, TableSet, Cell -from messytables.types import (StringType, DecimalType, - DateType, BoolType, CurrencyType, - TimeType, PercentageType) ODS_NAMESPACES_TAG_MATCH = re.compile( @@ -38,8 +37,8 @@ } ODS_TYPES = { - 'float': DecimalType(), - 'date': DateType('%Y-%m-%d'), + 'float': Decimal(), + 'date': Date(), 'boolean': BoolType(), 'percentage': PercentageType(), 'time': TimeType() @@ -47,15 +46,15 @@ class ODSTableSet(TableSet): - """ - A wrapper around ODS files. Because they are zipped and the info we want - is in the zipped file as content.xml we must ensure that we either have - a seekable object (local file) or that we retrieve all of the content from - the remote URL. + """A wrapper around ODS files. + + Because they are zipped and the info we want is in the zipped file as + content.xml we must ensure that we either have a seekable object (local + file) or that we retrieve all of the content from the remote URL. """ def __init__(self, fileobj, window=None, **kw): - '''Initialize the object. + """Initialize the object. :param fileobj: may be a file path or a file-like object. Note the file-like object *must* be in binary mode and must be seekable (it will @@ -67,7 +66,7 @@ def __init__(self, fileobj, window=None, **kw): To get a seekable file you *cannot* use messytables.core.seekable_stream as it does not support the full seek functionality. - ''' + """ if hasattr(fileobj, 'read'): # wrap in a StringIO so we do not have hassle with seeks and # binary etc (see notes to __init__ above) @@ -81,13 +80,12 @@ def __init__(self, fileobj, window=None, **kw): zf.close() def make_tables(self): - """ - Return the sheets in the workbook. + """Return the sheets in the workbook. - A regex is used for this to avoid having to: + A regex is used for this to avoid having to: - 1. load large the entire file into memory, or - 2. SAX parse the file more than once + 1. load large the entire file into memory, or + 2. SAX parse the file more than once """ namespace_tags = self._get_namespace_tags() sheets = [m.groups(0)[0] @@ -104,8 +102,10 @@ def _get_namespace_tags(self): class ODSRowSet(RowSet): - """ ODS support for a single sheet in the ODS workbook. Unlike - the CSV row set this is not a streaming operation. """ + """ODS support for a single sheet in the ODS workbook. + + Unlike the CSV row set this is not a streaming operation. + """ def __init__(self, sheet, window=None, namespace_tags=None): self.sheet = sheet @@ -146,7 +146,7 @@ def __init__(self, sheet, window=None, namespace_tags=None): super(ODSRowSet, self).__init__(typed=True) def raw(self, sample=False): - """ Iterate over all rows in this sheet. """ + """Iterate over all rows in this sheet.""" rows = ODS_ROW_MATCH.findall(self.sheet) for row in rows: @@ -192,9 +192,9 @@ def _read_cell(element): cell = Cell(value + ' ' + currency, type=CurrencyType()) elif cell_type is not None: value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) - cell = Cell(value, type=ODS_TYPES.get(cell_type, StringType())) + cell = Cell(value, type=ODS_TYPES.get(cell_type, String())) else: - cell = Cell(EMPTY_CELL_VALUE, type=StringType()) + cell = Cell(EMPTY_CELL_VALUE, type=String()) return cell @@ -211,7 +211,7 @@ def _read_text_cell(element): cell_value = '\n'.join(text_content) else: cell_value = EMPTY_CELL_VALUE - return Cell(cell_value, type=StringType()) + return Cell(cell_value, type=String()) def _tag(namespace, tag): diff --git a/messytables/pdf.py b/messytables/pdf.py index 4f9052e..1998ac8 100644 --- a/messytables/pdf.py +++ b/messytables/pdf.py @@ -1,6 +1,6 @@ -from messytables.core import RowSet, TableSet, Cell +from typecast import String -from messytables.types import StringType +from messytables.core import RowSet, TableSet, Cell try: from pdftables import get_tables @@ -30,7 +30,7 @@ def __init__(self, pdftables_cell): self.column = None self.column_autogenerated = False - self.type = StringType() + self.type = String() @property def topleft(self): @@ -42,9 +42,8 @@ def properties(self): class PDFTableSet(TableSet): - """ - A TableSet from a PDF document. - """ + """A TableSet from a PDF document.""" + def __init__(self, fileobj=None, filename=None, **kw): if get_tables is None: raise ImportError("pdftables is not installed") @@ -57,9 +56,7 @@ def __init__(self, fileobj=None, filename=None, **kw): self.raw_tables = get_tables(self.fh) def make_tables(self): - """ - Return a listing of tables (as PDFRowSets) in the table set. - """ + """Return a listing of tables in the table set.""" def table_name(table): return "Table {0} of {1} on page {2} of {3}".format( table.table_number_on_page, @@ -71,9 +68,8 @@ def table_name(table): class PDFRowSet(RowSet): - """ - A RowSet representing a PDF table. - """ + """A RowSet representing a PDF table.""" + def __init__(self, name, table): if get_tables is None: raise ImportError("pdftables is not installed") @@ -85,9 +81,7 @@ def __init__(self, name, table): ) def raw(self, sample=False): - """ - Yield one row of cells at a time - """ + """Yield one row of cells at a time.""" if hasattr(self.table, "cell_data"): # New style of cell data. for row in self.table.cell_data: diff --git a/messytables/text.py b/messytables/text.py new file mode 100644 index 0000000..ee71179 --- /dev/null +++ b/messytables/text.py @@ -0,0 +1,36 @@ +import codecs +try: + import cchardet as chardet +except ImportError: + import chardet + +from messytables.buffered import BUFFER_SIZE + +# maps between chardet encoding and codecs bom keys +BOM_MAPPING = { + 'utf-16le': 'BOM_UTF16_LE', + 'utf-16be': 'BOM_UTF16_BE', + 'utf-32le': 'BOM_UTF32_LE', + 'utf-32be': 'BOM_UTF32_BE', + 'utf-8': 'BOM_UTF8', + 'utf-8-sig': 'BOM_UTF8', +} + + +def analyze_stream(stream, encoding=None): + sample = stream.read(BUFFER_SIZE) + if encoding is None: + encoding = chardet.detect(sample).get('encoding') or 'utf-8' + encoding = encoding.lower() + # The reader only skips a BOM if the encoding isn't explicit about its + # endianness (i.e. if encoding is UTF-16 a BOM is handled properly + # and taken out, but if encoding is UTF-16LE a BOM is ignored). + # However, if chardet sees a BOM it returns an encoding with the + # endianness explicit, which results in the codecs stream leaving the + # BOM in the stream. This is ridiculously dumb. For UTF-{16,32}{LE,BE} + # encodings, check for a BOM and remove it if it's there. + if encoding in BOM_MAPPING: + bom = getattr(codecs, BOM_MAPPING[encoding], None) + if sample[:len(bom)] == bom: + return encoding, sample[len(bom):] + return encoding, sample diff --git a/messytables/types.py b/messytables/types.py index 589409c..815d846 100644 --- a/messytables/types.py +++ b/messytables/types.py @@ -1,320 +1,37 @@ -import decimal -import datetime -from collections import defaultdict -from messytables.compat23 import izip_longest, unicode_string, string_types -import locale -import sys +import six +from typecast import guesser, GUESS_TYPES -import dateutil.parser as parser -from messytables.dateparser import DATE_FORMATS, is_date +def type_guess(rows, types=GUESS_TYPES, strict=False): + """Guess the best type for a given row set. + The type guesser aggregates the number of successful conversions of each + column to each type, weights them by a fixed type priority and select the + most probable type for each column based on that figure. It returns a list + of ``CellType``. Empty cells are ignored. -class CellType(object): - """ A cell type maintains information about the format - of the cell, providing methods to check if a type is - applicable to a given value and to convert a value to the - type. """ - - guessing_weight = 1 - # the type that the result will have - result_type = None - - def test(self, value): - """ Test if the value is of the given type. The - default implementation calls ``cast`` and checks if - that throws an exception. True or False""" - if isinstance(value, self.result_type): - return True - try: - self.cast(value) - return True - except: - return False - - @classmethod - def instances(cls): - return [cls()] - - def cast(self, value): - """ Convert the value to the type. This may throw - a quasi-random exception if conversion fails. """ - return value - - def __eq__(self, other): - return self.__class__ == other.__class__ - - def __hash__(self): - return hash(self.__class__) - - def __repr__(self): - return self.__class__.__name__.rsplit('Type', 1)[0] - - -class StringType(CellType): - """ A string or other unconverted type. """ - result_type = unicode_string - - def cast(self, value): - if value is None: - return None - if isinstance(value, self.result_type): - return value - try: - return unicode_string(value) - except UnicodeEncodeError: - return str(value) - - -class IntegerType(CellType): - """ An integer field. """ - guessing_weight = 6 - result_type = int - - def cast(self, value): - if value in ('', None): - return None - - try: - value = float(value) - except: - return locale.atoi(value) - - if value.is_integer(): - return int(value) - else: - raise ValueError('Invalid integer: %s' % value) - - -class DecimalType(CellType): - """ Decimal number, ``decimal.Decimal`` or float numbers. """ - guessing_weight = 4 - result_type = decimal.Decimal - - def cast(self, value): - if value in ('', None): - return None - try: - return decimal.Decimal(value) - except: - value = locale.atof(value) - if sys.version_info < (2, 7): - value = str(value) - return decimal.Decimal(value) - - -class PercentageType(DecimalType): - """ Decimal number, ``decimal.Decimal`` or float numbers. """ - guessing_weight = 0 - - def cast(self, value): - result = DecimalType.cast(self, value) - if result: - result = result/decimal.Decimal('100') - return result - - -class CurrencyType(DecimalType): - guessing_weight = 0 - result_type = decimal.Decimal - - def cast(self, value): - value_without_currency = value.split(' ')[0] - return DecimalType.cast(self, - value_without_currency) - - -class FloatType(DecimalType): - """ FloatType is deprecated """ - pass - - -class BoolType(CellType): - """ A boolean field. Matches true/false, yes/no and 0/1 by default, - but a custom set of values can be optionally provided. + Strict means that a type will not be guessed if parsing fails for a single + cell in the column. """ - guessing_weight = 7 - result_type = bool - true_values = ('yes', 'true', '0') - false_values = ('no', 'false', '1') - - def __init__(self, true_values=None, false_values=None): - if true_values is not None: - self.true_values = true_values - if false_values is not None: - self.false_values = false_values - - def cast(self, value): - s = value.strip().lower() - if value in ('', None): - return None - if s in self.true_values: - return True - if s in self.false_values: - return False - raise ValueError - - -class TimeType(CellType): - result_type = datetime.time - - def cast(self, value): - if isinstance(value, self.result_type): - return value - if value in ('', None): - return None - hour = int(value[2:4]) - minute = int(value[5:7]) - second = int(value[8:10]) - if hour < 24: - return datetime.time(hour, minute, second) - else: - return datetime.timedelta(hours=hour, - minutes=minute, - seconds=second) - - -class DateType(CellType): - """ The date type is special in that it also includes a specific - date format that is used to parse the date, additionally to the - basic type information. """ - guessing_weight = 3 - formats = DATE_FORMATS - result_type = datetime.datetime - - def __init__(self, format): - self.format = format - - @classmethod - def instances(cls): - return [cls(v) for v in cls.formats] - - def test(self, value): - if isinstance(value, string_types) and not is_date(value): - return False - return CellType.test(self, value) - - def cast(self, value): - if isinstance(value, self.result_type): - return value - if value in ('', None): - return None - if self.format is None: - return value - return datetime.datetime.strptime(value, self.format) - - def __eq__(self, other): - return (isinstance(other, DateType) and - self.format == other.format) - - def __repr__(self): - return "Date(%s)" % self.format - - def __hash__(self): - return hash(self.__class__) + hash(self.format) - - -class DateUtilType(CellType): - """ The date util type uses the dateutil library to - parse the dates. The advantage of this type over - DateType is the speed and better date detection. However, - it does not offer format detection. - - Do not use this together with the DateType""" - guessing_weight = 3 - result_type = datetime.datetime - - def test(self, value): - if not( - isinstance(value, datetime.datetime) or - (isinstance(value, string_types) and is_date(value)) - ): - return False - return CellType.test(self, value) - - def cast(self, value): - if value in ('', None): - return None - return parser.parse(value) - - -TYPES = [StringType, DecimalType, IntegerType, DateType, BoolType, - TimeType, CurrencyType, PercentageType] - - -def type_guess(rows, types=TYPES, strict=False): - """ The type guesser aggregates the number of successful - conversions of each column to each type, weights them by a - fixed type priority and select the most probable type for - each column based on that figure. It returns a list of - ``CellType``. Empty cells are ignored. - - Strict means that a type will not be guessed - if parsing fails for a single cell in the column.""" - guesses = [] - type_instances = [i for t in types for i in t.instances()] - if strict: - at_least_one_value = [] - for ri, row in enumerate(rows): - diff = len(row) - len(guesses) - for _ in range(diff): - typesdict = {} - for type in type_instances: - typesdict[type] = 0 - guesses.append(typesdict) - at_least_one_value.append(False) - for ci, cell in enumerate(row): - if not cell.value: - continue - at_least_one_value[ci] = True - for type in list(guesses[ci].keys()): - if not type.test(cell.value): - guesses[ci].pop(type) - # no need to set guessing weights before this - # because we only accept a type if it never fails - for i, guess in enumerate(guesses): - for type in guess: - guesses[i][type] = type.guessing_weight - # in case there were no values at all in the column, - # we just set the guessed type to string - for i, v in enumerate(at_least_one_value): - if not v: - guesses[i] = {StringType(): 0} - else: - for i, row in enumerate(rows): - diff = len(row) - len(guesses) - for _ in range(diff): - guesses.append(defaultdict(int)) - for i, cell in enumerate(row): - # add string guess so that we have at least one guess - guesses[i][StringType()] = guesses[i].get(StringType(), 0) - if not cell.value: - continue - for type in type_instances: - if type.test(cell.value): - guesses[i][type] += type.guessing_weight - _columns = [] - _columns = [] - for guess in guesses: - # this first creates an array of tuples because we want the types to be - # sorted. Even though it is not specified, python chooses the first - # element in case of a tie - # See: http://stackoverflow.com/a/6783101/214950 - guesses_tuples = [(t, guess[t]) for t in type_instances if t in guess] - _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0]) - return _columns + guessers = [] + for i, row in enumerate(rows): + for _ in range(len(row) - len(guessers)): + guessers.append(guesser(types=types, strict=strict)) + for j, cell in enumerate(row): + # add string guess so that we have at least one guess + guessers[j].add(cell.value) + return [g.best for g in guessers] def types_processor(types, strict=False): - """ Apply the column types set on the instance to the - current row, attempting to cast each cell to the specified - type. + """Apply the column types to the each row. - Strict means that casting errors are not ignored""" + Strict means that casting errors are not ignored. + """ def apply_types(row_set, row): if types is None: return row - for cell, type in izip_longest(row, types): + for cell, type in six.moves.zip_longest(row, types): try: cell.value = type.cast(cell.value) cell.type = type diff --git a/messytables/util.py b/messytables/util.py index 04dd160..a83d456 100644 --- a/messytables/util.py +++ b/messytables/util.py @@ -1,82 +1,8 @@ -try: - # python 2.7: - from collections import OrderedDict -except ImportError: - ## {{{ http://code.activestate.com/recipes/576669/ (r18) - ## Raymond Hettingers proporsal to go in 2.7 - from collections import MutableMapping - - class OrderedDict(dict, MutableMapping): - - # Methods with direct access to underlying attributes - - def __init__(self, *args, **kwds): - if len(args) > 1: - raise TypeError('expected at 1 argument, got %d', len(args)) - if not hasattr(self, '_keys'): - self._keys = [] - self.update(*args, **kwds) - - def clear(self): - del self._keys[:] - dict.clear(self) - - def __setitem__(self, key, value): - if key not in self: - self._keys.append(key) - dict.__setitem__(self, key, value) - - def __delitem__(self, key): - dict.__delitem__(self, key) - self._keys.remove(key) - - def __iter__(self): - return iter(self._keys) - - def __reversed__(self): - return reversed(self._keys) - - def popitem(self): - if not self: - raise KeyError - key = self._keys.pop() - value = dict.pop(self, key) - return key, value - - def __reduce__(self): - items = [[k, self[k]] for k in self] - inst_dict = vars(self).copy() - inst_dict.pop('_keys', None) - return (self.__class__, (items,), inst_dict) - - # Methods with indirect access via the above methods - - setdefault = MutableMapping.setdefault - update = MutableMapping.update - pop = MutableMapping.pop - keys = MutableMapping.keys - values = MutableMapping.values - items = MutableMapping.items - - def __repr__(self): - pairs = ', '.join(map('%r: %r'.__mod__, self.items())) - return '%s({%s})' % (self.__class__.__name__, pairs) - - def copy(self): - return self.__class__(self) - - @classmethod - def fromkeys(cls, iterable, value=None): - d = cls() - for key in iterable: - d[key] = value - return d - ## end of http://code.activestate.com/recipes/576669/ }}} - def offset_processor(offset): - """ Skip ``offset`` from the given iterator. This can - be used in combination with the ``headers_processor`` to + """Skip ``offset`` from the given iterator. + + This can be used in combination with the ``headers_processor`` to apply the result of a header scan to the table. :param offset: Offset to be skipped @@ -92,7 +18,7 @@ def apply_offset(row_set, row): def null_processor(nulls): - """ Replaces every occurrence of items from `nulls` with None. + """Replace every occurrence of items from `nulls` with None. :param nulls: List of items to be replaced :type nulls: list diff --git a/messytables/zip.py b/messytables/zip.py index 4707d47..a15c90f 100644 --- a/messytables/zip.py +++ b/messytables/zip.py @@ -1,15 +1,15 @@ import zipfile -import messytables +from messytables.core import TableSet +from messytables.error import ReadError -class ZIPTableSet(messytables.TableSet): - """ Reads TableSets from inside a ZIP file """ +class ZIPTableSet(TableSet): + """Reads TableSets from inside a ZIP file.""" def __init__(self, fileobj, **kw): - """ - On error it will raise messytables.ReadError. - """ + """On error it will raise ReadError.""" + from messytables.any import any_tableset tables = [] found = [] z = zipfile.ZipFile(fileobj, 'r') @@ -25,8 +25,7 @@ def __init__(self, fileobj, **kw): ext = f.filename[f.filename.rindex(".") + 1:] try: - filetables = messytables.any.any_tableset( - z.open(f), extension=ext, **kw) + filetables = any_tableset(z.open(f), extension=ext, **kw) except ValueError as e: found.append(f.filename + ": " + e.message) continue @@ -34,8 +33,8 @@ def __init__(self, fileobj, **kw): tables.extend(filetables.tables) if len(tables) == 0: - raise messytables.ReadError('''ZIP file has no recognized - tables (%s).''' % ', '.join(found)) + raise ReadError('''ZIP file has no recognized tables (%s).''' + % ', '.join(found)) finally: z.close() diff --git a/setup.py b/setup.py index 4f8f8ed..8418bba 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setup( name='messytables', - version='0.15.2', + version='1.99.0', description="Parse messy tabular data in various formats", long_description=long_desc, classifiers=[ @@ -42,16 +42,20 @@ 'xlrd>=0.8.0', 'python-magic>=0.4.12', # used for type guessing 'chardet>=2.3.0', - 'python-dateutil>=1.5.0', + 'cchardet', 'lxml>=3.2', - 'requests', - 'six>=1.9', # until messytables->html5lib releases https://github.com/html5lib/html5lib-python/pull/301 - 'html5lib', - 'json-table-schema>=0.2, <=0.2.1' + 'requests>=2.0', + 'html5lib', + 'json-table-schema>=0.2, <=0.2.1', + 'typecast>=0.3.3', + 'six', + 'ordereddict', ], extras_require={'pdf': ['pdftables>=0.0.4']}, - tests_require=[], - entry_points=\ - """ - """, + tests_require=[ + 'nose', + 'httpretty', + 'coverage' + ], + entry_points={} ) diff --git a/test/__init__.py b/test/__init__.py index 060bb3e..e69de29 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -1,6 +0,0 @@ -import os - - -def horror_fobj(name): - fn = os.path.join(os.path.dirname(__file__), '..', 'horror', name) - return open(fn, 'rb') diff --git a/test/test_any.py b/test/test_any.py index 1fbfe78..ce39b1c 100644 --- a/test/test_any.py +++ b/test/test_any.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from . import horror_fobj +from util import horror_fobj from nose.tools import assert_equal from nose.plugins.skip import SkipTest from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet, diff --git a/test/test_guessing.py b/test/test_guessing.py index b843c4e..2150340 100644 --- a/test/test_guessing.py +++ b/test/test_guessing.py @@ -1,17 +1,30 @@ # -*- coding: utf-8 -*- import unittest import io +# import cProfile +# from pstats import Stats -from . import horror_fobj +from util import horror_fobj from nose.plugins.attrib import attr +from nose.plugins.skip import SkipTest from nose.tools import assert_equal -from messytables import (CSVTableSet, type_guess, headers_guess, - offset_processor, DateType, StringType, - DecimalType, IntegerType, - DateUtilType, BoolType) +from typecast import Date, String, Decimal, Integer, Boolean +from messytables import CSVTableSet, type_guess, headers_guess +from messytables import offset_processor class TypeGuessTest(unittest.TestCase): + + # def setUp(self): + # self.pr = cProfile.Profile() + # self.pr.enable() + + # def tearDown(self): + # p = Stats(self.pr) + # p.strip_dirs() + # p.sort_stats('cumtime') + # p.print_stats() + @attr("slow") def test_type_guess(self): csv_file = io.BytesIO(b''' @@ -25,12 +38,15 @@ def test_type_guess(self): guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ - DecimalType(), DateType('%Y/%m/%d'), IntegerType(), - DateType('%d %B %Y'), BoolType(), BoolType()]) + Decimal(), Date('%Y/%m/%d'), Integer(), + Date('%d %b %Y'), Boolean(), Integer()]) def test_type_guess_strict(self): - import locale - locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8') + try: + import locale + locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8') + except: + raise SkipTest("Locale en_GB.UTF-8 not available.") csv_file = io.BytesIO(b''' 1, 2012/2/12, 2, 2,02 October 2011,"100.234354" 2, 2012/2/12, 1.1, 0,1 May 2011,"100,000,000.12" @@ -40,9 +56,9 @@ def test_type_guess_strict(self): rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(guessed_types, [ - StringType(), StringType(), - DecimalType(), IntegerType(), DateType('%d %B %Y'), - DecimalType()]) + String(), String(), + Decimal(), Integer(), Date('%d %b %Y'), + Decimal()]) def test_strict_guessing_handles_padding(self): csv_file = io.BytesIO(b''' @@ -53,7 +69,7 @@ def test_strict_guessing_handles_padding(self): guessed_types = type_guess(rows.sample, strict=True) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, - [StringType(), StringType(), DecimalType()]) + [String(), String(), Decimal()]) def test_non_strict_guessing_handles_padding(self): csv_file = io.BytesIO(b''' @@ -64,82 +80,81 @@ def test_non_strict_guessing_handles_padding(self): guessed_types = type_guess(rows.sample, strict=False) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, - [IntegerType(), StringType(), DecimalType()]) + [Integer(), String(), Decimal()]) def test_guessing_uses_first_in_case_of_tie(self): csv_file = io.BytesIO(b''' 2 1.1 + 2.1 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess( - rows.sample, types=[DecimalType, IntegerType], strict=False) - assert_equal(guessed_types, [DecimalType()]) + rows.sample, types=[Decimal, Integer], strict=False) + assert_equal(guessed_types, [Decimal()]) guessed_types = type_guess( - rows.sample, types=[IntegerType, DecimalType], strict=False) - assert_equal(guessed_types, [IntegerType()]) + rows.sample, types=[Integer, Decimal], strict=False) + assert_equal(guessed_types, [Integer()]) @attr("slow") def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') - rows = CSVTableSet(fh).tables[0] + rows = CSVTableSet(fh, encoding='iso-8859-2').tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) - types = [StringType, IntegerType, DecimalType, DateUtilType] - guessed_types = type_guess(rows.sample, types, True) + types = [String, Integer, Decimal, Date] + guessed_types = type_guess(rows.sample, types, False) assert_equal(len(guessed_types), 96) - assert_equal(guessed_types, [ - IntegerType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - IntegerType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), IntegerType(), StringType(), DecimalType(), - DecimalType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - IntegerType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - IntegerType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), DateUtilType(), - DateUtilType(), DateUtilType(), DateUtilType(), StringType(), - StringType(), StringType()]) + assumed_types = [Integer(), String(), String(), String(), + String(), String(), Integer(), String(), String(), String(), + String(), String(), String(), Integer(), String(), String(), + String(), String(), String(), String(), Integer(), String(), + String(), String(), String(), String(), String(), Integer(), + String(), Decimal(), Decimal(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), String(), Integer(), String(), Integer(), + String(), String(), String(), String(), String(), String(), + String(), String(), Integer(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + Integer(), String(), String(), String(), String(), String(), + String(), String(), String(), String(), String(), String(), + String(), String(), String(), String(), String(), Integer(), + String(), Date('%d/%m/%y'), Date('%d/%m/%y'), Date('%d/%m/%y'), + Date('%d/%m/%y'), String(), String(), String()] + # for (ta, tb) in zip(guessed_types, assumed_types): + # print (ta, tb) + assert_equal(guessed_types, assumed_types) def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) - types = [StringType, IntegerType, DecimalType, DateUtilType] + types = [String, Integer, Decimal, Date] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) - print(guessed_types) + # print(guessed_types) assert_equal(guessed_types, [ - IntegerType(), IntegerType(), - IntegerType(), IntegerType(), IntegerType(), IntegerType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), StringType(), StringType(), - StringType(), StringType(), IntegerType(), StringType(), - StringType()]) + Integer(), Integer(), + Integer(), Integer(), Integer(), Integer(), + String(), String(), String(), String(), + String(), String(), String(), String(), + String(), String(), Integer(), String(), + String()]) def test_integer_and_float_detection(self): def helper(value): - return any(i.test(value) for i in IntegerType.instances()) + return any(i.test(value) == 1 for i in Integer.instances()) assert_equal(helper(123), True) assert_equal(helper('123'), True) assert_equal(helper(123.0), True) - assert_equal(helper('123.0'), True) + assert_equal(helper('123.0'), False) assert_equal(helper(123.1), False) assert_equal(helper('123.1'), False) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_properties.py b/test/test_properties.py index 5ec3f6d..0a7ca09 100644 --- a/test/test_properties.py +++ b/test/test_properties.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from . import horror_fobj +from util import horror_fobj from messytables.any import any_tableset from messytables.error import NoSuchPropertyError from nose.tools import ( diff --git a/test/test_read.py b/test/test_read.py index ec4dbdc..8e7e8e6 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- import unittest +from util import horror_fobj from decimal import Decimal -from . import horror_fobj from nose.plugins.attrib import attr from nose.tools import assert_equal from nose.plugins.skip import SkipTest @@ -13,11 +13,11 @@ except ImportError: from .shim26 import assert_is_instance, assert_greater_equal -from messytables import (CSVTableSet, StringType, HTMLTableSet, - ZIPTableSet, XLSTableSet, XLSXTableSet, PDFTableSet, +from typecast import Date, Float, Integer, String +from messytables import (CSVTableSet, HTMLTableSet, + ZIPTableSet, XLSTableSet, PDFTableSet, ODSTableSet, headers_guess, headers_processor, - offset_processor, DateType, FloatType, - IntegerType, BoolType, rowset_as_jts, + offset_processor, rowset_as_jts, types_processor, type_guess, ReadError, null_processor) import datetime @@ -25,6 +25,7 @@ class ReadCsvTest(unittest.TestCase): + def test_utf8bom_lost(self): fh = horror_fobj('utf8bom.csv') table_set = CSVTableSet(fh) @@ -43,7 +44,7 @@ def test_read_simple_csv(self): for row in list(row_set): assert_equal(3, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) def test_read_complex_csv(self): fh = horror_fobj('complex.csv') @@ -58,7 +59,7 @@ def test_read_complex_csv(self): for row in list(row_set): assert_equal(4, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) def test_overriding_sniffed(self): # semicolon separated values @@ -102,13 +103,13 @@ def test_read_type_guess_simple(self): table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample) - expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()] + expected_types = [Date("%Y-%m-%d"), Integer(), String()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) header_types = [c.type for c in data[0]] - assert_equal(header_types, [StringType()] * 3) + assert_equal(header_types, [String()] * 3) row_types = [c.type for c in data[2]] assert_equal(expected_types, row_types) @@ -117,8 +118,8 @@ def test_apply_null_values(self): table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) - expected_types = [IntegerType(), StringType(), BoolType(), - StringType()] + expected_types = [Integer(), String(), Integer(), + String()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) @@ -147,8 +148,8 @@ def test_null_process(self): assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) - expected_types = [IntegerType(), BoolType(), BoolType(), - BoolType()] + expected_types = [Integer(), Integer(), Integer(), + Integer()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) @@ -212,7 +213,7 @@ def test_guess_headers(self): row_set.register_processor(headers_processor(['foo', 'bar'])) data = list(row_set) assert 'foo' in data[12][0].column, data[12][0] - assert 'Chirurgie' in data[12][0].value, data[12][0].value + assert 'Chirurgie' in data[10][0].value, data[10][0].value def test_read_encoded_characters_csv(self): fh = horror_fobj('characters.csv') @@ -239,7 +240,7 @@ def test_read_simple_zip(self): for row in list(row_set): assert_equal(3, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) class ReadTsvTest(unittest.TestCase): @@ -253,7 +254,7 @@ def test_read_simple_tsv(self): assert_equal(row[1].value, 'expr1_0_imp') for row in list(row_set): assert_equal(17, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) class ReadSsvTest(unittest.TestCase): @@ -269,7 +270,7 @@ def test_read_simple_ssv(self): for row in list(row_set): assert_equal(3, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) class ReadPsvTest(unittest.TestCase): @@ -285,7 +286,7 @@ def test_read_simple_psv(self): for row in list(row_set): assert_equal(6, len(row)) - assert_equal(row[0].type, StringType()) + assert_equal(row[0].type, String()) class ReadODSTest(unittest.TestCase): @@ -452,7 +453,7 @@ def test_that_xlsx_is_handled_by_xls_table_set(self): Should emit a DeprecationWarning. """ fh = horror_fobj('simple.xlsx') - assert_is_instance(XLSXTableSet(fh), XLSTableSet) + assert_is_instance(XLSTableSet(fh), XLSTableSet) class ReadXlsTest(unittest.TestCase): @@ -575,7 +576,7 @@ def test_read_type_know_simple(self): row_set = table_set.tables[0] row = list(row_set.sample)[1] types = [c.type for c in row] - assert_equal(types, [DateType(None), FloatType(), StringType()]) + assert_equal(types, [Date(None), Float(), String()]) def test_bad_first_sheet(self): # First sheet appears to have no cells diff --git a/test/test_rowset.py b/test/test_rowset.py index 4b47e7c..52e3928 100644 --- a/test/test_rowset.py +++ b/test/test_rowset.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from . import horror_fobj +from util import horror_fobj from messytables.any import any_tableset diff --git a/test/test_stream.py b/test/test_stream.py index 1d677d5..2ed6efd 100644 --- a/test/test_stream.py +++ b/test/test_stream.py @@ -1,15 +1,16 @@ # -*- coding: utf-8 -*- +import io import unittest -from messytables.compat23 import urlopen import requests -import io +import six.moves.urllib as urllib -from . import horror_fobj -from nose.tools import assert_equal +from util import horror_fobj import httpretty +from nose.tools import assert_equal from messytables import CSVTableSet, XLSTableSet + class StreamInputTest(unittest.TestCase): @httpretty.activate def test_http_csv(self): @@ -18,7 +19,7 @@ def test_http_csv(self): httpretty.GET, url, body=horror_fobj('long.csv').read(), content_type="application/csv") - fh = urlopen(url) + fh = urllib.request.urlopen(url) table_set = CSVTableSet(fh) row_set = table_set.tables[0] data = list(row_set) @@ -46,7 +47,7 @@ def test_http_csv_encoding(self): httpretty.GET, url, body=horror_fobj('utf-16le_encoded.csv').read(), content_type="application/csv") - fh = urlopen(url) + fh = urllib.request.urlopen(url) table_set = CSVTableSet(fh) row_set = table_set.tables[0] data = list(row_set) @@ -59,7 +60,7 @@ def test_http_xls(self): httpretty.GET, url, body=horror_fobj('simple.xls').read(), content_type="application/ms-excel") - fh = urlopen(url) + fh = urllib.request.urlopen(url) table_set = XLSTableSet(fh) row_set = table_set.tables[0] data = list(row_set) @@ -72,7 +73,7 @@ def test_http_xlsx(self): httpretty.GET, url, body=horror_fobj('simple.xlsx').read(), content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") - fh = urlopen(url) + fh = urllib.request.urlopen(url) table_set = XLSTableSet(fh) row_set = table_set.tables[0] data = list(row_set) diff --git a/test/test_tableset.py b/test/test_tableset.py index 4c2148c..d03de88 100644 --- a/test/test_tableset.py +++ b/test/test_tableset.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- import unittest -from . import horror_fobj +from util import horror_fobj from messytables.any import any_tableset from messytables.core import RowSet from messytables.error import TableError diff --git a/test/test_unit.py b/test/test_unit.py index 27c63aa..696604d 100644 --- a/test/test_unit.py +++ b/test/test_unit.py @@ -1,19 +1,7 @@ # -*- coding: utf-8 -*- import unittest -from messytables import dateparser, Cell - - -class DateParserTest(unittest.TestCase): - def test_date_regex(self): - assert dateparser.is_date('2012 12 22') - assert dateparser.is_date('2012/12/22') - assert dateparser.is_date('2012-12-22') - assert dateparser.is_date('22.12.2012') - assert dateparser.is_date('12 12 22') - assert dateparser.is_date('22 Dec 2012') - assert dateparser.is_date('2012 12 22 13:17') - assert dateparser.is_date('2012 12 22 T 13:17') +from messytables import Cell class CellReprTest(unittest.TestCase): diff --git a/test/util.py b/test/util.py new file mode 100644 index 0000000..060bb3e --- /dev/null +++ b/test/util.py @@ -0,0 +1,6 @@ +import os + + +def horror_fobj(name): + fn = os.path.join(os.path.dirname(__file__), '..', 'horror', name) + return open(fn, 'rb')