From 7b97d395e519eeee91d4be11bd0b76ec1bc32b03 Mon Sep 17 00:00:00 2001 From: Jonathan Demeyer Date: Tue, 25 Jul 2017 08:49:17 +0200 Subject: [PATCH 1/4] Copy ods.py to fods.py to develop support for flat ods (.fods) --- messytables/fods.py | 218 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 messytables/fods.py diff --git a/messytables/fods.py b/messytables/fods.py new file mode 100644 index 0000000..ea7c86e --- /dev/null +++ b/messytables/fods.py @@ -0,0 +1,218 @@ +import io +import re +import zipfile + +from lxml import etree + +from messytables.core import RowSet, TableSet, Cell +from messytables.types import (StringType, DecimalType, + DateType, BoolType, CurrencyType, + TimeType, PercentageType) + + +ODS_NAMESPACES_TAG_MATCH = re.compile( + b"(]*>)", re.MULTILINE) +ODS_TABLE_MATCH = re.compile( + b".*?().*?", re.MULTILINE) +ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?') +ODS_ROW_MATCH = re.compile( + b".*?().*?", re.MULTILINE) + +NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s" +NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s" +NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0" +NS_OPENDOCUMENT_OFFICE = NS_OPENDOCUMENT_PTTN % "office:1.0" + +TABLE_CELL = 'table-cell' +VALUE_TYPE = 'value-type' +COLUMN_REPEAT = 'number-columns-repeated' +EMPTY_CELL_VALUE = '' + +ODS_VALUE_TOKEN = { + "float": "value", + "date": "date-value", + "time": "time-value", + "boolean": "boolean-value", + "percentage": "value", + "currency": "value" +} + +ODS_TYPES = { + 'float': DecimalType(), + 'date': DateType('%Y-%m-%d'), + 'boolean': BoolType(), + 'percentage': PercentageType(), + 'time': TimeType() +} + + +class ODSTableSet(TableSet): + """ + A wrapper around ODS files. Because they are zipped and the info we want + is in the zipped file as content.xml we must ensure that we either have + a seekable object (local file) or that we retrieve all of the content from + the remote URL. + """ + + def __init__(self, fileobj, window=None, **kw): + '''Initialize the object. + + :param fileobj: may be a file path or a file-like object. Note the + file-like object *must* be in binary mode and must be seekable (it will + get passed to zipfile). + + As a specific tip: urllib2.urlopen returns a file-like object that is + not in file-like mode while urllib.urlopen *does*! + + To get a seekable file you *cannot* use + messytables.core.seekable_stream as it does not support the full seek + functionality. + ''' + if hasattr(fileobj, 'read'): + # wrap in a StringIO so we do not have hassle with seeks and + # binary etc (see notes to __init__ above) + # TODO: rather wasteful if in fact fileobj comes from disk + fileobj = io.BytesIO(fileobj.read()) + + self.window = window + + zf = zipfile.ZipFile(fileobj).open("content.xml") + self.content = zf.read() + zf.close() + + def make_tables(self): + """ + Return the sheets in the workbook. + + A regex is used for this to avoid having to: + + 1. load large the entire file into memory, or + 2. SAX parse the file more than once + """ + namespace_tags = self._get_namespace_tags() + sheets = [m.groups(0)[0] + for m in ODS_TABLE_MATCH.finditer(self.content)] + return [ODSRowSet(sheet, self.window, namespace_tags) + for sheet in sheets] + + def _get_namespace_tags(self): + match = re.search(ODS_NAMESPACES_TAG_MATCH, self.content) + assert match + tag_open = match.groups()[0] + tag_close = b'' + return tag_open, tag_close + + +class ODSRowSet(RowSet): + """ ODS support for a single sheet in the ODS workbook. Unlike + the CSV row set this is not a streaming operation. """ + + def __init__(self, sheet, window=None, namespace_tags=None): + self.sheet = sheet + + self.name = "Unknown" + m = ODS_TABLE_NAME.match(self.sheet) + if m: + self.name = m.groups(0)[0] + + self.window = window or 1000 + + # We must wrap the XML fragments in a valid header otherwise iterparse + # will explode with certain (undefined) versions of libxml2. The + # namespaces are in the ODS file, and change with the libreoffice + # version saving it, so get them from the ODS file if possible. The + # default namespaces are an option to preserve backwards compatibility + # of ODSRowSet. + if namespace_tags: + self.namespace_tags = namespace_tags + else: + namespaces = { + "dc": u"http://purl.org/dc/elements/1.1/", + "draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0", + "number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0", + "office": NS_OPENDOCUMENT_PTTN % u"office:1.0", + "svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0", + "table": NS_OPENDOCUMENT_PTTN % u"table:1.0", + "text": NS_OPENDOCUMENT_PTTN % u"text:1.0", + "calcext": NS_CAL_PTTN % u"calcext:1.0", + } + + ods_header = u""\ + .format(" ".join('xmlns:{0}="{1}"'.format(k, v) + for k, v in namespaces.iteritems())).encode('utf-8') + ods_footer = u"".encode('utf-8') + self.namespace_tags = (ods_header, ods_footer) + + super(ODSRowSet, self).__init__(typed=True) + + def raw(self, sample=False): + """ Iterate over all rows in this sheet. """ + rows = ODS_ROW_MATCH.findall(self.sheet) + + for row in rows: + row_data = [] + + block = self.namespace_tags[0] + row + self.namespace_tags[1] + partial = io.BytesIO(block) + empty_row = True + + for action, element in etree.iterparse(partial, ('end',)): + if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL): + continue + + cell = _read_cell(element) + if empty_row is True and cell.value != EMPTY_CELL_VALUE: + empty_row = False + + repeat = element.attrib.get( + _tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT)) + if repeat: + number_of_repeat = int(repeat) + row_data += [cell] * number_of_repeat + else: + row_data.append(cell) + + if empty_row: + # ignore blank lines + continue + + del partial + yield row_data + del rows + + +def _read_cell(element): + cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE)) + value_token = ODS_VALUE_TOKEN.get(cell_type, 'value') + if cell_type == 'string': + cell = _read_text_cell(element) + elif cell_type == 'currency': + value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) + currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency')) + cell = Cell(value + ' ' + currency, type=CurrencyType()) + elif cell_type is not None: + value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) + cell = Cell(value, type=ODS_TYPES.get(cell_type, StringType())) + else: + cell = Cell(EMPTY_CELL_VALUE, type=StringType()) + + return cell + + +def _read_text_cell(element): + children = element.getchildren() + text_content = [] + for child in children: + if child.text: + text_content.append(child.text) + else: + text_content.append(EMPTY_CELL_VALUE) + if len(text_content) > 0: + cell_value = '\n'.join(text_content) + else: + cell_value = EMPTY_CELL_VALUE + return Cell(cell_value, type=StringType()) + + +def _tag(namespace, tag): + return '{%s}%s' % (namespace, tag) From a2c4ab500abec70c7a18ed7867998c1d045d7c31 Mon Sep 17 00:00:00 2001 From: Jonathan Demeyer Date: Tue, 25 Jul 2017 09:04:17 +0200 Subject: [PATCH 2/4] Adaptations to support .fods --- messytables/__init__.py | 1 + messytables/fods.py | 25 +++++++++++-------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/messytables/__init__.py b/messytables/__init__.py index e2c03b9..f1f4137 100644 --- a/messytables/__init__.py +++ b/messytables/__init__.py @@ -9,6 +9,7 @@ from messytables.core import Cell, TableSet, RowSet, seekable_stream from messytables.commas import CSVTableSet, CSVRowSet from messytables.ods import ODSTableSet, ODSRowSet +from messytables.fods import FODSTableSet, FODSRowSet from messytables.excel import XLSTableSet, XLSRowSet # XLSXTableSet has been deprecated and its functionality is now provided by diff --git a/messytables/fods.py b/messytables/fods.py index ea7c86e..bf568cf 100644 --- a/messytables/fods.py +++ b/messytables/fods.py @@ -1,6 +1,5 @@ import io import re -import zipfile from lxml import etree @@ -10,13 +9,13 @@ TimeType, PercentageType) -ODS_NAMESPACES_TAG_MATCH = re.compile( - b"(]*>)", re.MULTILINE) +FODS_NAMESPACES_TAG_MATCH = re.compile( + b"(]*>)", re.MULTILINE) ODS_TABLE_MATCH = re.compile( - b".*?().*?", re.MULTILINE) + b".*?().*?", re.DOTALL) ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?') ODS_ROW_MATCH = re.compile( - b".*?().*?", re.MULTILINE) + b".*?().*?", re.DOTALL) NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s" NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s" @@ -46,7 +45,7 @@ } -class ODSTableSet(TableSet): +class FODSTableSet(TableSet): """ A wrapper around ODS files. Because they are zipped and the info we want is in the zipped file as content.xml we must ensure that we either have @@ -76,9 +75,7 @@ def __init__(self, fileobj, window=None, **kw): self.window = window - zf = zipfile.ZipFile(fileobj).open("content.xml") - self.content = zf.read() - zf.close() + self.content = fileobj.read() def make_tables(self): """ @@ -92,18 +89,18 @@ def make_tables(self): namespace_tags = self._get_namespace_tags() sheets = [m.groups(0)[0] for m in ODS_TABLE_MATCH.finditer(self.content)] - return [ODSRowSet(sheet, self.window, namespace_tags) + return [FODSRowSet(sheet, self.window, namespace_tags) for sheet in sheets] def _get_namespace_tags(self): - match = re.search(ODS_NAMESPACES_TAG_MATCH, self.content) + match = re.search(FODS_NAMESPACES_TAG_MATCH, self.content) assert match tag_open = match.groups()[0] - tag_close = b'' + tag_close = b'' return tag_open, tag_close -class ODSRowSet(RowSet): +class FODSRowSet(RowSet): """ ODS support for a single sheet in the ODS workbook. Unlike the CSV row set this is not a streaming operation. """ @@ -143,7 +140,7 @@ def __init__(self, sheet, window=None, namespace_tags=None): ods_footer = u"".encode('utf-8') self.namespace_tags = (ods_header, ods_footer) - super(ODSRowSet, self).__init__(typed=True) + super(FODSRowSet, self).__init__(typed=True) def raw(self, sample=False): """ Iterate over all rows in this sheet. """ From 1e400f0e6bf28fd84ebffc45ef7b5ab873f52793 Mon Sep 17 00:00:00 2001 From: Jonathan Demeyer Date: Tue, 25 Jul 2017 10:53:41 +0200 Subject: [PATCH 3/4] Add test for ".fods" with file "simple.fods" --- horror/simple.fods | 273 +++++++++++++++++++++++++++++++++++++++++++++ messytables/any.py | 6 +- test/test_any.py | 3 +- 3 files changed, 279 insertions(+), 3 deletions(-) create mode 100644 horror/simple.fods diff --git a/horror/simple.fods b/horror/simple.fods new file mode 100644 index 0000000..0aed0bf --- /dev/null +++ b/horror/simple.fods @@ -0,0 +1,273 @@ + + + + 2013-07-08T13:31:432013-07-19T10:14:48P0D3LibreOffice/5.3.4.2$Windows_x86 LibreOffice_project/f82d347ccc0be322489bf7da61d7e4ad13fe2ff3 + + + 0 + 0 + 6321 + 1781 + + + view1 + + + 0 + 4 + 0 + 0 + 0 + 0 + 2 + 0 + 0 + 0 + 0 + 0 + 64 + 60 + true + false + + + Sheet1 + 1856 + 0 + 64 + 60 + false + true + true + true + 12632256 + true + true + true + true + false + false + false + 1000 + 1000 + 1 + 1 + true + false + + + + + true + true + true + true + 12632256 + true + false + true + 3 + true + false + false + 1000 + 1000 + 1 + 1 + true + true + true + Microsoft Print to PDF + FRb+/01pY3Jvc29mdCBQcmludCB0byBQREYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATWljcm9zb2Z0IFByaW50IFRvIFBERgAAAAAAAAAAAAAWAAEAMhUAAAAAAAAIAFZUAAAkbQAAM1ROVwYATQBpAGMAcgBvAHMAbwBmAHQAIABQAHIAaQBuAHQAIAB0AG8AIABQAEQARgAAAAAAAAAAAAAAAAAAAAAAAAAAAAEEAwbcAFAUAy8BAAEAAQDqCm8IZAABAA8AWAICAAEAWAIDAAEATABlAHQAdABlAHIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAABAAAAAgAAAAEAAAD/////R0lTNAAAAAAAAAAAAAAAAERJTlUiAMgAJAMsET9de34AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQAAAAAACQABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADIAAAAU01USgAAAAAQALgAewAwADgANABGADAAMQBGAEEALQBFADYAMwA0AC0ANABEADcANwAtADgAMwBFAEUALQAwADcANAA4ADEANwBDADAAMwA1ADgAMQB9AAAAUkVTRExMAFVuaXJlc0RMTABQYXBlclNpemUATEVUVEVSAE9yaWVudGF0aW9uAFBPUlRSQUlUAFJlc29sdXRpb24AUmVzT3B0aW9uMQBDb2xvck1vZGUAQ29sb3IAAAAAAAAAAAAAAAAAACwRAABWNERNAQAAAAAAAACcCnAiHAAAAOwAAAADAAAA+gFPCDTmd02D7gdIF8A1gdAAAABMAAAAAwAAAAAIAAAAAAAAAAAAAAMAAAAACAAAKgAAAAAIAAADAAAAQAAAAFYAAAAAEAAARABvAGMAdQBtAGUAbgB0AFUAcwBlAHIAUABhAHMAcwB3AG8AcgBkAAAARABvAGMAdQBtAGUAbgB0AE8AdwBuAGUAcgBQAGEAcwBzAHcAbwByAGQAAABEAG8AYwB1AG0AZQBuAHQAQwByAHkAcAB0AFMAZQBjAHUAcgBpAHQAeQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASAENPTVBBVF9EVVBMRVhfTU9ERRMARHVwbGV4TW9kZTo6VW5rbm93bg== + 0 + false + true + true + false + false + false + 7 + + + + + + + + + + + + + + + + + + + + + + + + £ + + + + + - + £ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + / + + / + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ??? + + + + Page 1 + + + + + + + ??? (???) + + + 00/00/0000, 00:00:00 + + + + + Page 1 / 99 + + + + + + + + + + + + Name + + + Age + + + When + + + + + Bob + + + 20 + + + 10/10/10 + + + + + Jane + + + 23 + + + 01/01/12 + + + + + Ian + + + 34 + + + 11/11/15 + + + + + + + + + + + + + \ No newline at end of file diff --git a/messytables/any.py b/messytables/any.py index fd9dfc5..1c9290b 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -1,5 +1,5 @@ from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet, - HTMLTableSet, ODSTableSet) + HTMLTableSet, ODSTableSet, FODSTableSet) import messytables import re @@ -39,6 +39,7 @@ def TABTableSet(fileobj): 'HTML': HTMLTableSet, 'CSV': CSVTableSet, 'ODS': ODSTableSet, + 'FODS': FODSTableSet, 'PDF': PDFTableSet} @@ -107,7 +108,8 @@ def guess_ext(ext): 'xlsm': 'XLS', 'xltx': 'XLS', 'xltm': 'XLS', - 'ods': 'ODS'} + 'ods': 'ODS', + 'fods': 'FODS'} if ext in lookup: return lookup.get(ext, None) diff --git a/test/test_any.py b/test/test_any.py index 1fbfe78..74a0f59 100644 --- a/test/test_any.py +++ b/test/test_any.py @@ -5,7 +5,7 @@ from nose.tools import assert_equal from nose.plugins.skip import SkipTest from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet, - CSVTableSet, ODSTableSet, + CSVTableSet, ODSTableSet, FODSTableSet, ReadError) suite = [{'filename': 'simple.csv', 'tableset': CSVTableSet}, @@ -13,6 +13,7 @@ {'filename': 'simple.xlsx', 'tableset': XLSTableSet}, {'filename': 'simple.zip', 'tableset': ZIPTableSet}, {'filename': 'simple.ods', 'tableset': ODSTableSet}, + {'filename': 'simple.fods', 'tableset': FODSTableSet}, {'filename': 'bian-anal-mca-2005-dols-eng-1011-0312-tab3.xlsm', 'tableset': XLSTableSet}, ] From 3e37ef96b2399686aeef7918d7b9a119de462f6e Mon Sep 17 00:00:00 2001 From: jonadem Date: Tue, 25 Jul 2017 17:36:44 +0200 Subject: [PATCH 4/4] Add problematic file "cell_starting_with_a_space.ods" --- horror/cell_starting_with_a_space.ods | Bin 0 -> 9090 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 horror/cell_starting_with_a_space.ods diff --git a/horror/cell_starting_with_a_space.ods b/horror/cell_starting_with_a_space.ods new file mode 100644 index 0000000000000000000000000000000000000000..a0cd3646350d2d4c4a53a46b0161760d6ac2070d GIT binary patch literal 9090 zcmeHNbySpHw;yQ)0qJf*LOO>=Is{~B5fFxFx#z2|+@-q`MnwL4iBG z-{)&web>78uY1?}_N?cbGqHcqIeR~I&OX22s45~N;R66@005uUnS!sCU?3X+0JuKj zZvm{$t-(-dJFt|||)2vAi-y^a(B;rax( zKM&$}oiVktHG+cwX$Vws%wI!hZfyhsJFrWdL#>VM9R6Ya2P>v-E4*?_J7-Ot}e{3aa00qSf8cDN4eijKL>G#~!uoH`m84!t;U zgz2I{hEqFQ0^zMNB#9Wr0o*Ja$pC~86JIOVHu78|L?6=1ZfVOWWCvM~6P`7=FJ!Eq zuZF2Z{A#HSwQCnx?I<3-XQh#@wiomG+~&GDTj*J1YOe5v{0{5vYE4*jm~5ag;(K5J z{NCvL73ue3)U?d1A=30q{?RoYQfhb<=4u}UP5Gi9dA}aPrCJ6(@wFaIpW2Avt&vZ% zt4vWSES(=_aPG>+r9wJ>xZ1|b9!^hj4=V~kMRQlCi|gRx!b5C|-82~DQS*q+y}^QJwTu^vT!B*@}J82~M=;J}_VgF%_4 z$#G(NdlF1%6Tf*8a>Z@K;Sm>G5$(p2De7$e)Wez1#E{thN#22A#t+o^BRZc|t5tbsL*_4}3u2Q_ zuZG4_O|O88I4F%RKpJ*?i~v~3LSQHY%P z{em)FCFLreNR$bV)s+@HWO@>o3=~+wfv24>h6%tkh>vRE<%*d&QT=|^&_!8h@mRsefLZ)JzO}gsH)&s$LLHSeXZ8s1 zr{M@KNsT9L=t>DMHI(d;$nbpvqlLuI(xu436u4z>uM539!DKW z6TO#AkTaBsVc z?RnzIM9KX$GOh06U0b=>f*F@HnGOeP)vYayzyiNo%_HW}x9HPYop~wt57)C3Zs<#L z(b9m@2zi=^5B%RGw$3aG&@(6Y&!MyjQe+G`kIb{35r#4i_7mg)nW7vKUqfL0^(2s~ zhjXtg&E~o^&ng@XpP!f6j&nAFM?L{p)Ls+;UMsSDIF5OKJ*i(AH!7zb|4eRMXB<#+ z7|#_q%Ra>8RZ<*2TN-}FzNXdKt!9o#)&a#-V)>3Kn0pF` z5rDtyjFm$kRw|Bj6j(dE8l?!Kd<@8x+e+S`>L$#A2@z%IP0>^B3#_0+8KD#M zQue!Vt6TeOq@R5}7eRVH820{U3Q8+$px|_7**i2Dc`k&BNGbjC0nPH;Qu>CvxtDWQ zz-`VCd>q!2F?Uk763>X7b9S`dOV(F6HsZE}E(z$4%hyZLd*U+;>TwVv36c%i z#+P4wA(82d9}vE2XAqd%e2R%We}Fd}O=b+cMVx|W6%!^DMj`rq!-&vFj*{#<4ooL~o*e0~YVM3&<^rsypz$ z@wg8+HDfyRO-`IubKUUtfUY8m_a_)ac3-42g^%ELM+|Jh{yq12x`6Mi)P_zLC&+P( zeBSKcN4+n=y!_`&%pdFAP#P&aeEEg^yTk{kXTcw`xwmP^&8h4;Zec#?!%j zn?st)74`878Oudcd6tK&g%;i#kmrEZQiK4rAZEBffBy8VQ;Ox>#}ci6C&$^tLcud` zm=BY$=rKXhOl7&_ueodfF`1(g#i=1s;>=I$O30B3$j;|20AB(ht;cpNz3ekE zoX_WsXcqVI?JhEm@%#25MdB}EP6WtCm1q!^7HR!5YT6;O(e<<=%c;{D`=R~U;_l(v z?6Jy*RVT0P$EbD*FF*OEZ;9QwUb7WT9Is}fN}$-c@XktMu6t&~ENBqpdygN} zs=p%ru2aIHu*zH7cOM0w%3-!whM=KSzLGRQUUnlD>{~=P)l!3QrG`15Qi}x50P|AI zZyHrUZ1Ah54u3BG3SFto2kK2;_dv0YJLXVL@Lk%@VuxYFuy?w!;U|`m{_$CV%+n^cnhhb}006uZ|0#Tbm6Bkn(e>i~DzdkDm;=Z6R9M<%92jE!#=MaJnV%05 zLI7=yR-Opc%aLNMwd}VdXZIQgq$|4P_ni8a5*Ih@Ei#Dg9_y*ok}}>!peq_$)lRql za_Z!loX_uTO?pq_E=xULoR`le8b}#TR`O7oQ5_~7{nmc1uL>`V{yAYgR`NhxL^K7SD5-^sEuD^K z$txK*ERib3NT0|v_pHzdc_*^~#O}_pQY>DN!GR{}L9WZ*7Ije36ooOU4POSB+bTZ) z4)dd(Y6xD)hNDWJVcvXw+zN87cb}sw5YqsspijAKv)kh^V9kK2+jC`%?M$zFp5+p| zp_AJc3DJ9^ZZktv;Ipu@I3H(_b7JF{Vn2e@%d_$+8UEw7iV86al%eg1ZB7hXN?6T9 zQF{mXBlBDoHBjDaM)8L6-O|JTYOQ&fDJFUVlab^BziUE(&&|ewH>@hzIn*qCehrWH zs9rj|4{#+MsZ!}S4>>YK`KReb4OADj6ZKNIj~66Rmy=IB+i6Q=+{%q7FVP7Qjcb=V z;$(HlG!8X#d*4}rA-#L?_9NAG_RZ;g!aL#FKmR-X--^OJR?Hz4$N<3gab5R<%wX0} zY>dpU9N3}XnrwD9kO0*OvbS!MUe|xOKYpxTUuH|p-@*>S05jr;NalM$jHRR#EguLf`WpIii-OB z`qtLg?(Xj4;o<4&>7}Km?d|Qu!^6wV%j>ILUBS=)vHwKi|9%7t&R&?qH<>o_QW7Ai z$(_+D`%G$l@j-f-T{HyRPghhxsMdi^eq9cbdkVJ;HKf>kdbZuRan_^OLrd(Y9edVZ zig^^5OYsu1H_(0Q(0$2srFaS6urUiI#C_EU{pDi7~4^`Ks<*sQck+>&O`K zY5512V#~RUEUzq~vZ}V^lC;x{kvdcBFIj*heT{aX_P~J4rXkP%-AfU*#+>GOmx2}c zExo4L*<$6XeP^Q6g&{NRRU8QL!ZC&SedUb9#!i7+lZd;!QDw(gQMh#6>HYE3!}3`Y zHD7dHl&lpXH`8lT%>s~aF!@rLX|kB(ox^GiAu`4i>1dfF*{!*?*(JY|Db=x~Zsiim z(<~WFQ1{d_kZxvmG5I3ywZYUy4`<&iBgOksxnF_(v_V80gXb%qeOM&ZVPjt!K@`=r z4~Nc+sah<;t(63)k7g+P2wP$_iIvy>pWuJaN@{f3r2p z*f~0r41c{XL~yTHIK=cNNRtY?7VRXJC$JYzJ zA)<0eTa_WQO+5(RZWa|%D@<0X)>g__QCR^UNMJ-L^te3cYV8M+Be6G@v?g(arOfrm zDqI93nKKT*d~w|@Y?!apg^ihhreG;WyC2XUA*a^SGa7Y#bTL!i-!%qz&-QwmY3 z0;S%5F;(R&*c;wYGBPIm`jP$@EBfC3kh~AD)#lcRIt;^#;P}teiSaX?hYcB_W>Bh7 z?3}uXXCvB#LUI~~PJ+&_Z%wW#GP&l(4Di%P*4Nb5X7$=)>l^rv1(K;!3OC*ijrac& z-2L_85DtHiKXvJwJ9GTu%084dtY*f$?|>w*i=F)i#kbyxNzGq73|la)wV!qcmtz}2 zbDGqVvq0G@Nuz5mWW)lQJP(8qXSUeDAH z-#ZRH*oNr3wCZIHm!;c`>3})H7C4`JoLf9gK`bguJbkAdZR5XKB@J@E?Wp7#9yR_o z6%&;rlWUUMaP9-49aidnvbw|Z7R%&umJYwEiN*Woi5@C_yw*X_MjW|41NeAj`0s3S z8`UfpSSUF@MHQY}$V_B6efEacbatA2$Gw&IKYzk1wMT|X-2NIJalYT zek_M6i9Q0c?$*!z-nj)%ke8^Wk`$O#OBJ?pC3YIYv z0MG`fvHv_bUGLx=z)+~U4dgG5aYs+>SvnBMqp2yqV@yO}t$t?HydI*eTHi*9RJWt& z9I6u7{qOtw|X5@&kECfNV>F?ox)T(7QBc za&~Bm#Y?u7$0Lo8u3+*Ec>V1Ze&gm|_gxYU6-X5ATVDFf=d3s8XFVOKxw{|nz#kzI znma>t8%wJdXj>~(TbjNyaoRGWVXa_3k~dbt`X#PPG-AAne{#oBWKX|rl0MOCw>bUg z?wfQpnvxfvy(HrwX{+P(ZAv#dmV*vnnI}|jMN??L@pP}nSH;hvY`gKM_d|Jnfun{o zt&jsYq(rm8n)Gyucvw-my938KGcvb&Gc(8^=0a1Xj}d!(O5KF)7Q|RdzKq(RHmp1V zNlAd9W?+}DJ>a*PC3KV#yQK=<^u_XM@2jIvwDyoXBVK*9|Aff>DIat+O3ADz*>fks z_5RWuo|RNfmrehPC9S|q0Tbc2I6gDU+?F<13PbC5Giyk4{8f zS9bc$PAb_(S%SrR*rAHNo0sQOoQtE*y5(dECbZ}5O!Er&)4S|h^0jr+oA8@`;&7q( zNW{KJ_npNQZP)Xfj#o1*pm0@%7jy_>w{h1oS8^$PC6t$QwsNJ9a89aj*mH&!iAjsi z;kgOl(GfctZLOB}4^`8y&wd(snn<438yrAEAvP>jog4R=zP^pEoM_5KIV{EMc6Hjm zwZ-IU(ps4Q+S6qH>dX?_SLEbt+i{;8eHV)geB0Y{4!hE@VP=^ zCo^0w)mLSbJdkUVR7PKKU1rO&TJ!G+YNqwzYTrV?yenxROI5n!-5-m?Dy%>gny7cN z;ibc;)J_u!O;7V#gp~kadLhKksF#f~E5CS&xB6)68DUFf8I;0f>10v$^HU4K*=fa) zx127hjXG!U`Z->H5Oxywab0?bwA1n##j81tq?Kw9?)xt`c1x@SU-HOIzxG(WP_8X2 zT6WWRWG*?_`Eqtf&_P2_d2?@gAe(8jR8Z^z)){vBVi8eaaHj;mt*7$Pxi=AmMtTwM zPD+ZIxo1`832ame8@)ey4G>#Y}&n;tmN4gn{Xn2Cl>ZR&nHy~jW{l2&T-3__*)pS0z zv=%XU4Y(rRLA+6iuos&OX}ycsbrv0btcU{Nwdx2plN`~HvY*%|Vv-7`FjqU&a;uMJ zLXb_krRQwi^ak(=)kivP;gm=Q8Y;?qgn3+qAWB~EVs<>xT}K>y2j66W_ES}v#b=aG zE31kmfw{^~i{$6Gh5MA zUf~29_G*I>&QWM&Hbbi)5q5wPw+sC%Q2f;?PjwpFmV~+yxySh))BzZc&%8d8q}+55 z2z7n_y!Wsj;rtQMH3V~w6OU|wXx=rY zq&dr(Lh9-LpCo<~I|%hybPZk|XKe($#i$pR?-IyYONQ!tl6X z$ad3uJJxR7k6O01lP0aH->Aues%ZV5*e-bm#GngZkEyhjE;7tU1O)A*Ig^$fJQNU{ zOsJSMgU*Pl%+uu_v9bQ zu>Z|T!ilvakfKd1HO|Z$KH*f?fT>2BfDULAeF#xi*fH#rR~${CtpPbxO1=XAo0;Cz z27b)aNpci}9hz{AaetySR%UTBT|aj^(dxb9g7HG4f_JG~UfLV$(HuJ9z34JRVSIjiu!91glb{b- z*JbQ?|%dKvv&Sxq@M?O`5UC~TKb<+e!iP{;OY1^L_ccl|HAo} zzaMptYj*X!S;29>Ywh1s^v&-_LjSsE{%)A)KWiYrdj3cT|MicEzg-pK9f1Fj