diff --git a/horror/cell_starting_with_a_space.ods b/horror/cell_starting_with_a_space.ods new file mode 100644 index 0000000..a0cd364 Binary files /dev/null and b/horror/cell_starting_with_a_space.ods differ diff --git a/horror/simple.fods b/horror/simple.fods new file mode 100644 index 0000000..0aed0bf --- /dev/null +++ b/horror/simple.fods @@ -0,0 +1,273 @@ + + + + 2013-07-08T13:31:432013-07-19T10:14:48P0D3LibreOffice/5.3.4.2$Windows_x86 LibreOffice_project/f82d347ccc0be322489bf7da61d7e4ad13fe2ff3 + + + 0 + 0 + 6321 + 1781 + + + view1 + + + 0 + 4 + 0 + 0 + 0 + 0 + 2 + 0 + 0 + 0 + 0 + 0 + 64 + 60 + true + false + + + Sheet1 + 1856 + 0 + 64 + 60 + false + true + true + true + 12632256 + true + true + true + true + false + false + false + 1000 + 1000 + 1 + 1 + true + false + + + + + true + true + true + true + 12632256 + true + false + true + 3 + true + false + false + 1000 + 1000 + 1 + 1 + true + true + true + Microsoft Print to PDF + FRb+/01pY3Jvc29mdCBQcmludCB0byBQREYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATWljcm9zb2Z0IFByaW50IFRvIFBERgAAAAAAAAAAAAAWAAEAMhUAAAAAAAAIAFZUAAAkbQAAM1ROVwYATQBpAGMAcgBvAHMAbwBmAHQAIABQAHIAaQBuAHQAIAB0AG8AIABQAEQARgAAAAAAAAAAAAAAAAAAAAAAAAAAAAEEAwbcAFAUAy8BAAEAAQDqCm8IZAABAA8AWAICAAEAWAIDAAEATABlAHQAdABlAHIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAABAAAAAgAAAAEAAAD/////R0lTNAAAAAAAAAAAAAAAAERJTlUiAMgAJAMsET9de34AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQAAAAAACQABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADIAAAAU01USgAAAAAQALgAewAwADgANABGADAAMQBGAEEALQBFADYAMwA0AC0ANABEADcANwAtADgAMwBFAEUALQAwADcANAA4ADEANwBDADAAMwA1ADgAMQB9AAAAUkVTRExMAFVuaXJlc0RMTABQYXBlclNpemUATEVUVEVSAE9yaWVudGF0aW9uAFBPUlRSQUlUAFJlc29sdXRpb24AUmVzT3B0aW9uMQBDb2xvck1vZGUAQ29sb3IAAAAAAAAAAAAAAAAAACwRAABWNERNAQAAAAAAAACcCnAiHAAAAOwAAAADAAAA+gFPCDTmd02D7gdIF8A1gdAAAABMAAAAAwAAAAAIAAAAAAAAAAAAAAMAAAAACAAAKgAAAAAIAAADAAAAQAAAAFYAAAAAEAAARABvAGMAdQBtAGUAbgB0AFUAcwBlAHIAUABhAHMAcwB3AG8AcgBkAAAARABvAGMAdQBtAGUAbgB0AE8AdwBuAGUAcgBQAGEAcwBzAHcAbwByAGQAAABEAG8AYwB1AG0AZQBuAHQAQwByAHkAcAB0AFMAZQBjAHUAcgBpAHQAeQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASAENPTVBBVF9EVVBMRVhfTU9ERRMARHVwbGV4TW9kZTo6VW5rbm93bg== + 0 + false + true + true + false + false + false + 7 + + + + + + + + + + + + + + + + + + + + + + + + £ + + + + + - + £ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + / + + / + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ??? + + + + Page 1 + + + + + + + ??? (???) + + + 00/00/0000, 00:00:00 + + + + + Page 1 / 99 + + + + + + + + + + + + Name + + + Age + + + When + + + + + Bob + + + 20 + + + 10/10/10 + + + + + Jane + + + 23 + + + 01/01/12 + + + + + Ian + + + 34 + + + 11/11/15 + + + + + + + + + + + + + \ No newline at end of file diff --git a/messytables/__init__.py b/messytables/__init__.py index e2c03b9..f1f4137 100644 --- a/messytables/__init__.py +++ b/messytables/__init__.py @@ -9,6 +9,7 @@ from messytables.core import Cell, TableSet, RowSet, seekable_stream from messytables.commas import CSVTableSet, CSVRowSet from messytables.ods import ODSTableSet, ODSRowSet +from messytables.fods import FODSTableSet, FODSRowSet from messytables.excel import XLSTableSet, XLSRowSet # XLSXTableSet has been deprecated and its functionality is now provided by diff --git a/messytables/any.py b/messytables/any.py index fd9dfc5..1c9290b 100644 --- a/messytables/any.py +++ b/messytables/any.py @@ -1,5 +1,5 @@ from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet, - HTMLTableSet, ODSTableSet) + HTMLTableSet, ODSTableSet, FODSTableSet) import messytables import re @@ -39,6 +39,7 @@ def TABTableSet(fileobj): 'HTML': HTMLTableSet, 'CSV': CSVTableSet, 'ODS': ODSTableSet, + 'FODS': FODSTableSet, 'PDF': PDFTableSet} @@ -107,7 +108,8 @@ def guess_ext(ext): 'xlsm': 'XLS', 'xltx': 'XLS', 'xltm': 'XLS', - 'ods': 'ODS'} + 'ods': 'ODS', + 'fods': 'FODS'} if ext in lookup: return lookup.get(ext, None) diff --git a/messytables/fods.py b/messytables/fods.py new file mode 100644 index 0000000..bf568cf --- /dev/null +++ b/messytables/fods.py @@ -0,0 +1,215 @@ +import io +import re + +from lxml import etree + +from messytables.core import RowSet, TableSet, Cell +from messytables.types import (StringType, DecimalType, + DateType, BoolType, CurrencyType, + TimeType, PercentageType) + + +FODS_NAMESPACES_TAG_MATCH = re.compile( + b"(]*>)", re.MULTILINE) +ODS_TABLE_MATCH = re.compile( + b".*?().*?", re.DOTALL) +ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?') +ODS_ROW_MATCH = re.compile( + b".*?().*?", re.DOTALL) + +NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s" +NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s" +NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0" +NS_OPENDOCUMENT_OFFICE = NS_OPENDOCUMENT_PTTN % "office:1.0" + +TABLE_CELL = 'table-cell' +VALUE_TYPE = 'value-type' +COLUMN_REPEAT = 'number-columns-repeated' +EMPTY_CELL_VALUE = '' + +ODS_VALUE_TOKEN = { + "float": "value", + "date": "date-value", + "time": "time-value", + "boolean": "boolean-value", + "percentage": "value", + "currency": "value" +} + +ODS_TYPES = { + 'float': DecimalType(), + 'date': DateType('%Y-%m-%d'), + 'boolean': BoolType(), + 'percentage': PercentageType(), + 'time': TimeType() +} + + +class FODSTableSet(TableSet): + """ + A wrapper around ODS files. Because they are zipped and the info we want + is in the zipped file as content.xml we must ensure that we either have + a seekable object (local file) or that we retrieve all of the content from + the remote URL. + """ + + def __init__(self, fileobj, window=None, **kw): + '''Initialize the object. + + :param fileobj: may be a file path or a file-like object. Note the + file-like object *must* be in binary mode and must be seekable (it will + get passed to zipfile). + + As a specific tip: urllib2.urlopen returns a file-like object that is + not in file-like mode while urllib.urlopen *does*! + + To get a seekable file you *cannot* use + messytables.core.seekable_stream as it does not support the full seek + functionality. + ''' + if hasattr(fileobj, 'read'): + # wrap in a StringIO so we do not have hassle with seeks and + # binary etc (see notes to __init__ above) + # TODO: rather wasteful if in fact fileobj comes from disk + fileobj = io.BytesIO(fileobj.read()) + + self.window = window + + self.content = fileobj.read() + + def make_tables(self): + """ + Return the sheets in the workbook. + + A regex is used for this to avoid having to: + + 1. load large the entire file into memory, or + 2. SAX parse the file more than once + """ + namespace_tags = self._get_namespace_tags() + sheets = [m.groups(0)[0] + for m in ODS_TABLE_MATCH.finditer(self.content)] + return [FODSRowSet(sheet, self.window, namespace_tags) + for sheet in sheets] + + def _get_namespace_tags(self): + match = re.search(FODS_NAMESPACES_TAG_MATCH, self.content) + assert match + tag_open = match.groups()[0] + tag_close = b'' + return tag_open, tag_close + + +class FODSRowSet(RowSet): + """ ODS support for a single sheet in the ODS workbook. Unlike + the CSV row set this is not a streaming operation. """ + + def __init__(self, sheet, window=None, namespace_tags=None): + self.sheet = sheet + + self.name = "Unknown" + m = ODS_TABLE_NAME.match(self.sheet) + if m: + self.name = m.groups(0)[0] + + self.window = window or 1000 + + # We must wrap the XML fragments in a valid header otherwise iterparse + # will explode with certain (undefined) versions of libxml2. The + # namespaces are in the ODS file, and change with the libreoffice + # version saving it, so get them from the ODS file if possible. The + # default namespaces are an option to preserve backwards compatibility + # of ODSRowSet. + if namespace_tags: + self.namespace_tags = namespace_tags + else: + namespaces = { + "dc": u"http://purl.org/dc/elements/1.1/", + "draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0", + "number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0", + "office": NS_OPENDOCUMENT_PTTN % u"office:1.0", + "svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0", + "table": NS_OPENDOCUMENT_PTTN % u"table:1.0", + "text": NS_OPENDOCUMENT_PTTN % u"text:1.0", + "calcext": NS_CAL_PTTN % u"calcext:1.0", + } + + ods_header = u""\ + .format(" ".join('xmlns:{0}="{1}"'.format(k, v) + for k, v in namespaces.iteritems())).encode('utf-8') + ods_footer = u"".encode('utf-8') + self.namespace_tags = (ods_header, ods_footer) + + super(FODSRowSet, self).__init__(typed=True) + + def raw(self, sample=False): + """ Iterate over all rows in this sheet. """ + rows = ODS_ROW_MATCH.findall(self.sheet) + + for row in rows: + row_data = [] + + block = self.namespace_tags[0] + row + self.namespace_tags[1] + partial = io.BytesIO(block) + empty_row = True + + for action, element in etree.iterparse(partial, ('end',)): + if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL): + continue + + cell = _read_cell(element) + if empty_row is True and cell.value != EMPTY_CELL_VALUE: + empty_row = False + + repeat = element.attrib.get( + _tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT)) + if repeat: + number_of_repeat = int(repeat) + row_data += [cell] * number_of_repeat + else: + row_data.append(cell) + + if empty_row: + # ignore blank lines + continue + + del partial + yield row_data + del rows + + +def _read_cell(element): + cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE)) + value_token = ODS_VALUE_TOKEN.get(cell_type, 'value') + if cell_type == 'string': + cell = _read_text_cell(element) + elif cell_type == 'currency': + value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) + currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency')) + cell = Cell(value + ' ' + currency, type=CurrencyType()) + elif cell_type is not None: + value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token)) + cell = Cell(value, type=ODS_TYPES.get(cell_type, StringType())) + else: + cell = Cell(EMPTY_CELL_VALUE, type=StringType()) + + return cell + + +def _read_text_cell(element): + children = element.getchildren() + text_content = [] + for child in children: + if child.text: + text_content.append(child.text) + else: + text_content.append(EMPTY_CELL_VALUE) + if len(text_content) > 0: + cell_value = '\n'.join(text_content) + else: + cell_value = EMPTY_CELL_VALUE + return Cell(cell_value, type=StringType()) + + +def _tag(namespace, tag): + return '{%s}%s' % (namespace, tag) diff --git a/test/test_any.py b/test/test_any.py index 1fbfe78..74a0f59 100644 --- a/test/test_any.py +++ b/test/test_any.py @@ -5,7 +5,7 @@ from nose.tools import assert_equal from nose.plugins.skip import SkipTest from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet, - CSVTableSet, ODSTableSet, + CSVTableSet, ODSTableSet, FODSTableSet, ReadError) suite = [{'filename': 'simple.csv', 'tableset': CSVTableSet}, @@ -13,6 +13,7 @@ {'filename': 'simple.xlsx', 'tableset': XLSTableSet}, {'filename': 'simple.zip', 'tableset': ZIPTableSet}, {'filename': 'simple.ods', 'tableset': ODSTableSet}, + {'filename': 'simple.fods', 'tableset': FODSTableSet}, {'filename': 'bian-anal-mca-2005-dols-eng-1011-0312-tab3.xlsm', 'tableset': XLSTableSet}, ]