New style reader (#6)

chfw · web-flow · commit 21d2903e25e1 · 2020-10-03T22:53:41.000+01:00
* 🎉 xlsxr in new style.

* This is an auto-commit, updating project meta data, such as changelog.rst, contributors.rst

* 💄 update coding style. address long lines

* This is an auto-commit, updating project meta data, such as changelog.rst, contributors.rst

* 💚 use pyexcel-io 0.6.0 for testing

Co-authored-by: chfw &lt;chfw@users.noreply.github.com&gt;
diff --git a/pyexcel-xlsxr.yml b/pyexcel-xlsxr.yml
@@ -1,8 +1,8 @@
 overrides: "pyexcel.yaml"
 name: "pyexcel-xlsxr"
 nick_name: "xlsxr"
-version: "0.5.3"
-current_version: "0.5.3"
+version: "0.6.0"
+current_version: "0.6.0"
 release: "0.5.3"
 file_type: xlsx
 nodocs: true
diff --git a/pyexcel_xlsxr/__init__.py b/pyexcel_xlsxr/__init__.py
@@ -2,19 +2,25 @@
     pyexcel_xlsxr
     ~~~~~~~~~~~~~~~~~~~
     The lower level xlsx file format handler using lxml
-    :copyright: (c) 2015-2017 by Onni Software Ltd & its contributors
+    :copyright: (c) 2015-2020 by Onni Software Ltd & its contributors
     :license: New BSD License
 """
 from pyexcel_io.io import get_data as read_data
 from pyexcel_io.io import isstream
-from pyexcel_io.plugins import IOPluginInfoChain
+from pyexcel_io.plugins import IOPluginInfoChainV2
 
-from pyexcel_xlsxr._version import __author__, __version__  # flake8: noqa
+from pyexcel_xlsxr._version import __author__, __version__  # noqa
 
 __FILE_TYPE__ = "xlsx"
 
-IOPluginInfoChain(__name__).add_a_reader(
+IOPluginInfoChainV2(__name__).add_a_reader(
     relative_plugin_class_path="xlsxr.XLSXBook",
+    locations=["file", "memory"],
+    file_types=[__FILE_TYPE__],
+    stream_type="binary",
+).add_a_reader(
+    relative_plugin_class_path="xlsxr.XLSXBookInContent",
+    locations=["content"],
     file_types=[__FILE_TYPE__],
     stream_type="binary",
 )
diff --git a/pyexcel_xlsxr/_version.py b/pyexcel_xlsxr/_version.py
@@ -1,2 +1,2 @@
-__version__ = '0.5.3'
+__version__ = '0.6.0'
 __author__ = 'chfw'
diff --git a/pyexcel_xlsxr/messy_xlsx.py b/pyexcel_xlsxr/messy_xlsx.py
@@ -11,15 +11,15 @@
 WORK_BOOK = "xl/workbook.xml"
 SHEET_MATCHER = "xl/worksheets/(work)?sheet([0-9]+)?.xml"
 SHEET_INDEX_MATCHER = "xl/worksheets/(work)?sheet(([0-9]+)?).xml"
-XLSX_ROW_MATCH = re.compile(b".*?(<row.*?<\/.*?row>).*?", re.MULTILINE)
+XLSX_ROW_MATCH = re.compile(rb".*?(<row.*?<\/.*?row>).*?", re.MULTILINE)
 NUMBER_FMT_MATCHER = re.compile(
-    b".*?(<numFmts.*?<\/.*?numFmts>).*?", re.MULTILINE
+    rb".*?(<numFmts.*?<\/.*?numFmts>).*?", re.MULTILINE
 )
 XFS_FMT_MATCHER = re.compile(
-    b".*?(<cellXfs.*?<\/.*?cellXfs>).*?", re.MULTILINE
+    rb".*?(<cellXfs.*?<\/.*?cellXfs>).*?", re.MULTILINE
 )
-SHEET_FMT_MATCHER = re.compile(b".*?(<sheet .*?\/>).*?", re.MULTILINE)
-DATE_1904_MATCHER = re.compile(b".*?(<workbookPr.*?\/>).*?", re.MULTILINE)
+SHEET_FMT_MATCHER = re.compile(rb".*?(<sheet .*?\/>).*?", re.MULTILINE)
+DATE_1904_MATCHER = re.compile(rb".*?(<workbookPr.*?\/>).*?", re.MULTILINE)
 # "xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac"
 # But it not used for now
 X14AC_NAMESPACE = b'xmlns:x14ac="http://not.used.com/"'
@@ -140,8 +140,9 @@ def __extract_book_properties(self):
         book_content = self.zip_file.open(WORK_BOOK).read()
         return parse_book_properties(book_content)
 
-    def __del__(self):
-        self.zip_file.close()
+    def close(self):
+        if self.zip_file:
+            self.zip_file.close()
 
     def make_tables(self):
         sheet_files = find_sheets(self.zip_file.namelist())
@@ -216,9 +217,9 @@ def parse_cell_type(cell):
     cell_type = None
     if cell.style_string:
         date_time_flag = (
-            re.match("^\d+(\.\d+)?$", cell.value)
+            re.match(r"^\d+(\.\d+)?$", cell.value)
             and re.match(".*[hsmdyY]", cell.style_string)
-            and not re.match(".*\[.*[dmhys].*\]", cell.style_string)
+            and not re.match(r".*\[.*[dmhys].*\]", cell.style_string)
         )
         if cell.style_string in FORMATS:
             cell_type = FORMATS[cell.style_string]
@@ -227,7 +228,7 @@ def parse_cell_type(cell):
                 cell_type = "time"
             else:
                 cell_type = "date"
-        elif re.match("^-?\d+(.\d+)?$", cell.value):
+        elif re.match(r"^-?\d+(.\d+)?$", cell.value):
             cell_type = "float"
     return cell_type
 
@@ -313,9 +314,12 @@ def parse_book_properties(book_content):
                     properties["date1904"] = value.lower().strip() == "true"
                 else:
                     properties["date1904"] = False
-    namespaces = {
-        "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships"  # flake8: noqa
-    }
+
+    ns = (
+        "http://schemas.openxmlformats.org/"
+        + "officeDocument/2006/relationships"
+    )
+    namespaces = {"r": ns}
 
     xlsx_header = u"<wrapper {0}>".format(
         " ".join('xmlns:{0}="{1}"'.format(k, v) for k, v in namespaces.items())
diff --git a/pyexcel_xlsxr/xlsxr.py b/pyexcel_xlsxr/xlsxr.py
@@ -1,15 +1,14 @@
 from datetime import date, datetime, time
-from io import UnsupportedOperation
+from io import BytesIO
 
 import pyexcel_io.service as service
-from pyexcel_io._compact import BytesIO, OrderedDict
-from pyexcel_io.book import BookReader
-from pyexcel_io.sheet import SheetReader
+from pyexcel_io.plugin_api.abstract_reader import IReader
+from pyexcel_io.plugin_api.abstract_sheet import ISheet
 
 from pyexcel_xlsxr.messy_xlsx import XLSXBookSet
 
 
-class XLSXSheet(SheetReader):
+class XLSXSheet(ISheet):
     def __init__(
         self,
         sheet,
@@ -18,10 +17,11 @@ def __init__(
         auto_detect_datetime=True,
         **keywords
     ):
-        SheetReader.__init__(self, sheet, **keywords)
+        self._native_sheet = sheet
         self.__auto_detect_int = auto_detect_int
         self.__auto_detect_float = auto_detect_float
         self.__auto_detect_datetime = auto_detect_datetime
+        self._keywords = keywords
 
     @property
     def name(self):
@@ -55,60 +55,32 @@ def __convert_cell(self, cell):
         return ret
 
 
-class XLSXBook(BookReader):
-    def open(self, file_name, **keywords):
-        BookReader.open(self, file_name, **keywords)
-        self._load_from_file()
-
-    def open_stream(self, file_stream, **keywords):
-        if not hasattr(file_stream, "seek"):
-            # python 2
-            # Hei zipfile in odfpy would do a seek
-            # but stream from urlib cannot do seek
-            file_stream = BytesIO(file_stream.read())
-        try:
-            file_stream.seek(0)
-        except UnsupportedOperation:
-            # python 3
-            file_stream = BytesIO(file_stream.read())
-        BookReader.open_stream(self, file_stream, **keywords)
-        self._load_from_memory()
-
-    def read_sheet_by_name(self, sheet_name):
+class XLSXBook(IReader):
+    def __init__(self, file_alike_object, _, **keywords):
+        self._native_book = XLSXBookSet(file_alike_object)
+        self._keywords = keywords
         tables = self._native_book.make_tables()
-        rets = [table for table in tables if table.name == sheet_name]
-        if len(rets) == 0:
-            raise ValueError("%s cannot be found" % sheet_name)
-        else:
-            return self.read_sheet(rets[0])
+        self.content_array = [
+            NameObject(table.name, table) for table in tables
+        ]
 
-    def read_sheet_by_index(self, sheet_index):
+    def read_sheet(self, sheet_index):
         """read a sheet at a specified index"""
-        tables = self._native_book.make_tables()
-        length = len(tables)
-        if sheet_index < length:
-            return self.read_sheet(tables[sheet_index])
-        else:
-            raise IndexError(
-                "Index %d of out bound %d" % (sheet_index, length)
-            )
-
-    def read_all(self):
-        """read all sheets"""
-        result = OrderedDict()
-        for sheet in self._native_book.make_tables():
-            ods_sheet = XLSXSheet(sheet, **self._keywords)
-            result[ods_sheet.name] = ods_sheet.to_array()
-
-        return result
-
-    def read_sheet(self, native_sheet):
-        """read one native sheet"""
-        sheet = XLSXSheet(native_sheet, **self._keywords)
-        return {sheet.name: sheet.to_array()}
-
-    def _load_from_memory(self):
-        self._native_book = XLSXBookSet(self._file_stream)
-
-    def _load_from_file(self):
-        self._native_book = XLSXBookSet(self._file_name)
+        table = self.content_array[sheet_index].sheet
+        sheet = XLSXSheet(table, **self._keywords)
+        return sheet
+
+    def close(self):
+        self._native_book.close()
+
+
+class XLSXBookInContent(XLSXBook):
+    def __init__(self, file_content, file_type, **keywords):
+        file_stream = BytesIO(file_content)
+        super().__init__(file_stream, file_type, **keywords)
+
+
+class NameObject(object):
+    def __init__(self, name, sheet):
+        self.name = name
+        self.sheet = sheet
diff --git a/rnd_requirements.txt b/rnd_requirements.txt
@@ -0,0 +1 @@
+https://github.com/pyexcel/pyexcel-io/archive/dev.zip
diff --git a/setup.py b/setup.py
@@ -32,7 +32,7 @@
 
 NAME = "pyexcel-xlsxr"
 AUTHOR = "chfw"
-VERSION = "0.5.3"
+VERSION = "0.6.0"
 EMAIL = "info@pyexcel.org"
 LICENSE = "New BSD"
 DESCRIPTION = (
diff --git a/tests/test_messy_xlsx.py b/tests/test_messy_xlsx.py
@@ -114,7 +114,15 @@ def test_alternative_single_sheet():
 
 
 def test_parse_row():
-    xml_string = b'<row collapsed="false" customFormat="false" customHeight="false" hidden="false" ht="12.75" outlineLevel="0" r="4"><c r="A4" s="1" t="n"><v>42005</v></c><c r="B4" s="2" t="n"><v>0.550844907407407</v></c></row>'  # flake8: noqa
+    xml_string = b"""
+       <row collapsed="false" customFormat="false"
+         customHeight="false" hidden="false" ht="12.75" outlineLevel="0" r="4">
+          <c r="A4" s="1" t="n"><v>42005</v></c><c r="B4" s="2" t="n">
+          <v>0.550844907407407</v>
+          </c>
+        </row>""".replace(
+        b"\n", b" "
+    )
 
     class Book:
         def __init__(self):
@@ -133,30 +141,80 @@ def __init__(self):
 
 
 def test_parse_styles():
-    sample = b'<styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><numFmts count="3"><numFmt formatCode="GENERAL" numFmtId="164"/><numFmt formatCode="DD/MM/YY" numFmtId="165"/><numFmt formatCode="H:MM:SS;@" numFmtId="166"/></numFmts><fonts count="4"><font><name val="Arial"/>'
+    sample = b"""
+     <styleSheet
+       xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
+     <numFmts count="3"><numFmt formatCode="GENERAL" numFmtId="164"/>
+     <numFmt formatCode="DD/MM/YY" numFmtId="165"/>
+     <numFmt formatCode="H:MM:SS;@" numFmtId="166"/>
+     </numFmts><fonts count="4"><font>
+    <name val="Arial"/>""".replace(
+        b"\n", b" "
+    )
     styles = parse_styles(sample)
     eq_(list(styles.values()), ["general", "dd/mm/yy", "h:mm:ss;@"])
 
 
 def test_parse_properties():
-    sample = b'<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><fileVersion appName="Calc"/><workbookPr backupFile="false" showObjects="all" date1904="false"/><workbookProtection/>'
+    sample = b"""
+     <workbook
+       xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
+    xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+         <fileVersion appName="Calc"/>
+         <workbookPr backupFile="false" showObjects="all" date1904="false"/>
+    <workbookProtection/>""".replace(
+        b"\n", b" "
+    )
     properties = parse_book_properties(sample)
     eq_(properties, {"date1904": False, "sheets": []})
 
 
 def test_parse_sheet_properties():
-    sample = b'</bookViews><sheets><sheet name="Sheet1" sheetId="1" state="visible" r:id="rId2"/><sheet name="Sheet2" sheetId="2" state="visible" r:id="rId3"/><sheet name="Sheet3" sheetId="3" state="visible" r:id="rId4"/></sheets><calcPr iterateCount="100" refMode="A1" iterate="false" iterateDelta="0.001"/>'
+    sample = b"""
+     </bookViews><sheets>
+        <sheet name="Sheet1" sheetId="1" state="visible" r:id="rId2"/>
+        <sheet name="Sheet2" sheetId="2" state="visible" r:id="rId3"/>
+        <sheet name="Sheet3" sheetId="3" state="visible" r:id="rId4"/>
+        </sheets>
+    <calcPr iterateCount="100" refMode="A1"
+    iterate="false" iterateDelta="0.001"/>""".replace(
+        b"\n", b" "
+    )
     properties = parse_book_properties(sample)
     eq_(properties, {"sheets": ["Sheet1", "Sheet2", "Sheet3"]})
 
 
 def test_parse_xfs_styles():
-    sample = b'<cellXfs count="3"><xf applyAlignment="false" applyBorder="false" applyFont="false" applyProtection="false" borderId="0" fillId="0" fontId="0" numFmtId="164" xfId="0"></xf><xf applyAlignment="false" applyBorder="false" applyFont="false" applyProtection="false" borderId="0" fillId="0" fontId="0" numFmtId="165" xfId="0"></xf><xf applyAlignment="false" applyBorder="false" applyFont="false" applyProtection="false" borderId="0" fillId="0" fontId="0" numFmtId="166" xfId="0"></xf></cellXfs><cellStyles count="6">'
+    sample = b"""
+    <cellXfs count="3">
+      <xf applyAlignment="false"
+          applyBorder="false" applyFont="false"
+          applyProtection="false"
+          borderId="0" fillId="0" fontId="0" numFmtId="164" xfId="0">
+      </xf>
+     <xf applyAlignment="false"
+         applyBorder="false" applyFont="false"
+         applyProtection="false"
+         borderId="0" fillId="0" fontId="0" numFmtId="165" xfId="0">
+     </xf>
+     <xf applyAlignment="false"
+         applyBorder="false" applyFont="false"
+         applyProtection="false"
+         borderId="0" fillId="0" fontId="0" numFmtId="166" xfId="0">
+     </xf>
+    </cellXfs><cellStyles count="6">""".replace(
+        b"\n", b" "
+    )
     xfs_styles = parse_xfs_styles(sample)
     eq_(xfs_styles, [164, 165, 166])
 
 
 def test_parse_shared_strings():
-    sample = b'<sst count="2" uniqueCount="2" xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><si><t>Date</t></si><si><t>Time</t></si></sst>'
+    sample = b"""
+    <sst count="2" uniqueCount="2"
+       xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
+    <si><t>Date</t></si><si><t>Time</t></si></sst>""".replace(
+        b"\n", b" "
+    )
     content = parse_shared_strings(sample)
     eq_(list(content), ["Date", "Time"])

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-__version__ = '0.5.3'`
	`1`	`+__version__ = '0.6.0'`
`2`	`2`	`__author__ = 'chfw'`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+https://github.com/pyexcel/pyexcel-io/archive/dev.zip`