Skip to content

Commit 21d2903

Browse files
authored
New style reader (#6)
* 🎉 xlsxr in new style. * This is an auto-commit, updating project meta data, such as changelog.rst, contributors.rst * 💄 update coding style. address long lines * This is an auto-commit, updating project meta data, such as changelog.rst, contributors.rst * 💚 use pyexcel-io 0.6.0 for testing Co-authored-by: chfw <[email protected]>
1 parent 3ee118c commit 21d2903

File tree

8 files changed

+128
-87
lines changed

8 files changed

+128
-87
lines changed

pyexcel-xlsxr.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
overrides: "pyexcel.yaml"
22
name: "pyexcel-xlsxr"
33
nick_name: "xlsxr"
4-
version: "0.5.3"
5-
current_version: "0.5.3"
4+
version: "0.6.0"
5+
current_version: "0.6.0"
66
release: "0.5.3"
77
file_type: xlsx
88
nodocs: true

pyexcel_xlsxr/__init__.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,25 @@
22
pyexcel_xlsxr
33
~~~~~~~~~~~~~~~~~~~
44
The lower level xlsx file format handler using lxml
5-
:copyright: (c) 2015-2017 by Onni Software Ltd & its contributors
5+
:copyright: (c) 2015-2020 by Onni Software Ltd & its contributors
66
:license: New BSD License
77
"""
88
from pyexcel_io.io import get_data as read_data
99
from pyexcel_io.io import isstream
10-
from pyexcel_io.plugins import IOPluginInfoChain
10+
from pyexcel_io.plugins import IOPluginInfoChainV2
1111

12-
from pyexcel_xlsxr._version import __author__, __version__ # flake8: noqa
12+
from pyexcel_xlsxr._version import __author__, __version__ # noqa
1313

1414
__FILE_TYPE__ = "xlsx"
1515

16-
IOPluginInfoChain(__name__).add_a_reader(
16+
IOPluginInfoChainV2(__name__).add_a_reader(
1717
relative_plugin_class_path="xlsxr.XLSXBook",
18+
locations=["file", "memory"],
19+
file_types=[__FILE_TYPE__],
20+
stream_type="binary",
21+
).add_a_reader(
22+
relative_plugin_class_path="xlsxr.XLSXBookInContent",
23+
locations=["content"],
1824
file_types=[__FILE_TYPE__],
1925
stream_type="binary",
2026
)

pyexcel_xlsxr/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
__version__ = '0.5.3'
1+
__version__ = '0.6.0'
22
__author__ = 'chfw'

pyexcel_xlsxr/messy_xlsx.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@
1111
WORK_BOOK = "xl/workbook.xml"
1212
SHEET_MATCHER = "xl/worksheets/(work)?sheet([0-9]+)?.xml"
1313
SHEET_INDEX_MATCHER = "xl/worksheets/(work)?sheet(([0-9]+)?).xml"
14-
XLSX_ROW_MATCH = re.compile(b".*?(<row.*?<\/.*?row>).*?", re.MULTILINE)
14+
XLSX_ROW_MATCH = re.compile(rb".*?(<row.*?<\/.*?row>).*?", re.MULTILINE)
1515
NUMBER_FMT_MATCHER = re.compile(
16-
b".*?(<numFmts.*?<\/.*?numFmts>).*?", re.MULTILINE
16+
rb".*?(<numFmts.*?<\/.*?numFmts>).*?", re.MULTILINE
1717
)
1818
XFS_FMT_MATCHER = re.compile(
19-
b".*?(<cellXfs.*?<\/.*?cellXfs>).*?", re.MULTILINE
19+
rb".*?(<cellXfs.*?<\/.*?cellXfs>).*?", re.MULTILINE
2020
)
21-
SHEET_FMT_MATCHER = re.compile(b".*?(<sheet .*?\/>).*?", re.MULTILINE)
22-
DATE_1904_MATCHER = re.compile(b".*?(<workbookPr.*?\/>).*?", re.MULTILINE)
21+
SHEET_FMT_MATCHER = re.compile(rb".*?(<sheet .*?\/>).*?", re.MULTILINE)
22+
DATE_1904_MATCHER = re.compile(rb".*?(<workbookPr.*?\/>).*?", re.MULTILINE)
2323
# "xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac"
2424
# But it not used for now
2525
X14AC_NAMESPACE = b'xmlns:x14ac="http://not.used.com/"'
@@ -140,8 +140,9 @@ def __extract_book_properties(self):
140140
book_content = self.zip_file.open(WORK_BOOK).read()
141141
return parse_book_properties(book_content)
142142

143-
def __del__(self):
144-
self.zip_file.close()
143+
def close(self):
144+
if self.zip_file:
145+
self.zip_file.close()
145146

146147
def make_tables(self):
147148
sheet_files = find_sheets(self.zip_file.namelist())
@@ -216,9 +217,9 @@ def parse_cell_type(cell):
216217
cell_type = None
217218
if cell.style_string:
218219
date_time_flag = (
219-
re.match("^\d+(\.\d+)?$", cell.value)
220+
re.match(r"^\d+(\.\d+)?$", cell.value)
220221
and re.match(".*[hsmdyY]", cell.style_string)
221-
and not re.match(".*\[.*[dmhys].*\]", cell.style_string)
222+
and not re.match(r".*\[.*[dmhys].*\]", cell.style_string)
222223
)
223224
if cell.style_string in FORMATS:
224225
cell_type = FORMATS[cell.style_string]
@@ -227,7 +228,7 @@ def parse_cell_type(cell):
227228
cell_type = "time"
228229
else:
229230
cell_type = "date"
230-
elif re.match("^-?\d+(.\d+)?$", cell.value):
231+
elif re.match(r"^-?\d+(.\d+)?$", cell.value):
231232
cell_type = "float"
232233
return cell_type
233234

@@ -313,9 +314,12 @@ def parse_book_properties(book_content):
313314
properties["date1904"] = value.lower().strip() == "true"
314315
else:
315316
properties["date1904"] = False
316-
namespaces = {
317-
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships" # flake8: noqa
318-
}
317+
318+
ns = (
319+
"http://schemas.openxmlformats.org/"
320+
+ "officeDocument/2006/relationships"
321+
)
322+
namespaces = {"r": ns}
319323

320324
xlsx_header = u"<wrapper {0}>".format(
321325
" ".join('xmlns:{0}="{1}"'.format(k, v) for k, v in namespaces.items())

pyexcel_xlsxr/xlsxr.py

Lines changed: 32 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
from datetime import date, datetime, time
2-
from io import UnsupportedOperation
2+
from io import BytesIO
33

44
import pyexcel_io.service as service
5-
from pyexcel_io._compact import BytesIO, OrderedDict
6-
from pyexcel_io.book import BookReader
7-
from pyexcel_io.sheet import SheetReader
5+
from pyexcel_io.plugin_api.abstract_reader import IReader
6+
from pyexcel_io.plugin_api.abstract_sheet import ISheet
87

98
from pyexcel_xlsxr.messy_xlsx import XLSXBookSet
109

1110

12-
class XLSXSheet(SheetReader):
11+
class XLSXSheet(ISheet):
1312
def __init__(
1413
self,
1514
sheet,
@@ -18,10 +17,11 @@ def __init__(
1817
auto_detect_datetime=True,
1918
**keywords
2019
):
21-
SheetReader.__init__(self, sheet, **keywords)
20+
self._native_sheet = sheet
2221
self.__auto_detect_int = auto_detect_int
2322
self.__auto_detect_float = auto_detect_float
2423
self.__auto_detect_datetime = auto_detect_datetime
24+
self._keywords = keywords
2525

2626
@property
2727
def name(self):
@@ -55,60 +55,32 @@ def __convert_cell(self, cell):
5555
return ret
5656

5757

58-
class XLSXBook(BookReader):
59-
def open(self, file_name, **keywords):
60-
BookReader.open(self, file_name, **keywords)
61-
self._load_from_file()
62-
63-
def open_stream(self, file_stream, **keywords):
64-
if not hasattr(file_stream, "seek"):
65-
# python 2
66-
# Hei zipfile in odfpy would do a seek
67-
# but stream from urlib cannot do seek
68-
file_stream = BytesIO(file_stream.read())
69-
try:
70-
file_stream.seek(0)
71-
except UnsupportedOperation:
72-
# python 3
73-
file_stream = BytesIO(file_stream.read())
74-
BookReader.open_stream(self, file_stream, **keywords)
75-
self._load_from_memory()
76-
77-
def read_sheet_by_name(self, sheet_name):
58+
class XLSXBook(IReader):
59+
def __init__(self, file_alike_object, _, **keywords):
60+
self._native_book = XLSXBookSet(file_alike_object)
61+
self._keywords = keywords
7862
tables = self._native_book.make_tables()
79-
rets = [table for table in tables if table.name == sheet_name]
80-
if len(rets) == 0:
81-
raise ValueError("%s cannot be found" % sheet_name)
82-
else:
83-
return self.read_sheet(rets[0])
63+
self.content_array = [
64+
NameObject(table.name, table) for table in tables
65+
]
8466

85-
def read_sheet_by_index(self, sheet_index):
67+
def read_sheet(self, sheet_index):
8668
"""read a sheet at a specified index"""
87-
tables = self._native_book.make_tables()
88-
length = len(tables)
89-
if sheet_index < length:
90-
return self.read_sheet(tables[sheet_index])
91-
else:
92-
raise IndexError(
93-
"Index %d of out bound %d" % (sheet_index, length)
94-
)
95-
96-
def read_all(self):
97-
"""read all sheets"""
98-
result = OrderedDict()
99-
for sheet in self._native_book.make_tables():
100-
ods_sheet = XLSXSheet(sheet, **self._keywords)
101-
result[ods_sheet.name] = ods_sheet.to_array()
102-
103-
return result
104-
105-
def read_sheet(self, native_sheet):
106-
"""read one native sheet"""
107-
sheet = XLSXSheet(native_sheet, **self._keywords)
108-
return {sheet.name: sheet.to_array()}
109-
110-
def _load_from_memory(self):
111-
self._native_book = XLSXBookSet(self._file_stream)
112-
113-
def _load_from_file(self):
114-
self._native_book = XLSXBookSet(self._file_name)
69+
table = self.content_array[sheet_index].sheet
70+
sheet = XLSXSheet(table, **self._keywords)
71+
return sheet
72+
73+
def close(self):
74+
self._native_book.close()
75+
76+
77+
class XLSXBookInContent(XLSXBook):
78+
def __init__(self, file_content, file_type, **keywords):
79+
file_stream = BytesIO(file_content)
80+
super().__init__(file_stream, file_type, **keywords)
81+
82+
83+
class NameObject(object):
84+
def __init__(self, name, sheet):
85+
self.name = name
86+
self.sheet = sheet

rnd_requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
https://github.com/pyexcel/pyexcel-io/archive/dev.zip

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
NAME = "pyexcel-xlsxr"
3434
AUTHOR = "chfw"
35-
VERSION = "0.5.3"
35+
VERSION = "0.6.0"
3636
3737
LICENSE = "New BSD"
3838
DESCRIPTION = (

tests/test_messy_xlsx.py

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,15 @@ def test_alternative_single_sheet():
114114

115115

116116
def test_parse_row():
117-
xml_string = b'<row collapsed="false" customFormat="false" customHeight="false" hidden="false" ht="12.75" outlineLevel="0" r="4"><c r="A4" s="1" t="n"><v>42005</v></c><c r="B4" s="2" t="n"><v>0.550844907407407</v></c></row>' # flake8: noqa
117+
xml_string = b"""
118+
<row collapsed="false" customFormat="false"
119+
customHeight="false" hidden="false" ht="12.75" outlineLevel="0" r="4">
120+
<c r="A4" s="1" t="n"><v>42005</v></c><c r="B4" s="2" t="n">
121+
<v>0.550844907407407</v>
122+
</c>
123+
</row>""".replace(
124+
b"\n", b" "
125+
)
118126

119127
class Book:
120128
def __init__(self):
@@ -133,30 +141,80 @@ def __init__(self):
133141

134142

135143
def test_parse_styles():
136-
sample = b'<styleSheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><numFmts count="3"><numFmt formatCode="GENERAL" numFmtId="164"/><numFmt formatCode="DD/MM/YY" numFmtId="165"/><numFmt formatCode="H:MM:SS;@" numFmtId="166"/></numFmts><fonts count="4"><font><name val="Arial"/>'
144+
sample = b"""
145+
<styleSheet
146+
xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
147+
<numFmts count="3"><numFmt formatCode="GENERAL" numFmtId="164"/>
148+
<numFmt formatCode="DD/MM/YY" numFmtId="165"/>
149+
<numFmt formatCode="H:MM:SS;@" numFmtId="166"/>
150+
</numFmts><fonts count="4"><font>
151+
<name val="Arial"/>""".replace(
152+
b"\n", b" "
153+
)
137154
styles = parse_styles(sample)
138155
eq_(list(styles.values()), ["general", "dd/mm/yy", "h:mm:ss;@"])
139156

140157

141158
def test_parse_properties():
142-
sample = b'<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><fileVersion appName="Calc"/><workbookPr backupFile="false" showObjects="all" date1904="false"/><workbookProtection/>'
159+
sample = b"""
160+
<workbook
161+
xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
162+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
163+
<fileVersion appName="Calc"/>
164+
<workbookPr backupFile="false" showObjects="all" date1904="false"/>
165+
<workbookProtection/>""".replace(
166+
b"\n", b" "
167+
)
143168
properties = parse_book_properties(sample)
144169
eq_(properties, {"date1904": False, "sheets": []})
145170

146171

147172
def test_parse_sheet_properties():
148-
sample = b'</bookViews><sheets><sheet name="Sheet1" sheetId="1" state="visible" r:id="rId2"/><sheet name="Sheet2" sheetId="2" state="visible" r:id="rId3"/><sheet name="Sheet3" sheetId="3" state="visible" r:id="rId4"/></sheets><calcPr iterateCount="100" refMode="A1" iterate="false" iterateDelta="0.001"/>'
173+
sample = b"""
174+
</bookViews><sheets>
175+
<sheet name="Sheet1" sheetId="1" state="visible" r:id="rId2"/>
176+
<sheet name="Sheet2" sheetId="2" state="visible" r:id="rId3"/>
177+
<sheet name="Sheet3" sheetId="3" state="visible" r:id="rId4"/>
178+
</sheets>
179+
<calcPr iterateCount="100" refMode="A1"
180+
iterate="false" iterateDelta="0.001"/>""".replace(
181+
b"\n", b" "
182+
)
149183
properties = parse_book_properties(sample)
150184
eq_(properties, {"sheets": ["Sheet1", "Sheet2", "Sheet3"]})
151185

152186

153187
def test_parse_xfs_styles():
154-
sample = b'<cellXfs count="3"><xf applyAlignment="false" applyBorder="false" applyFont="false" applyProtection="false" borderId="0" fillId="0" fontId="0" numFmtId="164" xfId="0"></xf><xf applyAlignment="false" applyBorder="false" applyFont="false" applyProtection="false" borderId="0" fillId="0" fontId="0" numFmtId="165" xfId="0"></xf><xf applyAlignment="false" applyBorder="false" applyFont="false" applyProtection="false" borderId="0" fillId="0" fontId="0" numFmtId="166" xfId="0"></xf></cellXfs><cellStyles count="6">'
188+
sample = b"""
189+
<cellXfs count="3">
190+
<xf applyAlignment="false"
191+
applyBorder="false" applyFont="false"
192+
applyProtection="false"
193+
borderId="0" fillId="0" fontId="0" numFmtId="164" xfId="0">
194+
</xf>
195+
<xf applyAlignment="false"
196+
applyBorder="false" applyFont="false"
197+
applyProtection="false"
198+
borderId="0" fillId="0" fontId="0" numFmtId="165" xfId="0">
199+
</xf>
200+
<xf applyAlignment="false"
201+
applyBorder="false" applyFont="false"
202+
applyProtection="false"
203+
borderId="0" fillId="0" fontId="0" numFmtId="166" xfId="0">
204+
</xf>
205+
</cellXfs><cellStyles count="6">""".replace(
206+
b"\n", b" "
207+
)
155208
xfs_styles = parse_xfs_styles(sample)
156209
eq_(xfs_styles, [164, 165, 166])
157210

158211

159212
def test_parse_shared_strings():
160-
sample = b'<sst count="2" uniqueCount="2" xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><si><t>Date</t></si><si><t>Time</t></si></sst>'
213+
sample = b"""
214+
<sst count="2" uniqueCount="2"
215+
xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
216+
<si><t>Date</t></si><si><t>Time</t></si></sst>""".replace(
217+
b"\n", b" "
218+
)
161219
content = parse_shared_strings(sample)
162220
eq_(list(content), ["Date", "Time"])

0 commit comments

Comments
 (0)