Skip to content

Commit d4da5a8

Browse files
authored
[ENG-8475] Fix unable to render spreadsheet files (.xls and .xlsx) (#393)
1 parent ffaba86 commit d4da5a8

File tree

3 files changed

+120
-85
lines changed

3 files changed

+120
-85
lines changed
Lines changed: 26 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,95 +1,38 @@
11
import xlrd
22
import zipfile
3-
from collections import OrderedDict
4-
from ..exceptions import TableTooBigError, MissingRequirementsError
53

6-
from ..utilities import header_population
7-
from mfr.extensions.tabular.compat import range, basestring
4+
from io import BytesIO
5+
from openpyxl import load_workbook
6+
from collections import OrderedDict
7+
from ..utilities import (
8+
to_bytes,
9+
parse_xls,
10+
parse_xlsx
11+
)
812

913

1014
def xlsx_xlrd(fp):
11-
"""Read and convert a xlsx file to JSON format using the xlrd library
12-
:param fp: File pointer object
13-
:return: tuple of table headers and data
1415
"""
15-
MAX_SIZE = 10000
16-
17-
try:
18-
wb = xlrd.open_workbook(fp.name)
19-
using_xlrd = True
20-
except xlrd.biffh.XLRDError:
21-
using_xlrd = False
22-
try:
23-
from openpyxl import load_workbook
24-
except ImportError:
25-
raise MissingRequirementsError(
26-
'openpyxl is required to read .xlsx files',
27-
function_preference='openpyxl'
28-
)
29-
try:
30-
wb = load_workbook(fp.name, data_only=True)
31-
except zipfile.BadZipFile:
32-
raise xlrd.biffh.XLRDError("Excel xlsx file; not supported")
16+
• .xls → xlrd
17+
• .xlsx → openpyxl (xlrd ≥2.0 dropped xlsx support)
3318
19+
`fp` is the stream returned by WaterButler/MFR. It may already have been
20+
read, so we always rewind and copy to an in‑memory buffer that openpyxl (and
21+
ZipFile) can seek inside safely.
22+
"""
3423
sheets = OrderedDict()
3524

36-
if using_xlrd:
37-
for sheet in wb.sheets():
38-
if sheet.ncols > MAX_SIZE or sheet.nrows > MAX_SIZE:
39-
raise TableTooBigError('Table is too large to render.', '.xlsx',
40-
nbr_cols=sheet.ncols, nbr_rows=sheet.nrows)
41-
42-
if sheet.ncols < 1 or sheet.nrows < 1:
43-
sheets[sheet.name] = ([], [])
44-
continue
45-
46-
fields = sheet.row_values(0) if sheet.nrows else []
47-
48-
fields = [
49-
str(value)
50-
if not isinstance(value, basestring) and value is not None
51-
else value or f'Unnamed: {index + 1}'
52-
for index, value in enumerate(fields)
53-
]
54-
55-
data = []
56-
for i in range(1, sheet.nrows):
57-
row = []
58-
for cell in sheet.row(i):
59-
if cell.ctype == xlrd.XL_CELL_DATE:
60-
value = xlrd.xldate.xldate_as_datetime(cell.value, wb.datemode).isoformat()
61-
else:
62-
value = cell.value
63-
row.append(value)
64-
data.append(dict(zip(fields, row)))
65-
66-
header = header_population(fields)
67-
sheets[sheet.name] = (header, data)
68-
69-
else:
70-
for name in wb.sheetnames:
71-
ws = wb[name]
72-
nrows = ws.max_row
73-
ncols = ws.max_column
74-
if ncols > MAX_SIZE or nrows > MAX_SIZE:
75-
raise TableTooBigError('Table is too large to render.', '.xlsx',
76-
nbr_cols=ncols, nbr_rows=nrows)
77-
78-
if nrows < 1 or ncols < 1:
79-
sheets[name] = ([], [])
80-
continue
81-
82-
header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True))
83-
fields = [
84-
str(val) if val is not None else f'Unnamed: {i + 1}'
85-
for i, val in enumerate(header_row)
86-
]
87-
88-
data = []
89-
for row in ws.iter_rows(min_row=2, max_row=nrows, max_col=ncols, values_only=True):
90-
data.append(dict(zip(fields, row)))
25+
try:
26+
wb = xlrd.open_workbook(file_contents=to_bytes(fp))
27+
return parse_xls(wb, sheets)
28+
except xlrd.biffh.XLRDError:
29+
pass
9130

92-
header = header_population(fields)
93-
sheets[name] = (header, data)
31+
try:
32+
wb = load_workbook(BytesIO(to_bytes(fp)), data_only=True, read_only=True)
33+
except zipfile.BadZipFile as exc:
34+
raise xlrd.biffh.XLRDError(
35+
"Invalid xlsx file or corrupted ZIP structure"
36+
) from exc
9437

95-
return sheets
38+
return parse_xlsx(wb, sheets)

mfr/extensions/tabular/utilities.py

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,20 @@
11
import re
2+
import xlrd
3+
24
from http import HTTPStatus
35
from subprocess import (check_call,
46
TimeoutExpired,
57
CalledProcessError)
68
from tempfile import NamedTemporaryFile
79

810
from mfr.extensions.tabular import compat
9-
from mfr.core.exceptions import SubprocessError
11+
from mfr.core.exceptions import SubprocessError, TooBigToRenderError
1012
from mfr.extensions.tabular.settings import (PSPP_CONVERT_BIN,
1113
PSPP_CONVERT_TIMEOUT)
1214

1315

16+
MAX_SIZE = 10_000
17+
1418
def header_population(headers):
1519
"""make column headers from a list
1620
:param headers: list of column headers
@@ -83,3 +87,91 @@ def sav_to_csv(fp):
8387
exporter_class='tabular'
8488
)
8589
return csv_file
90+
91+
92+
def to_bytes(fp):
93+
"""
94+
Return *exactly* the original bytes of the Excel file and rewind *fp*.
95+
Handles both binary and text wrappers that WaterButler may give us.
96+
"""
97+
try:
98+
fp.seek(0)
99+
except Exception:
100+
pass
101+
102+
raw = fp.read()
103+
if isinstance(raw, bytes):
104+
try:
105+
fp.seek(0)
106+
except Exception:
107+
pass
108+
return raw
109+
110+
if hasattr(fp, "buffer"):
111+
buf = fp.buffer
112+
try:
113+
buf.seek(0)
114+
except Exception:
115+
pass
116+
data = buf.read()
117+
try:
118+
buf.seek(0)
119+
except Exception:
120+
pass
121+
else:
122+
data = raw.encode("utf-8", "surrogateescape")
123+
124+
try:
125+
fp.seek(0)
126+
except Exception:
127+
pass
128+
return data
129+
130+
131+
def parse_xls(wb, sheets):
132+
for sheet in wb.sheets():
133+
verify_size(sheet.nrows, sheet.ncols, '.xls')
134+
fields = fix_headers(sheet.row_values(0))
135+
rows = [
136+
dict(zip(fields, row_vals(sheet.row(r), wb.datemode)))
137+
for r in range(1, sheet.nrows)
138+
]
139+
sheets[sheet.name] = (header_population(fields), rows)
140+
return sheets
141+
142+
143+
def parse_xlsx(wb, sheets):
144+
for name in wb.sheetnames:
145+
ws = wb[name]
146+
verify_size(ws.max_row, ws.max_column, '.xlsx')
147+
header_row = next(ws.iter_rows(max_row=1, values_only=True))
148+
fields = fix_headers(header_row)
149+
rows = [
150+
dict(zip(fields, row))
151+
for row in ws.iter_rows(min_row=2,
152+
max_row=ws.max_row,
153+
max_col=ws.max_column,
154+
values_only=True)
155+
]
156+
sheets[name] = (header_population(fields), rows)
157+
return sheets
158+
159+
160+
def verify_size(rows, cols, ext):
161+
if rows > MAX_SIZE or cols > MAX_SIZE:
162+
raise TooBigToRenderError('Table is too large to render.', ext,
163+
nbr_cols=cols, nbr_rows=rows)
164+
165+
166+
def fix_headers(raw):
167+
return [str(v) if v not in (None, '') else f'Unnamed: {i + 1}' for i, v in enumerate(raw)]
168+
169+
170+
def row_vals(row, datemode):
171+
out = []
172+
for c in row:
173+
if c.ctype == xlrd.XL_CELL_DATE:
174+
out.append(xlrd.xldate.xldate_as_datetime(c.value, datemode).isoformat())
175+
else:
176+
out.append(c.value)
177+
return out

tests/extensions/tabular/test_xlsx_tools.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
class TestTabularPandaTools:
99

1010
def test_xlsx_xlrd(self):
11-
with open(os.path.join(BASE, 'files', 'test.xlsx')) as fp:
11+
with open(os.path.join(BASE, 'files', 'test.xlsx'), 'rb') as fp:
1212
sheets = xlrd_tools.xlsx_xlrd(fp)
1313

1414
sheet = sheets.popitem()[1]

0 commit comments

Comments
 (0)