|
1 | 1 | import xlrd
|
2 | 2 | import zipfile
|
3 |
| -from collections import OrderedDict |
4 |
| -from ..exceptions import TableTooBigError, MissingRequirementsError |
5 | 3 |
|
6 |
| -from ..utilities import header_population |
7 |
| -from mfr.extensions.tabular.compat import range, basestring |
| 4 | +from io import BytesIO |
| 5 | +from openpyxl import load_workbook |
| 6 | +from collections import OrderedDict |
| 7 | +from ..utilities import ( |
| 8 | + to_bytes, |
| 9 | + parse_xls, |
| 10 | + parse_xlsx |
| 11 | +) |
8 | 12 |
|
9 | 13 |
|
10 | 14 | def xlsx_xlrd(fp):
|
11 |
| - """Read and convert a xlsx file to JSON format using the xlrd library |
12 |
| - :param fp: File pointer object |
13 |
| - :return: tuple of table headers and data |
14 | 15 | """
|
15 |
| - MAX_SIZE = 10000 |
16 |
| - |
17 |
| - try: |
18 |
| - wb = xlrd.open_workbook(fp.name) |
19 |
| - using_xlrd = True |
20 |
| - except xlrd.biffh.XLRDError: |
21 |
| - using_xlrd = False |
22 |
| - try: |
23 |
| - from openpyxl import load_workbook |
24 |
| - except ImportError: |
25 |
| - raise MissingRequirementsError( |
26 |
| - 'openpyxl is required to read .xlsx files', |
27 |
| - function_preference='openpyxl' |
28 |
| - ) |
29 |
| - try: |
30 |
| - wb = load_workbook(fp.name, data_only=True) |
31 |
| - except zipfile.BadZipFile: |
32 |
| - raise xlrd.biffh.XLRDError("Excel xlsx file; not supported") |
| 16 | + • .xls → xlrd |
| 17 | + • .xlsx → openpyxl (xlrd ≥2.0 dropped xlsx support) |
33 | 18 |
|
| 19 | + `fp` is the stream returned by WaterButler/MFR. It may already have been |
| 20 | + read, so we always rewind and copy to an in‑memory buffer that openpyxl (and |
| 21 | + ZipFile) can seek inside safely. |
| 22 | + """ |
34 | 23 | sheets = OrderedDict()
|
35 | 24 |
|
36 |
| - if using_xlrd: |
37 |
| - for sheet in wb.sheets(): |
38 |
| - if sheet.ncols > MAX_SIZE or sheet.nrows > MAX_SIZE: |
39 |
| - raise TableTooBigError('Table is too large to render.', '.xlsx', |
40 |
| - nbr_cols=sheet.ncols, nbr_rows=sheet.nrows) |
41 |
| - |
42 |
| - if sheet.ncols < 1 or sheet.nrows < 1: |
43 |
| - sheets[sheet.name] = ([], []) |
44 |
| - continue |
45 |
| - |
46 |
| - fields = sheet.row_values(0) if sheet.nrows else [] |
47 |
| - |
48 |
| - fields = [ |
49 |
| - str(value) |
50 |
| - if not isinstance(value, basestring) and value is not None |
51 |
| - else value or f'Unnamed: {index + 1}' |
52 |
| - for index, value in enumerate(fields) |
53 |
| - ] |
54 |
| - |
55 |
| - data = [] |
56 |
| - for i in range(1, sheet.nrows): |
57 |
| - row = [] |
58 |
| - for cell in sheet.row(i): |
59 |
| - if cell.ctype == xlrd.XL_CELL_DATE: |
60 |
| - value = xlrd.xldate.xldate_as_datetime(cell.value, wb.datemode).isoformat() |
61 |
| - else: |
62 |
| - value = cell.value |
63 |
| - row.append(value) |
64 |
| - data.append(dict(zip(fields, row))) |
65 |
| - |
66 |
| - header = header_population(fields) |
67 |
| - sheets[sheet.name] = (header, data) |
68 |
| - |
69 |
| - else: |
70 |
| - for name in wb.sheetnames: |
71 |
| - ws = wb[name] |
72 |
| - nrows = ws.max_row |
73 |
| - ncols = ws.max_column |
74 |
| - if ncols > MAX_SIZE or nrows > MAX_SIZE: |
75 |
| - raise TableTooBigError('Table is too large to render.', '.xlsx', |
76 |
| - nbr_cols=ncols, nbr_rows=nrows) |
77 |
| - |
78 |
| - if nrows < 1 or ncols < 1: |
79 |
| - sheets[name] = ([], []) |
80 |
| - continue |
81 |
| - |
82 |
| - header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True)) |
83 |
| - fields = [ |
84 |
| - str(val) if val is not None else f'Unnamed: {i + 1}' |
85 |
| - for i, val in enumerate(header_row) |
86 |
| - ] |
87 |
| - |
88 |
| - data = [] |
89 |
| - for row in ws.iter_rows(min_row=2, max_row=nrows, max_col=ncols, values_only=True): |
90 |
| - data.append(dict(zip(fields, row))) |
| 25 | + try: |
| 26 | + wb = xlrd.open_workbook(file_contents=to_bytes(fp)) |
| 27 | + return parse_xls(wb, sheets) |
| 28 | + except xlrd.biffh.XLRDError: |
| 29 | + pass |
91 | 30 |
|
92 |
| - header = header_population(fields) |
93 |
| - sheets[name] = (header, data) |
| 31 | + try: |
| 32 | + wb = load_workbook(BytesIO(to_bytes(fp)), data_only=True, read_only=True) |
| 33 | + except zipfile.BadZipFile as exc: |
| 34 | + raise xlrd.biffh.XLRDError( |
| 35 | + "Invalid xlsx file or corrupted ZIP structure" |
| 36 | + ) from exc |
94 | 37 |
|
95 |
| - return sheets |
| 38 | + return parse_xlsx(wb, sheets) |
0 commit comments