From db3b3cb79172412fbaef7f92e93d52c04cde8968 Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Mon, 20 May 2024 22:49:56 +0400 Subject: [PATCH] feat(python): Allow `read_excel` to handle bytes/BytesIO directly, in conjunction with the "calamine" (fastexcel) engine --- py-polars/polars/io/spreadsheet/functions.py | 54 +++++++++++--------- py-polars/tests/unit/io/test_spreadsheet.py | 9 +++- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 2947129fd175..8d37402f824d 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -1,9 +1,8 @@ from __future__ import annotations import re -from contextlib import nullcontext from datetime import time -from io import BufferedReader, BytesIO, StringIO +from io import BufferedReader, BytesIO, StringIO, TextIOWrapper from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Callable, NoReturn, Sequence, overload @@ -35,7 +34,6 @@ ) from polars.io._utils import looks_like_url, process_file_url from polars.io.csv.functions import read_csv -from polars.io.spreadsheet._utils import PortableTemporaryFile if TYPE_CHECKING: from typing import Literal @@ -444,9 +442,11 @@ def _identify_from_magic_bytes(data: IO[bytes] | bytes) -> str | None: return "xls" elif magic_bytes[:4] == xlsx_bytes: return "xlsx" - return None + except UnicodeDecodeError: + pass finally: data.seek(initial_position) + return None def _identify_workbook(wb: str | Path | IO[bytes] | bytes) -> str | None: @@ -630,25 +630,33 @@ def _initialise_spreadsheet_parser( return _read_spreadsheet_openpyxl, parser, sheets elif engine == "calamine": - # note: can't read directly from bytes (yet) so - read_buffered = False - if read_bytesio := isinstance(source, BytesIO) or ( - read_buffered := isinstance(source, BufferedReader) - ): - temp_data = PortableTemporaryFile(delete=True) - - with temp_data if (read_bytesio or read_buffered) else nullcontext() as tmp: - if read_bytesio and tmp is not None: - tmp.write(source.read() if read_buffered else source.getvalue()) # type: ignore[union-attr] - source = tmp.name - tmp.close() - - fxl = import_optional("fastexcel", min_version="0.7.0") - parser = fxl.read_excel(source, **engine_options) - sheets = [ - {"index": i + 1, "name": nm} for i, nm in enumerate(parser.sheet_names) - ] - return _read_spreadsheet_calamine, parser, sheets + fastexcel = import_optional("fastexcel", min_version="0.7.0") + reading_bytesio, reading_bytes = ( + isinstance(source, BytesIO), + isinstance(source, bytes), + ) + if (reading_bytesio or reading_bytes) and parse_version( + module_version := fastexcel.__version__ + ) < (0, 10): + msg = f"`fastexcel` >= 0.10 is required to read bytes; found {module_version})" + raise ModuleUpgradeRequired(msg) + + if reading_bytesio: + source = source.getbuffer().tobytes() # type: ignore[union-attr] + elif isinstance(source, (BufferedReader, TextIOWrapper)): + if "b" not in source.mode: + msg = f"file {source.name!r} must be opened in binary mode" + raise OSError(msg) + elif (filename := source.name) and Path(filename).exists(): + source = filename + else: + source = source.read() + + parser = fastexcel.read_excel(source, **engine_options) + sheets = [ + {"index": i + 1, "name": nm} for i, nm in enumerate(parser.sheet_names) + ] + return _read_spreadsheet_calamine, parser, sheets elif engine == "pyxlsb": issue_deprecation_warning( diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index 54af4e288587..abc365fd88fb 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -911,11 +911,16 @@ def test_identify_workbook( # identify from IO[bytes] with Path.open(spreadsheet_path, "rb") as f: assert _identify_workbook(f) == file_type + assert isinstance(pl.read_excel(f, engine="calamine"), pl.DataFrame) # identify from bytes with Path.open(spreadsheet_path, "rb") as f: - assert _identify_workbook(f.read()) == file_type + raw_data = f.read() + assert _identify_workbook(raw_data) == file_type + assert isinstance(pl.read_excel(raw_data, engine="calamine"), pl.DataFrame) # identify from BytesIO with Path.open(spreadsheet_path, "rb") as f: - assert _identify_workbook(BytesIO(f.read())) == file_type + bytesio_data = BytesIO(f.read()) + assert _identify_workbook(bytesio_data) == file_type + assert isinstance(pl.read_excel(bytesio_data, engine="calamine"), pl.DataFrame)