diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 4186ec773..7f2b4144e 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -36,6 +36,9 @@ class XlsxConverter(DocumentConverter): """ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. + For files <= 100MB uses the standard pandas/openpyxl path. + For files > 100MB uses openpyxl read_only streaming to avoid loading the entire + workbook into memory, which caused timeouts on large files (see issue #2096). """ def __init__(self): @@ -80,17 +83,49 @@ def convert( _xlsx_dependency_exc_info[2] ) - sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") - md_content = "" - for s in sheets: - md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) - md_content += ( - self._html_converter.convert_string( - html_content, **kwargs - ).markdown.strip() - + "\n\n" - ) + # Measure file size without consuming the stream + file_stream.seek(0, 2) + size_mb = file_stream.tell() / (1024 * 1024) + file_stream.seek(0) + + if size_mb <= 100: + # Standard path for small files — preserves existing behavior + sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + md_content = "" + for s in sheets: + md_content += f"## {s}\n" + html_content = sheets[s].to_html(index=False) + md_content += ( + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + + "\n\n" + ) + else: + # Streaming path for large files (>100MB) + # openpyxl read_only=True streams rows lazily instead of loading + # the entire workbook into memory. data_only=True skips formula + # objects and returns computed values only. Markdown table is built + # directly, avoiding the to_html() + HtmlConverter round-trip. + wb = openpyxl.load_workbook(file_stream, read_only=True, data_only=True) + md_content = "" + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + md_content += f"## {sheet_name}\n" + rows = ws.iter_rows(values_only=True) + header = next(rows, None) + if header is None: + continue + header_cells = [str(c) if c is not None else "" for c in header] + md_content += "| " + " | ".join(header_cells) + " |\n" + md_content += "| " + " | ".join(["---"] * len(header_cells)) + " |\n" + for row in rows: + cells = [str(c) if c is not None else "" for c in row] + cells = cells[:len(header_cells)] + cells += [""] * (len(header_cells) - len(cells)) + md_content += "| " + " | ".join(cells) + " |\n" + md_content += "\n" + wb.close() return DocumentConverterResult(markdown=md_content.strip()) @@ -154,4 +189,4 @@ def convert( + "\n\n" ) - return DocumentConverterResult(markdown=md_content.strip()) + return DocumentConverterResult(markdown=md_content.strip()) \ No newline at end of file