microsoft · sbidwaibing · Jun 11, 2026
diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -36,6 +36,9 @@
 class XlsxConverter(DocumentConverter):
     """
     Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
+    For files <= 100MB uses the standard pandas/openpyxl path.
+    For files > 100MB uses openpyxl read_only streaming to avoid loading the entire
+    workbook into memory, which caused timeouts on large files (see issue #2096).
     """
 
     def __init__(self):
@@ -80,17 +83,49 @@ def convert(
                 _xlsx_dependency_exc_info[2]
             )
 
-        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
-        md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
-            md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
-                + "\n\n"
-            )
+        # Measure file size without consuming the stream
+        file_stream.seek(0, 2)
+        size_mb = file_stream.tell() / (1024 * 1024)
+        file_stream.seek(0)
+
+        if size_mb <= 100:
+            # Standard path for small files — preserves existing behavior
+            sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
+            md_content = ""
+            for s in sheets:
+                md_content += f"## {s}\n"
+                html_content = sheets[s].to_html(index=False)
+                md_content += (
+                    self._html_converter.convert_string(
+                        html_content, **kwargs
+                    ).markdown.strip()
+                    + "\n\n"
+                )
+        else:
+            # Streaming path for large files (>100MB)
+            # openpyxl read_only=True streams rows lazily instead of loading
+            # the entire workbook into memory. data_only=True skips formula
+            # objects and returns computed values only. Markdown table is built
+            # directly, avoiding the to_html() + HtmlConverter round-trip.
+            wb = openpyxl.load_workbook(file_stream, read_only=True, data_only=True)
+            md_content = ""
+            for sheet_name in wb.sheetnames:
+                ws = wb[sheet_name]
+                md_content += f"## {sheet_name}\n"
+                rows = ws.iter_rows(values_only=True)
+                header = next(rows, None)
+                if header is None:
+                    continue
+                header_cells = [str(c) if c is not None else "" for c in header]
+                md_content += "| " + " | ".join(header_cells) + " |\n"
+                md_content += "| " + " | ".join(["---"] * len(header_cells)) + " |\n"
+                for row in rows:
+                    cells = [str(c) if c is not None else "" for c in row]
+                    cells = cells[:len(header_cells)]
+                    cells += [""] * (len(header_cells) - len(cells))
+                    md_content += "| " + " | ".join(cells) + " |\n"
+                md_content += "\n"
+            wb.close()
 
         return DocumentConverterResult(markdown=md_content.strip())
 
@@ -154,4 +189,4 @@ def convert(
                 + "\n\n"
             )
 
-        return DocumentConverterResult(markdown=md_content.strip())
+        return DocumentConverterResult(markdown=md_content.strip())