From 92e2da4bb3e89f12a5f7cfc15e7c9aaa96ee039d Mon Sep 17 00:00:00 2001 From: Wajih-Ur-Raza Asif <152525473+wajih-rathore@users.noreply.github.com> Date: Tue, 16 Jun 2026 22:51:03 +0500 Subject: [PATCH] fix(csv): skip leading blank lines so they don't wipe the whole table A CSV whose first line is blank converted to an all-empty markdown table. csv.reader yields [] for the blank line, so rows[0] had zero columns. That empty row became the header, and every data row was then truncated with row[:len(rows[0])] == row[:0], silently dropping all the data. Skip leading empty rows before choosing the header. Adds a regression test fixture with a leading blank first line. --- .../src/markitdown/converters/_csv_converter.py | 7 +++++++ packages/markitdown/tests/_test_vectors.py | 14 ++++++++++++++ .../tests/test_files/test_blank_first_line.csv | 4 ++++ 3 files changed, 25 insertions(+) create mode 100644 packages/markitdown/tests/test_files/test_blank_first_line.csv diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py index 7e9631e1b..3ff4d799a 100644 --- a/packages/markitdown/src/markitdown/converters/_csv_converter.py +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -51,6 +51,13 @@ def convert( reader = csv.reader(io.StringIO(content)) rows = list(reader) + # A leading blank line (or any empty rows before the header) parses as an + # empty list. If that becomes the header its column count is 0, and every + # data row then gets truncated to nothing -- silent total data loss. + # Skip leading empty rows so the first real row becomes the header. + while rows and len(rows[0]) == 0: + rows.pop(0) + if not rows: return DocumentConverterResult(markdown="") diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd0a..0adeced77 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -152,6 +152,20 @@ class FileTestVector(object): ], must_not_include=[], ), + FileTestVector( + filename="test_blank_first_line.csv", + mimetype="text/csv", + charset="utf-8", + url=None, + must_include=[ + "| name | age |", + "| bob | 3 |", + "| alice | 7 |", + ], + must_not_include=[ + "| |", + ], + ), FileTestVector( filename="test.json", mimetype="application/json", diff --git a/packages/markitdown/tests/test_files/test_blank_first_line.csv b/packages/markitdown/tests/test_files/test_blank_first_line.csv new file mode 100644 index 000000000..2bf2c831b --- /dev/null +++ b/packages/markitdown/tests/test_files/test_blank_first_line.csv @@ -0,0 +1,4 @@ + +name,age +bob,3 +alice,7