From ad6ed69cdaab7755b9cb3c4d95756597b9641189 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 21 Jun 2026 12:34:54 -0700 Subject: [PATCH] BUG: read_csv pyarrow engine ignored tuple names for MultiIndex columns Passing tuples in ``names`` to ``read_csv`` with ``engine="pyarrow"`` produced flat columns instead of MultiIndex columns like the other engines. Route the result columns through ``_maybe_make_multi_index_columns`` as the C/python engines do. Co-Authored-By: Claude Opus 4.8 (1M context) --- doc/source/whatsnew/v3.1.0.rst | 1 + pandas/io/parsers/arrow_parser_wrapper.py | 4 +++ pandas/tests/io/parser/test_header.py | 36 +++++++++++++++++++---- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 10f24f01e7688..7bcc547afa1b3 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -350,6 +350,7 @@ MultiIndex I/O ^^^ - :func:`read_csv` with ``memory_map=True`` and an in-memory buffer (e.g. ``BytesIO``) now raises a clear ``ValueError`` instead of a cryptic ``UnsupportedOperation: fileno`` (:issue:`45630`) +- Fixed bug in :func:`read_csv` with ``engine="pyarrow"`` where passing tuples in ``names`` produced flat columns instead of :class:`MultiIndex` columns (:issue:`65862`) - Fixed bug in :func:`read_csv` with the ``c`` engine where an embedded ``\r`` followed by a space in an unquoted field could cause an infinite re-parsing loop, producing spurious rows or a buffer overflow (:issue:`51141`) - Fixed bug in :func:`read_excel` where usage of ``skiprows`` could lead to an infinite loop (:issue:`64027`) - Fixed bug where :func:`read_html` parsed nested tables incorrectly when using ``html5lib`` or ``bs4`` flavors (:issue:`64524`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 0ca0bd921c74a..d3bef60452396 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -252,6 +252,10 @@ def _finalize_pandas_output( frame = self._do_date_conversions(frame.columns, frame) frame = self._finalize_index(frame, multi_index_named) frame = self._finalize_dtype(frame) + # tuples passed via names imply MultiIndex columns, as with other engines + frame.columns = self._maybe_make_multi_index_columns( + list(frame.columns), self.col_names + ) return frame def _validate_usecols(self, usecols) -> None: diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index eed553f6d20f6..cfe0a6196f16f 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -199,7 +199,6 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("_TestTuple", ["first", "second"]) -@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "kwargs", [ @@ -228,8 +227,13 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): }, ], ) -def test_header_multi_index_common_format1(all_parsers, kwargs): +def test_header_multi_index_common_format1(all_parsers, kwargs, request): parser = all_parsers + if parser.engine == "pyarrow" and "header" in kwargs: + # list-valued header is unsupported by the pyarrow engine + request.applymarker( + pytest.mark.xfail(reason="TypeError: an integer is required") + ) expected = DataFrame( [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], index=["one", "two"], @@ -247,7 +251,6 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "kwargs", [ @@ -276,8 +279,13 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): }, ], ) -def test_header_multi_index_common_format2(all_parsers, kwargs): +def test_header_multi_index_common_format2(all_parsers, kwargs, request): parser = all_parsers + if parser.engine == "pyarrow" and "header" in kwargs: + # list-valued header is unsupported by the pyarrow engine + request.applymarker( + pytest.mark.xfail(reason="TypeError: an integer is required") + ) expected = DataFrame( [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], index=["one", "two"], @@ -294,7 +302,6 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "kwargs", [ @@ -323,8 +330,13 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): }, ], ) -def test_header_multi_index_common_format3(all_parsers, kwargs): +def test_header_multi_index_common_format3(all_parsers, kwargs, request): parser = all_parsers + if parser.engine == "pyarrow" and "header" in kwargs: + # list-valued header is unsupported by the pyarrow engine + request.applymarker( + pytest.mark.xfail(reason="TypeError: an integer is required") + ) expected = DataFrame( [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], index=["one", "two"], @@ -469,6 +481,18 @@ def test_no_header(all_parsers, kwargs, names): tm.assert_frame_equal(result, expected) +def test_names_tuples_multi_index_columns(all_parsers): + # GH#65862 tuples passed via names produce MultiIndex columns for all engines + parser = all_parsers + data = "1,2\n3,4" + result = parser.read_csv(StringIO(data), names=[("a", "x"), ("b", "y")]) + expected = DataFrame( + [[1, 2], [3, 4]], + columns=MultiIndex.from_tuples([("a", "x"), ("b", "y")]), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("header", [["a", "b"], "string_header"]) def test_non_int_header(all_parsers, header): # see gh-16338