diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 10f24f01e7688..b14e5dad9d8d6 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -280,6 +280,7 @@ Timezones Numeric ^^^^^^^ - Fixed bug in :func:`read_excel` where having a column with mixture of numeric and boolean values will typecast the values based on the first appearance data type since 1==True and 0==False (:issue:`60088`) +- Fixed bug in :meth:`DataFrame.idxmax` and :meth:`DataFrame.idxmin` with ``axis=1`` and ``skipna=False`` returning incorrect column labels for extension array dtypes (e.g. :class:`BooleanDtype`, nullable integer/float, :class:`ArrowDtype`) (:issue:`56903`) - Fixed bug in :meth:`Series.clip` where passing a scalar numpy array (e.g. ``np.array(0)``) would raise a ``TypeError`` (:issue:`59053`) - Fixed bug in :meth:`Series.mean` and :meth:`Series.sum` (and their :class:`DataFrame` counterparts) overflowing for ``float16`` dtypes instead of upcasting to ``float64`` (:issue:`43929`) - Fixed bug in :meth:`Series.skew` and :meth:`Series.kurt` (and their :class:`DataFrame` counterparts) returning ``0.0`` for degenerate distributions; these now return ``NaN`` (:issue:`62864`) @@ -404,6 +405,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.agg` would operate on the group as a whole when ``args`` or ``kwargs`` are supplied for the provided ``func``; now this method only operates on each Series of the group (:issue:`39169`) - Bug in :meth:`.DataFrameGroupBy.apply` with ``as_index=False`` where applying on an empty :class:`DataFrame` returned inconsistent index metadata compared to non-empty results (:issue:`48135`) - Bug in :meth:`.DataFrameGroupBy.cumprod`, :meth:`.DataFrameGroupBy.cummin`, and :meth:`.DataFrameGroupBy.cummax` (and Series variants) returning ``Float64`` instead of preserving the nullable integer dtype (e.g. ``Int64``) when the group key contains ``NA`` (:issue:`65550`) +- Bug in :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin` (and :class:`Series` variants) with ``skipna=False`` returning incorrect results when the input contained no ``NA`` values (:issue:`56903`) - Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` returning ``NaN`` with ``float64`` dtype for unobserved categorical groups on NumPy ``bool`` data instead of the boolean identity value with ``bool`` dtype (:issue:`65100`) - Bug in :meth:`.Resampler.agg` raising ``ValueError`` with a dict of aggregations when applied to a :meth:`DataFrame.groupby` with ``as_index=False`` (:issue:`52397`) - Bug in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` computing incorrect results on degenerate windows (:issue:`24019`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f16dac834807c..2be6f4fa2e0df 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -2127,8 +2127,11 @@ def group_idxmin_idxmax( continue for j in range(K): - if not skipna and out[lab, j] == -1: - # Once we've hit NA there is no going back + if not skipna and seen[lab, j] and out[lab, j] == -1: + # Once we've hit an NA in this group there is no going back. + # seen[lab, j] distinguishes this locked state from the + # initial out[lab, j] == -1 sentinel that holds before any + # row of the group has been visited (GH#56903). continue val = values[i, j] @@ -2139,7 +2142,12 @@ def group_idxmin_idxmax( isna_entry = _treat_as_na(val, is_datetimelike) if isna_entry: - if not skipna or not seen[lab, j]: + if not skipna: + # Lock the group to the NA sentinel; the guard above + # then keeps later non-NA values from overwriting it. + out[lab, j] = -1 + seen[lab, j] = True + elif not seen[lab, j]: out[lab, j] = -1 else: if not seen[lab, j]: diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 250cb61e5cd10..d46e345a80097 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -2488,6 +2488,48 @@ def test_numeric_ea_axis_1( tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("how", ["idxmax", "idxmin"]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize( + "dtype", ["Int64", "Float64", "UInt32", "boolean", "int64[pyarrow]"] +) +def test_idxmax_idxmin_axis_1_ea_matches_numpy(how, skipna, dtype): + # GH#56903 the EA-backed axis=1 path must agree with the numpy-backed path, + # including skipna=False when there are no NA values (previously the cython + # kernel returned the all-NA sentinel for every row). + if "pyarrow" in dtype: + pytest.importorskip("pyarrow") + if dtype == "boolean": + values = [[True, False, True], [False, False, True], [True, True, False]] + np_dtype = bool + else: + values = [[1, 4, 0], [5, 2, 9], [2, 8, 1], [3, 3, 7]] + np_dtype = "float64" + df_ea = DataFrame(values, columns=["a", "b", "c"]).astype(dtype) + df_np = DataFrame(values, columns=["a", "b", "c"]).astype(np_dtype) + + result = getattr(df_ea, how)(axis=1, skipna=skipna) + expected = getattr(df_np, how)(axis=1, skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmax", "idxmin"]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean", "int64[pyarrow]"]) +def test_idxmax_idxmin_axis_1_ea_all_na_row_raises(how, skipna, dtype): + # GH#56903 a fully-NA row must raise (matching the numpy-backed path) rather + # than leak the -1 cython sentinel as a column label. + if "pyarrow" in dtype: + pytest.importorskip("pyarrow") + if dtype == "boolean": + values = [[True, False], [pd.NA, pd.NA], [False, True]] + else: + values = [[1, 4], [pd.NA, pd.NA], [3, 9]] + df = DataFrame(values, columns=["a", "b"]).astype(dtype) + with pytest.raises(ValueError, match="[Ee]ncountered an?.*NA value"): + getattr(df, how)(axis=1, skipna=skipna) + + def test_mean_nullable_int_axis_1(): # GH##36585 df = DataFrame( diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 028eb8bad4f50..97fa939bcccef 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -325,6 +325,22 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +@pytest.mark.parametrize("dtype", ["int64", "float64", "Int64", "Float64"]) +def test_idxmin_idxmax_skipna_false_no_na(how, dtype): + # GH#56903 with skipna=False and no NA values, the group_idxmin_idxmax + # kernel previously skipped every group and returned the all-NA sentinel. + df = DataFrame( + {"a": [1, 1, 2, 2, 2], "b": [3, 1, 4, 9, 2]}, + index=[10, 11, 12, 13, 14], + ).astype({"b": dtype}) + gb = df.groupby("a") + result = getattr(gb, how)(skipna=False) + # No NA is present, so skipna is irrelevant; result must match skipna=True. + expected = getattr(gb, how)(skipna=True) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "func, values", [