Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ Timezones
Numeric
^^^^^^^
- Fixed bug in :func:`read_excel` where having a column with mixture of numeric and boolean values will typecast the values based on the first appearance data type since 1==True and 0==False (:issue:`60088`)
- Fixed bug in :meth:`DataFrame.idxmax` and :meth:`DataFrame.idxmin` with ``axis=1`` and ``skipna=False`` returning incorrect column labels for extension array dtypes (e.g. :class:`BooleanDtype`, nullable integer/float, :class:`ArrowDtype`) (:issue:`56903`)
- Fixed bug in :meth:`Series.clip` where passing a scalar numpy array (e.g. ``np.array(0)``) would raise a ``TypeError`` (:issue:`59053`)
- Fixed bug in :meth:`Series.mean` and :meth:`Series.sum` (and their :class:`DataFrame` counterparts) overflowing for ``float16`` dtypes instead of upcasting to ``float64`` (:issue:`43929`)
- Fixed bug in :meth:`Series.skew` and :meth:`Series.kurt` (and their :class:`DataFrame` counterparts) returning ``0.0`` for degenerate distributions; these now return ``NaN`` (:issue:`62864`)
Expand Down Expand Up @@ -404,6 +405,7 @@ Groupby/resample/rolling
- Bug in :meth:`.DataFrameGroupBy.agg` would operate on the group as a whole when ``args`` or ``kwargs`` are supplied for the provided ``func``; now this method only operates on each Series of the group (:issue:`39169`)
- Bug in :meth:`.DataFrameGroupBy.apply` with ``as_index=False`` where applying on an empty :class:`DataFrame` returned inconsistent index metadata compared to non-empty results (:issue:`48135`)
- Bug in :meth:`.DataFrameGroupBy.cumprod`, :meth:`.DataFrameGroupBy.cummin`, and :meth:`.DataFrameGroupBy.cummax` (and Series variants) returning ``Float64`` instead of preserving the nullable integer dtype (e.g. ``Int64``) when the group key contains ``NA`` (:issue:`65550`)
- Bug in :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin` (and :class:`Series` variants) with ``skipna=False`` returning incorrect results when the input contained no ``NA`` values (:issue:`56903`)
- Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` returning ``NaN`` with ``float64`` dtype for unobserved categorical groups on NumPy ``bool`` data instead of the boolean identity value with ``bool`` dtype (:issue:`65100`)
- Bug in :meth:`.Resampler.agg` raising ``ValueError`` with a dict of aggregations when applied to a :meth:`DataFrame.groupby` with ``as_index=False`` (:issue:`52397`)
- Bug in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` computing incorrect results on degenerate windows (:issue:`24019`)
Expand Down
14 changes: 11 additions & 3 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2127,8 +2127,11 @@ def group_idxmin_idxmax(
continue

for j in range(K):
if not skipna and out[lab, j] == -1:
# Once we've hit NA there is no going back
if not skipna and seen[lab, j] and out[lab, j] == -1:
# Once we've hit an NA in this group there is no going back.
# seen[lab, j] distinguishes this locked state from the
# initial out[lab, j] == -1 sentinel that holds before any
# row of the group has been visited (GH#56903).
continue

val = values[i, j]
Expand All @@ -2139,7 +2142,12 @@ def group_idxmin_idxmax(
isna_entry = _treat_as_na(val, is_datetimelike)

if isna_entry:
if not skipna or not seen[lab, j]:
if not skipna:
# Lock the group to the NA sentinel; the guard above
# then keeps later non-NA values from overwriting it.
out[lab, j] = -1
seen[lab, j] = True
elif not seen[lab, j]:
out[lab, j] = -1
else:
if not seen[lab, j]:
Expand Down
42 changes: 42 additions & 0 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2488,6 +2488,48 @@ def test_numeric_ea_axis_1(
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("how", ["idxmax", "idxmin"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
"dtype", ["Int64", "Float64", "UInt32", "boolean", "int64[pyarrow]"]
)
def test_idxmax_idxmin_axis_1_ea_matches_numpy(how, skipna, dtype):
# GH#56903 the EA-backed axis=1 path must agree with the numpy-backed path,
# including skipna=False when there are no NA values (previously the cython
# kernel returned the all-NA sentinel for every row).
if "pyarrow" in dtype:
pytest.importorskip("pyarrow")
if dtype == "boolean":
values = [[True, False, True], [False, False, True], [True, True, False]]
np_dtype = bool
else:
values = [[1, 4, 0], [5, 2, 9], [2, 8, 1], [3, 3, 7]]
np_dtype = "float64"
df_ea = DataFrame(values, columns=["a", "b", "c"]).astype(dtype)
df_np = DataFrame(values, columns=["a", "b", "c"]).astype(np_dtype)

result = getattr(df_ea, how)(axis=1, skipna=skipna)
expected = getattr(df_np, how)(axis=1, skipna=skipna)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("how", ["idxmax", "idxmin"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean", "int64[pyarrow]"])
def test_idxmax_idxmin_axis_1_ea_all_na_row_raises(how, skipna, dtype):
# GH#56903 a fully-NA row must raise (matching the numpy-backed path) rather
# than leak the -1 cython sentinel as a column label.
if "pyarrow" in dtype:
pytest.importorskip("pyarrow")
if dtype == "boolean":
values = [[True, False], [pd.NA, pd.NA], [False, True]]
else:
values = [[1, 4], [pd.NA, pd.NA], [3, 9]]
df = DataFrame(values, columns=["a", "b"]).astype(dtype)
with pytest.raises(ValueError, match="[Ee]ncountered an?.*NA value"):
getattr(df, how)(axis=1, skipna=skipna)


def test_mean_nullable_int_axis_1():
# GH##36585
df = DataFrame(
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,22 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
@pytest.mark.parametrize("dtype", ["int64", "float64", "Int64", "Float64"])
def test_idxmin_idxmax_skipna_false_no_na(how, dtype):
# GH#56903 with skipna=False and no NA values, the group_idxmin_idxmax
# kernel previously skipped every group and returned the all-NA sentinel.
df = DataFrame(
{"a": [1, 1, 2, 2, 2], "b": [3, 1, 4, 9, 2]},
index=[10, 11, 12, 13, 14],
).astype({"b": dtype})
gb = df.groupby("a")
result = getattr(gb, how)(skipna=False)
# No NA is present, so skipna is irrelevant; result must match skipna=True.
expected = getattr(gb, how)(skipna=True)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"func, values",
[
Expand Down