initial commit
This commit is contained in:
25
venv/Lib/site-packages/pandas/tests/groupby/__init__.py
Normal file
25
venv/Lib/site-packages/pandas/tests/groupby/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
def get_groupby_method_args(name, obj):
|
||||
"""
|
||||
Get required arguments for a groupby method.
|
||||
|
||||
When parametrizing a test over groupby methods (e.g. "sum", "mean"),
|
||||
it is often the case that arguments are required for certain methods.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
Name of the method.
|
||||
obj: Series or DataFrame
|
||||
pandas object that is being grouped.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A tuple of required arguments for the method.
|
||||
"""
|
||||
if name in ("nth", "take"):
|
||||
return (0,)
|
||||
if name == "quantile":
|
||||
return (0.5,)
|
||||
if name == "corrwith":
|
||||
return (obj,)
|
||||
return ()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,415 @@
|
||||
"""
|
||||
test cython .agg behavior
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_float_dtype,
|
||||
is_integer_dtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
bdate_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
pytest.param(
|
||||
"median",
|
||||
# ignore mean of empty slice
|
||||
# and all-NaN
|
||||
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
|
||||
),
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cythonized_aggers(op_name):
|
||||
data = {
|
||||
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
|
||||
"B": ["A", "B"] * 6,
|
||||
"C": np.random.default_rng(2).standard_normal(12),
|
||||
}
|
||||
df = DataFrame(data)
|
||||
df.loc[2:10:2, "C"] = np.nan
|
||||
|
||||
op = lambda x: getattr(x, op_name)()
|
||||
|
||||
# single column
|
||||
grouped = df.drop(["B"], axis=1).groupby("A")
|
||||
exp = {cat: op(group["C"]) for cat, group in grouped}
|
||||
exp = DataFrame({"C": exp})
|
||||
exp.index.name = "A"
|
||||
result = op(grouped)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
# multiple columns
|
||||
grouped = df.groupby(["A", "B"])
|
||||
expd = {}
|
||||
for (cat1, cat2), group in grouped:
|
||||
expd.setdefault(cat1, {})[cat2] = op(group["C"])
|
||||
exp = DataFrame(expd).T.stack()
|
||||
exp.index.names = ["A", "B"]
|
||||
exp.name = "C"
|
||||
|
||||
result = op(grouped)["C"]
|
||||
if op_name in ["sum", "prod"]:
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_cython_agg_boolean():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 5, 50),
|
||||
"b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"),
|
||||
}
|
||||
)
|
||||
result = frame.groupby("a")["b"].mean()
|
||||
# GH#53425
|
||||
expected = frame.groupby("a")["b"].agg(np.mean)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg():
|
||||
frame = DataFrame(
|
||||
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
|
||||
)
|
||||
|
||||
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby("a")["b"].mean(numeric_only=True)
|
||||
|
||||
frame = DataFrame(
|
||||
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
|
||||
)
|
||||
|
||||
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
|
||||
expected = DataFrame(
|
||||
[],
|
||||
index=frame["a"].sort_values().drop_duplicates(),
|
||||
columns=Index([], dtype="str"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg_with_dates():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 5, 50),
|
||||
"b": ["foo", "bar"] * 25,
|
||||
"dates": pd.date_range("now", periods=50, freq="min"),
|
||||
}
|
||||
)
|
||||
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby("b").dates.mean(numeric_only=True)
|
||||
|
||||
|
||||
def test_cython_agg_return_dict():
|
||||
# GH 16741
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
|
||||
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
|
||||
expected = Series(
|
||||
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
|
||||
index=Index(["bar", "foo"], name="A"),
|
||||
name="B",
|
||||
)
|
||||
tm.assert_series_equal(ts, expected)
|
||||
|
||||
|
||||
def test_cython_fail_agg():
|
||||
dr = bdate_range("1/1/2000", periods=50)
|
||||
ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)
|
||||
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
summed = grouped.sum()
|
||||
expected = grouped.agg(np.sum).astype(object)
|
||||
tm.assert_series_equal(summed, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", np.median),
|
||||
("var", np.var),
|
||||
("sum", np.sum),
|
||||
("prod", np.prod),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
("first", lambda x: x.iloc[0]),
|
||||
("last", lambda x: x.iloc[-1]),
|
||||
],
|
||||
)
|
||||
def test__cython_agg_general(op, targop):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal(1000))
|
||||
labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
|
||||
kwargs = {"ddof": 1} if op == "var" else {}
|
||||
if op not in ["first", "last"]:
|
||||
kwargs["axis"] = 0
|
||||
|
||||
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
|
||||
expected = df.groupby(labels).agg(targop, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
|
||||
("var", lambda x: np.var(x, ddof=1)),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_empty_buckets(op, targop, observed):
|
||||
df = DataFrame([11, 12, 13])
|
||||
grps = range(0, 55, 5)
|
||||
|
||||
# calling _cython_agg_general directly, instead of via the user API
|
||||
# which sets different values for min_count, so do that here.
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
result = g._cython_agg_general(op, alt=None, numeric_only=True)
|
||||
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
expected = g.agg(lambda x: targop(x))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_empty_buckets_nanops(observed):
|
||||
# GH-18869 can't call nanops on empty groups, so hardcode expected
|
||||
# for these
|
||||
df = DataFrame([11, 12, 13], columns=["a"])
|
||||
grps = np.arange(0, 25, 5, dtype=int)
|
||||
# add / sum
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"sum", alt=None, numeric_only=True
|
||||
)
|
||||
intervals = pd.interval_range(0, 20, freq=5)
|
||||
expected = DataFrame(
|
||||
{"a": [0, 0, 36, 0]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 0]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# prod
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"prod", alt=None, numeric_only=True
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 1, 1716, 1]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 1]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
|
||||
@pytest.mark.parametrize(
|
||||
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
|
||||
)
|
||||
def test_cython_with_timestamp_and_nat(op, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/19526
|
||||
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
|
||||
index = Index([0, 1], name="a")
|
||||
|
||||
# We will group by a and test the cython aggregations
|
||||
expected = DataFrame({"b": [data, NaT]}, index=index)
|
||||
|
||||
result = df.groupby("a").aggregate(op)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg",
|
||||
[
|
||||
"min",
|
||||
"max",
|
||||
"count",
|
||||
"sum",
|
||||
"prod",
|
||||
"var",
|
||||
"mean",
|
||||
"median",
|
||||
"ohlc",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"shift",
|
||||
"any",
|
||||
"all",
|
||||
"quantile",
|
||||
"first",
|
||||
"last",
|
||||
"rank",
|
||||
"cummin",
|
||||
"cummax",
|
||||
],
|
||||
)
|
||||
def test_read_only_buffer_source_agg(agg):
|
||||
# https://github.com/pandas-dev/pandas/issues/36014
|
||||
df = DataFrame(
|
||||
{
|
||||
"sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
|
||||
"species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
|
||||
}
|
||||
)
|
||||
df._mgr.blocks[0].values.flags.writeable = False
|
||||
|
||||
result = df.groupby(["species"]).agg({"sepal_length": agg})
|
||||
expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
"median",
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cython_agg_nullable_int(op_name):
|
||||
# ensure that the cython-based aggregations don't fail for nullable dtype
|
||||
# (eg https://github.com/pandas-dev/pandas/issues/37415)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["A", "B"] * 5,
|
||||
"B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("A")["B"], op_name)()
|
||||
df2 = df.assign(B=df["B"].astype("float64"))
|
||||
expected = getattr(df2.groupby("A")["B"], op_name)()
|
||||
if op_name in ("mean", "median"):
|
||||
convert_integer = False
|
||||
else:
|
||||
convert_integer = True
|
||||
expected = expected.convert_dtypes(convert_integer=convert_integer)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_count_masked_returns_masked_dtype(dtype):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1],
|
||||
"B": pd.array([1, pd.NA], dtype=dtype),
|
||||
"C": pd.array([1, 1], dtype=dtype),
|
||||
}
|
||||
)
|
||||
result = df.groupby("A").count()
|
||||
expected = DataFrame(
|
||||
[[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_na", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"op_name, action",
|
||||
[
|
||||
# ("count", "always_int"),
|
||||
("sum", "large_int"),
|
||||
# ("std", "always_float"),
|
||||
("var", "always_float"),
|
||||
# ("sem", "always_float"),
|
||||
("mean", "always_float"),
|
||||
("median", "always_float"),
|
||||
("prod", "large_int"),
|
||||
("min", "preserve"),
|
||||
("max", "preserve"),
|
||||
("first", "preserve"),
|
||||
("last", "preserve"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
pd.array([1, 2, 3, 4], dtype="Int64"),
|
||||
pd.array([1, 2, 3, 4], dtype="Int8"),
|
||||
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
|
||||
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
|
||||
pd.array([True, True, False, False], dtype="boolean"),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
|
||||
if with_na:
|
||||
data[3] = pd.NA
|
||||
|
||||
df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
|
||||
grouped = df.groupby("key")
|
||||
|
||||
if action == "always_int":
|
||||
# always Int64
|
||||
expected_dtype = pd.Int64Dtype()
|
||||
elif action == "large_int":
|
||||
# for any int/bool use Int64, for float preserve dtype
|
||||
if is_float_dtype(data.dtype):
|
||||
expected_dtype = data.dtype
|
||||
elif is_integer_dtype(data.dtype):
|
||||
# match the numpy dtype we'd get with the non-nullable analogue
|
||||
expected_dtype = data.dtype
|
||||
else:
|
||||
expected_dtype = pd.Int64Dtype()
|
||||
elif action == "always_float":
|
||||
# for any int/bool use Float64, for float preserve dtype
|
||||
if is_float_dtype(data.dtype):
|
||||
expected_dtype = data.dtype
|
||||
else:
|
||||
expected_dtype = pd.Float64Dtype()
|
||||
elif action == "preserve":
|
||||
expected_dtype = data.dtype
|
||||
|
||||
result = getattr(grouped, op_name)()
|
||||
assert result["col"].dtype == expected_dtype
|
||||
|
||||
result = grouped.aggregate(op_name)
|
||||
assert result["col"].dtype == expected_dtype
|
||||
|
||||
result = getattr(grouped["col"], op_name)()
|
||||
assert result.dtype == expected_dtype
|
||||
|
||||
result = grouped["col"].aggregate(op_name)
|
||||
assert result.dtype == expected_dtype
|
||||
@@ -0,0 +1,441 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_arm
|
||||
from pandas.errors import NumbaUtilError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
NamedAgg,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
numba = pytest.importorskip("numba")
|
||||
pytestmark.append(
|
||||
pytest.mark.skipif(
|
||||
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
|
||||
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_correct_function_signature():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(x):
|
||||
return sum(x) * 2.7
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba")
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba")
|
||||
|
||||
|
||||
def test_check_nopython_kwargs():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(values, index, *, a):
|
||||
return sum(values) * 2.7 + a
|
||||
|
||||
def correct_function(values, index, a):
|
||||
return sum(values) * 2.7 + a
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
expected = data.groupby("key").sum() * 2.7
|
||||
|
||||
# py signature binding
|
||||
with pytest.raises(
|
||||
TypeError, match="missing a required (keyword-only argument|argument): 'a'"
|
||||
):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba", b=1)
|
||||
with pytest.raises(TypeError, match="missing a required argument: 'a'"):
|
||||
data.groupby("key").agg(correct_function, engine="numba", b=1)
|
||||
|
||||
with pytest.raises(
|
||||
TypeError, match="missing a required (keyword-only argument|argument): 'a'"
|
||||
):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba", b=1)
|
||||
with pytest.raises(TypeError, match="missing a required argument: 'a'"):
|
||||
data.groupby("key")["data"].agg(correct_function, engine="numba", b=1)
|
||||
|
||||
# numba signature check after binding
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba", a=1)
|
||||
actual = data.groupby("key").agg(correct_function, engine="numba", a=1)
|
||||
tm.assert_frame_equal(expected + 1, actual)
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
|
||||
actual = data.groupby("key")["data"].agg(correct_function, engine="numba", a=1)
|
||||
tm.assert_series_equal(expected["data"] + 1, actual)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
def test_numba_vs_cython(jit, frame_or_series, nogil, parallel, nopython, as_index):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_numba(values, index):
|
||||
return np.mean(values) * 2.7
|
||||
|
||||
if jit:
|
||||
# Test accepted jitted functions
|
||||
import numba
|
||||
|
||||
func_numba = numba.jit(func_numba)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0, as_index=as_index)
|
||||
if frame_or_series is Series:
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
def test_cache(jit, frame_or_series, nogil, parallel, nopython):
|
||||
# Test that the functions are cached correctly if we switch functions
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return np.mean(values) - 3.4
|
||||
|
||||
def func_2(values, index):
|
||||
return np.mean(values) * 2.7
|
||||
|
||||
if jit:
|
||||
import numba
|
||||
|
||||
func_1 = numba.jit(func_1)
|
||||
func_2 = numba.jit(func_2)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if frame_or_series is Series:
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Add func_2 to the cache
|
||||
result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Retest func_1 which should use the cache
|
||||
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_use_global_config():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return np.mean(values) - 3.4
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
expected = grouped.agg(func_1, engine="numba")
|
||||
with option_context("compute.use_numba", True):
|
||||
result = grouped.agg(func_1, engine=None)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs",
|
||||
[
|
||||
{"func": ["min", "max"]},
|
||||
{"func": "min"},
|
||||
{"func": {1: ["min", "max"], 2: "sum"}},
|
||||
{"bmin": NamedAgg(column=1, aggfunc="min")},
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_vs_cython_frame(agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
data = DataFrame(
|
||||
{
|
||||
0: ["a", "a", "b", "b", "a"],
|
||||
1: [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||
2: [1, 2, 3, 4, 5],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba")
|
||||
expected = grouped.agg(**agg_kwargs, engine="cython")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["sum", "mean", "var", "std", "min", "max"])
|
||||
def test_multifunc_numba_vs_cython_frame_noskipna(func):
|
||||
pytest.importorskip("numba")
|
||||
data = DataFrame(
|
||||
{
|
||||
0: ["a", "a", "b", "b", "a"],
|
||||
1: [1.0, np.nan, 3.0, 4.0, 5.0],
|
||||
2: [1, 2, 3, 4, 5],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
result = grouped.agg(func, skipna=False, engine="numba")
|
||||
expected = grouped.agg(func, skipna=False, engine="cython")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs,expected_func",
|
||||
[
|
||||
({"func": lambda values, index: values.sum()}, "sum"),
|
||||
# FIXME
|
||||
pytest.param(
|
||||
{
|
||||
"func": [
|
||||
lambda values, index: values.sum(),
|
||||
lambda values, index: values.min(),
|
||||
]
|
||||
},
|
||||
["sum", "min"],
|
||||
marks=pytest.mark.xfail(
|
||||
reason="This doesn't work yet! Fails in nopython pipeline!"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_udf_frame(agg_kwargs, expected_func):
|
||||
pytest.importorskip("numba")
|
||||
data = DataFrame(
|
||||
{
|
||||
0: ["a", "a", "b", "b", "a"],
|
||||
1: [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||
2: [1, 2, 3, 4, 5],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba")
|
||||
expected = grouped.agg(expected_func, engine="cython")
|
||||
# check_dtype can be removed if GH 44952 is addressed
|
||||
# Currently, UDFs still always return float64 while reductions can preserve dtype
|
||||
tm.assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs",
|
||||
[{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}],
|
||||
)
|
||||
def test_multifunc_numba_vs_cython_series(agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
labels = ["a", "a", "b", "b", "a"]
|
||||
data = Series([1.0, 2.0, 3.0, 4.0, 5.0])
|
||||
grouped = data.groupby(labels)
|
||||
agg_kwargs["engine"] = "numba"
|
||||
result = grouped.agg(**agg_kwargs)
|
||||
agg_kwargs["engine"] = "cython"
|
||||
expected = grouped.agg(**agg_kwargs)
|
||||
if isinstance(expected, DataFrame):
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.parametrize(
|
||||
"data,agg_kwargs",
|
||||
[
|
||||
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}),
|
||||
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": ["min", "max"]},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": "min"},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": {1: ["min", "max"], 2: "sum"}},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"min_col": NamedAgg(column=1, aggfunc="min")},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_kwarg_propagation(data, agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
labels = ["a", "a", "b", "b", "a"]
|
||||
grouped = data.groupby(labels)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True})
|
||||
expected = grouped.agg(**agg_kwargs, engine="numba")
|
||||
if isinstance(expected, DataFrame):
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_args_not_cached():
|
||||
# GH 41647
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def sum_last(values, index, n):
|
||||
return values[-n:].sum()
|
||||
|
||||
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
||||
grouped_x = df.groupby("id")["x"]
|
||||
result = grouped_x.agg(sum_last, 1, engine="numba")
|
||||
expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped_x.agg(sum_last, 2, engine="numba")
|
||||
expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_data_correctly_passed():
|
||||
# GH 43133
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def f(values, index):
|
||||
return np.mean(index)
|
||||
|
||||
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
||||
result = df.groupby("group").aggregate(f, engine="numba")
|
||||
expected = DataFrame(
|
||||
[-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_engine_kwargs_not_cached():
|
||||
# If the user passes a different set of engine_kwargs don't return the same
|
||||
# jitted function
|
||||
pytest.importorskip("numba")
|
||||
nogil = True
|
||||
parallel = False
|
||||
nopython = True
|
||||
|
||||
def func_kwargs(values, index):
|
||||
return nogil + parallel + nopython
|
||||
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
df = DataFrame({"value": [0, 0, 0]})
|
||||
result = df.groupby(level=0).aggregate(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
nogil = False
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby(level=0).aggregate(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
def test_multiindex_one_key(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby("A").agg(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
|
||||
df.groupby(["A", "B"]).agg(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
|
||||
|
||||
def test_multilabel_numba_vs_cython(numba_supported_reductions):
|
||||
pytest.importorskip("numba")
|
||||
reduction, kwargs = numba_supported_reductions
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
res_agg = gb.agg(reduction, engine="numba", **kwargs)
|
||||
expected_agg = gb.agg(reduction, engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(res_agg, expected_agg)
|
||||
# Test that calling the aggregation directly also works
|
||||
direct_res = getattr(gb, reduction)(engine="numba", **kwargs)
|
||||
direct_expected = getattr(gb, reduction)(engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(direct_res, direct_expected)
|
||||
|
||||
|
||||
def test_multilabel_udf_numba_vs_cython():
|
||||
pytest.importorskip("numba")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
result = gb.agg(lambda values, index: values.min(), engine="numba")
|
||||
expected = gb.agg(lambda x: x.min(), engine="cython")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,667 @@
|
||||
"""
|
||||
test all other .agg behavior
|
||||
"""
|
||||
|
||||
import datetime as dt
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import SpecificationError
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
def test_agg_partial_failure_raises():
|
||||
# GH#43741
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"data1": np.random.default_rng(2).standard_normal(5),
|
||||
"data2": np.random.default_rng(2).standard_normal(5),
|
||||
"key1": ["a", "a", "b", "b", "a"],
|
||||
"key2": ["one", "two", "one", "two", "one"],
|
||||
}
|
||||
)
|
||||
grouped = df.groupby("key1")
|
||||
|
||||
def peak_to_peak(arr):
|
||||
return arr.max() - arr.min()
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
grouped.agg([peak_to_peak])
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
grouped.agg(peak_to_peak)
|
||||
|
||||
|
||||
def test_agg_datetimes_mixed():
|
||||
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
|
||||
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
data = [
|
||||
[
|
||||
row[0],
|
||||
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
|
||||
row[2],
|
||||
]
|
||||
for row in data
|
||||
]
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
df1["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb1 = df1.groupby("date").aggregate("sum")
|
||||
|
||||
df2["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb2 = df2.groupby("date").aggregate("sum")
|
||||
|
||||
assert len(gb1) == len(gb2)
|
||||
|
||||
|
||||
def test_agg_period_index():
|
||||
prng = period_range("2012-1-1", freq="M", periods=3)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng)
|
||||
rs = df.groupby(level=0).sum()
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
|
||||
# GH 3579
|
||||
index = period_range(start="1999-01", periods=5, freq="M")
|
||||
s1 = Series(np.random.default_rng(2).random(len(index)), index=index)
|
||||
s2 = Series(np.random.default_rng(2).random(len(index)), index=index)
|
||||
df = DataFrame.from_dict({"s1": s1, "s2": s2})
|
||||
grouped = df.groupby(df.index.month)
|
||||
list(grouped)
|
||||
|
||||
|
||||
def test_agg_dict_parameter_cast_result_dtypes():
|
||||
# GH 12821
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
|
||||
"time": date_range("1/1/2011", periods=8, freq="h"),
|
||||
}
|
||||
)
|
||||
df.loc[[0, 1, 2, 5], "time"] = None
|
||||
|
||||
# test for `first` function
|
||||
exp = df.loc[[0, 3, 4, 6]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.first(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("first"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
|
||||
tm.assert_series_equal(grouped.time.first(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
|
||||
|
||||
# test for `last` function
|
||||
exp = df.loc[[0, 3, 4, 7]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.last(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("last"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
|
||||
tm.assert_series_equal(grouped.time.last(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
|
||||
|
||||
# count
|
||||
exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.agg(len), exp)
|
||||
tm.assert_series_equal(grouped.time.size(), exp)
|
||||
|
||||
exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.count(), exp)
|
||||
|
||||
|
||||
def test_agg_cast_results_dtypes():
|
||||
# similar to GH12821
|
||||
# xref #11444
|
||||
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
|
||||
v = list("aaabbbbbbccd")
|
||||
df = DataFrame({"X": v, "Y": u})
|
||||
|
||||
result = df.groupby("X")["Y"].agg(len)
|
||||
expected = df.groupby("X")["Y"].count()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_float64_no_int64():
|
||||
# see gh-11199
|
||||
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a", "c"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_api_consistency():
|
||||
# GH 9052
|
||||
# make sure that the aggregates via dict
|
||||
# are consistent
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["A", "B"])
|
||||
c_mean = grouped["C"].mean()
|
||||
c_sum = grouped["C"].sum()
|
||||
d_mean = grouped["D"].mean()
|
||||
d_sum = grouped["D"].sum()
|
||||
|
||||
result = grouped["D"].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean], axis=1)
|
||||
expected.columns = ["sum", "mean"]
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg(["sum", "mean"])
|
||||
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped[["D", "C"]].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": "mean", "D": "sum"})
|
||||
expected = pd.concat([d_sum, c_mean], axis=1)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
|
||||
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
|
||||
|
||||
msg = r"Label\(s\) \['r', 'r2'\] do not exist"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"})
|
||||
|
||||
|
||||
def test_agg_dict_renaming_deprecation():
|
||||
# 15931
|
||||
df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
df.groupby("A").agg(
|
||||
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
|
||||
)
|
||||
|
||||
msg = r"Label\(s\) \['ma'\] do not exist"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
df.groupby("A").B.agg({"foo": "count"})
|
||||
|
||||
|
||||
def test_agg_compat():
|
||||
# GH 12334
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"C": ["sum", "std"]})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"C": "sum", "D": "std"})
|
||||
|
||||
|
||||
def test_agg_nested_dicts():
|
||||
# API change for disallowing these types of nested dicts
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
|
||||
|
||||
# same name as the original column
|
||||
# GH9052
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"result1": np.sum, "result2": np.mean})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"D": np.sum, "result2": np.mean})
|
||||
|
||||
|
||||
def test_agg_item_by_item_raise_typeerror():
|
||||
df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10)))
|
||||
|
||||
def raiseException(df):
|
||||
pprint_thing("----------------------------------------")
|
||||
pprint_thing(df.to_string())
|
||||
raise TypeError("test")
|
||||
|
||||
with pytest.raises(TypeError, match="test"):
|
||||
df.groupby(0).agg(raiseException)
|
||||
|
||||
|
||||
def test_series_agg_multikey():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
result = grouped.agg("sum")
|
||||
expected = grouped.sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_agg_multi_pure_python():
|
||||
data = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.default_rng(2).standard_normal(11),
|
||||
"E": np.random.default_rng(2).standard_normal(11),
|
||||
"F": np.random.default_rng(2).standard_normal(11),
|
||||
}
|
||||
)
|
||||
|
||||
def bad(x):
|
||||
if isinstance(x.values, np.ndarray):
|
||||
assert len(x.values.base) > 0
|
||||
return "foo"
|
||||
|
||||
result = data.groupby(["A", "B"]).agg(bad)
|
||||
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_consistency():
|
||||
# agg with ([]) and () not consistent
|
||||
# GH 6715
|
||||
def P1(a):
|
||||
return np.percentile(a.dropna(), q=1)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, 3, 4],
|
||||
"col2": [10, 25, 26, 31],
|
||||
"date": [
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 11),
|
||||
dt.date(2013, 2, 11),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby("date")
|
||||
|
||||
expected = g.agg([P1])
|
||||
expected.columns = expected.columns.levels[0]
|
||||
|
||||
result = g.agg(P1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_callables():
|
||||
# GH 7929
|
||||
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
|
||||
|
||||
class fn_class:
|
||||
def __call__(self, x):
|
||||
return sum(x)
|
||||
|
||||
equiv_callables = [
|
||||
sum,
|
||||
np.sum,
|
||||
lambda x: sum(x),
|
||||
lambda x: x.sum(),
|
||||
partial(sum),
|
||||
fn_class(),
|
||||
]
|
||||
|
||||
expected = df.groupby("foo").agg("sum")
|
||||
for ecall in equiv_callables:
|
||||
result = df.groupby("foo").agg(ecall)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_over_numpy_arrays():
|
||||
# GH 3788
|
||||
df = DataFrame(
|
||||
[
|
||||
[1, np.array([10, 20, 30])],
|
||||
[1, np.array([40, 50, 60])],
|
||||
[2, np.array([20, 30, 40])],
|
||||
],
|
||||
columns=["category", "arraydata"],
|
||||
)
|
||||
gb = df.groupby("category")
|
||||
|
||||
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
|
||||
expected_index = Index([1, 2], name="category")
|
||||
expected_column = ["arraydata"]
|
||||
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
|
||||
|
||||
alt = gb.sum(numeric_only=False)
|
||||
tm.assert_frame_equal(alt, expected)
|
||||
|
||||
result = gb.agg("sum", numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# FIXME: the original version of this test called `gb.agg(sum)`
|
||||
# and that raises TypeError if `numeric_only=False` is passed
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_period", [True, False])
|
||||
def test_agg_tzaware_non_datetime_result(as_period):
|
||||
# discussed in GH#29589, fixed in GH#29641, operating on tzaware values
|
||||
# with function that is not dtype-preserving
|
||||
dti = date_range("2012-01-01", periods=4, tz="UTC")
|
||||
if as_period:
|
||||
dti = dti.tz_localize(None).to_period("D")
|
||||
|
||||
df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
|
||||
gb = df.groupby("a")
|
||||
|
||||
# Case that _does_ preserve the dtype
|
||||
result = gb["b"].agg(lambda x: x.iloc[0])
|
||||
expected = Series(dti[::2], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# Cases that do _not_ preserve the dtype
|
||||
result = gb["b"].agg(lambda x: x.iloc[0].year)
|
||||
expected = Series([2012, 2012], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
|
||||
expected = Series(
|
||||
[pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b", dtype="m8[us]"
|
||||
)
|
||||
expected.index.name = "a"
|
||||
if as_period:
|
||||
expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_timezone_round_trip():
|
||||
# GH 15426
|
||||
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
|
||||
df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
|
||||
|
||||
result1 = df.groupby("a")["b"].agg("min").iloc[0]
|
||||
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
|
||||
result3 = df.groupby("a")["b"].min().iloc[0]
|
||||
|
||||
assert result1 == ts
|
||||
assert result2 == ts
|
||||
assert result3 == ts
|
||||
|
||||
dates = [
|
||||
pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
|
||||
]
|
||||
df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
|
||||
grouped = df.groupby("A")
|
||||
|
||||
ts = df["B"].iloc[0]
|
||||
assert ts == grouped.nth(0)["B"].iloc[0]
|
||||
assert ts == grouped.head(1)["B"].iloc[0]
|
||||
assert ts == grouped.first()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
assert ts == grouped.apply(lambda x: x.iloc[0])["B"].iloc[0]
|
||||
|
||||
ts = df["B"].iloc[2]
|
||||
assert ts == grouped.last()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
assert ts == grouped.apply(lambda x: x.iloc[-1])["B"].iloc[0]
|
||||
|
||||
|
||||
def test_sum_uint64_overflow():
|
||||
# see gh-14758
|
||||
# Convert to uint64 and don't overflow
|
||||
df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
|
||||
df = df + 9223372036854775807
|
||||
|
||||
index = Index(
|
||||
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
|
||||
)
|
||||
expected = DataFrame(
|
||||
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
|
||||
index=index,
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
expected.index.name = 0
|
||||
result = df.groupby(0).sum(numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# out column is non-numeric, so with numeric_only=True it is dropped
|
||||
result2 = df.groupby(0).sum(numeric_only=True)
|
||||
expected2 = expected[[]]
|
||||
tm.assert_frame_equal(result2, expected2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, cast_as",
|
||||
[
|
||||
(tuple, tuple),
|
||||
(list, list),
|
||||
(lambda x: tuple(x), tuple),
|
||||
(lambda x: list(x), list),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_dataframe(structure, cast_as):
|
||||
df = DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby(["A", "B"]).aggregate(structure)
|
||||
expected = DataFrame(
|
||||
{"C": {(1, 1): cast_as([1, 1, 1]), (3, 4): cast_as([3, 4, 4])}}
|
||||
)
|
||||
expected.index.names = ["A", "B"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, cast_as",
|
||||
[
|
||||
(tuple, tuple),
|
||||
(list, list),
|
||||
(lambda x: tuple(x), tuple),
|
||||
(lambda x: list(x), list),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_series(structure, cast_as):
|
||||
# Issue #18079
|
||||
df = DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby("A")["C"].aggregate(structure)
|
||||
expected = Series([cast_as([1, 1, 1]), cast_as([3, 4, 4])], index=[1, 3], name="C")
|
||||
expected.index.name = "A"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_category_nansum(observed):
|
||||
categories = ["a", "b", "c"]
|
||||
df = DataFrame(
|
||||
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
|
||||
)
|
||||
result = df.groupby("A", observed=observed).B.agg(np.nansum)
|
||||
expected = Series(
|
||||
[3, 3, 0],
|
||||
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
|
||||
name="B",
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected != 0]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_list_like_func():
|
||||
# GH 18473
|
||||
df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
|
||||
grouped = df.groupby("A", as_index=False, sort=False)
|
||||
result = grouped.agg({"B": lambda x: list(x)})
|
||||
expected = DataFrame(
|
||||
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_lambda_with_timezone():
|
||||
# GH 23683
|
||||
df = DataFrame(
|
||||
{
|
||||
"tag": [1, 1],
|
||||
"date": [
|
||||
pd.Timestamp("2018-01-01", tz="UTC"),
|
||||
pd.Timestamp("2018-01-02", tz="UTC"),
|
||||
],
|
||||
}
|
||||
)
|
||||
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
|
||||
expected = DataFrame(
|
||||
[pd.Timestamp("2018-01-01", tz="UTC")],
|
||||
index=Index([1], name="tag"),
|
||||
columns=["date"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"err_cls",
|
||||
[
|
||||
NotImplementedError,
|
||||
RuntimeError,
|
||||
KeyError,
|
||||
IndexError,
|
||||
OSError,
|
||||
ValueError,
|
||||
ArithmeticError,
|
||||
AttributeError,
|
||||
],
|
||||
)
|
||||
def test_groupby_agg_err_catching(err_cls):
|
||||
# make sure we suppress anything other than TypeError or AssertionError
|
||||
# in _python_agg_general
|
||||
|
||||
# Use a non-standard EA to make sure we don't go down ndarray paths
|
||||
from pandas.tests.extension.decimal.array import (
|
||||
DecimalArray,
|
||||
make_data,
|
||||
to_decimal,
|
||||
)
|
||||
|
||||
data = make_data(5)
|
||||
df = DataFrame(
|
||||
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
|
||||
)
|
||||
|
||||
expected = Series(to_decimal([data[0], data[3]]))
|
||||
|
||||
def weird_func(x):
|
||||
# weird function that raise something other than TypeError or IndexError
|
||||
# in _python_agg_general
|
||||
if len(x) == 0:
|
||||
raise err_cls
|
||||
return x.iloc[0]
|
||||
|
||||
result = df["decimals"].groupby(df["id1"]).agg(weird_func)
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
166
venv/Lib/site-packages/pandas/tests/groupby/conftest.py
Normal file
166
venv/Lib/site-packages/pandas/tests/groupby/conftest.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
from pandas.core.groupby.base import (
|
||||
reduction_kernels,
|
||||
transformation_kernels,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ts():
|
||||
return Series(
|
||||
np.random.default_rng(2).standard_normal(30),
|
||||
index=date_range("2000-01-01", periods=30, freq="B"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tsframe():
|
||||
return DataFrame(
|
||||
np.random.default_rng(2).standard_normal((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=30, freq="B"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def three_group():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.default_rng(2).standard_normal(11),
|
||||
"E": np.random.default_rng(2).standard_normal(11),
|
||||
"F": np.random.default_rng(2).standard_normal(11),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def slice_test_df():
|
||||
data = [
|
||||
[0, "a", "a0_at_0"],
|
||||
[1, "b", "b0_at_1"],
|
||||
[2, "a", "a1_at_2"],
|
||||
[3, "b", "b1_at_3"],
|
||||
[4, "c", "c0_at_4"],
|
||||
[5, "a", "a2_at_5"],
|
||||
[6, "a", "a3_at_6"],
|
||||
[7, "a", "a4_at_7"],
|
||||
]
|
||||
df = DataFrame(data, columns=["Index", "Group", "Value"])
|
||||
return df.set_index("Index")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def slice_test_grouped(slice_test_df):
|
||||
return slice_test_df.groupby("Group", as_index=False)
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(reduction_kernels))
|
||||
def reduction_func(request):
|
||||
"""
|
||||
yields the string names of all groupby reduction functions, one at a time.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(transformation_kernels))
|
||||
def transformation_func(request):
|
||||
"""yields the string names of all groupby transformation functions."""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels))
|
||||
def groupby_func(request):
|
||||
"""yields both aggregation and transformation functions."""
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("mean", {}),
|
||||
("var", {"ddof": 1}),
|
||||
("var", {"ddof": 0}),
|
||||
("std", {"ddof": 1}),
|
||||
("std", {"ddof": 0}),
|
||||
("sum", {}),
|
||||
("min", {}),
|
||||
("max", {}),
|
||||
("sum", {"min_count": 2}),
|
||||
("min", {"min_count": 2}),
|
||||
("max", {"min_count": 2}),
|
||||
],
|
||||
ids=[
|
||||
"mean",
|
||||
"var_1",
|
||||
"var_0",
|
||||
"std_1",
|
||||
"std_0",
|
||||
"sum",
|
||||
"min",
|
||||
"max",
|
||||
"sum-min_count",
|
||||
"min-min_count",
|
||||
"max-min_count",
|
||||
],
|
||||
)
|
||||
def numba_supported_reductions(request):
|
||||
"""reductions supported with engine='numba'"""
|
||||
return request.param
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,271 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_apply_describe_bug(multiindex_dataframe_random_data):
|
||||
grouped = multiindex_dataframe_random_data.groupby(level="first")
|
||||
grouped.describe() # it works!
|
||||
|
||||
|
||||
def test_series_describe_multikey():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.describe()
|
||||
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
|
||||
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
|
||||
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
|
||||
|
||||
|
||||
def test_series_describe_single():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
result = grouped.apply(lambda x: x.describe())
|
||||
expected = grouped.describe().stack()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
|
||||
def test_series_describe_as_index(as_index, keys):
|
||||
# GH#49256
|
||||
df = DataFrame(
|
||||
{
|
||||
"key1": ["one", "two", "two", "three", "two"],
|
||||
"key2": ["one", "two", "two", "three", "two"],
|
||||
"foo2": [1, 2, 4, 4, 6],
|
||||
}
|
||||
)
|
||||
gb = df.groupby(keys, as_index=as_index)["foo2"]
|
||||
result = gb.describe()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key1": ["one", "three", "two"],
|
||||
"count": [1.0, 1.0, 3.0],
|
||||
"mean": [1.0, 4.0, 4.0],
|
||||
"std": [np.nan, np.nan, 2.0],
|
||||
"min": [1.0, 4.0, 2.0],
|
||||
"25%": [1.0, 4.0, 3.0],
|
||||
"50%": [1.0, 4.0, 4.0],
|
||||
"75%": [1.0, 4.0, 5.0],
|
||||
"max": [1.0, 4.0, 6.0],
|
||||
}
|
||||
)
|
||||
if len(keys) == 2:
|
||||
expected.insert(1, "key2", expected["key1"])
|
||||
if as_index:
|
||||
expected = expected.set_index(keys)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_describe_multikey(tsframe):
|
||||
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.describe()
|
||||
desc_groups = []
|
||||
for col in tsframe:
|
||||
group = grouped[col].describe()
|
||||
# GH 17464 - Remove duplicate MultiIndex levels
|
||||
group_col = MultiIndex(
|
||||
levels=[Index([col], dtype=tsframe.columns.dtype), group.columns],
|
||||
codes=[[0] * len(group.columns), range(len(group.columns))],
|
||||
)
|
||||
group = DataFrame(group.values, columns=group_col, index=group.index)
|
||||
desc_groups.append(group)
|
||||
expected = pd.concat(desc_groups, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_describe_tupleindex():
|
||||
# GH 14848 - regression from 0.19.0 to 0.19.1
|
||||
name = "k"
|
||||
df = DataFrame(
|
||||
{
|
||||
"x": [1, 2, 3, 4, 5] * 3,
|
||||
name: [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5,
|
||||
}
|
||||
)
|
||||
result = df.groupby(name).describe()
|
||||
expected = DataFrame(
|
||||
[[5.0, 3.0, 1.581139, 1.0, 2.0, 3.0, 4.0, 5.0]] * 3,
|
||||
index=Index([(0, 0, 1), (0, 1, 0), (1, 0, 0)], tupleize_cols=False, name=name),
|
||||
columns=MultiIndex.from_arrays(
|
||||
[["x"] * 8, ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_describe_unstacked_format():
|
||||
# GH 4792
|
||||
prices = {
|
||||
Timestamp("2011-01-06 10:59:05", tz=None): 24990,
|
||||
Timestamp("2011-01-06 12:43:33", tz=None): 25499,
|
||||
Timestamp("2011-01-06 12:54:09", tz=None): 25499,
|
||||
}
|
||||
volumes = {
|
||||
Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
|
||||
Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
|
||||
Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
|
||||
}
|
||||
df = DataFrame({"PRICE": prices, "VOLUME": volumes})
|
||||
result = df.groupby("PRICE").VOLUME.describe()
|
||||
data = [
|
||||
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
|
||||
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
|
||||
]
|
||||
expected = DataFrame(
|
||||
data,
|
||||
index=Index([24990, 25499], name="PRICE"),
|
||||
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:"
|
||||
"indexing past lexsort depth may impact performance:"
|
||||
"pandas.errors.PerformanceWarning"
|
||||
)
|
||||
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
||||
def test_describe_with_duplicate_output_column_names(as_index, keys):
|
||||
# GH 35314
|
||||
df = DataFrame(
|
||||
{
|
||||
"a1": [99, 99, 99, 88, 88, 88],
|
||||
"a2": [99, 99, 99, 88, 88, 88],
|
||||
"b": [1, 2, 3, 4, 5, 6],
|
||||
"c": [10, 20, 30, 40, 50, 60],
|
||||
},
|
||||
columns=["a1", "a2", "b", "b"],
|
||||
copy=False,
|
||||
)
|
||||
if keys == ["a1"]:
|
||||
df = df.drop(columns="a2")
|
||||
|
||||
expected = (
|
||||
DataFrame.from_records(
|
||||
[
|
||||
("b", "count", 3.0, 3.0),
|
||||
("b", "mean", 5.0, 2.0),
|
||||
("b", "std", 1.0, 1.0),
|
||||
("b", "min", 4.0, 1.0),
|
||||
("b", "25%", 4.5, 1.5),
|
||||
("b", "50%", 5.0, 2.0),
|
||||
("b", "75%", 5.5, 2.5),
|
||||
("b", "max", 6.0, 3.0),
|
||||
("b", "count", 3.0, 3.0),
|
||||
("b", "mean", 5.0, 2.0),
|
||||
("b", "std", 1.0, 1.0),
|
||||
("b", "min", 4.0, 1.0),
|
||||
("b", "25%", 4.5, 1.5),
|
||||
("b", "50%", 5.0, 2.0),
|
||||
("b", "75%", 5.5, 2.5),
|
||||
("b", "max", 6.0, 3.0),
|
||||
],
|
||||
)
|
||||
.set_index([0, 1])
|
||||
.T
|
||||
)
|
||||
expected.columns.names = [None, None]
|
||||
if len(keys) == 2:
|
||||
expected.index = MultiIndex(
|
||||
levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
|
||||
)
|
||||
else:
|
||||
expected.index = Index([88, 99], name="a1")
|
||||
|
||||
if not as_index:
|
||||
expected = expected.reset_index()
|
||||
|
||||
result = df.groupby(keys, as_index=as_index).describe()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_describe_duplicate_columns():
|
||||
# GH#50806
|
||||
df = DataFrame([[0, 1, 2, 3]])
|
||||
df.columns = [0, 1, 2, 0]
|
||||
gb = df.groupby(df[1])
|
||||
result = gb.describe(percentiles=[])
|
||||
|
||||
columns = ["count", "mean", "std", "min", "max"]
|
||||
frames = [
|
||||
DataFrame([[1.0, val, np.nan, val, val]], index=[1], columns=columns)
|
||||
for val in (0.0, 2.0, 3.0)
|
||||
]
|
||||
expected = pd.concat(frames, axis=1)
|
||||
expected.columns = MultiIndex(
|
||||
levels=[[0, 2], columns],
|
||||
codes=[5 * [0] + 5 * [1] + 5 * [0], 3 * list(range(5))],
|
||||
)
|
||||
expected.index.names = [1]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_describe_non_cython_paths():
|
||||
# GH#5610 non-cython calls should not include the grouper
|
||||
# Tests for code not expected to go through cython paths.
|
||||
df = DataFrame(
|
||||
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
gb = df.groupby("A")
|
||||
expected_index = Index([1, 3], name="A")
|
||||
expected_col = MultiIndex(
|
||||
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
|
||||
codes=[[0] * 8, list(range(8))],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
|
||||
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
],
|
||||
index=expected_index,
|
||||
columns=expected_col,
|
||||
)
|
||||
result = gb.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
gni = df.groupby("A", as_index=False)
|
||||
expected = expected.reset_index()
|
||||
result = gni.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [int, float, object])
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
|
||||
],
|
||||
)
|
||||
def test_groupby_empty_dataset(dtype, kwargs):
|
||||
# GH#41575
|
||||
df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
|
||||
df["B"] = df["B"].astype(int)
|
||||
df["C"] = df["C"].astype(float)
|
||||
|
||||
result = df.iloc[:0].groupby("A").describe(**kwargs)
|
||||
expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.iloc[:0].groupby("A").B.describe(**kwargs)
|
||||
expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
|
||||
expected.index = Index([], dtype=df.columns.dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,268 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_group_shift_with_null_key():
|
||||
# This test is designed to replicate the segfault in issue #13813.
|
||||
n_rows = 1200
|
||||
|
||||
# Generate a moderately large dataframe with occasional missing
|
||||
# values in column `B`, and then group by [`A`, `B`]. This should
|
||||
# force `-1` in `labels` array of `g._grouper.group_info` exactly
|
||||
# at those places, where the group-by key is partially missing.
|
||||
df = DataFrame(
|
||||
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["A", "B", "Z"],
|
||||
index=None,
|
||||
)
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = DataFrame(
|
||||
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["Z"],
|
||||
index=None,
|
||||
)
|
||||
result = g.shift(-1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_shift_with_fill_value():
|
||||
# GH #24128
|
||||
n_rows = 24
|
||||
df = DataFrame(
|
||||
[(i % 12, i % 3, i) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["A", "B", "Z"],
|
||||
index=None,
|
||||
)
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = DataFrame(
|
||||
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["Z"],
|
||||
index=None,
|
||||
)
|
||||
result = g.shift(-1, fill_value=0)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_shift_lose_timezone():
|
||||
# GH 30134
|
||||
now_dt = Timestamp.now("UTC").as_unit("ns")
|
||||
df = DataFrame({"a": [1, 1], "date": now_dt})
|
||||
result = df.groupby("a").shift(0).iloc[0]
|
||||
expected = Series({"date": now_dt}, name=result.name)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_real_series(any_real_numpy_dtype):
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
|
||||
dtype=any_real_numpy_dtype,
|
||||
)
|
||||
result = df.groupby("a")["b"].diff()
|
||||
exp_dtype = "float"
|
||||
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
||||
exp_dtype = "float32"
|
||||
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_real_frame(any_real_numpy_dtype):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3, 3, 2],
|
||||
"b": [1, 2, 3, 4, 5],
|
||||
"c": [1, 2, 3, 4, 6],
|
||||
},
|
||||
dtype=any_real_numpy_dtype,
|
||||
)
|
||||
result = df.groupby("a").diff()
|
||||
exp_dtype = "float"
|
||||
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
||||
exp_dtype = "float32"
|
||||
expected = DataFrame(
|
||||
{
|
||||
"b": [np.nan, np.nan, np.nan, 1.0, 3.0],
|
||||
"c": [np.nan, np.nan, np.nan, 1.0, 4.0],
|
||||
},
|
||||
dtype=exp_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
[
|
||||
Timestamp("2013-01-01"),
|
||||
Timestamp("2013-01-02"),
|
||||
Timestamp("2013-01-03"),
|
||||
],
|
||||
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
|
||||
],
|
||||
)
|
||||
def test_group_diff_datetimelike(data, unit):
|
||||
df = DataFrame({"a": [1, 2, 2], "b": data})
|
||||
df["b"] = df["b"].dt.as_unit(unit)
|
||||
result = df.groupby("a")["b"].diff()
|
||||
expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_bool():
|
||||
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
||||
result = df.groupby("a")["b"].diff()
|
||||
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_object_raises(object_dtype):
|
||||
df = DataFrame(
|
||||
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
|
||||
)
|
||||
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
|
||||
df.groupby("a")["b"].diff()
|
||||
|
||||
|
||||
def test_empty_shift_with_fill():
|
||||
# GH 41264, single-index check
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
shifted = df.groupby(["a"]).shift(1)
|
||||
shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
|
||||
tm.assert_frame_equal(shifted, shifted_with_fill)
|
||||
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
||||
|
||||
|
||||
def test_multindex_empty_shift_with_fill():
|
||||
# GH 41264, multi-index check
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
shifted = df.groupby(["a", "b"]).shift(1)
|
||||
shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
|
||||
tm.assert_frame_equal(shifted, shifted_with_fill)
|
||||
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
||||
|
||||
|
||||
def test_shift_periods_freq():
|
||||
# GH 54093
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data, index=date_range(start="20100101", periods=6))
|
||||
result = df.groupby(df.index).shift(periods=-2, freq="D")
|
||||
expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_shift_disallow_freq_and_fill_value():
|
||||
# GH 53832
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data, index=date_range(start="20100101", periods=6))
|
||||
msg = "Passing a 'freq' together with a 'fill_value'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1")
|
||||
|
||||
|
||||
def test_shift_disallow_suffix_if_periods_is_int():
|
||||
# GH#44424
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data)
|
||||
msg = "Cannot specify `suffix` if `periods` is an int."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("b").shift(1, suffix="fails")
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods():
|
||||
# GH#44424
|
||||
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
||||
|
||||
shifted_df = df.groupby("b")[["a"]].shift([0, 1])
|
||||
expected_df = DataFrame(
|
||||
{"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]}
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
# series
|
||||
shifted_series = df.groupby("b")["a"].shift([0, 1])
|
||||
tm.assert_frame_equal(shifted_series, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_freq():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
shifted_df = df.groupby("b")[["a"]].shift(
|
||||
[0, 1],
|
||||
freq="h",
|
||||
)
|
||||
expected_df = DataFrame(
|
||||
{
|
||||
"a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan],
|
||||
"a_1": [
|
||||
np.nan,
|
||||
1.0,
|
||||
2.0,
|
||||
3.0,
|
||||
4.0,
|
||||
5.0,
|
||||
],
|
||||
},
|
||||
index=date_range("1/1/2000", periods=6, freq="h"),
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_fill_value():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
)
|
||||
shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1)
|
||||
expected_df = DataFrame(
|
||||
{"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]},
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
msg = "Passing a 'freq' together with a 'fill_value'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")
|
||||
|
||||
|
||||
def test_groupby_shift_multiple_periods_unsorted_index():
|
||||
# https://github.com/pandas-dev/pandas/pull/62843
|
||||
idx = date_range("1/1/2000", periods=4, freq="h")
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3], "b": [True, True, False]},
|
||||
index=[idx[2], idx[0], idx[1]],
|
||||
)
|
||||
result = df.groupby("b")[["a"]].shift([0, 1], freq="h")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a_0": [1.0, 2.0, 3.0, np.nan],
|
||||
"a_1": [3.0, np.nan, 2.0, 1.0],
|
||||
},
|
||||
index=[idx[2], idx[0], idx[1], idx[3]],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,78 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"in_vals, out_vals",
|
||||
[
|
||||
# Basics: strictly increasing (T), strictly decreasing (F),
|
||||
# abs val increasing (F), non-strictly increasing (T)
|
||||
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
|
||||
# Test with inf vals
|
||||
(
|
||||
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
|
||||
[True, False, True, False],
|
||||
),
|
||||
# Test with nan vals; should always be False
|
||||
(
|
||||
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
||||
[False, False, False, False],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_is_monotonic_increasing(in_vals, out_vals):
|
||||
# GH 17015
|
||||
source_dict = {
|
||||
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
||||
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
||||
"C": in_vals,
|
||||
}
|
||||
df = DataFrame(source_dict)
|
||||
result = df.groupby("B").C.is_monotonic_increasing
|
||||
index = Index(list("abcd"), name="B")
|
||||
expected = Series(index=index, data=out_vals, name="C")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# Also check result equal to manually taking x.is_monotonic_increasing.
|
||||
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"in_vals, out_vals",
|
||||
[
|
||||
# Basics: strictly decreasing (T), strictly increasing (F),
|
||||
# abs val decreasing (F), non-strictly increasing (T)
|
||||
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
|
||||
# Test with inf vals
|
||||
(
|
||||
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
|
||||
[True, True, False, True],
|
||||
),
|
||||
# Test with nan vals; should always be False
|
||||
(
|
||||
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
||||
[False, False, False, False],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_is_monotonic_decreasing(in_vals, out_vals):
|
||||
# GH 17015
|
||||
source_dict = {
|
||||
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
||||
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
||||
"C": in_vals,
|
||||
}
|
||||
|
||||
df = DataFrame(source_dict)
|
||||
result = df.groupby("B").C.is_monotonic_decreasing
|
||||
index = Index(list("abcd"), name="B")
|
||||
expected = Series(index=index, data=out_vals, name="C")
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,90 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_groupby_kurt_equivalence():
|
||||
# GH#40139
|
||||
# Test that that groupby kurt method (which uses libgroupby.group_kurt)
|
||||
# matches the results of operating group-by-group (which uses nanops.nankurt)
|
||||
nrows = 1000
|
||||
ngroups = 3
|
||||
ncols = 2
|
||||
nan_frac = 0.05
|
||||
|
||||
arr = np.random.default_rng(2).standard_normal((nrows, ncols))
|
||||
arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
|
||||
|
||||
df = pd.DataFrame(arr)
|
||||
grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
|
||||
gb = df.groupby(grps)
|
||||
|
||||
result = gb.kurt()
|
||||
|
||||
grpwise = [grp.kurt().to_frame(i).T for i, grp in gb]
|
||||
expected = pd.concat(grpwise, axis=0)
|
||||
expected.index = expected.index.astype("int64") # 32bit builds
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
"Float64",
|
||||
],
|
||||
)
|
||||
def test_groupby_kurt_arrow_float64(dtype):
|
||||
# GH#40139
|
||||
# Test groupby.kurt() with float64[pyarrow] and Float64 dtypes
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"x": [1.0, pd.NA, 3.2, 4.8, 2.3, 1.9, 8.9],
|
||||
"y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0],
|
||||
},
|
||||
dtype=dtype,
|
||||
)
|
||||
gb = df.groupby(by=lambda x: 0)
|
||||
|
||||
result = gb.kurt()
|
||||
expected = pd.DataFrame({"x": [2.1644713], "y": [0.1513969]}, dtype=dtype)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_kurt_noskipna():
|
||||
# GH#40139
|
||||
# Test groupby.kurt() with skipna = False
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9],
|
||||
"y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0],
|
||||
}
|
||||
)
|
||||
gb = df.groupby(by=lambda x: 0)
|
||||
|
||||
result = gb.kurt(skipna=False)
|
||||
expected = pd.DataFrame({"x": [np.nan], "y": [0.1513969]})
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_kurt_all_ones():
|
||||
# GH#40139
|
||||
# Test groupby.kurt() with constant values
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"x": [1.0] * 10,
|
||||
}
|
||||
)
|
||||
gb = df.groupby(by=lambda x: 0)
|
||||
|
||||
result = gb.kurt(skipna=False)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"x": [0.0], # Same behavior as pd.DataFrame.kurt()
|
||||
}
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
@@ -0,0 +1,114 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
MultiIndex,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_nlargest():
|
||||
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
||||
b = Series(list("a" * 5 + "b" * 5))
|
||||
gb = a.groupby(b)
|
||||
r = gb.nlargest(3)
|
||||
e = Series(
|
||||
[7, 5, 3, 10, 9, 6],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
|
||||
)
|
||||
tm.assert_series_equal(r, e)
|
||||
|
||||
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
||||
gb = a.groupby(b)
|
||||
e = Series(
|
||||
[3, 2, 1, 3, 3, 2],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
|
||||
)
|
||||
tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
|
||||
|
||||
|
||||
def test_nlargest_mi_grouper():
|
||||
# see gh-21411
|
||||
npr = np.random.default_rng(2)
|
||||
|
||||
dts = date_range("20180101", periods=10)
|
||||
iterables = [dts, ["one", "two"]]
|
||||
|
||||
idx = MultiIndex.from_product(iterables, names=["first", "second"])
|
||||
s = Series(npr.standard_normal(20), index=idx)
|
||||
|
||||
result = s.groupby("first").nlargest(1)
|
||||
|
||||
exp_idx = MultiIndex.from_tuples(
|
||||
[
|
||||
(dts[0], dts[0], "one"),
|
||||
(dts[1], dts[1], "one"),
|
||||
(dts[2], dts[2], "one"),
|
||||
(dts[3], dts[3], "two"),
|
||||
(dts[4], dts[4], "one"),
|
||||
(dts[5], dts[5], "one"),
|
||||
(dts[6], dts[6], "one"),
|
||||
(dts[7], dts[7], "one"),
|
||||
(dts[8], dts[8], "one"),
|
||||
(dts[9], dts[9], "one"),
|
||||
],
|
||||
names=["first", "first", "second"],
|
||||
)
|
||||
|
||||
exp_values = [
|
||||
0.18905338179353307,
|
||||
-0.41306354339189344,
|
||||
1.799707382720902,
|
||||
0.7738065867276614,
|
||||
0.28121066979764925,
|
||||
0.9775674511260357,
|
||||
-0.3288239040579627,
|
||||
0.45495807124085547,
|
||||
0.5452887139646817,
|
||||
0.12682784711186987,
|
||||
]
|
||||
|
||||
expected = Series(exp_values, index=exp_idx)
|
||||
tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
|
||||
|
||||
|
||||
def test_nsmallest():
|
||||
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
||||
b = Series(list("a" * 5 + "b" * 5))
|
||||
gb = a.groupby(b)
|
||||
r = gb.nsmallest(3)
|
||||
e = Series(
|
||||
[1, 2, 3, 0, 4, 6],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
|
||||
)
|
||||
tm.assert_series_equal(r, e)
|
||||
|
||||
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
||||
gb = a.groupby(b)
|
||||
e = Series(
|
||||
[0, 1, 1, 0, 1, 2],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
|
||||
)
|
||||
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, groups",
|
||||
[([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
|
||||
def test_nlargest_and_smallest_noop(data, groups, dtype, nselect_method):
|
||||
# GH 15272, GH 16345, GH 29129
|
||||
# Test nlargest/smallest when it results in a noop,
|
||||
# i.e. input is sorted and group size <= n
|
||||
if dtype is not None:
|
||||
data = np.array(data, dtype=dtype)
|
||||
if nselect_method == "nlargest":
|
||||
data = list(reversed(data))
|
||||
ser = Series(data, name="a")
|
||||
result = getattr(ser.groupby(groups), nselect_method)(n=2)
|
||||
expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups
|
||||
expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
848
venv/Lib/site-packages/pandas/tests/groupby/methods/test_nth.py
Normal file
848
venv/Lib/site-packages/pandas/tests/groupby/methods/test_nth.py
Normal file
@@ -0,0 +1,848 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_first_last_nth(df):
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(first, expected)
|
||||
|
||||
nth = grouped.nth(0)
|
||||
expected = df.loc[[0, 1]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
tm.assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(-1)
|
||||
expected = df.iloc[[5, 7]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.iloc[[2, 3]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
# it works!
|
||||
grouped["B"].first()
|
||||
grouped["B"].last()
|
||||
grouped["B"].nth(0)
|
||||
|
||||
df = df.copy()
|
||||
df.loc[df["A"] == "foo", "B"] = np.nan
|
||||
grouped = df.groupby("A")
|
||||
assert isna(grouped["B"].first()["foo"])
|
||||
assert isna(grouped["B"].last()["foo"])
|
||||
assert isna(grouped["B"].nth(0).iloc[0])
|
||||
|
||||
# v0.14.0 whatsnew
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
result = g.first()
|
||||
expected = df.iloc[[1, 2]].set_index("A")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df.iloc[[1, 2]]
|
||||
result = g.nth(0, dropna="any")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last_with_na_object(method, nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32123
|
||||
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
|
||||
result = getattr(groups, method)()
|
||||
|
||||
if method == "first":
|
||||
values = [1, 3]
|
||||
else:
|
||||
values = [2, 3]
|
||||
|
||||
values = np.array(values, dtype=result["b"].dtype)
|
||||
idx = Index([1, 2], name="a")
|
||||
expected = DataFrame({"b": values}, index=idx)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index", [0, -1])
|
||||
def test_nth_with_na_object(index, nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32123
|
||||
df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
|
||||
groups = df.groupby("a")
|
||||
result = groups.nth(index)
|
||||
expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last_with_None(method):
|
||||
# https://github.com/pandas-dev/pandas/issues/32800
|
||||
# None should be preserved as object dtype
|
||||
df = DataFrame.from_dict({"id": ["a"], "value": [None]})
|
||||
groups = df.groupby("id", as_index=False)
|
||||
result = getattr(groups, method)()
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
@pytest.mark.parametrize(
|
||||
"df, expected",
|
||||
[
|
||||
(
|
||||
DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
|
||||
DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
|
||||
),
|
||||
(
|
||||
DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
|
||||
DataFrame({"value": [None]}, index=Index(["a"], name="id")),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_with_None_expanded(method, df, expected):
|
||||
# GH 32800, 38286
|
||||
result = getattr(df.groupby("id"), method)()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes():
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
|
||||
}
|
||||
)
|
||||
df["E"] = True
|
||||
df["F"] = 1
|
||||
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(first, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.iloc[[2, 3]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes2():
|
||||
# GH 2763, first/last shifting dtypes
|
||||
idx = list(range(10))
|
||||
idx.append(9)
|
||||
ser = Series(data=range(11), index=idx, name="IntCol")
|
||||
assert ser.dtype == "int64"
|
||||
f = ser.groupby(level=0).first()
|
||||
assert f.dtype == "int64"
|
||||
|
||||
|
||||
def test_first_last_nth_nan_dtype():
|
||||
# GH 33591
|
||||
df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
|
||||
grouped = df.groupby("data")
|
||||
|
||||
expected = df.set_index("data").nans
|
||||
tm.assert_series_equal(grouped.nans.first(), expected)
|
||||
tm.assert_series_equal(grouped.nans.last(), expected)
|
||||
|
||||
expected = df.nans
|
||||
tm.assert_series_equal(grouped.nans.nth(-1), expected)
|
||||
tm.assert_series_equal(grouped.nans.nth(0), expected)
|
||||
|
||||
|
||||
def test_first_strings_timestamps():
|
||||
# GH 11244
|
||||
test = DataFrame(
|
||||
{
|
||||
Timestamp("2012-01-01 00:00:00"): ["a", "b"],
|
||||
Timestamp("2012-01-02 00:00:00"): ["c", "d"],
|
||||
"name": ["e", "e"],
|
||||
"aaaa": ["f", "g"],
|
||||
}
|
||||
)
|
||||
result = test.groupby("name").first()
|
||||
expected = DataFrame(
|
||||
[["a", "c", "f"]],
|
||||
columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
|
||||
index=Index(["e"], name="name"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth():
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]])
|
||||
tm.assert_frame_equal(gb.nth(1), df.iloc[[1]])
|
||||
tm.assert_frame_equal(gb.nth(2), df.loc[[]])
|
||||
tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]])
|
||||
tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]])
|
||||
tm.assert_frame_equal(gb.nth(-3), df.loc[[]])
|
||||
tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]])
|
||||
tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]])
|
||||
tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]])
|
||||
tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]])
|
||||
|
||||
tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0])
|
||||
tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0])
|
||||
|
||||
|
||||
def test_nth2():
|
||||
# out of bounds, regression from 0.13.1
|
||||
# GH 6621
|
||||
df = DataFrame(
|
||||
{
|
||||
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
|
||||
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
|
||||
"two": {
|
||||
0: 1.5456590000000001,
|
||||
1: -0.070345000000000005,
|
||||
2: -2.4004539999999999,
|
||||
3: 0.46206000000000003,
|
||||
4: 0.52350799999999997,
|
||||
},
|
||||
"one": {
|
||||
0: 0.56573799999999996,
|
||||
1: -0.9742360000000001,
|
||||
2: 1.033801,
|
||||
3: -0.78543499999999999,
|
||||
4: 0.70422799999999997,
|
||||
},
|
||||
}
|
||||
).set_index(["color", "food"])
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(2)
|
||||
expected = df.iloc[[-1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(3)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth3():
|
||||
# GH 7559
|
||||
# from the vbench
|
||||
df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64")
|
||||
ser = df[1]
|
||||
gb = df[0]
|
||||
expected = ser.groupby(gb).first()
|
||||
expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(expected2, expected, check_names=False)
|
||||
assert expected.name == 1
|
||||
assert expected2.name == 1
|
||||
|
||||
# validate first
|
||||
v = ser[gb == 1].iloc[0]
|
||||
assert expected.iloc[0] == v
|
||||
assert expected2.iloc[0] == v
|
||||
|
||||
with pytest.raises(ValueError, match="For a DataFrame"):
|
||||
ser.groupby(gb, sort=False).nth(0, dropna=True)
|
||||
|
||||
|
||||
def test_nth4():
|
||||
# doc example
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
result = gb.B.nth(0, dropna="all")
|
||||
expected = df.B.iloc[[1, 2]]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth5():
|
||||
# test multiple nth values
|
||||
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]])
|
||||
tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]])
|
||||
tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]])
|
||||
|
||||
|
||||
def test_nth_bdays(unit):
|
||||
business_dates = pd.date_range(
|
||||
start="4/1/2014", end="6/30/2014", freq="B", unit=unit
|
||||
)
|
||||
df = DataFrame(1, index=business_dates, columns=["a", "b"])
|
||||
# get the first, fourth and last two business days for each month
|
||||
key = [df.index.year, df.index.month]
|
||||
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
|
||||
expected_dates = pd.to_datetime(
|
||||
[
|
||||
"2014/4/1",
|
||||
"2014/4/4",
|
||||
"2014/4/29",
|
||||
"2014/4/30",
|
||||
"2014/5/1",
|
||||
"2014/5/6",
|
||||
"2014/5/29",
|
||||
"2014/5/30",
|
||||
"2014/6/2",
|
||||
"2014/6/5",
|
||||
"2014/6/27",
|
||||
"2014/6/30",
|
||||
]
|
||||
).as_unit(unit)
|
||||
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_multi_grouper(three_group):
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on multiple groupers
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = three_group.iloc[[0, 3, 4, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected_first, expected_last",
|
||||
[
|
||||
(
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
),
|
||||
(
|
||||
{
|
||||
"id": ["A", "B", "A"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
],
|
||||
"foo": [1, 2, 3],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [1, 2],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [3, 2],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_tz(data, expected_first, expected_last):
|
||||
# GH15884
|
||||
# Test that the timezone is retained when calling first
|
||||
# or last on groupby with as_index=False
|
||||
|
||||
df = DataFrame(data)
|
||||
|
||||
result = df.groupby("id", as_index=False).first()
|
||||
expected = DataFrame(expected_first)
|
||||
cols = ["id", "time", "foo"]
|
||||
tm.assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].first()
|
||||
tm.assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
result = df.groupby("id", as_index=False).last()
|
||||
expected = DataFrame(expected_last)
|
||||
cols = ["id", "time", "foo"]
|
||||
tm.assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].last()
|
||||
tm.assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, ts, alpha",
|
||||
[
|
||||
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
|
||||
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
|
||||
],
|
||||
)
|
||||
def test_first_last_tz_multi_column(method, ts, alpha, unit):
|
||||
# GH 21603
|
||||
category_string = Series(list("abc")).astype("category")
|
||||
dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit)
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": [1, 1, 2],
|
||||
"category_string": category_string,
|
||||
"datetimetz": dti,
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("group"), method)()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"category_string": pd.Categorical(
|
||||
[alpha, "c"], dtype=category_string.dtype
|
||||
),
|
||||
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
)
|
||||
expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.array([True, False], dtype="boolean"),
|
||||
pd.array([1, 2], dtype="Int64"),
|
||||
pd.to_datetime(["2020-01-01", "2020-02-01"]),
|
||||
pd.to_timedelta([1, 2], unit="D"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("function", ["first", "last", "min", "max"])
|
||||
def test_first_last_extension_array_keeps_dtype(values, function):
|
||||
# https://github.com/pandas-dev/pandas/issues/33071
|
||||
# https://github.com/pandas-dev/pandas/issues/32194
|
||||
df = DataFrame({"a": [1, 2], "b": values})
|
||||
grouped = df.groupby("a")
|
||||
idx = Index([1, 2], name="a")
|
||||
expected_series = Series(values, name="b", index=idx)
|
||||
expected_frame = DataFrame({"b": values}, index=idx)
|
||||
|
||||
result_series = getattr(grouped["b"], function)()
|
||||
tm.assert_series_equal(result_series, expected_series)
|
||||
|
||||
result_frame = grouped.agg({"b": function})
|
||||
tm.assert_frame_equal(result_frame, expected_frame)
|
||||
|
||||
|
||||
def test_nth_multi_index_as_expected():
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on MultiIndex
|
||||
three_group = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
}
|
||||
)
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = three_group.iloc[[0, 3, 4, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, n, expected_rows",
|
||||
[
|
||||
("head", -1, [0]),
|
||||
("head", 0, []),
|
||||
("head", 1, [0, 2]),
|
||||
("head", 7, [0, 1, 2]),
|
||||
("tail", -1, [1]),
|
||||
("tail", 0, []),
|
||||
("tail", 1, [1, 2]),
|
||||
("tail", 7, [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
|
||||
def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A", as_index=as_index)
|
||||
expected = df.iloc[expected_rows]
|
||||
if columns is not None:
|
||||
g = g[columns]
|
||||
expected = expected[columns]
|
||||
result = getattr(g, op)(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_selection_cache():
|
||||
# GH 12839 nth, head, and tail should return same result consistently
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
expected = df.iloc[[0, 2]]
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.head(n=2)
|
||||
result2 = g.nth(0)
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.tail(n=2)
|
||||
result2 = g.nth(0)
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.head(n=2)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.tail(n=2)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
|
||||
def test_nth_empty():
|
||||
# GH 16064
|
||||
df = DataFrame(index=[0], columns=["a", "b", "c"])
|
||||
result = df.groupby("a").nth(10)
|
||||
expected = df.iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["a", "b"]).nth(10)
|
||||
expected = df.iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_column_order():
|
||||
# GH 20760
|
||||
# Check that nth preserves column order
|
||||
df = DataFrame(
|
||||
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
|
||||
columns=["A", "C", "B"],
|
||||
)
|
||||
result = df.groupby("A").nth(0)
|
||||
expected = df.iloc[[0, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").nth(-1, dropna="any")
|
||||
expected = df.iloc[[1, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
||||
def test_nth_nan_in_grouper(dropna):
|
||||
# GH 26011
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, "a", np.nan, "b", np.nan],
|
||||
"b": [0, 2, 4, 6, 8],
|
||||
"c": [1, 3, 5, 7, 9],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a").nth(0, dropna=dropna)
|
||||
expected = df.iloc[[1, 3]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
||||
def test_nth_nan_in_grouper_series(dropna):
|
||||
# GH 26454
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, "a", np.nan, "b", np.nan],
|
||||
"b": [0, 2, 4, 6, 8],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a")["b"].nth(0, dropna=dropna)
|
||||
expected = df["b"].iloc[[1, 3]]
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_categorical_and_datetime_data_nat():
|
||||
# GH 20520
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": ["first", "first", "second", "third", "third"],
|
||||
"time": 5 * [np.datetime64("NaT")],
|
||||
"categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
|
||||
}
|
||||
)
|
||||
result = df.groupby("group").first()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"time": 3 * [np.datetime64("NaT")],
|
||||
"categories": Series(["a", "c", "a"]).astype(
|
||||
pd.CategoricalDtype(["a", "b", "c"])
|
||||
),
|
||||
}
|
||||
)
|
||||
expected.index = Index(["first", "second", "third"], name="group")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_multi_key_groupby_categorical():
|
||||
# GH 22512
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1, 1, 2, 2],
|
||||
"B": [100, 100, 200, 100, 100],
|
||||
"C": ["apple", "orange", "mango", "mango", "orange"],
|
||||
"D": ["jupiter", "mercury", "mars", "venus", "venus"],
|
||||
}
|
||||
)
|
||||
df = df.astype({"D": "category"})
|
||||
result = df.groupby(by=["A", "B"]).first()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": ["apple", "mango", "mango"],
|
||||
"D": Series(["jupiter", "mars", "venus"]).astype(
|
||||
pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
|
||||
),
|
||||
}
|
||||
)
|
||||
expected.index = MultiIndex.from_tuples(
|
||||
[(1, 100), (1, 200), (2, 100)], names=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last", "nth"])
|
||||
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
|
||||
# GH29645
|
||||
expected = Series(["y"], dtype=object)
|
||||
data = Series(
|
||||
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
|
||||
index=[0, 0, 0, 0, 0],
|
||||
dtype=object,
|
||||
).groupby(level=0)
|
||||
|
||||
if method == "nth":
|
||||
result = getattr(data, method)(3)
|
||||
else:
|
||||
result = getattr(data, method)()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[slice(None, 3, 2), [0, 1, 4, 5]],
|
||||
[slice(None, -2), [0, 2, 5]],
|
||||
[[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
||||
[[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
||||
],
|
||||
)
|
||||
def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test slices GH #42947
|
||||
|
||||
result = slice_test_grouped.nth[arg]
|
||||
equivalent = slice_test_grouped.nth(arg)
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(equivalent, expected)
|
||||
|
||||
|
||||
def test_nth_indexed(slice_test_df, slice_test_grouped):
|
||||
# Test index notation GH #44688
|
||||
|
||||
result = slice_test_grouped.nth[0, 1, -2:]
|
||||
equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(equivalent, expected)
|
||||
|
||||
|
||||
def test_invalid_argument(slice_test_grouped):
|
||||
# Test for error on invalid argument
|
||||
|
||||
with pytest.raises(TypeError, match="Invalid index"):
|
||||
slice_test_grouped.nth(3.14)
|
||||
|
||||
|
||||
def test_negative_step(slice_test_grouped):
|
||||
# Test for error on negative slice step
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid step"):
|
||||
slice_test_grouped.nth(slice(None, None, -1))
|
||||
|
||||
|
||||
def test_np_ints(slice_test_df, slice_test_grouped):
|
||||
# Test np ints work
|
||||
|
||||
result = slice_test_grouped.nth(np.array([0, 1]))
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nth_interval():
|
||||
# GH#24205
|
||||
idx_result = MultiIndex(
|
||||
[
|
||||
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
|
||||
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
|
||||
],
|
||||
[[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]],
|
||||
)
|
||||
df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result)
|
||||
result = df_result.groupby(level=[0, 1], observed=False).nth(0)
|
||||
val_expected = [0, 1, 3]
|
||||
idx_expected = MultiIndex(
|
||||
[
|
||||
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
|
||||
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
|
||||
],
|
||||
[[0, 0, 1], [0, 1, 0]],
|
||||
)
|
||||
expected = DataFrame(val_expected, index=idx_expected, columns=["col"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:invalid value encountered in remainder:RuntimeWarning"
|
||||
)
|
||||
def test_head_tail_dropna_true():
|
||||
# GH#45089
|
||||
df = DataFrame(
|
||||
[["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
|
||||
)
|
||||
expected = DataFrame([["a", "z"]], columns=["X", "Y"])
|
||||
|
||||
result = df.groupby(["X", "Y"]).head(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"]).tail(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"]).nth(n=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_head_tail_dropna_false():
|
||||
# GH#45089
|
||||
df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
||||
expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).head(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"]))
|
||||
@pytest.mark.parametrize("dropna", ["any", "all", None])
|
||||
def test_nth_after_selection(selection, dropna):
|
||||
# GH#11038, GH#53518
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 2],
|
||||
"b": [np.nan, 3, 4],
|
||||
"c": [5, 6, 7],
|
||||
}
|
||||
)
|
||||
gb = df.groupby("a")[selection]
|
||||
result = gb.nth(0, dropna=dropna)
|
||||
if dropna == "any" or (dropna == "all" and selection != ["b", "c"]):
|
||||
locs = [1, 2]
|
||||
else:
|
||||
locs = [0, 2]
|
||||
expected = df.loc[locs, selection]
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
(
|
||||
Timestamp("2011-01-15 12:50:28.502376"),
|
||||
Timestamp("2011-01-20 12:50:28.593448"),
|
||||
),
|
||||
(24650000000000001, 24650000000000002),
|
||||
],
|
||||
)
|
||||
def test_groupby_nth_int_like_precision(data):
|
||||
# GH#6620, GH#9311
|
||||
df = DataFrame({"a": [1, 1], "b": data})
|
||||
|
||||
grouped = df.groupby("a")
|
||||
result = grouped.nth(0)
|
||||
expected = DataFrame({"a": 1, "b": [data[0]]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,459 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"a_vals,b_vals",
|
||||
[
|
||||
# Ints
|
||||
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
|
||||
([1, 2, 3, 4], [4, 3, 2, 1]),
|
||||
([1, 2, 3, 4, 5], [4, 3, 2, 1]),
|
||||
# Floats
|
||||
([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
|
||||
# Missing data
|
||||
([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
|
||||
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
|
||||
# Timestamps
|
||||
(
|
||||
pd.date_range("1/1/18", freq="D", periods=5),
|
||||
pd.date_range("1/1/18", freq="D", periods=5)[::-1],
|
||||
),
|
||||
(
|
||||
pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"),
|
||||
pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"),
|
||||
),
|
||||
# All NA
|
||||
([np.nan] * 5, [np.nan] * 5),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
|
||||
def test_quantile(interpolation, a_vals, b_vals, q):
|
||||
all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)])
|
||||
|
||||
a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
|
||||
b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
|
||||
|
||||
df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals})
|
||||
|
||||
expected = DataFrame(
|
||||
[a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
|
||||
)
|
||||
if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M":
|
||||
# TODO(non-nano): this should be unnecessary once array_to_datetime
|
||||
# correctly infers non-nano from Timestamp.unit
|
||||
expected = expected.astype(all_vals.dtype)
|
||||
result = df.groupby("key").quantile(q, interpolation=interpolation)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array():
|
||||
# https://github.com/pandas-dev/pandas/issues/27526
|
||||
df = DataFrame({"A": [0, 1, 2, 3, 4]})
|
||||
key = np.array([0, 0, 1, 1, 1], dtype=np.int64)
|
||||
result = df.groupby(key).quantile([0.25])
|
||||
|
||||
index = pd.MultiIndex.from_product([[0, 1], [0.25]])
|
||||
expected = DataFrame({"A": [0.25, 2.50]}, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
|
||||
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
|
||||
|
||||
key = np.array([0, 0, 1, 1], dtype=np.int64)
|
||||
result = df.groupby(key).quantile([0.25, 0.75])
|
||||
expected = DataFrame(
|
||||
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array2():
|
||||
# https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
|
||||
arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64)
|
||||
df = DataFrame(arr, columns=list("ABC"))
|
||||
result = df.groupby("A").quantile([0.3, 0.7])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7],
|
||||
"C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8],
|
||||
},
|
||||
index=pd.MultiIndex.from_product(
|
||||
[[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array_no_sort():
|
||||
df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
|
||||
key = np.array([1, 0, 1], dtype=np.int64)
|
||||
result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75])
|
||||
expected = DataFrame(
|
||||
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
|
||||
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(key, sort=False).quantile([0.75, 0.25])
|
||||
expected = DataFrame(
|
||||
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
|
||||
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array_multiple_levels():
|
||||
df = DataFrame(
|
||||
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
|
||||
)
|
||||
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
|
||||
index = pd.MultiIndex.from_tuples(
|
||||
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
|
||||
names=["c", "d", None],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
|
||||
@pytest.mark.parametrize("groupby", [[0], [0, 1]])
|
||||
@pytest.mark.parametrize("q", [[0.5, 0.6]])
|
||||
def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
|
||||
# GH30289
|
||||
nrow, ncol = frame_size
|
||||
df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
|
||||
|
||||
idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q]
|
||||
idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
|
||||
list(range(len(q))) * min(nrow, 4)
|
||||
]
|
||||
expected_index = pd.MultiIndex(
|
||||
levels=idx_levels, codes=idx_codes, names=[*groupby, None]
|
||||
)
|
||||
expected_values = [
|
||||
[float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
|
||||
]
|
||||
expected_columns = [x for x in range(ncol) if x not in groupby]
|
||||
expected = DataFrame(
|
||||
expected_values, index=expected_index, columns=expected_columns
|
||||
)
|
||||
result = df.groupby(groupby).quantile(q)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_raises():
|
||||
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
|
||||
|
||||
msg = "dtype '(object|str)' does not support operation 'quantile'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.groupby("key").quantile()
|
||||
|
||||
|
||||
def test_quantile_out_of_bounds_q_raises():
|
||||
# https://github.com/pandas-dev/pandas/issues/27470
|
||||
df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
|
||||
g = df.groupby([0, 0, 0, 1, 1, 1])
|
||||
with pytest.raises(ValueError, match="Got '50.0' instead"):
|
||||
g.quantile(50)
|
||||
|
||||
with pytest.raises(ValueError, match="Got '-1.0' instead"):
|
||||
g.quantile(-1)
|
||||
|
||||
|
||||
def test_quantile_missing_group_values_no_segfaults():
|
||||
# GH 28662
|
||||
data = np.array([1.0, np.nan, 1.0])
|
||||
df = DataFrame({"key": data, "val": range(3)})
|
||||
|
||||
# Random segfaults; would have been guaranteed in loop
|
||||
grp = df.groupby("key")
|
||||
for _ in range(100):
|
||||
grp.quantile()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, val, expected_key, expected_val",
|
||||
[
|
||||
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
|
||||
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
|
||||
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
|
||||
([0], [42], [0], [42.0]),
|
||||
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
|
||||
],
|
||||
)
|
||||
def test_quantile_missing_group_values_correct_results(
|
||||
key, val, expected_key, expected_val
|
||||
):
|
||||
# GH 28662, GH 33200, GH 33569
|
||||
df = DataFrame({"key": key, "val": val})
|
||||
|
||||
expected = DataFrame(
|
||||
expected_val, index=Index(expected_key, name="key"), columns=["val"]
|
||||
)
|
||||
|
||||
grp = df.groupby("key")
|
||||
|
||||
result = grp.quantile(0.5)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grp.quantile()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.array([1, 0, None] * 2, dtype="Int64"),
|
||||
pd.array([True, False, None] * 2, dtype="boolean"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
||||
def test_groupby_quantile_nullable_array(values, q):
|
||||
# https://github.com/pandas-dev/pandas/issues/33136
|
||||
df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
|
||||
result = df.groupby("a")["b"].quantile(q)
|
||||
|
||||
if isinstance(q, list):
|
||||
idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
|
||||
true_quantiles = [0.0, 0.5, 1.0]
|
||||
else:
|
||||
idx = Index(["x", "y"], name="a")
|
||||
true_quantiles = [0.5]
|
||||
|
||||
expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
|
||||
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
|
||||
if numeric_only:
|
||||
result = df.groupby("a").quantile(q, numeric_only=numeric_only)
|
||||
expected = df.groupby("a")[["b"]].quantile(q)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = "dtype '.*' does not support operation 'quantile'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.groupby("a").quantile(q, numeric_only=numeric_only)
|
||||
|
||||
|
||||
def test_groupby_quantile_NA_float(any_float_dtype):
|
||||
# GH#42849
|
||||
dtype = pd.Series([], dtype=any_float_dtype).dtype
|
||||
item = np.nan if isinstance(dtype, np.dtype) else pd.NA
|
||||
df = DataFrame({"x": [1, 1], "y": [0.2, item]}, dtype=any_float_dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
exp_index = Index([1.0], dtype=any_float_dtype, name="x")
|
||||
|
||||
if any_float_dtype in ["Float32", "Float64"]:
|
||||
expected_dtype = any_float_dtype
|
||||
else:
|
||||
expected_dtype = None
|
||||
|
||||
expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby("x")["y"].quantile([0.5, 0.75])
|
||||
expected = pd.Series(
|
||||
[0.2] * 2,
|
||||
index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
|
||||
name="y",
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_NA_int(any_int_ea_dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
expected = pd.Series(
|
||||
[3.5],
|
||||
dtype="Float64",
|
||||
index=Index([1], name="x", dtype=any_int_ea_dtype),
|
||||
name="y",
|
||||
)
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
result = df.groupby("x").quantile(0.5)
|
||||
expected = DataFrame(
|
||||
{"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype)
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)]
|
||||
)
|
||||
def test_groupby_quantile_all_na_group_masked(
|
||||
interpolation, val1, val2, any_numeric_ea_dtype
|
||||
):
|
||||
# GH#37493
|
||||
df = DataFrame(
|
||||
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
|
||||
)
|
||||
result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation)
|
||||
expected = DataFrame(
|
||||
{"b": [val1, val2, pd.NA, pd.NA]},
|
||||
dtype=any_numeric_ea_dtype,
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]],
|
||||
names=["a", None],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("interpolation", ["midpoint", "linear"])
|
||||
def test_groupby_quantile_all_na_group_masked_interp(
|
||||
interpolation, any_numeric_ea_dtype
|
||||
):
|
||||
# GH#37493
|
||||
df = DataFrame(
|
||||
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
|
||||
)
|
||||
result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation)
|
||||
|
||||
if any_numeric_ea_dtype == "Float32":
|
||||
expected_dtype = any_numeric_ea_dtype
|
||||
else:
|
||||
expected_dtype = "Float64"
|
||||
|
||||
expected = DataFrame(
|
||||
{"b": [2.0, 2.5, pd.NA, pd.NA]},
|
||||
dtype=expected_dtype,
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[
|
||||
pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype),
|
||||
[0.5, 0.75, 0.5, 0.75],
|
||||
],
|
||||
names=["a", None],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Float64", "Float32"])
|
||||
def test_groupby_quantile_allNA_column(dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
expected = pd.Series(
|
||||
[pd.NA], dtype=dtype, index=Index([1.0], dtype=dtype), name="y"
|
||||
)
|
||||
expected.index.name = "x"
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_groupby_timedelta_quantile():
|
||||
# GH: 29485
|
||||
tdi = pd.to_timedelta(np.arange(4), unit="s").as_unit("us")
|
||||
df = DataFrame({"value": tdi, "group": [1, 1, 2, 2]})
|
||||
result = df.groupby("group").quantile(0.99)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"value": [
|
||||
pd.Timedelta("0 days 00:00:00.990000"),
|
||||
pd.Timedelta("0 days 00:00:02.990000"),
|
||||
]
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_timestamp_groupby_quantile(unit):
|
||||
# GH 33168
|
||||
dti = pd.date_range(
|
||||
start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit
|
||||
).floor("1h")
|
||||
df = DataFrame(
|
||||
{
|
||||
"timestamp": dti,
|
||||
"category": list(range(1, 101)),
|
||||
"value": list(range(101, 201)),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("timestamp").quantile([0.2, 0.8])
|
||||
|
||||
mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None))
|
||||
expected = DataFrame(
|
||||
[
|
||||
{"category": 12.8, "value": 112.8},
|
||||
{"category": 48.2, "value": 148.2},
|
||||
{"category": 68.8, "value": 168.8},
|
||||
{"category": 92.2, "value": 192.2},
|
||||
],
|
||||
index=mi,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_dt64tz_period():
|
||||
# GH#51373
|
||||
dti = pd.date_range("2016-01-01", periods=1000, unit="ns")
|
||||
df = pd.Series(dti).to_frame().copy()
|
||||
df[1] = dti.tz_localize("US/Pacific")
|
||||
df[2] = dti.to_period("D")
|
||||
df[3] = dti - dti[0]
|
||||
df.iloc[-1] = pd.NaT
|
||||
|
||||
by = np.tile(np.arange(5), 200)
|
||||
gb = df.groupby(by)
|
||||
|
||||
result = gb.quantile(0.5)
|
||||
|
||||
# Check that we match the group-by-group result
|
||||
exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
|
||||
expected = DataFrame(exp).T.infer_objects()
|
||||
expected.index = expected.index.astype(int)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_nonmulti_levels_order():
|
||||
# Non-regression test for GH #53009
|
||||
ind = pd.MultiIndex.from_tuples(
|
||||
[
|
||||
(0, "a", "B"),
|
||||
(0, "a", "A"),
|
||||
(0, "b", "B"),
|
||||
(0, "b", "A"),
|
||||
(1, "a", "B"),
|
||||
(1, "a", "A"),
|
||||
(1, "b", "B"),
|
||||
(1, "b", "A"),
|
||||
],
|
||||
names=["sample", "cat0", "cat1"],
|
||||
)
|
||||
ser = pd.Series(range(8), index=ind)
|
||||
result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8])
|
||||
|
||||
qind = pd.MultiIndex.from_tuples(
|
||||
[("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None]
|
||||
)
|
||||
expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# We need to check that index levels are not sorted
|
||||
expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]])
|
||||
tm.assert_equal(result.index.levels, expected_levels)
|
||||
643
venv/Lib/site-packages/pandas/tests/groupby/methods/test_rank.py
Normal file
643
venv/Lib/site-packages/pandas/tests/groupby/methods/test_rank.py
Normal file
@@ -0,0 +1,643 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_rank_unordered_categorical_typeerror():
|
||||
# GH#51034 should be TypeError, not NotImplementedError
|
||||
cat = pd.Categorical([], ordered=False)
|
||||
ser = Series(cat)
|
||||
df = ser.to_frame()
|
||||
|
||||
msg = "Cannot perform rank with non-ordered Categorical"
|
||||
|
||||
gb = ser.groupby(cat, observed=False)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
gb.rank()
|
||||
|
||||
gb2 = df.groupby(cat, observed=False)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
gb2.rank()
|
||||
|
||||
|
||||
def test_rank_apply():
|
||||
lev1 = np.array(["a" * 10] * 100, dtype=object)
|
||||
lev2 = np.array(["b" * 10] * 130, dtype=object)
|
||||
lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int)
|
||||
lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"value": np.random.default_rng(2).standard_normal(500),
|
||||
"key1": lev1.take(lab1),
|
||||
"key2": lev2.take(lab2),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank()
|
||||
|
||||
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
|
||||
|
||||
expected = [
|
||||
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
|
||||
]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
np.array([2, 2, 8, 2, 6], dtype=dtype)
|
||||
for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
|
||||
]
|
||||
+ [
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-08").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-06").to_period("D"),
|
||||
],
|
||||
],
|
||||
ids=lambda x: type(x[0]),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,pct,exp",
|
||||
[
|
||||
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
|
||||
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
|
||||
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
|
||||
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
|
||||
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
|
||||
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
|
||||
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
|
||||
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
|
||||
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
|
||||
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
|
||||
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
|
||||
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
|
||||
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
|
||||
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
|
||||
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
|
||||
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
|
||||
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
|
||||
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
|
||||
orig_vals = vals
|
||||
vals = list(vals) * len(grps)
|
||||
if isinstance(orig_vals, np.ndarray):
|
||||
vals = np.array(vals, dtype=orig_vals.dtype)
|
||||
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,exp",
|
||||
[
|
||||
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
|
||||
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
|
||||
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
|
||||
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
|
||||
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
|
||||
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
|
||||
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
|
||||
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
|
||||
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
|
||||
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
|
||||
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
|
||||
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
|
||||
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
|
||||
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
|
||||
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
|
||||
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
|
||||
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
|
||||
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
|
||||
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
|
||||
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
|
||||
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
|
||||
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
|
||||
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
|
||||
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
|
||||
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
|
||||
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
|
||||
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
|
||||
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
|
||||
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
|
||||
# GH 20561
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option
|
||||
)
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
|
||||
for dtype in ["f8", "f4", "f2"]
|
||||
]
|
||||
+ [
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-06").to_period("D"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
],
|
||||
ids=lambda x: type(x[0]),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,pct,exp",
|
||||
[
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
|
||||
),
|
||||
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
|
||||
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
|
||||
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
|
||||
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
1.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
np.nan,
|
||||
3.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
3.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
np.nan,
|
||||
1.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
|
||||
),
|
||||
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
|
||||
),
|
||||
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
|
||||
),
|
||||
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
|
||||
),
|
||||
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
|
||||
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
|
||||
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
|
||||
),
|
||||
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
|
||||
),
|
||||
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
|
||||
),
|
||||
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
|
||||
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
|
||||
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
|
||||
orig_vals = vals
|
||||
vals = list(vals) * len(grps)
|
||||
if isinstance(orig_vals, np.ndarray):
|
||||
vals = np.array(vals, dtype=orig_vals.dtype)
|
||||
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
|
||||
)
|
||||
def test_rank_resets_each_group(pct, exp):
|
||||
df = DataFrame(
|
||||
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
|
||||
)
|
||||
result = df.groupby("key").rank(pct=pct)
|
||||
exp_df = DataFrame(exp * 2, columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
|
||||
)
|
||||
@pytest.mark.parametrize("upper", [True, False])
|
||||
def test_rank_avg_even_vals(dtype, upper):
|
||||
if upper:
|
||||
# use IntegerDtype/FloatingDtype
|
||||
dtype = dtype[0].upper() + dtype[1:]
|
||||
dtype = dtype.replace("Ui", "UI")
|
||||
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
|
||||
df["val"] = df["val"].astype(dtype)
|
||||
assert df["val"].dtype == dtype
|
||||
|
||||
result = df.groupby("key").rank()
|
||||
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
|
||||
if upper:
|
||||
exp_df = exp_df.astype("Float64")
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
|
||||
)
|
||||
def test_rank_object_dtype(rank_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
mask = df["val"].isna()
|
||||
|
||||
gb = df.groupby("key")
|
||||
res = gb.rank(method=rank_method, ascending=ascending, na_option=na_option, pct=pct)
|
||||
|
||||
# construct our expected by using numeric values with the same ordering
|
||||
if mask.any():
|
||||
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
|
||||
else:
|
||||
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
|
||||
|
||||
gb2 = df2.groupby("key")
|
||||
alt = gb2.rank(
|
||||
method=rank_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(res, alt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", [True, "bad", 1])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
["bar", "bar", "foo", "bar", "baz"],
|
||||
["bar", np.nan, "foo", np.nan, "baz"],
|
||||
[1, np.nan, 2, np.nan, 3],
|
||||
],
|
||||
)
|
||||
def test_rank_naoption_raises(rank_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("key").rank(
|
||||
method=rank_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
|
||||
def test_rank_empty_group():
|
||||
# see gh-22519
|
||||
column = "A"
|
||||
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
|
||||
|
||||
result = df.groupby(column).B.rank(pct=True)
|
||||
expected = Series([0.5, np.nan, 1.0], name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(column).rank(pct=True)
|
||||
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_key,input_value,output_value",
|
||||
[
|
||||
([1, 2], [1, 1], [1.0, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
|
||||
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
|
||||
],
|
||||
)
|
||||
def test_rank_zero_div(input_key, input_value, output_value):
|
||||
# GH 23666
|
||||
df = DataFrame({"A": input_key, "B": input_value})
|
||||
|
||||
result = df.groupby("A").rank(method="dense", pct=True)
|
||||
expected = DataFrame({"B": output_value})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_min_int():
|
||||
# GH-32859
|
||||
df = DataFrame(
|
||||
{
|
||||
"grp": [1, 1, 2],
|
||||
"int_col": [
|
||||
np.iinfo(np.int64).min,
|
||||
np.iinfo(np.int64).max,
|
||||
np.iinfo(np.int64).min,
|
||||
],
|
||||
"datetimelike": [NaT, datetime(2001, 1, 1), NaT],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("grp").rank()
|
||||
expected = DataFrame(
|
||||
{"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_nan", [True, False])
|
||||
def test_rank_pct_equal_values_on_group_transition(use_nan):
|
||||
# GH#40518
|
||||
fill_value = np.nan if use_nan else 3
|
||||
df = DataFrame(
|
||||
[
|
||||
[-1, 1],
|
||||
[-1, 2],
|
||||
[1, fill_value],
|
||||
[-1, fill_value],
|
||||
],
|
||||
columns=["group", "val"],
|
||||
)
|
||||
result = df.groupby(["group"])["val"].rank(
|
||||
method="dense",
|
||||
pct=True,
|
||||
)
|
||||
if use_nan:
|
||||
expected = Series([0.5, 1, np.nan, np.nan], name="val")
|
||||
else:
|
||||
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_non_unique_index():
|
||||
# GH 16577
|
||||
df = DataFrame(
|
||||
{"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
|
||||
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
||||
)
|
||||
result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
|
||||
expected = Series(
|
||||
[1.0, 1.0, 1.0, np.nan],
|
||||
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
||||
name="value",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_categorical():
|
||||
cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
|
||||
cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)
|
||||
|
||||
df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})
|
||||
|
||||
gb = df.groupby("col1")
|
||||
|
||||
res = gb.rank()
|
||||
|
||||
expected = df.astype(object).groupby("col1").rank()
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", ["top", "bottom"])
|
||||
def test_groupby_op_with_nullables(na_option):
|
||||
# GH 54206
|
||||
df = DataFrame({"x": [None]}, dtype="Float64")
|
||||
result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option)
|
||||
expected = Series([1.0], dtype="Float64", name=result.name)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,154 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
|
||||
def test_groupby_sample_balanced_groups_shape(n, frac):
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=n, frac=frac)
|
||||
values = [1] * 2 + [2] * 2
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=n, frac=frac)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_unbalanced_groups_shape():
|
||||
values = [1] * 10 + [2] * 20
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=5)
|
||||
values = [1] * 5 + [2] * 5
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=5)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_index_value_spans_groups():
|
||||
values = [1] * 3 + [2] * 3
|
||||
df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
|
||||
|
||||
result = df.groupby("a").sample(n=2)
|
||||
values = [1] * 2 + [2] * 2
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=2)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_n_and_frac_raises():
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
msg = "Please enter a value for `frac` OR `n`, not both"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(n=1, frac=1.0)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(n=1, frac=1.0)
|
||||
|
||||
|
||||
def test_groupby_sample_frac_gt_one_without_replacement_raises():
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(frac=1.5, replace=False)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(frac=1.5, replace=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [-1, 1.5])
|
||||
def test_groupby_sample_invalid_n_raises(n):
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
|
||||
if n < 0:
|
||||
msg = "A negative number of rows requested. Please provide `n` >= 0."
|
||||
else:
|
||||
msg = "Only integers accepted as `n` values"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(n=n)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(n=n)
|
||||
|
||||
|
||||
def test_groupby_sample_oversample():
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(frac=2.0, replace=True)
|
||||
values = [1] * 20 + [2] * 20
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_without_n_or_frac():
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=None, frac=None)
|
||||
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=None, frac=None)
|
||||
expected = Series([1, 2], name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index, expected_index",
|
||||
[(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
|
||||
)
|
||||
def test_groupby_sample_with_weights(index, expected_index):
|
||||
# GH 39927 - tests for integer index needed
|
||||
values = [1] * 2 + [2] * 2
|
||||
df = DataFrame({"a": values, "b": values}, index=Index(index))
|
||||
|
||||
result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||||
expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||||
expected = Series(values, name="b", index=Index(expected_index))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_with_selections():
|
||||
# GH 39928
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values, "c": values})
|
||||
|
||||
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
|
||||
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_with_empty_inputs():
|
||||
# GH48459
|
||||
df = DataFrame({"a": [], "b": []})
|
||||
groupby_df = df.groupby("a")
|
||||
|
||||
result = groupby_df.sample()
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,90 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
||||
def test_size(df, by):
|
||||
grouped = df.groupby(by=by)
|
||||
result = grouped.size()
|
||||
for key, group in grouped:
|
||||
assert result[key] == len(group)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
||||
def test_size_sort(sort, by):
|
||||
df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC"))
|
||||
left = df.groupby(by=by, sort=sort).size()
|
||||
right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
|
||||
tm.assert_series_equal(left, right, check_names=False)
|
||||
|
||||
|
||||
def test_size_series_dataframe():
|
||||
# https://github.com/pandas-dev/pandas/issues/11699
|
||||
df = DataFrame(columns=["A", "B"])
|
||||
out = Series(dtype="int64", index=Index([], name="A"))
|
||||
tm.assert_series_equal(df.groupby("A").size(), out)
|
||||
|
||||
|
||||
def test_size_groupby_all_null():
|
||||
# https://github.com/pandas-dev/pandas/issues/23050
|
||||
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
|
||||
df = DataFrame({"A": [None, None]}) # all-null groups
|
||||
result = df.groupby("A").size()
|
||||
expected = Series(dtype="int64", index=Index([], name="A"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_size_period_index():
|
||||
# https://github.com/pandas-dev/pandas/issues/34010
|
||||
ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
|
||||
grp = ser.groupby(level="A")
|
||||
result = grp.size()
|
||||
tm.assert_series_equal(result, ser)
|
||||
|
||||
|
||||
def test_size_on_categorical(as_index):
|
||||
df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
|
||||
df["A"] = df["A"].astype("category")
|
||||
result = df.groupby(["A", "B"], as_index=as_index, observed=False).size()
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
|
||||
)
|
||||
expected["A"] = expected["A"].astype("category")
|
||||
if as_index:
|
||||
expected = expected.set_index(["A", "B"])["size"].rename(None)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_size_series_masked_type_returns_Int64(dtype):
|
||||
# GH 54132
|
||||
ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
|
||||
result = ser.groupby(level=0).size()
|
||||
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_size_strings(any_string_dtype, using_infer_string):
|
||||
# GH#55627
|
||||
dtype = any_string_dtype
|
||||
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
|
||||
result = df.groupby("a")["b"].size()
|
||||
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
|
||||
exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype
|
||||
expected = Series(
|
||||
[2, 1],
|
||||
index=Index(["a", "b"], name="a", dtype=exp_index_dtype),
|
||||
name="b",
|
||||
dtype=exp_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,27 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_groupby_skew_equivalence():
|
||||
# Test that that groupby skew method (which uses libgroupby.group_skew)
|
||||
# matches the results of operating group-by-group (which uses nanops.nanskew)
|
||||
nrows = 1000
|
||||
ngroups = 3
|
||||
ncols = 2
|
||||
nan_frac = 0.05
|
||||
|
||||
arr = np.random.default_rng(2).standard_normal((nrows, ncols))
|
||||
arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
|
||||
|
||||
df = pd.DataFrame(arr)
|
||||
grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
|
||||
gb = df.groupby(grps)
|
||||
|
||||
result = gb.skew()
|
||||
|
||||
grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
|
||||
expected = pd.concat(grpwise, axis=0)
|
||||
expected.index = expected.index.astype(result.index.dtype) # 32bit builds
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
105
venv/Lib/site-packages/pandas/tests/groupby/test_all_methods.py
Normal file
105
venv/Lib/site-packages/pandas/tests/groupby/test_all_methods.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
Tests that apply to all groupby operation methods.
|
||||
|
||||
The only tests that should appear here are those that use the `groupby_func` fixture.
|
||||
Even if it does use that fixture, prefer a more specific test file if it available
|
||||
such as:
|
||||
|
||||
- test_categorical
|
||||
- test_groupby_dropna
|
||||
- test_groupby_subclass
|
||||
- test_raises
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import Pandas4Warning
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
def test_multiindex_group_all_columns_when_empty(groupby_func):
|
||||
# GH 32464
|
||||
df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"])
|
||||
gb = df.groupby(["a", "b", "c"], group_keys=True)
|
||||
method = getattr(gb, groupby_func)
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
if groupby_func == "corrwith":
|
||||
warn = Pandas4Warning
|
||||
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
|
||||
else:
|
||||
warn = None
|
||||
warn_msg = ""
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
result = method(*args).index
|
||||
expected = df.index
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
def test_duplicate_columns(request, groupby_func, as_index):
|
||||
# GH#50806
|
||||
if groupby_func == "corrwith":
|
||||
msg = "GH#50845 - corrwith fails when there are duplicate columns"
|
||||
request.applymarker(pytest.mark.xfail(reason=msg))
|
||||
df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby("a", as_index=as_index)
|
||||
result = getattr(gb, groupby_func)(*args)
|
||||
|
||||
expected_df = df.set_axis(["a", "b", "c"], axis=1)
|
||||
expected_args = get_groupby_method_args(groupby_func, expected_df)
|
||||
expected_gb = expected_df.groupby("a", as_index=as_index)
|
||||
expected = getattr(expected_gb, groupby_func)(*expected_args)
|
||||
if groupby_func not in ("size", "ngroup", "cumcount"):
|
||||
expected = expected.rename(columns={"c": "b"})
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[
|
||||
pd.Index(["a", "a"], name="foo"),
|
||||
pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]),
|
||||
],
|
||||
)
|
||||
def test_dup_labels_output_shape(groupby_func, idx):
|
||||
if groupby_func in {"size", "ngroup", "cumcount"}:
|
||||
pytest.skip(f"Not applicable for {groupby_func}")
|
||||
|
||||
df = DataFrame([[1, 1]], columns=idx)
|
||||
grp_by = df.groupby([0])
|
||||
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
if groupby_func == "corrwith":
|
||||
warn = Pandas4Warning
|
||||
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
|
||||
else:
|
||||
warn = None
|
||||
warn_msg = ""
|
||||
with tm.assert_produces_warning(warn, match=warn_msg):
|
||||
result = getattr(grp_by, groupby_func)(*args)
|
||||
|
||||
assert result.shape == (1, 2)
|
||||
tm.assert_index_equal(result.columns, idx)
|
||||
|
||||
|
||||
def test_not_c_contiguous_mask(groupby_func):
|
||||
# https://github.com/pandas-dev/pandas/issues/61031
|
||||
if groupby_func == "corrwith":
|
||||
# corrwith is deprecated
|
||||
return
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}, dtype="Int64")
|
||||
reversed = DataFrame(
|
||||
{"a": [2, 1, 1], "b": [5, 4, 3]}, dtype="Int64", index=[2, 1, 0]
|
||||
)[::-1]
|
||||
assert not reversed["b"].array._mask.flags["C_CONTIGUOUS"]
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
|
||||
gb_reversed = reversed.groupby("a")
|
||||
result = getattr(gb_reversed, groupby_func)(*args)
|
||||
gb = df.groupby("a")
|
||||
expected = getattr(gb, groupby_func)(*args)
|
||||
tm.assert_equal(result, expected)
|
||||
274
venv/Lib/site-packages/pandas/tests/groupby/test_api.py
Normal file
274
venv/Lib/site-packages/pandas/tests/groupby/test_api.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Tests of the groupby API, including internal consistency and with other pandas objects.
|
||||
|
||||
Tests in this file should only check the existence, names, and arguments of groupby
|
||||
methods. It should not test the results of any groupby operation.
|
||||
"""
|
||||
|
||||
import inspect
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
from pandas.core.groupby.base import (
|
||||
groupby_other_methods,
|
||||
reduction_kernels,
|
||||
transformation_kernels,
|
||||
)
|
||||
from pandas.core.groupby.generic import (
|
||||
DataFrameGroupBy,
|
||||
SeriesGroupBy,
|
||||
)
|
||||
|
||||
|
||||
def test_tab_completion(multiindex_dataframe_random_data):
|
||||
grp = multiindex_dataframe_random_data.groupby(level="second")
|
||||
results = {v for v in dir(grp) if not v.startswith("_")}
|
||||
expected = {
|
||||
"A",
|
||||
"B",
|
||||
"C",
|
||||
"agg",
|
||||
"aggregate",
|
||||
"apply",
|
||||
"boxplot",
|
||||
"filter",
|
||||
"first",
|
||||
"get_group",
|
||||
"groups",
|
||||
"hist",
|
||||
"indices",
|
||||
"last",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"ngroups",
|
||||
"nth",
|
||||
"ohlc",
|
||||
"plot",
|
||||
"prod",
|
||||
"size",
|
||||
"std",
|
||||
"sum",
|
||||
"transform",
|
||||
"var",
|
||||
"sem",
|
||||
"count",
|
||||
"nunique",
|
||||
"head",
|
||||
"describe",
|
||||
"cummax",
|
||||
"quantile",
|
||||
"rank",
|
||||
"cumprod",
|
||||
"tail",
|
||||
"resample",
|
||||
"cummin",
|
||||
"cumsum",
|
||||
"cumcount",
|
||||
"ngroup",
|
||||
"all",
|
||||
"shift",
|
||||
"skew",
|
||||
"kurt",
|
||||
"take",
|
||||
"pct_change",
|
||||
"any",
|
||||
"corr",
|
||||
"corrwith",
|
||||
"cov",
|
||||
"ndim",
|
||||
"diff",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"ffill",
|
||||
"bfill",
|
||||
"rolling",
|
||||
"expanding",
|
||||
"pipe",
|
||||
"sample",
|
||||
"ewm",
|
||||
"value_counts",
|
||||
}
|
||||
assert results == expected
|
||||
|
||||
|
||||
def test_all_methods_categorized(multiindex_dataframe_random_data):
|
||||
grp = multiindex_dataframe_random_data.groupby(
|
||||
multiindex_dataframe_random_data.iloc[:, 0]
|
||||
)
|
||||
names = {_ for _ in dir(grp) if not _.startswith("_")} - set(
|
||||
multiindex_dataframe_random_data.columns
|
||||
)
|
||||
new_names = set(names)
|
||||
new_names -= reduction_kernels
|
||||
new_names -= transformation_kernels
|
||||
new_names -= groupby_other_methods
|
||||
|
||||
assert not reduction_kernels & transformation_kernels
|
||||
assert not reduction_kernels & groupby_other_methods
|
||||
assert not transformation_kernels & groupby_other_methods
|
||||
|
||||
# new public method?
|
||||
if new_names:
|
||||
msg = f"""
|
||||
There are uncategorized methods defined on the Grouper class:
|
||||
{new_names}.
|
||||
|
||||
Was a new method recently added?
|
||||
|
||||
Every public method On Grouper must appear in exactly one the
|
||||
following three lists defined in pandas.core.groupby.base:
|
||||
- `reduction_kernels`
|
||||
- `transformation_kernels`
|
||||
- `groupby_other_methods`
|
||||
see the comments in pandas/core/groupby/base.py for guidance on
|
||||
how to fix this test.
|
||||
"""
|
||||
raise AssertionError(msg)
|
||||
|
||||
# removed a public method?
|
||||
all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
|
||||
if names != all_categorized:
|
||||
msg = f"""
|
||||
Some methods which are supposed to be on the Grouper class
|
||||
are missing:
|
||||
{all_categorized - names}.
|
||||
|
||||
They're still defined in one of the lists that live in pandas/core/groupby/base.py.
|
||||
If you removed a method, you should update them
|
||||
"""
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
def test_frame_consistency(groupby_func):
|
||||
# GH#48028
|
||||
if groupby_func in ("first", "last"):
|
||||
msg = "first and last don't exist for DataFrame anymore"
|
||||
pytest.skip(reason=msg)
|
||||
|
||||
if groupby_func in ("cumcount", "ngroup"):
|
||||
assert not hasattr(DataFrame, groupby_func)
|
||||
return
|
||||
|
||||
frame_method = getattr(DataFrame, groupby_func)
|
||||
gb_method = getattr(DataFrameGroupBy, groupby_func)
|
||||
result = set(inspect.signature(gb_method).parameters)
|
||||
if groupby_func == "size":
|
||||
# "size" is a method on GroupBy but property on DataFrame:
|
||||
expected = {"self"}
|
||||
else:
|
||||
expected = set(inspect.signature(frame_method).parameters)
|
||||
|
||||
# Exclude certain arguments from result and expected depending on the operation
|
||||
# Some of these may be purposeful inconsistencies between the APIs
|
||||
exclude_expected, exclude_result = set(), set()
|
||||
if groupby_func in ("any", "all"):
|
||||
exclude_expected = {"kwargs", "bool_only", "axis"}
|
||||
elif groupby_func in ("count",):
|
||||
exclude_expected = {"numeric_only", "axis"}
|
||||
elif groupby_func in ("nunique",):
|
||||
exclude_expected = {"axis"}
|
||||
elif groupby_func in ("max", "min"):
|
||||
exclude_expected = {"axis", "kwargs"}
|
||||
exclude_result = {"min_count", "engine", "engine_kwargs"}
|
||||
elif groupby_func in ("sum", "mean", "std", "var"):
|
||||
exclude_expected = {"axis", "kwargs"}
|
||||
exclude_result = {"engine", "engine_kwargs"}
|
||||
elif groupby_func in ("median", "prod", "sem"):
|
||||
exclude_expected = {"axis", "kwargs"}
|
||||
elif groupby_func in ("bfill", "ffill"):
|
||||
exclude_expected = {"inplace", "axis", "limit_area"}
|
||||
elif groupby_func in ("cummax", "cummin"):
|
||||
exclude_expected = {"axis", "skipna", "args"}
|
||||
elif groupby_func in ("cumprod", "cumsum"):
|
||||
exclude_expected = {"axis", "skipna"}
|
||||
elif groupby_func in ("pct_change",):
|
||||
exclude_expected = {"kwargs"}
|
||||
elif groupby_func in ("rank",):
|
||||
exclude_expected = {"numeric_only"}
|
||||
elif groupby_func in ("quantile",):
|
||||
exclude_expected = {"method", "axis"}
|
||||
elif groupby_func in ["corrwith"]:
|
||||
exclude_expected = {"min_periods"}
|
||||
if groupby_func not in ["pct_change", "size"]:
|
||||
exclude_expected |= {"axis"}
|
||||
|
||||
# Ensure excluded arguments are actually in the signatures
|
||||
assert result & exclude_result == exclude_result
|
||||
assert expected & exclude_expected == exclude_expected
|
||||
|
||||
result -= exclude_result
|
||||
expected -= exclude_expected
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_series_consistency(request, groupby_func):
|
||||
# GH#48028
|
||||
if groupby_func in ("first", "last"):
|
||||
msg = "first and last don't exist for Series anymore"
|
||||
pytest.skip(msg)
|
||||
|
||||
if groupby_func in ("cumcount", "corrwith", "ngroup"):
|
||||
assert not hasattr(Series, groupby_func)
|
||||
return
|
||||
|
||||
series_method = getattr(Series, groupby_func)
|
||||
gb_method = getattr(SeriesGroupBy, groupby_func)
|
||||
result = set(inspect.signature(gb_method).parameters)
|
||||
if groupby_func == "size":
|
||||
# "size" is a method on GroupBy but property on Series
|
||||
expected = {"self"}
|
||||
else:
|
||||
expected = set(inspect.signature(series_method).parameters)
|
||||
|
||||
# Exclude certain arguments from result and expected depending on the operation
|
||||
# Some of these may be purposeful inconsistencies between the APIs
|
||||
exclude_expected, exclude_result = set(), set()
|
||||
if groupby_func in ("any", "all"):
|
||||
exclude_expected = {"kwargs", "bool_only", "axis"}
|
||||
elif groupby_func in ("max", "min"):
|
||||
exclude_expected = {"axis", "kwargs"}
|
||||
exclude_result = {"min_count", "engine", "engine_kwargs"}
|
||||
elif groupby_func in ("sum", "mean", "std", "var"):
|
||||
exclude_expected = {"axis", "kwargs"}
|
||||
exclude_result = {"engine", "engine_kwargs"}
|
||||
elif groupby_func in ("median", "prod", "sem"):
|
||||
exclude_expected = {"axis", "kwargs"}
|
||||
elif groupby_func in ("bfill", "ffill"):
|
||||
exclude_expected = {"inplace", "axis", "limit_area"}
|
||||
elif groupby_func in ("cummax", "cummin"):
|
||||
exclude_expected = {"skipna", "args"}
|
||||
exclude_result = {"numeric_only"}
|
||||
elif groupby_func in ("cumprod", "cumsum"):
|
||||
exclude_expected = {"skipna"}
|
||||
exclude_result = {"numeric_only"}
|
||||
elif groupby_func in ("pct_change",):
|
||||
exclude_expected = {"kwargs"}
|
||||
elif groupby_func in ("rank",):
|
||||
exclude_expected = {"numeric_only"}
|
||||
elif groupby_func in ("idxmin", "idxmax"):
|
||||
exclude_expected = {"args", "kwargs"}
|
||||
elif groupby_func in ("quantile",):
|
||||
exclude_result = {"numeric_only"}
|
||||
if groupby_func not in [
|
||||
"diff",
|
||||
"pct_change",
|
||||
"count",
|
||||
"nunique",
|
||||
"quantile",
|
||||
"size",
|
||||
]:
|
||||
exclude_expected |= {"axis"}
|
||||
|
||||
# Ensure excluded arguments are actually in the signatures
|
||||
assert result & exclude_result == exclude_result
|
||||
assert expected & exclude_expected == exclude_expected
|
||||
|
||||
result -= exclude_result
|
||||
expected -= exclude_expected
|
||||
assert result == expected
|
||||
1543
venv/Lib/site-packages/pandas/tests/groupby/test_apply.py
Normal file
1543
venv/Lib/site-packages/pandas/tests/groupby/test_apply.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,67 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def assert_block_lengths(x):
|
||||
assert len(x) == len(x._mgr.blocks[0].mgr_locs)
|
||||
return 0
|
||||
|
||||
|
||||
def cumsum_max(x):
|
||||
x.cumsum().max()
|
||||
return 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func",
|
||||
[
|
||||
cumsum_max,
|
||||
assert_block_lengths,
|
||||
],
|
||||
)
|
||||
def test_mgr_locs_updated(func):
|
||||
# https://github.com/pandas-dev/pandas/issues/31802
|
||||
# Some operations may require creating new blocks, which requires
|
||||
# valid mgr_locs
|
||||
df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
|
||||
result = df.groupby(["A", "B"]).agg(func)
|
||||
expected = pd.DataFrame(
|
||||
{"C": [0, 0]},
|
||||
index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"binner,closed,expected",
|
||||
[
|
||||
(
|
||||
[0, 3, 6, 9],
|
||||
"left",
|
||||
[2, 5, 6],
|
||||
),
|
||||
(
|
||||
[0, 3, 6, 9],
|
||||
"right",
|
||||
[3, 6, 6],
|
||||
),
|
||||
([0, 3, 6], "left", [2, 5]),
|
||||
(
|
||||
[0, 3, 6],
|
||||
"right",
|
||||
[3, 6],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_generate_bins(binner, closed, expected):
|
||||
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
|
||||
result = lib.generate_bins_dt64(
|
||||
values, np.array(binner, dtype=np.int64), closed=closed
|
||||
)
|
||||
expected = np.array(expected, dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
2189
venv/Lib/site-packages/pandas/tests/groupby/test_categorical.py
Normal file
2189
venv/Lib/site-packages/pandas/tests/groupby/test_categorical.py
Normal file
File diff suppressed because it is too large
Load Diff
394
venv/Lib/site-packages/pandas/tests/groupby/test_counting.py
Normal file
394
venv/Lib/site-packages/pandas/tests/groupby/test_counting.py
Normal file
@@ -0,0 +1,394 @@
|
||||
from itertools import product
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Period,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCounting:
|
||||
def test_cumcount(self):
|
||||
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3])
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series(dtype=object).groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype="int64")
|
||||
|
||||
tm.assert_series_equal(e, ge.cumcount())
|
||||
tm.assert_series_equal(e, se.cumcount())
|
||||
|
||||
def test_cumcount_dupe_index(self):
|
||||
df = DataFrame(
|
||||
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
|
||||
)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=mi)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_groupby_not_col(self):
|
||||
df = DataFrame(
|
||||
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
|
||||
)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.cumcount())
|
||||
tm.assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_ngroup(self):
|
||||
df = DataFrame({"A": list("aaaba")})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_distinct(self):
|
||||
df = DataFrame({"A": list("abcde")})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series(range(5), dtype="int64")
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_one_group(self):
|
||||
df = DataFrame({"A": [0] * 5})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series(dtype=object).groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype="int64")
|
||||
|
||||
tm.assert_series_equal(e, ge.ngroup())
|
||||
tm.assert_series_equal(e, se.ngroup())
|
||||
|
||||
def test_ngroup_series_matches_frame(self):
|
||||
df = DataFrame({"A": list("aaaba")})
|
||||
s = Series(list("aaaba"))
|
||||
|
||||
tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
|
||||
|
||||
def test_ngroup_dupe_index(self):
|
||||
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame({"A": list("aaaba")}, index=mi)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
expected = Series([0, 0, 0, 1, 0], index=mi)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_groupby_not_col(self):
|
||||
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
tm.assert_series_equal(expected, g.ngroup())
|
||||
tm.assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_descending(self):
|
||||
df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
|
||||
g = df.groupby(["A"])
|
||||
|
||||
ascending = Series([0, 0, 1, 0, 1])
|
||||
descending = Series([1, 1, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
|
||||
tm.assert_series_equal(ascending, g.ngroup(ascending=True))
|
||||
tm.assert_series_equal(descending, g.ngroup(ascending=False))
|
||||
|
||||
def test_ngroup_matches_cumcount(self):
|
||||
# verify one manually-worked out case works
|
||||
df = DataFrame(
|
||||
[["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
|
||||
columns=["A", "X"],
|
||||
)
|
||||
g = df.groupby(["A", "X"])
|
||||
g_ngroup = g.ngroup()
|
||||
g_cumcount = g.cumcount()
|
||||
expected_ngroup = Series([0, 1, 2, 0, 3])
|
||||
expected_cumcount = Series([0, 0, 0, 1, 0])
|
||||
|
||||
tm.assert_series_equal(g_ngroup, expected_ngroup)
|
||||
tm.assert_series_equal(g_cumcount, expected_cumcount)
|
||||
|
||||
def test_ngroup_cumcount_pair(self):
|
||||
# brute force comparison for all small series
|
||||
for p in product(range(3), repeat=4):
|
||||
df = DataFrame({"a": p})
|
||||
g = df.groupby(["a"])
|
||||
|
||||
order = sorted(set(p))
|
||||
ngroupd = [order.index(val) for val in p]
|
||||
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
|
||||
|
||||
tm.assert_series_equal(g.ngroup(), Series(ngroupd))
|
||||
tm.assert_series_equal(g.cumcount(), Series(cumcounted))
|
||||
|
||||
def test_ngroup_respects_groupby_order(self, sort):
|
||||
df = DataFrame({"a": np.random.default_rng(2).choice(list("abcdef"), 100)})
|
||||
g = df.groupby("a", sort=sort)
|
||||
df["group_id"] = -1
|
||||
df["group_index"] = -1
|
||||
|
||||
for i, (_, group) in enumerate(g):
|
||||
df.loc[group.index, "group_id"] = i
|
||||
for j, ind in enumerate(group.index):
|
||||
df.loc[ind, "group_index"] = j
|
||||
|
||||
tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
|
||||
tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"datetimelike",
|
||||
[
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
|
||||
[Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
|
||||
[Timedelta(x, unit="h") for x in range(1, 4)],
|
||||
[Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
|
||||
],
|
||||
)
|
||||
def test_count_with_datetimelike(self, datetimelike):
|
||||
# test for #13393, where DataframeGroupBy.count() fails
|
||||
# when counting a datetimelike column.
|
||||
|
||||
df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
|
||||
res = df.groupby("x").count()
|
||||
expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
|
||||
expected.index.name = "x"
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
def test_count_with_only_nans_in_first_group(self):
|
||||
# GH21956
|
||||
df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
|
||||
result = df.groupby(["A", "B"]).C.count()
|
||||
mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
|
||||
expected = Series([], index=mi, dtype=np.int64, name="C")
|
||||
tm.assert_series_equal(result, expected, check_index_type=False)
|
||||
|
||||
def test_count_groupby_column_with_nan_in_groupby_column(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/32841
|
||||
df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.nan, 3, 0]})
|
||||
res = df.groupby(["B"]).count()
|
||||
expected = DataFrame(
|
||||
index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
|
||||
)
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
def test_groupby_count_dateparseerror(self):
|
||||
dr = date_range(start="1/1/2012", freq="5min", periods=10)
|
||||
|
||||
# BAD Example, datetimes first
|
||||
ser = Series(np.arange(10), index=[dr, np.arange(10)])
|
||||
grouped = ser.groupby(lambda x: x[1] % 2 == 0)
|
||||
result = grouped.count()
|
||||
|
||||
ser = Series(np.arange(10), index=[np.arange(10), dr])
|
||||
grouped = ser.groupby(lambda x: x[0] % 2 == 0)
|
||||
expected = grouped.count()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_timedelta_cython_count():
|
||||
df = DataFrame(
|
||||
{"g": list("ab" * 2), "delta": np.arange(4).astype("timedelta64[ns]")}
|
||||
)
|
||||
expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delta")
|
||||
result = df.groupby("g").delta.count()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_count():
|
||||
n = 1 << 15
|
||||
dr = date_range("2015-08-30", periods=n // 10, freq="min")
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"1st": np.random.default_rng(2).choice(list(ascii_lowercase), n),
|
||||
"2nd": np.random.default_rng(2).integers(0, 5, n),
|
||||
"3rd": np.random.default_rng(2).standard_normal(n).round(3),
|
||||
"4th": np.random.default_rng(2).integers(-10, 10, n),
|
||||
"5th": np.random.default_rng(2).choice(dr, n),
|
||||
"6th": np.random.default_rng(2).standard_normal(n).round(3),
|
||||
"7th": np.random.default_rng(2).standard_normal(n).round(3),
|
||||
"8th": np.random.default_rng(2).choice(dr, n)
|
||||
- np.random.default_rng(2).choice(dr, 1),
|
||||
"9th": np.random.default_rng(2).choice(list(ascii_lowercase), n),
|
||||
}
|
||||
)
|
||||
|
||||
for col in df.columns.drop(["1st", "2nd", "4th"]):
|
||||
df.loc[np.random.default_rng(2).choice(n, n // 10), col] = np.nan
|
||||
|
||||
df["9th"] = df["9th"].astype("category")
|
||||
|
||||
for key in ["1st", "2nd", ["1st", "2nd"]]:
|
||||
left = df.groupby(key).count()
|
||||
right = df.groupby(key).apply(DataFrame.count)
|
||||
tm.assert_frame_equal(left, right)
|
||||
|
||||
|
||||
def test_count_non_nulls():
|
||||
# GH#5610
|
||||
# count counts non-nulls
|
||||
df = DataFrame(
|
||||
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
|
||||
count_as = df.groupby("A").count()
|
||||
count_not_as = df.groupby("A", as_index=False).count()
|
||||
|
||||
expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
|
||||
expected.index.name = "A"
|
||||
tm.assert_frame_equal(count_not_as, expected.reset_index())
|
||||
tm.assert_frame_equal(count_as, expected)
|
||||
|
||||
count_B = df.groupby("A")["B"].count()
|
||||
tm.assert_series_equal(count_B, expected["B"])
|
||||
|
||||
|
||||
def test_count_object():
|
||||
df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
|
||||
result = df.groupby("c").a.count()
|
||||
expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_object_nan():
|
||||
df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
|
||||
result = df.groupby("c").a.count()
|
||||
expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("typ", ["object", "float32"])
|
||||
def test_count_cross_type(typ):
|
||||
# GH8169
|
||||
# Set float64 dtype to avoid upcast when setting nan below
|
||||
vals = np.hstack(
|
||||
(
|
||||
np.random.default_rng(2).integers(0, 5, (10, 2)),
|
||||
np.random.default_rng(2).integers(0, 2, (10, 2)),
|
||||
)
|
||||
).astype("float64")
|
||||
|
||||
df = DataFrame(vals, columns=["a", "b", "c", "d"])
|
||||
df[df == 2] = np.nan
|
||||
expected = df.groupby(["c", "d"]).count()
|
||||
|
||||
df["a"] = df["a"].astype(typ)
|
||||
df["b"] = df["b"].astype(typ)
|
||||
result = df.groupby(["c", "d"]).count()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_lower_int_prec_count():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.array([0, 1, 2, 100], np.int8),
|
||||
"b": np.array([1, 2, 3, 6], np.uint32),
|
||||
"c": np.array([4, 5, 6, 8], np.int16),
|
||||
"grp": list("ab" * 2),
|
||||
}
|
||||
)
|
||||
result = df.groupby("grp").count()
|
||||
expected = DataFrame(
|
||||
{"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_uses_size_on_exception():
|
||||
class RaisingObjectException(Exception):
|
||||
pass
|
||||
|
||||
class RaisingObject:
|
||||
def __init__(self, msg="I will raise inside Cython") -> None:
|
||||
super().__init__()
|
||||
self.msg = msg
|
||||
|
||||
def __eq__(self, other):
|
||||
# gets called in Cython to check that raising calls the method
|
||||
raise RaisingObjectException(self.msg)
|
||||
|
||||
df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
|
||||
result = df.groupby("grp").count()
|
||||
expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_arrow_string_array(any_string_dtype):
|
||||
# GH#54751
|
||||
pytest.importorskip("pyarrow")
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)}
|
||||
)
|
||||
result = df.groupby("a").count()
|
||||
expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
332
venv/Lib/site-packages/pandas/tests/groupby/test_cumulative.py
Normal file
332
venv/Lib/site-packages/pandas/tests/groupby/test_cumulative.py
Normal file
@@ -0,0 +1,332 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import UnsupportedFunctionCall
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
|
||||
ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
|
||||
)
|
||||
def dtypes_for_minmax(request):
|
||||
"""
|
||||
Fixture of dtypes with min and max values used for testing
|
||||
cummin and cummax
|
||||
"""
|
||||
dtype = request.param
|
||||
|
||||
np_type = dtype
|
||||
if dtype == "Int64":
|
||||
np_type = np.int64
|
||||
elif dtype == "Float64":
|
||||
np_type = np.float64
|
||||
|
||||
min_val = (
|
||||
np.iinfo(np_type).min
|
||||
if np.dtype(np_type).kind == "i"
|
||||
else np.finfo(np_type).min
|
||||
)
|
||||
max_val = (
|
||||
np.iinfo(np_type).max
|
||||
if np.dtype(np_type).kind == "i"
|
||||
else np.finfo(np_type).max
|
||||
)
|
||||
|
||||
return (dtype, min_val, max_val)
|
||||
|
||||
|
||||
def test_groupby_cumprod():
|
||||
# GH 4095
|
||||
df = DataFrame({"key": ["b"] * 10, "value": 2})
|
||||
|
||||
actual = df.groupby("key")["value"].cumprod()
|
||||
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
|
||||
expected.name = "value"
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
df = DataFrame({"key": ["b"] * 100, "value": 2})
|
||||
df["value"] = df["value"].astype(float)
|
||||
actual = df.groupby("key")["value"].cumprod()
|
||||
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
|
||||
expected.name = "value"
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_groupby_cumprod_overflow():
|
||||
# GH#37493 if we overflow we return garbage consistent with numpy
|
||||
df = DataFrame({"key": ["b"] * 4, "value": 100_000})
|
||||
actual = df.groupby("key")["value"].cumprod()
|
||||
expected = Series(
|
||||
[100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
|
||||
name="value",
|
||||
)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
numpy_result = df.groupby("key", group_keys=False)["value"].apply(
|
||||
lambda x: x.cumprod()
|
||||
)
|
||||
numpy_result.name = "value"
|
||||
tm.assert_series_equal(actual, numpy_result)
|
||||
|
||||
|
||||
def test_groupby_cumprod_nan_influences_other_columns():
|
||||
# GH#48064
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": 1,
|
||||
"b": [1, np.nan, 2],
|
||||
"c": [1, 2, 3.0],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a").cumprod(numeric_only=True, skipna=False)
|
||||
expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummin(dtypes_for_minmax):
|
||||
dtype = dtypes_for_minmax[0]
|
||||
|
||||
# GH 15048
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
|
||||
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
|
||||
|
||||
df = base_df.astype(dtype)
|
||||
expected = DataFrame({"B": expected_mins}).astype(dtype)
|
||||
result = df.groupby("A").cummin()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummin_min_value_for_dtype(dtypes_for_minmax):
|
||||
dtype = dtypes_for_minmax[0]
|
||||
min_val = dtypes_for_minmax[1]
|
||||
|
||||
# GH 15048
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
|
||||
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
|
||||
expected = DataFrame({"B": expected_mins}).astype(dtype)
|
||||
df = base_df.astype(dtype)
|
||||
df.loc[[2, 6], "B"] = min_val
|
||||
df.loc[[1, 5], "B"] = min_val + 1
|
||||
expected.loc[[2, 3, 6, 7], "B"] = min_val
|
||||
expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
|
||||
result = df.groupby("A").cummin()
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
expected = (
|
||||
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
|
||||
|
||||
def test_cummin_nan_in_some_values(dtypes_for_minmax):
|
||||
# Explicit cast to float to avoid implicit cast when setting nan
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
|
||||
base_df = base_df.astype({"B": "float"})
|
||||
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
|
||||
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
|
||||
result = base_df.groupby("A").cummin()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected = (
|
||||
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummin_datetime():
|
||||
# GH 15561
|
||||
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
|
||||
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
|
||||
|
||||
result = df.groupby("a")["b"].cummin()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_cummin_getattr_series():
|
||||
# GH 15635
|
||||
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
|
||||
result = df.groupby("a").b.cummin()
|
||||
expected = Series([1, 2, 1], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
|
||||
def test_cummin_max_all_nan_column(method, dtype):
|
||||
item = np.nan if dtype == "float" else pd.NA
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [item] * 8})
|
||||
base_df["B"] = base_df["B"].astype(dtype)
|
||||
grouped = base_df.groupby("A")
|
||||
|
||||
expected = DataFrame({"B": [item] * 8}, dtype=dtype)
|
||||
result = getattr(grouped, method)()
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
result = getattr(grouped["B"], method)().to_frame()
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_cummax(dtypes_for_minmax):
|
||||
dtype = dtypes_for_minmax[0]
|
||||
|
||||
# GH 15048
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
|
||||
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
|
||||
|
||||
df = base_df.astype(dtype)
|
||||
|
||||
expected = DataFrame({"B": expected_maxs}).astype(dtype)
|
||||
result = df.groupby("A").cummax()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummax_min_value_for_dtype(dtypes_for_minmax):
|
||||
dtype = dtypes_for_minmax[0]
|
||||
max_val = dtypes_for_minmax[2]
|
||||
|
||||
# GH 15048
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
|
||||
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
|
||||
|
||||
df = base_df.astype(dtype)
|
||||
df.loc[[2, 6], "B"] = max_val
|
||||
expected = DataFrame({"B": expected_maxs}).astype(dtype)
|
||||
expected.loc[[2, 3, 6, 7], "B"] = max_val
|
||||
result = df.groupby("A").cummax()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected = (
|
||||
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummax_nan_in_some_values(dtypes_for_minmax):
|
||||
# Test nan in some values
|
||||
# Explicit cast to float to avoid implicit cast when setting nan
|
||||
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
|
||||
base_df = base_df.astype({"B": "float"})
|
||||
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
|
||||
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
|
||||
result = base_df.groupby("A").cummax()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
expected = (
|
||||
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummax_datetime():
|
||||
# GH 15561
|
||||
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
|
||||
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
|
||||
|
||||
result = df.groupby("a")["b"].cummax()
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_cummax_getattr_series():
|
||||
# GH 15635
|
||||
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
|
||||
result = df.groupby("a").b.cummax()
|
||||
expected = Series([2, 1, 2], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cummax_i8_at_implementation_bound():
|
||||
# the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT
|
||||
# for int64 dtype GH#46382
|
||||
ser = Series([pd.NaT._value + n for n in range(5)])
|
||||
df = DataFrame({"A": 1, "B": ser, "C": ser._values.view("M8[ns]")})
|
||||
gb = df.groupby("A")
|
||||
|
||||
res = gb.cummax()
|
||||
exp = df[["B", "C"]]
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
|
||||
@pytest.mark.parametrize(
|
||||
"groups,expected_data",
|
||||
[
|
||||
([1, 1, 1], [1, None, None]),
|
||||
([1, 2, 3], [1, None, 2]),
|
||||
([1, 3, 3], [1, None, None]),
|
||||
],
|
||||
)
|
||||
def test_cummin_max_skipna(method, dtype, groups, expected_data):
|
||||
# GH-34047
|
||||
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
|
||||
orig = df.copy()
|
||||
gb = df.groupby(groups)["a"]
|
||||
|
||||
result = getattr(gb, method)(skipna=False)
|
||||
expected = Series(expected_data, dtype=dtype, name="a")
|
||||
|
||||
# check we didn't accidentally alter df
|
||||
tm.assert_frame_equal(df, orig)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
def test_cummin_max_skipna_multiple_cols(method):
|
||||
# Ensure missing value in "a" doesn't cause "b" to be nan-filled
|
||||
df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
|
||||
gb = df.groupby([1, 1, 1])[["a", "b"]]
|
||||
|
||||
result = getattr(gb, method)(skipna=False)
|
||||
expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["cumprod", "cumsum"])
|
||||
def test_numpy_compat(func):
|
||||
# see gh-12811
|
||||
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
|
||||
g = df.groupby("A")
|
||||
|
||||
msg = "numpy operations are not valid with groupby"
|
||||
|
||||
with pytest.raises(UnsupportedFunctionCall, match=msg):
|
||||
getattr(g, func)(1, 2, 3)
|
||||
with pytest.raises(UnsupportedFunctionCall, match=msg):
|
||||
getattr(g, func)(foo=1)
|
||||
|
||||
|
||||
@td.skip_if_32bit
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)]
|
||||
)
|
||||
def test_nullable_int_not_cast_as_float(method, dtype, val):
|
||||
data = [val, pd.NA]
|
||||
df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
|
||||
grouped = df.groupby("grp")
|
||||
|
||||
result = grouped.transform(method)
|
||||
expected = DataFrame({"b": data}, dtype=dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_api2(as_index):
|
||||
# this takes the fast apply path
|
||||
|
||||
# cumsum (GH5614)
|
||||
# GH 5755 - cumsum is a transformer and should ignore as_index
|
||||
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
|
||||
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
|
||||
result = df.groupby("A", as_index=as_index).cumsum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
638
venv/Lib/site-packages/pandas/tests/groupby/test_filters.py
Normal file
638
venv/Lib/site-packages/pandas/tests/groupby/test_filters.py
Normal file
@@ -0,0 +1,638 @@
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_filter_series():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = Series([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(s.index),
|
||||
)
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(s.index),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_single_column_df():
|
||||
df = DataFrame([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = DataFrame([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = df[0].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(df.index),
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(df.index),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_multi_column_df():
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
|
||||
)
|
||||
|
||||
|
||||
def test_filter_mixed_df():
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
|
||||
|
||||
|
||||
def test_filter_out_all_groups():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
|
||||
|
||||
|
||||
def test_filter_out_no_groups():
|
||||
s = Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x.mean() > 0)
|
||||
tm.assert_series_equal(filtered, s)
|
||||
|
||||
|
||||
def test_filter_out_no_groups_dataframe():
|
||||
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x["A"].mean() > 0)
|
||||
tm.assert_frame_equal(filtered, df)
|
||||
|
||||
|
||||
def test_filter_out_all_groups_in_df():
|
||||
# GH12768
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
|
||||
res = df.groupby("a")
|
||||
res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
|
||||
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
|
||||
def test_filter_out_all_groups_in_df_dropna_true():
|
||||
# GH12768
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
|
||||
res = df.groupby("a")
|
||||
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
|
||||
expected = DataFrame({"a": [], "b": []}, dtype="int64")
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
|
||||
def test_filter_condition_raises():
|
||||
def raise_if_sum_is_zero(x):
|
||||
if x.sum() == 0:
|
||||
raise ValueError
|
||||
return x.sum() > 0
|
||||
|
||||
s = Series([-1, 0, 1, 2])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
grouped.filter(raise_if_sum_is_zero)
|
||||
|
||||
|
||||
def test_filter_bad_shapes():
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
s = df["B"]
|
||||
g_df = df.groupby("B")
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: x
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: x == 1
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: np.outer(x, x)
|
||||
msg = "can't multiply sequence by non-int of type 'str'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
|
||||
def test_filter_nan_is_false():
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
s = df["B"]
|
||||
g_df = df.groupby(df["B"])
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: np.nan
|
||||
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
|
||||
tm.assert_series_equal(g_s.filter(f), s[[]])
|
||||
|
||||
|
||||
def test_filter_pdna_is_false():
|
||||
# in particular, dont raise in filter trying to call bool(pd.NA)
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
ser = df["B"]
|
||||
g_df = df.groupby(df["B"])
|
||||
g_s = ser.groupby(ser)
|
||||
|
||||
func = lambda x: pd.NA
|
||||
res = g_df.filter(func)
|
||||
tm.assert_frame_equal(res, df.loc[[]])
|
||||
res = g_s.filter(func)
|
||||
tm.assert_series_equal(res, ser[[]])
|
||||
|
||||
|
||||
def test_filter_against_workaround_ints():
|
||||
# Series of ints
|
||||
s = Series(np.random.default_rng(2).integers(0, 100, 10))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
|
||||
old_way = s[grouped.transform(f).astype("bool")]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
|
||||
def test_filter_against_workaround_floats():
|
||||
# Series of floats
|
||||
s = 100 * Series(np.random.default_rng(2).random(10))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
old_way = s[grouped.transform(f).astype("bool")]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
|
||||
def test_filter_against_workaround_dataframe():
|
||||
# Set up DataFrame of ints, floats, strings.
|
||||
letters = np.array(list(ascii_lowercase))
|
||||
N = 10
|
||||
random_letters = letters.take(
|
||||
np.random.default_rng(2).integers(0, 26, N, dtype=int)
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
"ints": Series(np.random.default_rng(2).integers(0, 10, N)),
|
||||
"floats": N / 10 * Series(np.random.default_rng(2).random(N)),
|
||||
"letters": Series(random_letters),
|
||||
}
|
||||
)
|
||||
|
||||
# Group by ints; filter on floats.
|
||||
grouped = df.groupby("ints")
|
||||
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by floats (rounded); filter on strings.
|
||||
grouper = df.floats.apply(lambda x: np.round(x, -1))
|
||||
grouped = df.groupby(grouper)
|
||||
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: len(x.letters) < N / 2)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by strings; filter on ints.
|
||||
grouped = df.groupby("letters")
|
||||
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
|
||||
def test_filter_using_len():
|
||||
# GH 4447
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
grouped = df.groupby("B")
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = DataFrame(
|
||||
{"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
|
||||
index=range(2, 6),
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_using_len_series():
|
||||
# GH 4447
|
||||
s = Series(list("aabbbbcc"), name="B")
|
||||
grouped = s.groupby(s)
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = Series(4 * ["b"], index=range(2, 6), name="B")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = s[[]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]]
|
||||
)
|
||||
def test_filter_maintains_ordering(index):
|
||||
# GH 4621
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_multiple_timestamp():
|
||||
# GH 10114
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.arange(5, dtype="int64"),
|
||||
"B": ["foo", "bar", "foo", "bar", "bar"],
|
||||
"C": Timestamp("20130101"),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["B", "C"])
|
||||
|
||||
result = grouped["A"].filter(lambda x: True)
|
||||
tm.assert_series_equal(df["A"], result)
|
||||
|
||||
result = grouped["A"].transform(len)
|
||||
expected = Series([2, 3, 2, 3, 3], name="A")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped.filter(lambda x: True)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
result = grouped.transform("sum")
|
||||
expected = DataFrame({"A": [2, 8, 2, 8, 8]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped.transform(len)
|
||||
expected = DataFrame({"A": [2, 3, 2, 3, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 1, 1, 0, 1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_multiple_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 0, 0, 0, 1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_float_index():
|
||||
# GH4620
|
||||
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_timestamp_index():
|
||||
# GH4620
|
||||
t0 = Timestamp("2013-09-30 00:05:00")
|
||||
t1 = Timestamp("2013-10-30 00:05:00")
|
||||
t2 = Timestamp("2013-11-30 00:05:00")
|
||||
index = [t1, t1, t1, t2, t1, t1, t0, t1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_string_index():
|
||||
# GH4620
|
||||
index = list("bbbcbbab")
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
# Cast to avoid upcast when setting nan below
|
||||
expected = df.copy().astype("float64")
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_has_access_to_grouped_cols():
|
||||
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
# previously didn't have access to col A #????
|
||||
filt = g.filter(lambda x: x["A"].sum() == 2)
|
||||
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
|
||||
|
||||
|
||||
def test_filter_enforces_scalarness():
|
||||
df = DataFrame(
|
||||
[
|
||||
["best", "a", "x"],
|
||||
["worst", "b", "y"],
|
||||
["best", "c", "x"],
|
||||
["best", "d", "y"],
|
||||
["worst", "d", "y"],
|
||||
["worst", "d", "y"],
|
||||
["best", "d", "z"],
|
||||
],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
with pytest.raises(TypeError, match="filter function returned a.*"):
|
||||
df.groupby("c").filter(lambda g: g["a"] == "best")
|
||||
|
||||
|
||||
def test_filter_non_bool_raises():
|
||||
df = DataFrame(
|
||||
[
|
||||
["best", "a", 1],
|
||||
["worst", "b", 1],
|
||||
["best", "c", 1],
|
||||
["best", "d", 1],
|
||||
["worst", "d", 1],
|
||||
["worst", "d", 1],
|
||||
["best", "d", 1],
|
||||
],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
with pytest.raises(TypeError, match="filter function returned a.*"):
|
||||
df.groupby("a").filter(lambda g: g.c.mean())
|
||||
|
||||
|
||||
def test_filter_dropna_with_empty_groups():
|
||||
# GH 10780
|
||||
data = Series(np.random.default_rng(2).random(9), index=np.repeat([1, 2, 3], 3))
|
||||
grouped = data.groupby(level=0)
|
||||
result_false = grouped.filter(lambda x: x.mean() > 1, dropna=False)
|
||||
expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
|
||||
tm.assert_series_equal(result_false, expected_false)
|
||||
|
||||
result_true = grouped.filter(lambda x: x.mean() > 1, dropna=True)
|
||||
expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64)
|
||||
tm.assert_series_equal(result_true, expected_true)
|
||||
|
||||
|
||||
def test_filter_consistent_result_before_after_agg_func():
|
||||
# GH 17091
|
||||
df = DataFrame({"data": range(6), "key": list("ABCABC")})
|
||||
grouper = df.groupby("key")
|
||||
result = grouper.filter(lambda x: True)
|
||||
expected = DataFrame({"data": range(6), "key": list("ABCABC")})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
grouper.sum()
|
||||
result = grouper.filter(lambda x: True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_with_non_values():
|
||||
# GH 62501
|
||||
df = DataFrame(
|
||||
[
|
||||
[1],
|
||||
[None],
|
||||
],
|
||||
columns=["a"],
|
||||
)
|
||||
|
||||
result = df.groupby("a", dropna=False).filter(lambda x: True)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_filter_with_non_values_multi_index():
|
||||
# GH 62501
|
||||
df = DataFrame(
|
||||
[
|
||||
[1, 2],
|
||||
[3, None],
|
||||
[None, 4],
|
||||
[None, None],
|
||||
],
|
||||
columns=["a", "b"],
|
||||
)
|
||||
|
||||
result = df.groupby(["a", "b"], dropna=False).filter(lambda x: True)
|
||||
tm.assert_frame_equal(result, df)
|
||||
3004
venv/Lib/site-packages/pandas/tests/groupby/test_groupby.py
Normal file
3004
venv/Lib/site-packages/pandas/tests/groupby/test_groupby.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,692 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import Pandas4Warning
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas.core.dtypes.missing import na_value_for_dtype
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"]],
|
||||
{
|
||||
"c": [13.0, 12.3, 123.23],
|
||||
"d": [13.0, 233.0, 123.0],
|
||||
"e": [13.0, 12.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
|
||||
dropna, tuples, outputs, nulls_fixture
|
||||
):
|
||||
# GH 3729 this is to test that NA is in one group
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", nulls_fixture, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
["A", "B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels(["A", "B", np.nan], level="b")
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
|
||||
{
|
||||
"c": [12.0, 13.3, 123.23, 1.0],
|
||||
"d": [12.0, 234.0, 123.0, 1.0],
|
||||
"e": [12.0, 13.0, 1.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
|
||||
dropna, tuples, outputs, nulls_fixture, nulls_fixture2
|
||||
):
|
||||
# GH 3729 this is to test that NA in different groups with different representations
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", nulls_fixture, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
[nulls_fixture2, "B", 1, 1, 1.0],
|
||||
["A", nulls_fixture2, 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, idx, outputs",
|
||||
[
|
||||
(True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}),
|
||||
(
|
||||
False,
|
||||
["A", "B", np.nan],
|
||||
{
|
||||
"b": [123.23, 13.0, 12.3],
|
||||
"c": [123.0, 13.0, 233.0],
|
||||
"d": [1.0, 13.0, 12.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
|
||||
# GH 3729
|
||||
df_list = [
|
||||
["B", 12, 12, 12],
|
||||
[None, 12.3, 233.0, 12],
|
||||
["A", 123.23, 123, 1],
|
||||
["B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
|
||||
grouped = df.groupby("a", dropna=dropna).sum()
|
||||
|
||||
expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a"))
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, idx, expected",
|
||||
[
|
||||
(True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])),
|
||||
(
|
||||
False,
|
||||
["a", "a", "b", np.nan],
|
||||
pd.Series([3, 3, 3], index=["a", "b", np.nan]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_series_level(dropna, idx, expected):
|
||||
ser = pd.Series([1, 2, 3, 3], index=idx)
|
||||
|
||||
result = ser.groupby(level=0, dropna=dropna).sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, expected",
|
||||
[
|
||||
(True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")),
|
||||
(
|
||||
False,
|
||||
pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_series_by(dropna, expected):
|
||||
ser = pd.Series(
|
||||
[390.0, 350.0, 30.0, 20.0],
|
||||
index=["Falcon", "Falcon", "Parrot", "Parrot"],
|
||||
name="Max Speed",
|
||||
)
|
||||
|
||||
result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_grouper_dropna_propagation(dropna):
|
||||
# GH 36604
|
||||
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
|
||||
gb = df.groupby("A", dropna=dropna)
|
||||
assert gb._grouper.dropna == dropna
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index",
|
||||
[
|
||||
pd.RangeIndex(0, 4),
|
||||
list("abcd"),
|
||||
pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
|
||||
],
|
||||
)
|
||||
def test_groupby_dataframe_slice_then_transform(dropna, index):
|
||||
# GH35014 & GH35612
|
||||
expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
|
||||
|
||||
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
|
||||
gb = df.groupby("A", dropna=dropna)
|
||||
|
||||
result = gb.transform(len)
|
||||
expected = pd.DataFrame(expected_data, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = gb[["B"]].transform(len)
|
||||
expected = pd.DataFrame(expected_data, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = gb["B"].transform(len)
|
||||
expected = pd.Series(expected_data["B"], index=index, name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, tuples, outputs",
|
||||
[
|
||||
(
|
||||
True,
|
||||
[["A", "B"], ["B", "A"]],
|
||||
{"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]},
|
||||
),
|
||||
(
|
||||
False,
|
||||
[["A", "B"], ["A", np.nan], ["B", "A"]],
|
||||
{
|
||||
"c": [13.0, 12.3, 123.23],
|
||||
"d": [12.0, 233.0, 123.0],
|
||||
"e": [1.0, 12.0, 1.0],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
|
||||
# GH 3729
|
||||
df_list = [
|
||||
["A", "B", 12, 12, 12],
|
||||
["A", None, 12.3, 233.0, 12],
|
||||
["B", "A", 123.23, 123, 1],
|
||||
["A", "B", 1, 1, 1.0],
|
||||
]
|
||||
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
|
||||
agg_dict = {"c": "sum", "d": "max", "e": "min"}
|
||||
grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict)
|
||||
|
||||
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
|
||||
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna:
|
||||
mi = mi.set_levels(["A", "B", np.nan], level="b")
|
||||
expected = pd.DataFrame(outputs, index=mi)
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.arm_slow
|
||||
@pytest.mark.parametrize(
|
||||
"datetime1, datetime2",
|
||||
[
|
||||
(pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")),
|
||||
(pd.Timedelta("-2 days"), pd.Timedelta("-1 days")),
|
||||
(pd.Period("2020-01-01"), pd.Period("2020-02-01")),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
|
||||
def test_groupby_dropna_datetime_like_data(
|
||||
dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
|
||||
):
|
||||
# 3729
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"values": [1, 2, 3, 4, 5, 6],
|
||||
"dt": [
|
||||
datetime1,
|
||||
unique_nulls_fixture,
|
||||
datetime2,
|
||||
unique_nulls_fixture2,
|
||||
datetime1,
|
||||
datetime1,
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if dropna:
|
||||
indexes = [datetime1, datetime2]
|
||||
else:
|
||||
indexes = [datetime1, datetime2, np.nan]
|
||||
|
||||
grouped = df.groupby("dt", dropna=dropna).agg({"values": "sum"})
|
||||
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
|
||||
|
||||
tm.assert_frame_equal(grouped, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dropna, data, selected_data, levels",
|
||||
[
|
||||
pytest.param(
|
||||
False,
|
||||
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
["a", "b", np.nan],
|
||||
id="dropna_false_has_nan",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0]},
|
||||
None,
|
||||
id="dropna_true_has_nan",
|
||||
),
|
||||
pytest.param(
|
||||
# no nan in "groups"; dropna=True|False should be same.
|
||||
False,
|
||||
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
None,
|
||||
id="dropna_false_no_nan",
|
||||
),
|
||||
pytest.param(
|
||||
# no nan in "groups"; dropna=True|False should be same.
|
||||
True,
|
||||
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
|
||||
{"values": [0, 1, 0, 0]},
|
||||
None,
|
||||
id="dropna_true_no_nan",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
|
||||
# GH 35889
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
gb = df.groupby("groups", dropna=dropna)
|
||||
result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
|
||||
|
||||
mi_tuples = tuple(zip(data["groups"], selected_data["values"], strict=False))
|
||||
mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
|
||||
# Since right now, by default MI will drop NA from levels when we create MI
|
||||
# via `from_*`, so we need to add NA for level manually afterwards.
|
||||
if not dropna and levels:
|
||||
mi = mi.set_levels(levels, level="groups")
|
||||
|
||||
expected = pd.DataFrame(selected_data, index=mi)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_index", [None, ["a"], ["a", "b"]])
|
||||
@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
|
||||
@pytest.mark.parametrize("series", [True, False])
|
||||
def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
|
||||
# GH#46783
|
||||
obj = pd.DataFrame(
|
||||
{
|
||||
"a": [1, np.nan],
|
||||
"b": [1, 1],
|
||||
"c": [2, 3],
|
||||
}
|
||||
)
|
||||
|
||||
expected = obj.set_index(keys)
|
||||
if series:
|
||||
expected = expected["c"]
|
||||
elif input_index == ["a", "b"] and keys == ["a"]:
|
||||
# Column b should not be aggregated
|
||||
expected = expected[["c"]]
|
||||
|
||||
if input_index is not None:
|
||||
obj = obj.set_index(input_index)
|
||||
gb = obj.groupby(keys, dropna=False)
|
||||
if series:
|
||||
gb = gb["c"]
|
||||
result = gb.sum()
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nan_included():
|
||||
# GH 35646
|
||||
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", dropna=False)
|
||||
result = grouped.indices
|
||||
dtype = np.intp
|
||||
expected = {
|
||||
"g1": np.array([0, 2], dtype=dtype),
|
||||
"g2": np.array([3], dtype=dtype),
|
||||
np.nan: np.array([1, 4], dtype=dtype),
|
||||
}
|
||||
for result_values, expected_values in zip(
|
||||
result.values(), expected.values(), strict=True
|
||||
):
|
||||
tm.assert_numpy_array_equal(result_values, expected_values)
|
||||
assert np.isnan(list(result.keys())[2])
|
||||
assert list(result.keys())[0:2] == ["g1", "g2"]
|
||||
|
||||
|
||||
def test_groupby_drop_nan_with_multi_index():
|
||||
# GH 39895
|
||||
df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"])
|
||||
df = df.set_index(["a", "b"])
|
||||
result = df.groupby(["a", "b"], dropna=False).first()
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
# y >x and z is the missing value
|
||||
@pytest.mark.parametrize(
|
||||
"sequence",
|
||||
[
|
||||
"xyzy",
|
||||
"xxyz",
|
||||
"yzxz",
|
||||
"zzzz",
|
||||
"zyzx",
|
||||
"yyyy",
|
||||
"zzxy",
|
||||
"xyxy",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
None,
|
||||
"UInt8",
|
||||
"Int8",
|
||||
"UInt16",
|
||||
"Int16",
|
||||
"UInt32",
|
||||
"Int32",
|
||||
"UInt64",
|
||||
"Int64",
|
||||
"Float32",
|
||||
"Float64",
|
||||
"category",
|
||||
"string",
|
||||
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
"datetime64[ns]",
|
||||
"period[D]",
|
||||
"Sparse[float]",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("test_series", [True, False])
|
||||
def test_no_sort_keep_na(sequence, dtype, test_series, as_index):
|
||||
# GH#46584, GH#48794
|
||||
|
||||
# Unique values to use for grouper, depends on dtype
|
||||
if dtype in ("string", "string[pyarrow]"):
|
||||
uniques = {"x": "x", "y": "y", "z": pd.NA}
|
||||
elif dtype in ("datetime64[ns]", "period[D]"):
|
||||
uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA}
|
||||
elif dtype is not None and dtype.startswith(("I", "U", "F")):
|
||||
uniques = {"x": 1, "y": 2, "z": pd.NA}
|
||||
else:
|
||||
uniques = {"x": 1, "y": 2, "z": np.nan}
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"key": pd.Series([uniques[label] for label in sequence], dtype=dtype),
|
||||
"a": [0, 1, 2, 3],
|
||||
}
|
||||
)
|
||||
gb = df.groupby("key", dropna=False, sort=False, as_index=as_index, observed=False)
|
||||
if test_series:
|
||||
gb = gb["a"]
|
||||
result = gb.sum()
|
||||
|
||||
# Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
|
||||
# issues with hashing np.nan
|
||||
summed = {}
|
||||
for idx, label in enumerate(sequence):
|
||||
summed[label] = summed.get(label, 0) + idx
|
||||
if dtype == "category":
|
||||
index = pd.CategoricalIndex(
|
||||
[uniques[e] for e in summed],
|
||||
df["key"].cat.categories,
|
||||
name="key",
|
||||
)
|
||||
elif isinstance(dtype, str) and dtype.startswith("Sparse"):
|
||||
index = pd.Index(
|
||||
pd.array([uniques[label] for label in summed], dtype=dtype), name="key"
|
||||
)
|
||||
else:
|
||||
index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
|
||||
expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
|
||||
if not test_series:
|
||||
expected = expected.to_frame()
|
||||
if not as_index:
|
||||
expected = expected.reset_index()
|
||||
if dtype is not None and dtype.startswith("Sparse"):
|
||||
expected["key"] = expected["key"].astype(dtype)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_series", [True, False])
|
||||
@pytest.mark.parametrize("dtype", [object, None])
|
||||
def test_null_is_null_for_dtype(
|
||||
sort, dtype, nulls_fixture, nulls_fixture2, test_series
|
||||
):
|
||||
# GH#48506 - groups should always result in using the null for the dtype
|
||||
df = pd.DataFrame({"a": [1, 2]})
|
||||
groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
|
||||
obj = df["a"] if test_series else df
|
||||
gb = obj.groupby(groups, dropna=False, sort=sort)
|
||||
result = gb.sum()
|
||||
index = pd.Index([na_value_for_dtype(groups.dtype)])
|
||||
expected = pd.DataFrame({"a": [3]}, index=index)
|
||||
if test_series:
|
||||
tm.assert_series_equal(result, expected["a"])
|
||||
else:
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
|
||||
def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind):
|
||||
# Ensure there is at least one null value by appending to the end
|
||||
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
|
||||
)
|
||||
|
||||
# Strategy: Compare to dropna=True by filling null values with a new code
|
||||
df_filled = df.copy()
|
||||
df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4)
|
||||
|
||||
if index_kind == "range":
|
||||
keys = ["x"]
|
||||
elif index_kind == "single":
|
||||
keys = ["x"]
|
||||
df = df.set_index("x")
|
||||
df_filled = df_filled.set_index("x")
|
||||
else:
|
||||
keys = ["x", "x2"]
|
||||
df["x2"] = df["x"]
|
||||
df = df.set_index(["x", "x2"])
|
||||
df_filled["x2"] = df_filled["x"]
|
||||
df_filled = df_filled.set_index(["x", "x2"])
|
||||
args = get_groupby_method_args(reduction_func, df)
|
||||
args_filled = get_groupby_method_args(reduction_func, df_filled)
|
||||
if reduction_func == "corrwith" and index_kind == "range":
|
||||
# Don't include the grouping columns so we can call reset_index
|
||||
args = (args[0].drop(columns=keys),)
|
||||
args_filled = (args_filled[0].drop(columns=keys),)
|
||||
|
||||
gb_keepna = df.groupby(
|
||||
keys, dropna=False, observed=observed, sort=sort, as_index=as_index
|
||||
)
|
||||
|
||||
if not observed and reduction_func in ["idxmin", "idxmax"]:
|
||||
with pytest.raises(
|
||||
ValueError, match="empty group due to unobserved categories"
|
||||
):
|
||||
getattr(gb_keepna, reduction_func)(*args)
|
||||
return
|
||||
|
||||
gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
|
||||
if reduction_func == "corrwith":
|
||||
warn = Pandas4Warning
|
||||
msg = "DataFrameGroupBy.corrwith is deprecated"
|
||||
else:
|
||||
warn = None
|
||||
msg = ""
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()
|
||||
expected["x"] = expected["x"].cat.remove_categories([4])
|
||||
if index_kind == "multi":
|
||||
expected["x2"] = expected["x2"].cat.remove_categories([4])
|
||||
if as_index:
|
||||
if index_kind == "multi":
|
||||
expected = expected.set_index(["x", "x2"])
|
||||
else:
|
||||
expected = expected.set_index("x")
|
||||
if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
|
||||
# expected was computed with a RangeIndex; need to translate to index values
|
||||
values = expected["y"].values.tolist()
|
||||
if index_kind == "single":
|
||||
values = [np.nan if e == 4 else e for e in values]
|
||||
expected["y"] = pd.Categorical(values, categories=[1, 2, 3])
|
||||
else:
|
||||
values = [(np.nan, np.nan) if e == (4, 4) else e for e in values]
|
||||
expected["y"] = values
|
||||
if reduction_func == "size":
|
||||
# size, unlike other methods, has the desired behavior in GH#49519
|
||||
expected = expected.rename(columns={0: "size"})
|
||||
if as_index:
|
||||
expected = expected["size"].rename(None)
|
||||
|
||||
if reduction_func == "corrwith":
|
||||
warn = Pandas4Warning
|
||||
msg = "DataFrameGroupBy.corrwith is deprecated"
|
||||
else:
|
||||
warn = None
|
||||
msg = ""
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = getattr(gb_keepna, reduction_func)(*args)
|
||||
|
||||
# size will return a Series, others are DataFrame
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_transformers(transformation_func, observed, sort, as_index):
|
||||
# GH#36327
|
||||
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
|
||||
)
|
||||
args = get_groupby_method_args(transformation_func, df)
|
||||
|
||||
# Compute result for null group
|
||||
null_group_values = df[df["x"].isnull()]["y"]
|
||||
if transformation_func == "cumcount":
|
||||
null_group_data = list(range(len(null_group_values)))
|
||||
elif transformation_func == "ngroup":
|
||||
if sort:
|
||||
if observed:
|
||||
na_group = df["x"].nunique(dropna=False) - 1
|
||||
else:
|
||||
# TODO: Should this be 3?
|
||||
na_group = df["x"].nunique(dropna=False) - 1
|
||||
else:
|
||||
na_group = df.iloc[: null_group_values.index[0]]["x"].nunique()
|
||||
null_group_data = len(null_group_values) * [na_group]
|
||||
else:
|
||||
null_group_data = getattr(null_group_values, transformation_func)(*args)
|
||||
null_group_result = pd.DataFrame({"y": null_group_data})
|
||||
|
||||
gb_keepna = df.groupby(
|
||||
"x", dropna=False, observed=observed, sort=sort, as_index=as_index
|
||||
)
|
||||
gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort)
|
||||
|
||||
result = getattr(gb_keepna, transformation_func)(*args)
|
||||
expected = getattr(gb_dropna, transformation_func)(*args)
|
||||
|
||||
for iloc, value in zip(
|
||||
df[df["x"].isnull()].index.tolist(),
|
||||
null_group_result.values.ravel(),
|
||||
strict=True,
|
||||
):
|
||||
if expected.ndim == 1:
|
||||
expected.iloc[iloc] = value
|
||||
else:
|
||||
expected.iloc[iloc, 0] = value
|
||||
if transformation_func == "ngroup":
|
||||
expected[df["x"].notnull() & expected.ge(na_group)] += 1
|
||||
if transformation_func not in ("rank", "diff", "pct_change", "shift"):
|
||||
expected = expected.astype("int64")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["head", "tail"])
|
||||
def test_categorical_head_tail(method, observed, sort, as_index):
|
||||
# GH#36327
|
||||
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||||
)
|
||||
gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index)
|
||||
result = getattr(gb, method)()
|
||||
|
||||
if method == "tail":
|
||||
values = values[::-1]
|
||||
# Take the top 5 values from each group
|
||||
mask = (
|
||||
((values == 1) & ((values == 1).cumsum() <= 5))
|
||||
| ((values == 2) & ((values == 2).cumsum() <= 5))
|
||||
# flake8 doesn't like the vectorized check for None, thinks we should use `is`
|
||||
| ((values == None) & ((values == None).cumsum() <= 5)) # noqa: E711
|
||||
)
|
||||
if method == "tail":
|
||||
mask = mask[::-1]
|
||||
expected = df[mask]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_agg():
|
||||
# GH#36327
|
||||
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||||
)
|
||||
gb = df.groupby("x", dropna=False, observed=False)
|
||||
result = gb.agg(lambda x: x.sum())
|
||||
expected = gb.sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_transform():
|
||||
# GH#36327
|
||||
values = np.random.default_rng(2).choice([1, 2, None], 30)
|
||||
df = pd.DataFrame(
|
||||
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
|
||||
)
|
||||
gb = df.groupby("x", dropna=False, observed=False)
|
||||
result = gb.transform(lambda x: x.sum())
|
||||
expected = gb.transform("sum")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,152 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import Pandas4Warning
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"obj",
|
||||
[
|
||||
tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
|
||||
tm.SubclassedSeries(np.arange(0, 10), name="A"),
|
||||
],
|
||||
)
|
||||
def test_groupby_preserves_subclass(obj, groupby_func):
|
||||
# GH28330 -- preserve subclass through groupby operations
|
||||
|
||||
if isinstance(obj, Series) and groupby_func in {"corrwith"}:
|
||||
pytest.skip(f"Not applicable for Series and {groupby_func}")
|
||||
|
||||
grouped = obj.groupby(np.arange(0, 10))
|
||||
|
||||
# Groups should preserve subclass type
|
||||
assert isinstance(grouped.get_group(0), type(obj))
|
||||
|
||||
args = get_groupby_method_args(groupby_func, obj)
|
||||
|
||||
warn = Pandas4Warning if groupby_func == "corrwith" else None
|
||||
msg = f"{type(grouped).__name__}.corrwith is deprecated"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result1 = getattr(grouped, groupby_func)(*args)
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result2 = grouped.agg(groupby_func, *args)
|
||||
|
||||
# Reduction or transformation kernels should preserve type
|
||||
slices = {"ngroup", "cumcount", "size"}
|
||||
if isinstance(obj, DataFrame) and groupby_func in slices:
|
||||
assert isinstance(result1, tm.SubclassedSeries)
|
||||
else:
|
||||
assert isinstance(result1, type(obj))
|
||||
|
||||
# Confirm .agg() groupby operations return same results
|
||||
if isinstance(result1, DataFrame):
|
||||
tm.assert_frame_equal(result1, result2)
|
||||
else:
|
||||
tm.assert_series_equal(result1, result2)
|
||||
|
||||
|
||||
def test_groupby_preserves_metadata():
|
||||
# GH-37343
|
||||
custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
|
||||
assert "testattr" in custom_df._metadata
|
||||
custom_df.testattr = "hello"
|
||||
for _, group_df in custom_df.groupby("c"):
|
||||
assert group_df.testattr == "hello"
|
||||
|
||||
# GH-45314
|
||||
def func(group):
|
||||
assert isinstance(group, tm.SubclassedDataFrame)
|
||||
assert hasattr(group, "testattr")
|
||||
assert group.testattr == "hello"
|
||||
return group.testattr
|
||||
|
||||
result = custom_df.groupby("c").apply(func)
|
||||
expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = custom_df.groupby("c").apply(func)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# https://github.com/pandas-dev/pandas/pull/56761
|
||||
result = custom_df.groupby("c")[["a", "b"]].apply(func)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def func2(group):
|
||||
assert isinstance(group, tm.SubclassedSeries)
|
||||
assert hasattr(group, "testattr")
|
||||
return group.testattr
|
||||
|
||||
custom_series = tm.SubclassedSeries([1, 2, 3])
|
||||
custom_series.testattr = "hello"
|
||||
result = custom_series.groupby(custom_df["c"]).apply(func2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = custom_series.groupby(custom_df["c"]).agg(func2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_apply_preserves_metadata():
|
||||
# GH#62134 - Test that apply() preserves metadata when returning DataFrames/Series
|
||||
custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
|
||||
custom_df.testattr = "hello"
|
||||
|
||||
def sum_func(group):
|
||||
assert isinstance(group, tm.SubclassedDataFrame)
|
||||
assert hasattr(group, "testattr")
|
||||
assert group.testattr == "hello"
|
||||
return group.sum()
|
||||
|
||||
result = custom_df.groupby("c").apply(sum_func)
|
||||
assert hasattr(result, "testattr"), "DataFrame apply() should preserve metadata"
|
||||
assert result.testattr == "hello"
|
||||
|
||||
custom_series = tm.SubclassedSeries([1, 2, 3])
|
||||
custom_series.testattr = "hello"
|
||||
|
||||
def sum_series_func(group):
|
||||
assert isinstance(group, tm.SubclassedSeries)
|
||||
assert hasattr(group, "testattr")
|
||||
assert group.testattr == "hello"
|
||||
return group.sum()
|
||||
|
||||
result = custom_series.groupby(custom_df["c"]).apply(sum_series_func)
|
||||
assert hasattr(result, "testattr"), "Series apply() should preserve metadata"
|
||||
assert result.testattr == "hello"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame])
|
||||
def test_groupby_resample_preserves_subclass(obj):
|
||||
# GH28330 -- preserve subclass through groupby.resample()
|
||||
|
||||
df = obj(
|
||||
{
|
||||
"Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
df = df.set_index("Date")
|
||||
|
||||
# Confirm groupby.resample() preserves dataframe type
|
||||
result = df.groupby("Buyer").resample("5D").sum()
|
||||
assert isinstance(result, obj)
|
||||
1216
venv/Lib/site-packages/pandas/tests/groupby/test_grouping.py
Normal file
1216
venv/Lib/site-packages/pandas/tests/groupby/test_grouping.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,72 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key_strs,groupers",
|
||||
[
|
||||
("inner", pd.Grouper(level="inner")), # Index name
|
||||
(["inner"], [pd.Grouper(level="inner")]), # List of index name
|
||||
(["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index
|
||||
(["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("levels", [["inner"], ["inner", "outer"]])
|
||||
def test_grouper_index_level_as_string(levels, key_strs, groupers):
|
||||
frame = pd.DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 3, 1, 2, 3],
|
||||
"A": np.arange(6),
|
||||
"B": ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
frame = frame.set_index(levels)
|
||||
if "B" not in key_strs or "outer" in frame.columns:
|
||||
result = frame.groupby(key_strs).mean(numeric_only=True)
|
||||
expected = frame.groupby(groupers).mean(numeric_only=True)
|
||||
else:
|
||||
result = frame.groupby(key_strs).mean()
|
||||
expected = frame.groupby(groupers).mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"levels",
|
||||
[
|
||||
"inner",
|
||||
"outer",
|
||||
"B",
|
||||
["inner"],
|
||||
["outer"],
|
||||
["B"],
|
||||
["inner", "outer"],
|
||||
["outer", "inner"],
|
||||
["inner", "outer", "B"],
|
||||
["B", "outer", "inner"],
|
||||
],
|
||||
)
|
||||
def test_grouper_index_level_as_string_series(levels):
|
||||
# Compute expected result
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 3, 1, 2, 3],
|
||||
"A": np.arange(6),
|
||||
"B": ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
series = df.set_index(["outer", "inner", "B"])["A"]
|
||||
if isinstance(levels, list):
|
||||
groupers = [pd.Grouper(level=lv) for lv in levels]
|
||||
else:
|
||||
groupers = pd.Grouper(level=levels)
|
||||
|
||||
expected = series.groupby(groupers).mean()
|
||||
|
||||
# Compute and check result
|
||||
result = series.groupby(levels).mean()
|
||||
tm.assert_series_equal(result, expected)
|
||||
310
venv/Lib/site-packages/pandas/tests/groupby/test_indexing.py
Normal file
310
venv/Lib/site-packages/pandas/tests/groupby/test_indexing.py
Normal file
@@ -0,0 +1,310 @@
|
||||
# Test GroupBy._positional_selector positional grouped indexing GH#42864
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[0, [0, 1, 4]],
|
||||
[2, [5]],
|
||||
[5, []],
|
||||
[-1, [3, 4, 7]],
|
||||
[-2, [1, 6]],
|
||||
[-6, []],
|
||||
],
|
||||
)
|
||||
def test_int(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test single integer
|
||||
result = slice_test_grouped._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_slice(slice_test_df, slice_test_grouped):
|
||||
# Test single slice
|
||||
result = slice_test_grouped._positional_selector[0:3:2]
|
||||
expected = slice_test_df.iloc[[0, 1, 4, 5]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[[0, 2], [0, 1, 4, 5]],
|
||||
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
|
||||
[range(0, 3, 2), [0, 1, 4, 5]],
|
||||
[{0, 2}, [0, 1, 4, 5]],
|
||||
],
|
||||
ids=[
|
||||
"list",
|
||||
"negative",
|
||||
"range",
|
||||
"set",
|
||||
],
|
||||
)
|
||||
def test_list(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test lists of integers and integer valued iterables
|
||||
result = slice_test_grouped._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_ints(slice_test_df, slice_test_grouped):
|
||||
# Test tuple of ints
|
||||
result = slice_test_grouped._positional_selector[0, 2, -1]
|
||||
expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_slices(slice_test_df, slice_test_grouped):
|
||||
# Test tuple of slices
|
||||
result = slice_test_grouped._positional_selector[:2, -2:]
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_mix(slice_test_df, slice_test_grouped):
|
||||
# Test mixed tuple of ints and slices
|
||||
result = slice_test_grouped._positional_selector[0, 1, -2:]
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[0, [0, 1, 4]],
|
||||
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
|
||||
[(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]],
|
||||
],
|
||||
)
|
||||
def test_as_index(slice_test_df, arg, expected_rows):
|
||||
# Test the default as_index behaviour
|
||||
result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg]
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_doc_examples():
|
||||
# Test the examples in the documentation
|
||||
df = pd.DataFrame(
|
||||
[["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"]
|
||||
)
|
||||
|
||||
grouped = df.groupby("A", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[1:2]
|
||||
expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped._positional_selector[1, -1]
|
||||
expected = pd.DataFrame(
|
||||
[["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiindex():
|
||||
# Test the multiindex mentioned as the use-case in the documentation
|
||||
|
||||
def _make_df_from_data(data):
|
||||
rows = {}
|
||||
for date in data:
|
||||
for level in data[date]:
|
||||
rows[(date, level[0])] = {"A": level[1], "B": level[2]}
|
||||
|
||||
df = pd.DataFrame.from_dict(rows, orient="index")
|
||||
df.index.names = ("Date", "Item")
|
||||
return df
|
||||
|
||||
rng = np.random.default_rng(2)
|
||||
ndates = 100
|
||||
nitems = 20
|
||||
dates = pd.date_range("20130101", periods=ndates, freq="D")
|
||||
items = [f"item {i}" for i in range(nitems)]
|
||||
|
||||
multiindex_data = {}
|
||||
for date in dates:
|
||||
nitems_for_date = nitems - rng.integers(0, 12)
|
||||
levels = [
|
||||
(item, rng.integers(0, 10000) / 100, rng.integers(0, 10000) / 100)
|
||||
for item in items[:nitems_for_date]
|
||||
]
|
||||
levels.sort(key=lambda x: x[1])
|
||||
multiindex_data[date] = levels
|
||||
|
||||
df = _make_df_from_data(multiindex_data)
|
||||
result = df.groupby("Date", as_index=False).nth(slice(3, -3))
|
||||
|
||||
sliced = {date: values[3:-3] for date, values in multiindex_data.items()}
|
||||
expected = _make_df_from_data(sliced)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000])
|
||||
@pytest.mark.parametrize("method", ["head", "tail"])
|
||||
@pytest.mark.parametrize("simulated", [True, False])
|
||||
def test_against_head_and_tail(arg, method, simulated):
|
||||
# Test gives the same results as grouped head and tail
|
||||
n_groups = 100
|
||||
n_rows_per_group = 30
|
||||
|
||||
data = {
|
||||
"group": [
|
||||
f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)
|
||||
],
|
||||
"value": [
|
||||
f"group {g} row {j}"
|
||||
for j in range(n_rows_per_group)
|
||||
for g in range(n_groups)
|
||||
],
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", as_index=False)
|
||||
size = arg if arg >= 0 else n_rows_per_group + arg
|
||||
|
||||
if method == "head":
|
||||
result = grouped._positional_selector[:arg]
|
||||
|
||||
if simulated:
|
||||
indices = [
|
||||
j * n_groups + i
|
||||
for j in range(size)
|
||||
for i in range(n_groups)
|
||||
if j * n_groups + i < n_groups * n_rows_per_group
|
||||
]
|
||||
expected = df.iloc[indices]
|
||||
|
||||
else:
|
||||
expected = grouped.head(arg)
|
||||
|
||||
else:
|
||||
result = grouped._positional_selector[-arg:]
|
||||
|
||||
if simulated:
|
||||
indices = [
|
||||
(n_rows_per_group + j - size) * n_groups + i
|
||||
for j in range(size)
|
||||
for i in range(n_groups)
|
||||
if (n_rows_per_group + j - size) * n_groups + i >= 0
|
||||
]
|
||||
expected = df.iloc[indices]
|
||||
|
||||
else:
|
||||
expected = grouped.tail(arg)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10])
|
||||
@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10])
|
||||
@pytest.mark.parametrize("step", [None, 1, 5])
|
||||
def test_against_df_iloc(start, stop, step):
|
||||
# Test that a single group gives the same results as DataFrame.iloc
|
||||
n_rows = 30
|
||||
|
||||
data = {
|
||||
"group": ["group 0"] * n_rows,
|
||||
"value": list(range(n_rows)),
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
grouped = df.groupby("group", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[start:stop:step]
|
||||
expected = df.iloc[start:stop:step]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_series():
|
||||
# Test grouped Series
|
||||
ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"])
|
||||
grouped = ser.groupby(level=0)
|
||||
result = grouped._positional_selector[1:2]
|
||||
expected = pd.Series([2, 5], index=["a", "b"])
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("step", [1, 2, 3, 4, 5])
|
||||
def test_step(step):
|
||||
# Test slice with various step values
|
||||
data = [["x", f"x{i}"] for i in range(5)]
|
||||
data += [["y", f"y{i}"] for i in range(4)]
|
||||
data += [["z", f"z{i}"] for i in range(3)]
|
||||
df = pd.DataFrame(data, columns=["A", "B"])
|
||||
|
||||
grouped = df.groupby("A", as_index=False)
|
||||
|
||||
result = grouped._positional_selector[::step]
|
||||
|
||||
data = [["x", f"x{i}"] for i in range(0, 5, step)]
|
||||
data += [["y", f"y{i}"] for i in range(0, 4, step)]
|
||||
data += [["z", f"z{i}"] for i in range(0, 3, step)]
|
||||
|
||||
index = [0 + i for i in range(0, 5, step)]
|
||||
index += [5 + i for i in range(0, 4, step)]
|
||||
index += [9 + i for i in range(0, 3, step)]
|
||||
|
||||
expected = pd.DataFrame(data, columns=["A", "B"], index=index)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_columns_on_iter():
|
||||
# GitHub issue #44821
|
||||
df = pd.DataFrame({k: range(10) for k in "ABC"})
|
||||
|
||||
# Group-by and select columns
|
||||
cols = ["A", "B"]
|
||||
for _, dg in df.groupby(df.A < 4)[cols]:
|
||||
tm.assert_index_equal(dg.columns, pd.Index(cols))
|
||||
assert "C" not in dg.columns
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array])
|
||||
def test_groupby_duplicated_columns(func):
|
||||
# GH#44924
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2],
|
||||
"B": [3, 3],
|
||||
"C": ["G", "G"],
|
||||
}
|
||||
)
|
||||
result = df.groupby("C")[func(["A", "B", "A"])].mean()
|
||||
expected = pd.DataFrame(
|
||||
[[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_get_nonexisting_groups():
|
||||
# GH#32492
|
||||
df = pd.DataFrame(
|
||||
data={
|
||||
"A": ["a1", "a2", None],
|
||||
"B": ["b1", "b2", "b1"],
|
||||
"val": [1, 2, 3],
|
||||
}
|
||||
)
|
||||
grps = df.groupby(by=["A", "B"])
|
||||
|
||||
msg = "('a2', 'b1')"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
grps.get_group(("a2", "b1"))
|
||||
344
venv/Lib/site-packages/pandas/tests/groupby/test_libgroupby.py
Normal file
344
venv/Lib/site-packages/pandas/tests/groupby/test_libgroupby.py
Normal file
@@ -0,0 +1,344 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import groupby as libgroupby
|
||||
from pandas._libs.groupby import (
|
||||
group_cumprod,
|
||||
group_cumsum,
|
||||
group_mean,
|
||||
group_sum,
|
||||
group_var,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import ensure_platform_int
|
||||
|
||||
from pandas import isna
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype, rtol", [("float32", 1e-2), ("float64", 1e-5)])
|
||||
class TestGroupVar:
|
||||
def test_group_var_generic_1d(self, dtype, rtol):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((5, 1))).astype(dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.random((15, 1)).astype(dtype)
|
||||
labels = np.tile(np.arange(5), (3,)).astype("intp")
|
||||
|
||||
expected_out = (
|
||||
np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
|
||||
)[:, np.newaxis]
|
||||
expected_counts = counts + 3
|
||||
|
||||
group_var(out, counts, values, labels)
|
||||
assert np.allclose(out, expected_out, rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_1d_flat_labels(self, dtype, rtol):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((1, 1))).astype(dtype)
|
||||
counts = np.zeros(1, dtype="int64")
|
||||
values = 10 * prng.random((5, 1)).astype(dtype)
|
||||
labels = np.zeros(5, dtype="intp")
|
||||
|
||||
expected_out = np.array([[values.std(ddof=1) ** 2]])
|
||||
expected_counts = counts + 5
|
||||
|
||||
group_var(out, counts, values, labels)
|
||||
|
||||
assert np.allclose(out, expected_out, rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_2d_all_finite(self, dtype, rtol):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((5, 2))).astype(dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.random((10, 2)).astype(dtype)
|
||||
labels = np.tile(np.arange(5), (2,)).astype("intp")
|
||||
|
||||
expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
|
||||
expected_counts = counts + 2
|
||||
|
||||
group_var(out, counts, values, labels)
|
||||
assert np.allclose(out, expected_out, rtol)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_generic_2d_some_nan(self, dtype, rtol):
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = (np.nan * np.ones((5, 2))).astype(dtype)
|
||||
counts = np.zeros(5, dtype="int64")
|
||||
values = 10 * prng.random((10, 2)).astype(dtype)
|
||||
values[:, 1] = np.nan
|
||||
labels = np.tile(np.arange(5), (2,)).astype("intp")
|
||||
|
||||
expected_out = np.vstack(
|
||||
[
|
||||
values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
|
||||
np.nan * np.ones(5),
|
||||
]
|
||||
).T.astype(dtype)
|
||||
expected_counts = counts + 2
|
||||
|
||||
group_var(out, counts, values, labels)
|
||||
tm.assert_almost_equal(out, expected_out, rtol=0.5e-06)
|
||||
tm.assert_numpy_array_equal(counts, expected_counts)
|
||||
|
||||
def test_group_var_constant(self, dtype, rtol):
|
||||
# Regression test from GH 10448.
|
||||
|
||||
out = np.array([[np.nan]], dtype=dtype)
|
||||
counts = np.array([0], dtype="int64")
|
||||
values = 0.832845131556193 * np.ones((3, 1), dtype=dtype)
|
||||
labels = np.zeros(3, dtype="intp")
|
||||
|
||||
group_var(out, counts, values, labels)
|
||||
|
||||
assert counts[0] == 3
|
||||
assert out[0, 0] >= 0
|
||||
tm.assert_almost_equal(out[0, 0], 0.0)
|
||||
|
||||
|
||||
def test_group_var_large_inputs():
|
||||
dtype = np.float64
|
||||
prng = np.random.default_rng(2)
|
||||
|
||||
out = np.array([[np.nan]], dtype=dtype)
|
||||
counts = np.array([0], dtype="int64")
|
||||
values = (prng.random((10**6, 1)) + 10**12).astype(dtype)
|
||||
labels = np.zeros(10**6, dtype="intp")
|
||||
|
||||
group_var(out, counts, values, labels)
|
||||
|
||||
assert counts[0] == 10**6
|
||||
tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["float32", "float64"])
|
||||
def test_group_ohlc(dtype):
|
||||
obj = np.array(np.random.default_rng(2).standard_normal(20), dtype=dtype)
|
||||
|
||||
bins = np.array([6, 12, 20])
|
||||
out = np.zeros((3, 4), dtype)
|
||||
counts = np.zeros(len(out), dtype=np.int64)
|
||||
labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
|
||||
|
||||
func = libgroupby.group_ohlc
|
||||
func(out, counts, obj[:, None], labels)
|
||||
|
||||
def _ohlc(group):
|
||||
if isna(group).all():
|
||||
return np.repeat(np.nan, 4)
|
||||
return [group[0], group.max(), group.min(), group[-1]]
|
||||
|
||||
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
|
||||
|
||||
tm.assert_almost_equal(out, expected)
|
||||
tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
|
||||
|
||||
obj[:6] = np.nan
|
||||
func(out, counts, obj[:, None], labels)
|
||||
expected[0] = np.nan
|
||||
tm.assert_almost_equal(out, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float32, np.float64])
|
||||
@pytest.mark.parametrize(
|
||||
"pd_op, np_op",
|
||||
[
|
||||
(group_cumsum, np.cumsum),
|
||||
(group_cumprod, np.cumprod),
|
||||
],
|
||||
)
|
||||
def test_cython_group_transform(dtype, pd_op, np_op):
|
||||
# see gh-4095
|
||||
is_datetimelike = False
|
||||
|
||||
data = np.array([[1], [2], [3], [4]], dtype=dtype)
|
||||
answer = np.zeros_like(data)
|
||||
|
||||
labels = np.array([0, 0, 0, 0], dtype=np.intp)
|
||||
ngroups = 1
|
||||
pd_op(answer, data, labels, ngroups, is_datetimelike)
|
||||
|
||||
tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False)
|
||||
|
||||
|
||||
def test_cython_group_transform_algos():
|
||||
# see gh-4095
|
||||
is_datetimelike = False
|
||||
|
||||
# with nans
|
||||
labels = np.array([0, 0, 0, 0, 0], dtype=np.intp)
|
||||
ngroups = 1
|
||||
|
||||
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
|
||||
actual = np.zeros_like(data)
|
||||
actual.fill(np.nan)
|
||||
group_cumprod(actual, data, labels, ngroups, is_datetimelike)
|
||||
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
|
||||
tm.assert_numpy_array_equal(actual[:, 0], expected)
|
||||
|
||||
actual = np.zeros_like(data)
|
||||
actual.fill(np.nan)
|
||||
group_cumsum(actual, data, labels, ngroups, is_datetimelike)
|
||||
expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
|
||||
tm.assert_numpy_array_equal(actual[:, 0], expected)
|
||||
|
||||
# timedelta
|
||||
is_datetimelike = True
|
||||
data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
|
||||
actual = np.zeros_like(data, dtype="int64")
|
||||
group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
|
||||
expected = np.array(
|
||||
[
|
||||
np.timedelta64(1, "ns"),
|
||||
np.timedelta64(2, "ns"),
|
||||
np.timedelta64(3, "ns"),
|
||||
np.timedelta64(4, "ns"),
|
||||
np.timedelta64(5, "ns"),
|
||||
]
|
||||
)
|
||||
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
|
||||
|
||||
|
||||
def test_cython_group_mean_datetimelike():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.array([0], dtype="int64")
|
||||
data = (
|
||||
np.array(
|
||||
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
|
||||
dtype="m8[ns]",
|
||||
)[:, None]
|
||||
.view("int64")
|
||||
.astype("float64")
|
||||
)
|
||||
labels = np.zeros(len(data), dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=True)
|
||||
|
||||
tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
|
||||
|
||||
|
||||
def test_cython_group_mean_wrong_min_count():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.zeros(1, dtype="int64")
|
||||
data = np.zeros(1, dtype="float64")[:, None]
|
||||
labels = np.zeros(1, dtype=np.intp)
|
||||
|
||||
with pytest.raises(AssertionError, match="min_count"):
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
|
||||
|
||||
|
||||
def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
|
||||
actual = np.zeros(shape=(1, 1), dtype="float64")
|
||||
counts = np.array([0], dtype="int64")
|
||||
data = (
|
||||
np.array(
|
||||
[np.timedelta64("NaT"), np.timedelta64("NaT")],
|
||||
dtype="m8[ns]",
|
||||
)[:, None]
|
||||
.view("int64")
|
||||
.astype("float64")
|
||||
)
|
||||
labels = np.zeros(len(data), dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=False)
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
|
||||
)
|
||||
|
||||
|
||||
def test_cython_group_mean_Inf_at_beginning_and_end():
|
||||
# GH 50367
|
||||
actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64")
|
||||
counts = np.array([0, 0], dtype="int64")
|
||||
data = np.array(
|
||||
[[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]],
|
||||
dtype="float64",
|
||||
)
|
||||
labels = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp)
|
||||
|
||||
group_mean(actual, counts, data, labels, is_datetimelike=False)
|
||||
|
||||
expected = np.array([[np.inf, 3], [3, np.inf]], dtype="float64")
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
actual,
|
||||
expected,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, out",
|
||||
[
|
||||
([[np.inf], [np.inf], [np.inf]], [[np.inf], [np.inf]]),
|
||||
([[np.inf], [np.inf], [-np.inf]], [[np.inf], [np.nan]]),
|
||||
([[np.inf], [-np.inf], [np.inf]], [[np.inf], [np.nan]]),
|
||||
([[np.inf], [-np.inf], [-np.inf]], [[np.inf], [-np.inf]]),
|
||||
],
|
||||
)
|
||||
def test_cython_group_sum_Inf_at_beginning_and_end(values, out):
|
||||
# GH #53606
|
||||
actual = np.array([[np.nan], [np.nan]], dtype="float64")
|
||||
counts = np.array([0, 0], dtype="int64")
|
||||
data = np.array(values, dtype="float64")
|
||||
labels = np.array([0, 1, 1], dtype=np.intp)
|
||||
|
||||
group_sum(actual, counts, data, labels, None, is_datetimelike=False)
|
||||
|
||||
expected = np.array(out, dtype="float64")
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
actual,
|
||||
expected,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, expected_values",
|
||||
[
|
||||
(np.finfo(np.float64).max, [[np.inf]]),
|
||||
(np.finfo(np.float64).min, [[-np.inf]]),
|
||||
(
|
||||
np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).max * 1j),
|
||||
[[complex(-np.inf, np.inf)]],
|
||||
),
|
||||
(
|
||||
np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).min * 1j),
|
||||
[[complex(np.inf, -np.inf)]],
|
||||
),
|
||||
(
|
||||
np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).max * 1j),
|
||||
[[complex(np.inf, np.inf)]],
|
||||
),
|
||||
(
|
||||
np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).min * 1j),
|
||||
[[complex(-np.inf, -np.inf)]],
|
||||
),
|
||||
(
|
||||
np.complex128(3.0 + np.finfo(np.float64).min * 1j),
|
||||
[[complex(9.0, -np.inf)]],
|
||||
),
|
||||
(
|
||||
np.complex128(np.finfo(np.float64).max + 3 * 1j),
|
||||
[[complex(np.inf, 9.0)]],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_cython_group_sum_overflow(values, expected_values):
|
||||
# GH-60303
|
||||
data = np.array([[values] for _ in range(3)])
|
||||
labels = np.array([0, 0, 0], dtype=np.intp)
|
||||
counts = np.array([0], dtype="int64")
|
||||
|
||||
expected = np.array(expected_values, dtype=values.dtype)
|
||||
actual = np.zeros_like(expected)
|
||||
|
||||
group_sum(actual, counts, data, labels, None, is_datetimelike=False)
|
||||
|
||||
tm.assert_numpy_array_equal(actual, expected)
|
||||
89
venv/Lib/site-packages/pandas/tests/groupby/test_missing.py
Normal file
89
venv/Lib/site-packages/pandas/tests/groupby/test_missing.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["ffill", "bfill"])
|
||||
def test_groupby_column_index_name_lost_fill_funcs(func):
|
||||
# GH: 29764 groupby loses index sometimes
|
||||
df = DataFrame(
|
||||
[[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
|
||||
columns=Index(["type", "a", "b"], name="idx"),
|
||||
)
|
||||
df_grouped = df.groupby(["type"])[["a", "b"]]
|
||||
result = getattr(df_grouped, func)().columns
|
||||
expected = Index(["a", "b"], name="idx")
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["ffill", "bfill"])
|
||||
def test_groupby_fill_duplicate_column_names(func):
|
||||
# GH: 25610 ValueError with duplicate column names
|
||||
df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
|
||||
df2 = DataFrame({"field1": [1, np.nan, 4]})
|
||||
df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
|
||||
expected = DataFrame(
|
||||
[[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
|
||||
)
|
||||
result = getattr(df_grouped, func)()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["ffill", "bfill"])
|
||||
@pytest.mark.parametrize("has_nan_group", [True, False])
|
||||
def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
|
||||
# GH 34725
|
||||
|
||||
df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])
|
||||
|
||||
ridx = [-1, 0, -1, -1, 1, -1]
|
||||
df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
|
||||
|
||||
group_b = np.nan if has_nan_group else "b"
|
||||
df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)
|
||||
|
||||
grouped = df.groupby(by="group_col", dropna=dropna)
|
||||
result = getattr(grouped, method)(limit=None)
|
||||
|
||||
expected_rows = {
|
||||
("ffill", True, True): [-1, 0, 0, -1, -1, -1],
|
||||
("ffill", True, False): [-1, 0, 0, -1, 1, 1],
|
||||
("ffill", False, True): [-1, 0, 0, -1, 1, 1],
|
||||
("ffill", False, False): [-1, 0, 0, -1, 1, 1],
|
||||
("bfill", True, True): [0, 0, -1, -1, -1, -1],
|
||||
("bfill", True, False): [0, 0, -1, 1, 1, -1],
|
||||
("bfill", False, True): [0, 0, -1, 1, 1, -1],
|
||||
("bfill", False, False): [0, 0, -1, 1, 1, -1],
|
||||
}
|
||||
|
||||
ridx = expected_rows.get((method, dropna, has_nan_group))
|
||||
expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
|
||||
# columns are a 'take' on df.columns, which are object dtype
|
||||
expected.columns = expected.columns.astype(object)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
|
||||
@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
|
||||
def test_min_count(func, min_count, value):
|
||||
# GH#37821
|
||||
df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
|
||||
result = getattr(df.groupby("a"), func)(min_count=min_count)
|
||||
expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_indices_with_missing():
|
||||
# GH 9304
|
||||
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
|
||||
g = df.groupby(["a", "b"])
|
||||
result = g.indices
|
||||
expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
|
||||
assert result == expected
|
||||
82
venv/Lib/site-packages/pandas/tests/groupby/test_numba.py
Normal file
82
venv/Lib/site-packages/pandas/tests/groupby/test_numba.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_arm
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
numba = pytest.importorskip("numba")
|
||||
pytestmark.append(
|
||||
pytest.mark.skipif(
|
||||
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
|
||||
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
class TestEngine:
|
||||
def test_cython_vs_numba_frame(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = df.groupby("a", sort=sort)
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_cython_vs_numba_getitem(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = df.groupby("a", sort=sort)["c"]
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_cython_vs_numba_series(
|
||||
self, sort, nogil, parallel, nopython, numba_supported_reductions
|
||||
):
|
||||
func, kwargs = numba_supported_reductions
|
||||
ser = Series(range(3), index=[1, 2, 1], name="foo")
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
gb = ser.groupby(level=0, sort=sort)
|
||||
result = getattr(gb, func)(
|
||||
engine="numba", engine_kwargs=engine_kwargs, **kwargs
|
||||
)
|
||||
expected = getattr(gb, func)(**kwargs)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_as_index_false_unsupported(self, numba_supported_reductions):
|
||||
func, kwargs = numba_supported_reductions
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
gb = df.groupby("a", as_index=False)
|
||||
with pytest.raises(NotImplementedError, match="as_index=False"):
|
||||
getattr(gb, func)(engine="numba", **kwargs)
|
||||
|
||||
def test_no_engine_doesnt_raise(self):
|
||||
# GH55520
|
||||
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
|
||||
gb = df.groupby("a")
|
||||
# Make sure behavior of functions w/out engine argument don't raise
|
||||
# when the global use_numba option is set
|
||||
with option_context("compute.use_numba", True):
|
||||
res = gb.agg({"b": "first"})
|
||||
expected = gb.agg({"b": "first"})
|
||||
tm.assert_frame_equal(res, expected)
|
||||
445
venv/Lib/site-packages/pandas/tests/groupby/test_numeric_only.py
Normal file
445
venv/Lib/site-packages/pandas/tests/groupby/test_numeric_only.py
Normal file
@@ -0,0 +1,445 @@
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.errors import Pandas4Warning
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
class TestNumericOnly:
|
||||
# make sure that we are passing thru kwargs to our agg functions
|
||||
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
# GH3668
|
||||
# GH5724
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": [1, 1, 2],
|
||||
"int": [1, 2, 3],
|
||||
"float": [4.0, 5.0, 6.0],
|
||||
"string": Series(["a", "b", "c"], dtype="str"),
|
||||
"object": Series(["a", "b", "c"], dtype=object),
|
||||
"category_string": Series(list("abc")).astype("category"),
|
||||
"category_int": [7, 8, 9],
|
||||
"datetime": date_range("20130101", periods=3),
|
||||
"datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
|
||||
},
|
||||
columns=[
|
||||
"group",
|
||||
"int",
|
||||
"float",
|
||||
"string",
|
||||
"object",
|
||||
"category_string",
|
||||
"category_int",
|
||||
"datetime",
|
||||
"datetimetz",
|
||||
"timedelta",
|
||||
],
|
||||
)
|
||||
return df
|
||||
|
||||
@pytest.mark.parametrize("method", ["mean", "median"])
|
||||
def test_averages(self, df, method):
|
||||
# mean / median
|
||||
expected_columns_numeric = Index(["int", "float", "category_int"])
|
||||
|
||||
gb = df.groupby("group")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"category_int": [7.5, 9],
|
||||
"float": [4.5, 6.0],
|
||||
"timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
|
||||
"int": [1.5, 3],
|
||||
"datetime": [
|
||||
Timestamp("2013-01-01 12:00:00"),
|
||||
Timestamp("2013-01-03 00:00:00"),
|
||||
],
|
||||
"datetimetz": [
|
||||
Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
|
||||
Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
|
||||
],
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
columns=[
|
||||
"int",
|
||||
"float",
|
||||
"category_int",
|
||||
],
|
||||
)
|
||||
|
||||
result = getattr(gb, method)(numeric_only=True)
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
expected_columns = expected.columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["min", "max"])
|
||||
def test_extrema(self, df, method):
|
||||
# TODO: min, max *should* handle
|
||||
# categorical (ordered) dtype
|
||||
|
||||
expected_columns = Index(
|
||||
[
|
||||
"int",
|
||||
"float",
|
||||
"string",
|
||||
"category_int",
|
||||
"datetime",
|
||||
"datetimetz",
|
||||
"timedelta",
|
||||
]
|
||||
)
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last(self, df, method):
|
||||
expected_columns = Index(
|
||||
[
|
||||
"int",
|
||||
"float",
|
||||
"string",
|
||||
"object",
|
||||
"category_string",
|
||||
"category_int",
|
||||
"datetime",
|
||||
"datetimetz",
|
||||
"timedelta",
|
||||
]
|
||||
)
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["sum", "cumsum"])
|
||||
def test_sum_cumsum(self, df, method):
|
||||
expected_columns_numeric = Index(["int", "float", "category_int"])
|
||||
expected_columns = Index(
|
||||
["int", "float", "string", "category_int", "timedelta"]
|
||||
)
|
||||
if method == "cumsum":
|
||||
# cumsum loses string
|
||||
expected_columns = Index(["int", "float", "category_int", "timedelta"])
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["prod", "cumprod"])
|
||||
def test_prod_cumprod(self, df, method):
|
||||
expected_columns = Index(["int", "float", "category_int"])
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
@pytest.mark.parametrize("method", ["cummin", "cummax"])
|
||||
def test_cummin_cummax(self, df, method):
|
||||
# like min, max, but don't include strings
|
||||
expected_columns = Index(
|
||||
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
|
||||
)
|
||||
|
||||
# GH#15561: numeric_only=False set by default like min/max
|
||||
expected_columns_numeric = expected_columns
|
||||
|
||||
self._check(df, method, expected_columns, expected_columns_numeric)
|
||||
|
||||
def _check(self, df, method, expected_columns, expected_columns_numeric):
|
||||
gb = df.groupby("group")
|
||||
|
||||
# object dtypes for transformations are not implemented in Cython and
|
||||
# have no Python fallback
|
||||
exception = (
|
||||
(NotImplementedError, TypeError) if method.startswith("cum") else TypeError
|
||||
)
|
||||
|
||||
if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
|
||||
# The methods default to numeric_only=False and raise TypeError
|
||||
msg = "|".join(
|
||||
[
|
||||
"Categorical is not ordered",
|
||||
f"Cannot perform {method} with non-ordered Categorical",
|
||||
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||||
# cumsum/cummin/cummax/cumprod
|
||||
"function is not implemented for this dtype",
|
||||
f"dtype 'str' does not support operation '{method}'",
|
||||
]
|
||||
)
|
||||
with pytest.raises(exception, match=msg):
|
||||
getattr(gb, method)()
|
||||
elif method in ("sum", "mean", "median", "prod"):
|
||||
msg = "|".join(
|
||||
[
|
||||
"category type does not support sum operations",
|
||||
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||||
re.escape(f"agg function failed [how->{method},dtype->string]"),
|
||||
f"dtype 'str' does not support operation '{method}'",
|
||||
]
|
||||
)
|
||||
with pytest.raises(exception, match=msg):
|
||||
getattr(gb, method)()
|
||||
else:
|
||||
result = getattr(gb, method)()
|
||||
tm.assert_index_equal(result.columns, expected_columns_numeric)
|
||||
|
||||
if method not in ("first", "last"):
|
||||
msg = "|".join(
|
||||
[
|
||||
"Categorical is not ordered",
|
||||
"category type does not support",
|
||||
"function is not implemented for this dtype",
|
||||
f"Cannot perform {method} with non-ordered Categorical",
|
||||
re.escape(f"agg function failed [how->{method},dtype->object]"),
|
||||
re.escape(f"agg function failed [how->{method},dtype->string]"),
|
||||
f"dtype 'str' does not support operation '{method}'",
|
||||
]
|
||||
)
|
||||
with pytest.raises(exception, match=msg):
|
||||
getattr(gb, method)(numeric_only=False)
|
||||
else:
|
||||
result = getattr(gb, method)(numeric_only=False)
|
||||
tm.assert_index_equal(result.columns, expected_columns)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kernel, has_arg",
|
||||
[
|
||||
("all", False),
|
||||
("any", False),
|
||||
("bfill", False),
|
||||
("corr", True),
|
||||
("corrwith", True),
|
||||
("cov", True),
|
||||
("cummax", True),
|
||||
("cummin", True),
|
||||
("cumprod", True),
|
||||
("cumsum", True),
|
||||
("diff", False),
|
||||
("ffill", False),
|
||||
("first", True),
|
||||
("idxmax", True),
|
||||
("idxmin", True),
|
||||
("last", True),
|
||||
("max", True),
|
||||
("mean", True),
|
||||
("median", True),
|
||||
("min", True),
|
||||
("nth", False),
|
||||
("nunique", False),
|
||||
("pct_change", False),
|
||||
("prod", True),
|
||||
("quantile", True),
|
||||
("sem", True),
|
||||
("skew", True),
|
||||
("kurt", True),
|
||||
("std", True),
|
||||
("sum", True),
|
||||
("var", True),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
|
||||
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
||||
def test_numeric_only(kernel, has_arg, numeric_only, keys):
|
||||
# GH#46072
|
||||
# drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
|
||||
# has_arg: Whether the op has a numeric_only arg
|
||||
df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]})
|
||||
|
||||
args = get_groupby_method_args(kernel, df)
|
||||
kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
|
||||
|
||||
gb = df.groupby(keys)
|
||||
method = getattr(gb, kernel)
|
||||
if has_arg and numeric_only is True:
|
||||
# Cases where b does not appear in the result
|
||||
if kernel == "corrwith":
|
||||
warn = Pandas4Warning
|
||||
msg = "DataFrameGroupBy.corrwith is deprecated"
|
||||
else:
|
||||
warn = None
|
||||
msg = ""
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = method(*args, **kwargs)
|
||||
assert "b" not in result.columns
|
||||
elif (
|
||||
# kernels that work on any dtype and have numeric_only arg
|
||||
kernel in ("first", "last")
|
||||
or (
|
||||
# kernels that work on any dtype and don't have numeric_only arg
|
||||
kernel in ("any", "all", "bfill", "ffill", "nth", "nunique")
|
||||
and numeric_only is lib.no_default
|
||||
)
|
||||
):
|
||||
result = method(*args, **kwargs)
|
||||
assert "b" in result.columns
|
||||
elif has_arg:
|
||||
assert numeric_only is not True
|
||||
# kernels that are successful on any dtype were above; this will fail
|
||||
|
||||
# object dtypes for transformations are not implemented in Cython and
|
||||
# have no Python fallback
|
||||
exception = NotImplementedError if kernel.startswith("cum") else TypeError
|
||||
|
||||
msg = "|".join(
|
||||
[
|
||||
"not allowed for this dtype",
|
||||
"cannot be performed against 'object' dtypes",
|
||||
"must be a string or a real number",
|
||||
"unsupported operand type",
|
||||
"function is not implemented for this dtype",
|
||||
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
|
||||
]
|
||||
)
|
||||
if kernel == "quantile":
|
||||
msg = "dtype 'object' does not support operation 'quantile'"
|
||||
elif kernel == "idxmin":
|
||||
msg = "'<' not supported between instances of 'type' and 'type'"
|
||||
elif kernel == "idxmax":
|
||||
msg = "'>' not supported between instances of 'type' and 'type'"
|
||||
with pytest.raises(exception, match=msg):
|
||||
if kernel == "corrwith":
|
||||
warn = Pandas4Warning
|
||||
msg = "DataFrameGroupBy.corrwith is deprecated"
|
||||
else:
|
||||
warn = None
|
||||
msg = ""
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
method(*args, **kwargs)
|
||||
elif not has_arg and numeric_only is not lib.no_default:
|
||||
with pytest.raises(
|
||||
TypeError, match="got an unexpected keyword argument 'numeric_only'"
|
||||
):
|
||||
method(*args, **kwargs)
|
||||
else:
|
||||
assert kernel in ("diff", "pct_change")
|
||||
assert numeric_only is lib.no_default
|
||||
# Doesn't have numeric_only argument and fails on nuisance columns
|
||||
with pytest.raises(TypeError, match=r"unsupported operand type"):
|
||||
method(*args, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [bool, int, float, object])
|
||||
def test_deprecate_numeric_only_series(dtype, groupby_func, request):
|
||||
# GH#46560
|
||||
grouper = [0, 0, 1]
|
||||
|
||||
ser = Series([1, 0, 0], dtype=dtype)
|
||||
gb = ser.groupby(grouper)
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
# corrwith is not implemented on SeriesGroupBy
|
||||
assert not hasattr(gb, groupby_func)
|
||||
return
|
||||
|
||||
method = getattr(gb, groupby_func)
|
||||
|
||||
expected_ser = Series([1, 0, 0])
|
||||
expected_gb = expected_ser.groupby(grouper)
|
||||
expected_method = getattr(expected_gb, groupby_func)
|
||||
|
||||
args = get_groupby_method_args(groupby_func, ser)
|
||||
|
||||
fails_on_numeric_object = (
|
||||
"corr",
|
||||
"cov",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"quantile",
|
||||
)
|
||||
# ops that give an object result on object input
|
||||
obj_result = (
|
||||
"first",
|
||||
"last",
|
||||
"nth",
|
||||
"bfill",
|
||||
"ffill",
|
||||
"shift",
|
||||
"sum",
|
||||
"diff",
|
||||
"pct_change",
|
||||
"var",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"max",
|
||||
"prod",
|
||||
"skew",
|
||||
"kurt",
|
||||
)
|
||||
|
||||
# Test default behavior; kernels that fail may be enabled in the future but kernels
|
||||
# that succeed should not be allowed to fail (without deprecation, at least)
|
||||
if groupby_func in fails_on_numeric_object and dtype is object:
|
||||
if groupby_func == "quantile":
|
||||
msg = "dtype 'object' does not support operation 'quantile'"
|
||||
else:
|
||||
msg = "is not supported for object dtype"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args)
|
||||
elif dtype is object:
|
||||
result = method(*args)
|
||||
expected = expected_method(*args)
|
||||
if groupby_func in obj_result:
|
||||
expected = expected.astype(object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
has_numeric_only = (
|
||||
"first",
|
||||
"last",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"prod",
|
||||
"quantile",
|
||||
"sem",
|
||||
"skew",
|
||||
"kurt",
|
||||
"std",
|
||||
"sum",
|
||||
"var",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
)
|
||||
if groupby_func not in has_numeric_only:
|
||||
msg = "got an unexpected keyword argument 'numeric_only'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, numeric_only=True)
|
||||
elif dtype is object:
|
||||
msg = "|".join(
|
||||
[
|
||||
"SeriesGroupBy.sem called with numeric_only=True and dtype object",
|
||||
"Series.skew does not allow numeric_only=True with non-numeric",
|
||||
"cum(sum|prod|min|max) is not supported for object dtype",
|
||||
r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
|
||||
]
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, numeric_only=True)
|
||||
elif dtype == bool and groupby_func == "quantile":
|
||||
msg = "Cannot use quantile with bool dtype"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
# GH#51424
|
||||
method(*args, numeric_only=False)
|
||||
else:
|
||||
result = method(*args, numeric_only=True)
|
||||
expected = method(*args, numeric_only=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
80
venv/Lib/site-packages/pandas/tests/groupby/test_pipe.py
Normal file
80
venv/Lib/site-packages/pandas/tests/groupby/test_pipe.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_pipe():
|
||||
# Test the pipe method of DataFrameGroupBy.
|
||||
# Issue #17871
|
||||
|
||||
random_state = np.random.default_rng(2)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": random_state.standard_normal(8),
|
||||
"C": random_state.standard_normal(8),
|
||||
}
|
||||
)
|
||||
|
||||
def f(dfgb):
|
||||
return dfgb.B.max() - dfgb.C.min().min()
|
||||
|
||||
def square(srs):
|
||||
return srs**2
|
||||
|
||||
# Note that the transformations are
|
||||
# GroupBy -> Series
|
||||
# Series -> Series
|
||||
# This then chains the GroupBy.pipe and the
|
||||
# NDFrame.pipe methods
|
||||
result = df.groupby("A").pipe(f).pipe(square)
|
||||
|
||||
index = Index(["bar", "foo"], name="A")
|
||||
expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)
|
||||
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_pipe_args():
|
||||
# Test passing args to the pipe method of DataFrameGroupBy.
|
||||
# Issue #17871
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": ["A", "A", "B", "B", "C"],
|
||||
"x": [1.0, 2.0, 3.0, 2.0, 5.0],
|
||||
"y": [10.0, 100.0, 1000.0, -100.0, -1000.0],
|
||||
}
|
||||
)
|
||||
|
||||
def f(dfgb, arg1):
|
||||
filtered = dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
|
||||
return filtered.groupby("group")
|
||||
|
||||
def g(dfgb, arg2):
|
||||
return dfgb.sum() / dfgb.sum().sum() + arg2
|
||||
|
||||
def h(df, arg3):
|
||||
return df.x + df.y - arg3
|
||||
|
||||
result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100)
|
||||
|
||||
# Assert the results here
|
||||
index = Index(["A", "B"], name="group")
|
||||
expected = pd.Series([-79.5160891089, -78.4839108911], index=index)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# test SeriesGroupby.pipe
|
||||
ser = pd.Series([1, 1, 2, 2, 3, 3])
|
||||
result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
|
||||
|
||||
expected = pd.Series([4, 8, 12], index=Index([1, 2, 3], dtype=np.int64))
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
741
venv/Lib/site-packages/pandas/tests/groupby/test_raises.py
Normal file
741
venv/Lib/site-packages/pandas/tests/groupby/test_raises.py
Normal file
@@ -0,0 +1,741 @@
|
||||
# Only tests that raise an error and have no better location should go here.
|
||||
# Tests for specific groupby methods should go in their respective
|
||||
# test file.
|
||||
|
||||
import datetime
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import Pandas4Warning
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Grouper,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.groupby import get_groupby_method_args
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
"a",
|
||||
["a"],
|
||||
["a", "b"],
|
||||
Grouper(key="a"),
|
||||
lambda x: x % 2,
|
||||
[0, 0, 0, 1, 2, 2, 2, 3, 3],
|
||||
np.array([0, 0, 0, 1, 2, 2, 2, 3, 3]),
|
||||
dict(zip(range(9), [0, 0, 0, 1, 2, 2, 2, 3, 3], strict=True)),
|
||||
Series([1, 1, 1, 1, 1, 2, 2, 2, 2]),
|
||||
[Series([1, 1, 1, 1, 1, 2, 2, 2, 2]), Series([3, 3, 4, 4, 4, 4, 4, 3, 3])],
|
||||
]
|
||||
)
|
||||
def by(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def groupby_series(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_string_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": list("xyzwtyuio"),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_datetime_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_with_cat_col():
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": Categorical(
|
||||
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=True,
|
||||
),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def _call_and_check(
|
||||
klass, msg, how, gb, groupby_func, args, warn_category=None, warn_msg=""
|
||||
):
|
||||
with tm.assert_produces_warning(
|
||||
warn_category, match=warn_msg, check_stacklevel=False
|
||||
):
|
||||
if klass is None:
|
||||
if how == "method":
|
||||
getattr(gb, groupby_func)(*args)
|
||||
elif how == "agg":
|
||||
gb.agg(groupby_func, *args)
|
||||
else:
|
||||
gb.transform(groupby_func, *args)
|
||||
else:
|
||||
with pytest.raises(klass, match=msg):
|
||||
if how == "method":
|
||||
getattr(gb, groupby_func)(*args)
|
||||
elif how == "agg":
|
||||
gb.agg(groupby_func, *args)
|
||||
else:
|
||||
gb.transform(groupby_func, *args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_string(
|
||||
how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string
|
||||
):
|
||||
df = df_with_string_col
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (TypeError, "Could not convert"),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cummax) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"cummin": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cummin) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"cumprod": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cumprod) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"cumsum": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(function|cumsum) is not (implemented|supported) for (this|object) dtype",
|
||||
),
|
||||
"diff": (TypeError, "unsupported operand type"),
|
||||
"ffill": (None, ""),
|
||||
"first": (None, ""),
|
||||
"idxmax": (None, ""),
|
||||
"idxmin": (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->mean,dtype->object]"),
|
||||
),
|
||||
"median": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->median,dtype->object]"),
|
||||
),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (TypeError, "unsupported operand type"),
|
||||
"prod": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->prod,dtype->object]"),
|
||||
),
|
||||
"quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"),
|
||||
"rank": (None, ""),
|
||||
"sem": (ValueError, "could not convert string to float"),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (ValueError, "could not convert string to float"),
|
||||
"kurt": (ValueError, "could not convert string to float"),
|
||||
"std": (ValueError, "could not convert string to float"),
|
||||
"sum": (None, ""),
|
||||
"var": (
|
||||
TypeError,
|
||||
re.escape("agg function failed [how->var,dtype->"),
|
||||
),
|
||||
}[groupby_func]
|
||||
|
||||
if using_infer_string:
|
||||
if groupby_func in [
|
||||
"prod",
|
||||
"mean",
|
||||
"median",
|
||||
"cumsum",
|
||||
"cumprod",
|
||||
"std",
|
||||
"sem",
|
||||
"var",
|
||||
"skew",
|
||||
"kurt",
|
||||
"quantile",
|
||||
]:
|
||||
msg = f"dtype 'str' does not support operation '{groupby_func}'"
|
||||
if groupby_func in ["sem", "std", "skew", "kurt"]:
|
||||
# The object-dtype raises ValueError when trying to convert to numeric.
|
||||
klass = TypeError
|
||||
elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow":
|
||||
# This doesn't go through EA._groupby_op so the message isn't controlled
|
||||
# there.
|
||||
msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'"
|
||||
elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow":
|
||||
# This doesn't go through EA._groupby_op so the message isn't controlled
|
||||
# there.
|
||||
msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'"
|
||||
|
||||
elif groupby_func in ["cummin", "cummax"]:
|
||||
msg = msg.replace("object", "str")
|
||||
elif groupby_func == "corrwith":
|
||||
msg = "Cannot perform reduction 'mean' with string dtype"
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
warn_category = Pandas4Warning
|
||||
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
|
||||
else:
|
||||
warn_category = None
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
def test_groupby_raises_string_udf(how, by, groupby_series, df_with_string_col):
|
||||
df = df_with_string_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
def func(x):
|
||||
raise TypeError("Test error message")
|
||||
|
||||
with pytest.raises(TypeError, match="Test error message"):
|
||||
getattr(gb, how)(func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
||||
def test_groupby_raises_string_np(
|
||||
how,
|
||||
by,
|
||||
groupby_series,
|
||||
groupby_func_np,
|
||||
df_with_string_col,
|
||||
using_infer_string,
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_string_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
klass, msg = {
|
||||
np.sum: (None, ""),
|
||||
np.mean: (
|
||||
TypeError,
|
||||
"Could not convert string .* to numeric|"
|
||||
"Cannot perform reduction 'mean' with string dtype",
|
||||
),
|
||||
}[groupby_func_np]
|
||||
|
||||
if using_infer_string:
|
||||
if groupby_func_np is np.mean:
|
||||
klass = TypeError
|
||||
msg = f"Cannot perform reduction '{groupby_func_np.__name__}' with string dtype"
|
||||
|
||||
_call_and_check(klass, msg, how, gb, groupby_func_np, ())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_datetime(
|
||||
how, by, groupby_series, groupby_func, df_with_datetime_col
|
||||
):
|
||||
df = df_with_datetime_col
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
klass, msg = {
|
||||
"all": (TypeError, "'all' with datetime64 dtypes is no longer supported"),
|
||||
"any": (TypeError, "'any' with datetime64 dtypes is no longer supported"),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (TypeError, "cannot perform __mul__ with this index type"),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (None, ""),
|
||||
"cummin": (None, ""),
|
||||
"cumprod": (TypeError, "datetime64 type does not support operation 'cumprod'"),
|
||||
"cumsum": (TypeError, "datetime64 type does not support operation 'cumsum'"),
|
||||
"diff": (None, ""),
|
||||
"ffill": (None, ""),
|
||||
"first": (None, ""),
|
||||
"idxmax": (None, ""),
|
||||
"idxmin": (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (None, ""),
|
||||
"median": (None, ""),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (TypeError, "cannot perform __truediv__ with this index type"),
|
||||
"prod": (TypeError, "datetime64 type does not support operation 'prod'"),
|
||||
"quantile": (None, ""),
|
||||
"rank": (None, ""),
|
||||
"sem": (None, ""),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
r"dtype datetime64\[ns\] does not support operation",
|
||||
"datetime64 type does not support operation 'skew'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"kurt": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
r"dtype datetime64\[ns\] does not support operation",
|
||||
"datetime64 type does not support operation 'kurt'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"std": (None, ""),
|
||||
"sum": (TypeError, "datetime64 type does not support operation 'sum"),
|
||||
"var": (TypeError, "datetime64 type does not support operation 'var'"),
|
||||
}[groupby_func]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
warn_category = Pandas4Warning
|
||||
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
|
||||
else:
|
||||
warn_category = None
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
def test_groupby_raises_datetime_udf(how, by, groupby_series, df_with_datetime_col):
|
||||
df = df_with_datetime_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
def func(x):
|
||||
raise TypeError("Test error message")
|
||||
|
||||
with pytest.raises(TypeError, match="Test error message"):
|
||||
getattr(gb, how)(func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
||||
def test_groupby_raises_datetime_np(
|
||||
how, by, groupby_series, groupby_func_np, df_with_datetime_col
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_datetime_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
klass, msg = {
|
||||
np.sum: (
|
||||
TypeError,
|
||||
re.escape("datetime64[us] does not support operation 'sum'"),
|
||||
),
|
||||
np.mean: (None, ""),
|
||||
}[groupby_func_np]
|
||||
_call_and_check(klass, msg, how, gb, groupby_func_np, ())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "kurt", "var"])
|
||||
def test_groupby_raises_timedelta(func):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
||||
"c": range(9),
|
||||
"d": datetime.timedelta(days=1),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(by="a")
|
||||
|
||||
_call_and_check(
|
||||
TypeError,
|
||||
"timedelta64 type does not support .* operations",
|
||||
"method",
|
||||
gb,
|
||||
func,
|
||||
[],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_category(
|
||||
how, by, groupby_series, groupby_func, df_with_cat_col
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
|
||||
),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cummax operations|"
|
||||
"category dtype not supported|"
|
||||
"cummax is not supported for category dtype)",
|
||||
),
|
||||
"cummin": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cummin operations|"
|
||||
"category dtype not supported|"
|
||||
"cummin is not supported for category dtype)",
|
||||
),
|
||||
"cumprod": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cumprod operations|"
|
||||
"category dtype not supported|"
|
||||
"cumprod is not supported for category dtype)",
|
||||
),
|
||||
"cumsum": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(category type does not support cumsum operations|"
|
||||
"category dtype not supported|"
|
||||
"cumsum is not supported for category dtype)",
|
||||
),
|
||||
"diff": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'",
|
||||
),
|
||||
"ffill": (None, ""),
|
||||
"first": (None, ""),
|
||||
"idxmax": (None, ""),
|
||||
"idxmin": (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support operation 'mean'",
|
||||
"category dtype does not support aggregation 'mean'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"median": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support operation 'median'",
|
||||
"category dtype does not support aggregation 'median'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for /: 'Categorical' and 'Categorical'",
|
||||
),
|
||||
"prod": (TypeError, "category type does not support prod operations"),
|
||||
"quantile": (TypeError, "No matching signature found"),
|
||||
"rank": (None, ""),
|
||||
"sem": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support operation 'sem'",
|
||||
"category dtype does not support aggregation 'sem'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"dtype category does not support operation 'skew'",
|
||||
"category type does not support skew operations",
|
||||
]
|
||||
),
|
||||
),
|
||||
"kurt": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"dtype category does not support operation 'kurt'",
|
||||
"category type does not support kurt operations",
|
||||
]
|
||||
),
|
||||
),
|
||||
"std": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support operation 'std'",
|
||||
"category dtype does not support aggregation 'std'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"sum": (TypeError, "category type does not support sum operations"),
|
||||
"var": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support operation 'var'",
|
||||
"category dtype does not support aggregation 'var'",
|
||||
]
|
||||
),
|
||||
),
|
||||
}[groupby_func]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
warn_category = Pandas4Warning
|
||||
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
|
||||
else:
|
||||
warn_category = None
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
def test_groupby_raises_category_udf(how, by, groupby_series, df_with_cat_col):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
def func(x):
|
||||
raise TypeError("Test error message")
|
||||
|
||||
with pytest.raises(TypeError, match="Test error message"):
|
||||
getattr(gb, how)(func)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("how", ["agg", "transform"])
|
||||
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
||||
def test_groupby_raises_category_np(
|
||||
how, by, groupby_series, groupby_func_np, df_with_cat_col
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
gb = df.groupby(by=by)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
klass, msg = {
|
||||
np.sum: (TypeError, "dtype category does not support operation 'sum'"),
|
||||
np.mean: (
|
||||
TypeError,
|
||||
"dtype category does not support operation 'mean'",
|
||||
),
|
||||
}[groupby_func_np]
|
||||
_call_and_check(klass, msg, how, gb, groupby_func_np, ())
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:In a future version, the keys")
|
||||
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
||||
def test_groupby_raises_category_on_category(
|
||||
how,
|
||||
by,
|
||||
groupby_series,
|
||||
groupby_func,
|
||||
observed,
|
||||
df_with_cat_col,
|
||||
):
|
||||
# GH#50749
|
||||
df = df_with_cat_col
|
||||
df["a"] = Categorical(
|
||||
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=True,
|
||||
)
|
||||
args = get_groupby_method_args(groupby_func, df)
|
||||
gb = df.groupby(by=by, observed=observed)
|
||||
|
||||
if groupby_series:
|
||||
gb = gb["d"]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
assert not hasattr(gb, "corrwith")
|
||||
return
|
||||
|
||||
empty_groups = not observed and any(group.empty for group in gb.groups.values())
|
||||
if how == "transform":
|
||||
# empty groups will be ignored
|
||||
empty_groups = False
|
||||
|
||||
klass, msg = {
|
||||
"all": (None, ""),
|
||||
"any": (None, ""),
|
||||
"bfill": (None, ""),
|
||||
"corrwith": (
|
||||
TypeError,
|
||||
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
|
||||
),
|
||||
"count": (None, ""),
|
||||
"cumcount": (None, ""),
|
||||
"cummax": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cummax is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cummax operations)",
|
||||
),
|
||||
"cummin": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cummin is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cummin operations)",
|
||||
),
|
||||
"cumprod": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cumprod is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cumprod operations)",
|
||||
),
|
||||
"cumsum": (
|
||||
(NotImplementedError, TypeError),
|
||||
"(cumsum is not supported for category dtype|"
|
||||
"category dtype not supported|"
|
||||
"category type does not support cumsum operations)",
|
||||
),
|
||||
"diff": (TypeError, "unsupported operand type"),
|
||||
"ffill": (None, ""),
|
||||
"first": (None, ""),
|
||||
"idxmax": (ValueError, "empty group due to unobserved categories")
|
||||
if empty_groups
|
||||
else (None, ""),
|
||||
"idxmin": (ValueError, "empty group due to unobserved categories")
|
||||
if empty_groups
|
||||
else (None, ""),
|
||||
"last": (None, ""),
|
||||
"max": (None, ""),
|
||||
"mean": (TypeError, "category dtype does not support aggregation 'mean'"),
|
||||
"median": (TypeError, "category dtype does not support aggregation 'median'"),
|
||||
"min": (None, ""),
|
||||
"ngroup": (None, ""),
|
||||
"nunique": (None, ""),
|
||||
"pct_change": (TypeError, "unsupported operand type"),
|
||||
"prod": (TypeError, "category type does not support prod operations"),
|
||||
"quantile": (TypeError, "No matching signature found"),
|
||||
"rank": (None, ""),
|
||||
"sem": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support operation 'sem'",
|
||||
"category dtype does not support aggregation 'sem'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"shift": (None, ""),
|
||||
"size": (None, ""),
|
||||
"skew": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"category type does not support skew operations",
|
||||
"dtype category does not support operation 'skew'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"kurt": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"category type does not support kurt operations",
|
||||
"dtype category does not support operation 'kurt'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"std": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support operation 'std'",
|
||||
"category dtype does not support aggregation 'std'",
|
||||
]
|
||||
),
|
||||
),
|
||||
"sum": (TypeError, "category type does not support sum operations"),
|
||||
"var": (
|
||||
TypeError,
|
||||
"|".join(
|
||||
[
|
||||
"'Categorical' .* does not support operation 'var'",
|
||||
"category dtype does not support aggregation 'var'",
|
||||
]
|
||||
),
|
||||
),
|
||||
}[groupby_func]
|
||||
|
||||
if groupby_func == "corrwith":
|
||||
warn_category = Pandas4Warning
|
||||
warn_msg = "DataFrameGroupBy.corrwith is deprecated"
|
||||
else:
|
||||
warn_category = None
|
||||
warn_msg = ""
|
||||
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
|
||||
1538
venv/Lib/site-packages/pandas/tests/groupby/test_reductions.py
Normal file
1538
venv/Lib/site-packages/pandas/tests/groupby/test_reductions.py
Normal file
File diff suppressed because it is too large
Load Diff
984
venv/Lib/site-packages/pandas/tests/groupby/test_timegrouper.py
Normal file
984
venv/Lib/site-packages/pandas/tests/groupby/test_timegrouper.py
Normal file
@@ -0,0 +1,984 @@
|
||||
"""
|
||||
test with the TimeGrouper / grouping with datetimes
|
||||
"""
|
||||
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
timezone,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
offsets,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.groupby.grouper import Grouper
|
||||
from pandas.core.groupby.ops import BinGrouper
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_for_truncated_bingrouper():
|
||||
"""
|
||||
DataFrame used by groupby_with_truncated_bingrouper, made into
|
||||
a separate fixture for easier reuse in
|
||||
test_groupby_apply_timegrouper_with_nat_apply_squeeze
|
||||
"""
|
||||
df = DataFrame(
|
||||
{
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
Timestamp(2013, 9, 1, 13, 0),
|
||||
Timestamp(2013, 9, 1, 13, 5),
|
||||
Timestamp(2013, 10, 1, 20, 0),
|
||||
Timestamp(2013, 10, 3, 10, 0),
|
||||
pd.NaT,
|
||||
Timestamp(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
|
||||
"""
|
||||
GroupBy object such that gb._grouper is a BinGrouper and
|
||||
len(gb._grouper.result_index) < len(gb._grouper.group_keys_seq)
|
||||
|
||||
Aggregations on this groupby should have
|
||||
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
|
||||
|
||||
As either the index or an index level.
|
||||
"""
|
||||
df = frame_for_truncated_bingrouper
|
||||
|
||||
tdg = Grouper(key="Date", freq="5D")
|
||||
gb = df.groupby(tdg)
|
||||
|
||||
# check we're testing the case we're interested in
|
||||
assert len(gb._grouper.result_index) != len(gb._grouper.codes)
|
||||
|
||||
return gb
|
||||
|
||||
|
||||
class TestGroupBy:
|
||||
def test_groupby_with_timegrouper(self, using_infer_string):
|
||||
# GH 4161
|
||||
# TimeGrouper requires a sorted index
|
||||
# also verifies that the resultant index has the correct name
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
# GH 6908 change target column's order
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
df = df.set_index(["Date"])
|
||||
|
||||
exp_dti = date_range(
|
||||
"20130901",
|
||||
"20131205",
|
||||
freq="5D",
|
||||
name="Date",
|
||||
inclusive="left",
|
||||
unit=df.index.unit,
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"Buyer": "" if using_infer_string else 0, "Quantity": 0},
|
||||
index=exp_dti,
|
||||
)
|
||||
# Cast to object/str to avoid implicit cast when setting
|
||||
# entry to "CarlCarlCarl"
|
||||
expected = expected.astype({"Buyer": object})
|
||||
if using_infer_string:
|
||||
expected = expected.astype({"Buyer": "str"})
|
||||
expected.iloc[0, 0] = "CarlCarlCarl"
|
||||
expected.iloc[6, 0] = "CarlCarl"
|
||||
expected.iloc[18, 0] = "Joe"
|
||||
expected.iloc[[0, 6, 18], 1] = np.array([24, 6, 9], dtype="int64")
|
||||
|
||||
result1 = df.resample("5D").sum()
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
|
||||
df_sorted = df.sort_index()
|
||||
result2 = df_sorted.groupby(Grouper(freq="5D")).sum()
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
result3 = df.groupby(Grouper(freq="5D")).sum()
|
||||
tm.assert_frame_equal(result3, expected)
|
||||
|
||||
@pytest.mark.parametrize("should_sort", [True, False])
|
||||
def test_groupby_with_timegrouper_methods(self, should_sort):
|
||||
# GH 3881
|
||||
# make sure API of timegrouper conforms
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 8, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if should_sort:
|
||||
df = df.sort_values(by="Quantity", ascending=False)
|
||||
|
||||
df = df.set_index("Date", drop=False)
|
||||
g = df.groupby(Grouper(freq="6ME"))
|
||||
assert g.group_keys
|
||||
|
||||
assert isinstance(g._grouper, BinGrouper)
|
||||
groups = g.groups
|
||||
assert isinstance(groups, dict)
|
||||
assert len(groups) == 3
|
||||
|
||||
def test_timegrouper_with_reg_groups(self):
|
||||
# GH 3794
|
||||
# allow combination of timegrouper/reg groups
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
).set_index("Date")
|
||||
|
||||
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
|
||||
|
||||
for df in [df_original, df_sorted]:
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
|
||||
msg = "The default value of numeric_only"
|
||||
result = df.groupby([Grouper(freq="YE"), "Buyer"]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Mark Carl Joe".split(),
|
||||
"Quantity": [1, 3, 9, 18],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 10, 1, 13, 0),
|
||||
datetime(2013, 10, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 2, 12, 0),
|
||||
datetime(2013, 10, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
).set_index("Date")
|
||||
|
||||
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
|
||||
for df in [df_original, df_sorted]:
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark Carl Joe".split(),
|
||||
"Quantity": [6, 8, 3, 4, 10],
|
||||
"Date": [
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
|
||||
result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# passing the name
|
||||
df = df.reset_index()
|
||||
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
|
||||
df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum()
|
||||
|
||||
# passing the level
|
||||
df = df.set_index("Date")
|
||||
result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError, match="The level foo is not valid"):
|
||||
df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum()
|
||||
|
||||
# multi names
|
||||
df = df.copy()
|
||||
df["Date"] = df.index + offsets.MonthEnd(2)
|
||||
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# error as we have both a level and a name!
|
||||
msg = "The Grouper cannot specify both a key and a level!"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(
|
||||
[Grouper(freq="1ME", key="Date", level="Date"), "Buyer"]
|
||||
).sum()
|
||||
|
||||
# single groupers
|
||||
expected = DataFrame(
|
||||
[[31]],
|
||||
columns=["Quantity"],
|
||||
index=DatetimeIndex(
|
||||
[datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
|
||||
),
|
||||
)
|
||||
result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = expected.index.shift(1)
|
||||
assert expected.index.freq == offsets.MonthEnd()
|
||||
result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([Grouper(freq="1ME", key="Date")]).sum(
|
||||
numeric_only=True
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("freq", ["D", "ME", "YE", "QE-APR"])
|
||||
def test_timegrouper_with_reg_groups_freq(self, freq):
|
||||
# GH 6764 multiple grouping with/without sort
|
||||
df = DataFrame(
|
||||
{
|
||||
"date": pd.to_datetime(
|
||||
[
|
||||
"20121002",
|
||||
"20121007",
|
||||
"20130130",
|
||||
"20130202",
|
||||
"20130305",
|
||||
"20121002",
|
||||
"20121207",
|
||||
"20130130",
|
||||
"20130202",
|
||||
"20130305",
|
||||
"20130202",
|
||||
"20130305",
|
||||
]
|
||||
),
|
||||
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
|
||||
"whole_cost": [
|
||||
1790,
|
||||
364,
|
||||
280,
|
||||
259,
|
||||
201,
|
||||
623,
|
||||
90,
|
||||
312,
|
||||
359,
|
||||
301,
|
||||
359,
|
||||
801,
|
||||
],
|
||||
"cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
|
||||
}
|
||||
).set_index("date")
|
||||
|
||||
expected = (
|
||||
df.groupby("user_id")["whole_cost"]
|
||||
.resample(freq)
|
||||
.sum(min_count=1) # XXX
|
||||
.dropna()
|
||||
.reorder_levels(["date", "user_id"])
|
||||
.sort_index()
|
||||
.astype("int64")
|
||||
)
|
||||
expected.name = "whole_cost"
|
||||
|
||||
result1 = (
|
||||
df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
|
||||
)
|
||||
tm.assert_series_equal(result1, expected)
|
||||
|
||||
result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
|
||||
tm.assert_series_equal(result2, expected)
|
||||
|
||||
def test_timegrouper_get_group(self):
|
||||
# GH 6914
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Joe Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
# single grouping
|
||||
expected_list = [
|
||||
df_original.iloc[[0, 1, 5]],
|
||||
df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(Grouper(freq="ME", key="Date"))
|
||||
for t, expected in zip(dt_list, expected_list, strict=True):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# multiple grouping
|
||||
expected_list = [
|
||||
df_original.iloc[[1]],
|
||||
df_original.iloc[[3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")])
|
||||
for (b, t), expected in zip(g_list, expected_list, strict=True):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group((b, dt))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# with index
|
||||
df_original = df_original.set_index("Date")
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
expected_list = [
|
||||
df_original.iloc[[0, 1, 5]],
|
||||
df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(Grouper(freq="ME"))
|
||||
for t, expected in zip(dt_list, expected_list, strict=True):
|
||||
dt = Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_timegrouper_apply_return_type_series(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt["date"] = pd.to_datetime(df_dt["date"])
|
||||
|
||||
def sumfunc_series(x):
|
||||
return Series([x["value"].sum()], ("sum",))
|
||||
|
||||
expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
|
||||
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series)
|
||||
tm.assert_frame_equal(
|
||||
result.reset_index(drop=True), expected.reset_index(drop=True)
|
||||
)
|
||||
|
||||
def test_timegrouper_apply_return_type_value(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt["date"] = pd.to_datetime(df_dt["date"])
|
||||
|
||||
def sumfunc_value(x):
|
||||
return x.value.sum()
|
||||
|
||||
expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
|
||||
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value)
|
||||
tm.assert_series_equal(
|
||||
result.reset_index(drop=True), expected.reset_index(drop=True)
|
||||
)
|
||||
|
||||
def test_groupby_groups_datetimeindex(self):
|
||||
# GH#1430
|
||||
periods = 1000
|
||||
ind = date_range(start="2012/1/1", freq="5min", periods=periods)
|
||||
df = DataFrame(
|
||||
{"high": np.arange(periods), "low": np.arange(periods)}, index=ind
|
||||
)
|
||||
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
|
||||
|
||||
# it works!
|
||||
groups = grouped.groups
|
||||
assert isinstance(next(iter(groups.keys())), datetime)
|
||||
|
||||
def test_groupby_groups_datetimeindex2(self):
|
||||
# GH#11442
|
||||
index = date_range("2015/01/01", periods=5, name="date")
|
||||
df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
|
||||
result = df.groupby(level="date").groups
|
||||
dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
|
||||
expected = {
|
||||
Timestamp(date): DatetimeIndex([date], name="date") for date in dates
|
||||
}
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
grouped = df.groupby(level="date")
|
||||
for date in dates:
|
||||
result = grouped.get_group(date)
|
||||
data = [[df.loc[date, "A"], df.loc[date, "B"]]]
|
||||
expected_index = DatetimeIndex(
|
||||
[date], name="date", freq="D", dtype=index.dtype
|
||||
)
|
||||
expected = DataFrame(data, columns=list("AB"), index=expected_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_datetimeindex_tz(self):
|
||||
# GH 3950
|
||||
dates = [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
]
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"datetime": dates,
|
||||
"value1": np.arange(6, dtype="int64"),
|
||||
"value2": [1, 2] * 3,
|
||||
}
|
||||
)
|
||||
df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
|
||||
|
||||
exp_idx1 = DatetimeIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
tz="US/Pacific",
|
||||
name="datetime",
|
||||
)
|
||||
exp_idx2 = Index(["a", "b"] * 3, name="label")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(["datetime", "label"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = DatetimeIndex(dates, tz="Asia/Tokyo")
|
||||
df = DataFrame(
|
||||
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
|
||||
index=didx,
|
||||
)
|
||||
|
||||
exp_idx = DatetimeIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
tz="Asia/Tokyo",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_datetime64_handling_groupby(self):
|
||||
# it works!
|
||||
df = DataFrame(
|
||||
[(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
|
||||
columns=["a", "date"],
|
||||
)
|
||||
result = df.groupby("a").first()
|
||||
assert result["date"][3] == Timestamp("2012-07-03")
|
||||
|
||||
def test_groupby_multi_timezone(self):
|
||||
# combining multiple / different timezones yields UTC
|
||||
df = DataFrame(
|
||||
{
|
||||
"value": range(5),
|
||||
"date": [
|
||||
"2000-01-28 16:47:00",
|
||||
"2000-01-29 16:48:00",
|
||||
"2000-01-30 16:49:00",
|
||||
"2000-01-31 16:50:00",
|
||||
"2000-01-01 16:50:00",
|
||||
],
|
||||
"tz": [
|
||||
"America/Chicago",
|
||||
"America/Chicago",
|
||||
"America/Los_Angeles",
|
||||
"America/Chicago",
|
||||
"America/New_York",
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("tz", group_keys=False).date.apply(
|
||||
lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
|
||||
)
|
||||
|
||||
expected = Series(
|
||||
[
|
||||
Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
|
||||
Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
|
||||
],
|
||||
name="date",
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
tz = "America/Chicago"
|
||||
res_values = df.groupby("tz").date.get_group(tz)
|
||||
result = pd.to_datetime(res_values).dt.tz_localize(tz)
|
||||
exp_values = Series(
|
||||
["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
|
||||
index=[0, 1, 3],
|
||||
name="date",
|
||||
)
|
||||
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_periods(self):
|
||||
dates = [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
]
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"period": [pd.Period(d, freq="h") for d in dates],
|
||||
"value1": np.arange(6, dtype="int64"),
|
||||
"value2": [1, 2] * 3,
|
||||
}
|
||||
)
|
||||
|
||||
exp_idx1 = pd.PeriodIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
freq="h",
|
||||
name="period",
|
||||
)
|
||||
exp_idx2 = Index(["a", "b"] * 3, name="label")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(["period", "label"]).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = pd.PeriodIndex(dates, freq="h")
|
||||
df = DataFrame(
|
||||
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
|
||||
index=didx,
|
||||
)
|
||||
|
||||
exp_idx = pd.PeriodIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
freq="h",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_first_datetime64(self):
|
||||
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
|
||||
df[1] = df[1].astype("M8[ns]")
|
||||
|
||||
assert issubclass(df[1].dtype.type, np.datetime64)
|
||||
|
||||
result = df.groupby(level=0).first()
|
||||
got_dt = result[1].dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
result = df[1].groupby(level=0).first()
|
||||
got_dt = result.dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
def test_groupby_max_datetime64(self):
|
||||
# GH 5869
|
||||
# datetimelike dtype conversion from int
|
||||
df = DataFrame({"A": Timestamp("20130101").as_unit("s"), "B": np.arange(5)})
|
||||
# TODO: can we retain second reso in .apply here?
|
||||
expected = df.groupby("A")["A"].apply(lambda x: x.max()).astype("M8[s]")
|
||||
result = df.groupby("A")["A"].max()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_datetime64_32_bit(self):
|
||||
# GH 6410 / numpy 4328
|
||||
# 32-bit under 1.9-dev indexing issue
|
||||
|
||||
df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2})
|
||||
result = df.groupby("A")["B"].transform("min")
|
||||
expected = Series([Timestamp("2000-01-1")] * 2, name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_with_timezone_selection(self):
|
||||
# GH 11616
|
||||
# Test that column selection returns output in correct timezone.
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"factor": np.random.default_rng(2).integers(0, 3, size=60),
|
||||
"time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"),
|
||||
}
|
||||
)
|
||||
df1 = df.groupby("factor").max()["time"]
|
||||
df2 = df.groupby("factor")["time"].max()
|
||||
tm.assert_series_equal(df1, df2)
|
||||
|
||||
def test_timezone_info(self):
|
||||
# see gh-11682: Timezone info lost when broadcasting
|
||||
# scalar datetime to DataFrame
|
||||
utc = timezone.utc
|
||||
df = DataFrame({"a": [1], "b": [datetime.now(utc)]})
|
||||
assert df["b"][0].tzinfo == utc
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
df["b"] = datetime.now(utc)
|
||||
assert df["b"][0].tzinfo == utc
|
||||
|
||||
def test_datetime_count(self):
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="min")}
|
||||
)
|
||||
result = df.groupby("a").dates.count()
|
||||
expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_first_last_max_min_on_time_data(self):
|
||||
# GH 10295
|
||||
# Verify that NaT is not in the result of max, min, first and last on
|
||||
# Dataframe with datetime or timedelta values.
|
||||
df_test = DataFrame(
|
||||
{
|
||||
"dt": [
|
||||
np.nan,
|
||||
"2015-07-24 10:10",
|
||||
"2015-07-25 11:11",
|
||||
"2015-07-23 12:12",
|
||||
np.nan,
|
||||
],
|
||||
"td": [
|
||||
np.nan,
|
||||
timedelta(days=1),
|
||||
timedelta(days=2),
|
||||
timedelta(days=3),
|
||||
np.nan,
|
||||
],
|
||||
}
|
||||
)
|
||||
df_test.dt = pd.to_datetime(df_test.dt)
|
||||
df_test["group"] = "A"
|
||||
df_ref = df_test[df_test.dt.notna()]
|
||||
|
||||
grouped_test = df_test.groupby("group")
|
||||
grouped_ref = df_ref.groupby("group")
|
||||
|
||||
tm.assert_frame_equal(grouped_ref.max(), grouped_test.max())
|
||||
tm.assert_frame_equal(grouped_ref.min(), grouped_test.min())
|
||||
tm.assert_frame_equal(grouped_ref.first(), grouped_test.first())
|
||||
tm.assert_frame_equal(grouped_ref.last(), grouped_test.last())
|
||||
|
||||
def test_nunique_with_timegrouper_and_nat(self):
|
||||
# GH 17575
|
||||
test = DataFrame(
|
||||
{
|
||||
"time": [
|
||||
Timestamp("2016-06-28 09:35:35"),
|
||||
pd.NaT,
|
||||
Timestamp("2016-06-28 16:46:28"),
|
||||
],
|
||||
"data": ["1", "2", "3"],
|
||||
}
|
||||
)
|
||||
|
||||
grouper = Grouper(key="time", freq="h")
|
||||
result = test.groupby(grouper)["data"].nunique()
|
||||
expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
|
||||
expected.index = expected.index._with_freq(None)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_scalar_call_versus_list_call(self):
|
||||
# Issue: 17530
|
||||
data_frame = {
|
||||
"location": ["shanghai", "beijing", "shanghai"],
|
||||
"time": Series(
|
||||
["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
|
||||
dtype="datetime64[ns]",
|
||||
),
|
||||
"value": [1, 2, 3],
|
||||
}
|
||||
data_frame = DataFrame(data_frame).set_index("time")
|
||||
grouper = Grouper(freq="D")
|
||||
|
||||
grouped = data_frame.groupby(grouper)
|
||||
result = grouped.count()
|
||||
grouped = data_frame.groupby([grouper])
|
||||
expected = grouped.count()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_grouper_period_index(self):
|
||||
# GH 32108
|
||||
periods = 2
|
||||
index = pd.period_range(
|
||||
start="2018-01", periods=periods, freq="M", name="Month"
|
||||
)
|
||||
period_series = Series(range(periods), index=index)
|
||||
result = period_series.groupby(period_series.index.month).sum()
|
||||
|
||||
expected = Series(
|
||||
range(periods), index=Index(range(1, periods + 1), name=index.name)
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_dict_returns(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
# GH#43500 case where gb._grouper.result_index and gb._grouper.group_keys_seq
|
||||
# have different lengths that goes through the `isinstance(values[0], dict)`
|
||||
# path
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
|
||||
|
||||
df = gb.obj
|
||||
unit = df["Date"]._values.unit
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
|
||||
mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
|
||||
expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_scalar_returns(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
# GH#43500 Previously raised ValueError bc used index with incorrect
|
||||
# length in wrap_applied_result
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
|
||||
|
||||
df = gb.obj
|
||||
unit = df["Date"]._values.unit
|
||||
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
|
||||
expected = Series(
|
||||
[18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
|
||||
index=dti._with_freq(None),
|
||||
name="Quantity",
|
||||
)
|
||||
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
def test_groupby_apply_timegrouper_with_nat_apply_squeeze(
|
||||
self, frame_for_truncated_bingrouper
|
||||
):
|
||||
df = frame_for_truncated_bingrouper
|
||||
|
||||
# We need to create a GroupBy object with only one non-NaT group,
|
||||
# so use a huge freq so that all non-NaT dates will be grouped together
|
||||
tdg = Grouper(key="Date", freq="100YE")
|
||||
gb = df.groupby(tdg)
|
||||
|
||||
# check that we will go through the singular_series path
|
||||
# in _wrap_applied_output_series
|
||||
assert gb.ngroups == 1
|
||||
assert gb._selected_obj.index.nlevels == 1
|
||||
|
||||
# function that returns a Series
|
||||
res = gb.apply(lambda x: x["Quantity"] * 2)
|
||||
|
||||
dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date")
|
||||
expected = DataFrame(
|
||||
[[36, 6, 6, 10, 2]],
|
||||
index=dti,
|
||||
columns=Index([0, 1, 5, 2, 3], name="Quantity"),
|
||||
)
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
def test_groupby_agg_numba_timegrouper_with_nat(
|
||||
self, groupby_with_truncated_bingrouper
|
||||
):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
# See discussion in GH#43487
|
||||
gb = groupby_with_truncated_bingrouper
|
||||
|
||||
result = gb["Quantity"].aggregate(
|
||||
lambda values, index: np.nanmean(values), engine="numba"
|
||||
)
|
||||
|
||||
expected = gb["Quantity"].aggregate("mean")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result_df = gb[["Quantity"]].aggregate(
|
||||
lambda values, index: np.nanmean(values), engine="numba"
|
||||
)
|
||||
expected_df = gb[["Quantity"]].aggregate("mean")
|
||||
tm.assert_frame_equal(result_df, expected_df)
|
||||
|
||||
@td.skip_if_no("pyarrow")
|
||||
def test_pyarrow_index_retention(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/63518
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
},
|
||||
index=Index(
|
||||
[
|
||||
Timestamp("2013-01-01"),
|
||||
Timestamp("2013-01-01"),
|
||||
Timestamp("2013-01-02"),
|
||||
],
|
||||
dtype="timestamp[ns, America/Denver][pyarrow]",
|
||||
),
|
||||
)
|
||||
gb = df.groupby(Grouper(freq="D"))
|
||||
result = gb._grouper.result_index
|
||||
expected = Index(
|
||||
[Timestamp("2013-01-01"), Timestamp("2013-01-02")],
|
||||
dtype="timestamp[ns, America/Denver][pyarrow]",
|
||||
)
|
||||
tm.assert_index_equal(result, expected)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,331 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_arm
|
||||
from pandas.errors import NumbaUtilError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.util.version import Version
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
numba = pytest.importorskip("numba")
|
||||
pytestmark.append(
|
||||
pytest.mark.skipif(
|
||||
Version(numba.__version__) == Version("0.61") and is_platform_arm(),
|
||||
reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_correct_function_signature():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(x):
|
||||
return x + 1
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key").transform(incorrect_function, engine="numba")
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key")["data"].transform(incorrect_function, engine="numba")
|
||||
|
||||
|
||||
def test_check_nopython_kwargs():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(values, index, *, a):
|
||||
return values + a
|
||||
|
||||
def correct_function(values, index, a):
|
||||
return values + a
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
# py signature binding
|
||||
with pytest.raises(
|
||||
TypeError, match="missing a required (keyword-only argument|argument): 'a'"
|
||||
):
|
||||
data.groupby("key").transform(incorrect_function, engine="numba", b=1)
|
||||
with pytest.raises(TypeError, match="missing a required argument: 'a'"):
|
||||
data.groupby("key").transform(correct_function, engine="numba", b=1)
|
||||
|
||||
with pytest.raises(
|
||||
TypeError, match="missing a required (keyword-only argument|argument): 'a'"
|
||||
):
|
||||
data.groupby("key")["data"].transform(incorrect_function, engine="numba", b=1)
|
||||
with pytest.raises(TypeError, match="missing a required argument: 'a'"):
|
||||
data.groupby("key")["data"].transform(correct_function, engine="numba", b=1)
|
||||
|
||||
# numba signature check after binding
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key").transform(incorrect_function, engine="numba", a=1)
|
||||
actual = data.groupby("key").transform(correct_function, engine="numba", a=1)
|
||||
tm.assert_frame_equal(data[["data"]] + 1, actual)
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
|
||||
actual = data.groupby("key")["data"].transform(
|
||||
correct_function, engine="numba", a=1
|
||||
)
|
||||
tm.assert_series_equal(data["data"] + 1, actual)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
def test_numba_vs_cython(jit, frame_or_series, nogil, parallel, nopython, as_index):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func(values, index):
|
||||
return values + 1
|
||||
|
||||
if jit:
|
||||
# Test accepted jitted functions
|
||||
import numba
|
||||
|
||||
func = numba.jit(func)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0, as_index=as_index)
|
||||
if frame_or_series is Series:
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
def test_cache(jit, frame_or_series, nogil, parallel, nopython):
|
||||
# Test that the functions are cached correctly if we switch functions
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return values + 1
|
||||
|
||||
def func_2(values, index):
|
||||
return values * 5
|
||||
|
||||
if jit:
|
||||
import numba
|
||||
|
||||
func_1 = numba.jit(func_1)
|
||||
func_2 = numba.jit(func_2)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if frame_or_series is Series:
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x * 5, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Retest func_1 which should use the cache
|
||||
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.transform(lambda x: x + 1, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_use_global_config():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return values + 1
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
expected = grouped.transform(func_1, engine="numba")
|
||||
with option_context("compute.use_numba", True):
|
||||
result = grouped.transform(func_1, engine=None)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
# TODO: Test more than just reductions (e.g. actually test transformations once we have
|
||||
@pytest.mark.parametrize(
|
||||
"agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
|
||||
)
|
||||
def test_string_cython_vs_numba(agg_func, numba_supported_reductions):
|
||||
pytest.importorskip("numba")
|
||||
agg_func, kwargs = numba_supported_reductions
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
|
||||
result = grouped.transform(agg_func, engine="numba", **kwargs)
|
||||
expected = grouped.transform(agg_func, engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped[1].transform(agg_func, engine="numba", **kwargs)
|
||||
expected = grouped[1].transform(agg_func, engine="cython", **kwargs)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_args_not_cached():
|
||||
# GH 41647
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def sum_last(values, index, n):
|
||||
return values[-n:].sum()
|
||||
|
||||
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
||||
grouped_x = df.groupby("id")["x"]
|
||||
result = grouped_x.transform(sum_last, 1, engine="numba")
|
||||
expected = Series([1.0] * 4, name="x")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped_x.transform(sum_last, 2, engine="numba")
|
||||
expected = Series([2.0] * 4, name="x")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_data_correctly_passed():
|
||||
# GH 43133
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def f(values, index):
|
||||
return index - 1
|
||||
|
||||
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
||||
result = df.groupby("group").transform(f, engine="numba")
|
||||
expected = DataFrame([-2.0, -3.0, -4.0], columns=["v"], index=[-1, -2, -3])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_order_consistency_preserved():
|
||||
# GH 57069
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def f(values, index):
|
||||
return values
|
||||
|
||||
df = DataFrame(
|
||||
{"vals": [0.0, 1.0, 2.0, 3.0], "group": [0, 1, 0, 1]}, index=range(3, -1, -1)
|
||||
)
|
||||
result = df.groupby("group")["vals"].transform(f, engine="numba")
|
||||
expected = Series([0.0, 1.0, 2.0, 3.0], index=range(3, -1, -1), name="vals")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_engine_kwargs_not_cached():
|
||||
# If the user passes a different set of engine_kwargs don't return the same
|
||||
# jitted function
|
||||
pytest.importorskip("numba")
|
||||
nogil = True
|
||||
parallel = False
|
||||
nopython = True
|
||||
|
||||
def func_kwargs(values, index):
|
||||
return nogil + parallel + nopython
|
||||
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
df = DataFrame({"value": [0, 0, 0]})
|
||||
result = df.groupby(level=0).transform(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
nogil = False
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby(level=0).transform(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
def test_multiindex_one_key(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby("A").transform(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
|
||||
df.groupby(["A", "B"]).transform(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
|
||||
|
||||
def test_multilabel_numba_vs_cython(numba_supported_reductions):
|
||||
pytest.importorskip("numba")
|
||||
reduction, kwargs = numba_supported_reductions
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
res_agg = gb.transform(reduction, engine="numba", **kwargs)
|
||||
expected_agg = gb.transform(reduction, engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(res_agg, expected_agg)
|
||||
|
||||
|
||||
def test_multilabel_udf_numba_vs_cython():
|
||||
pytest.importorskip("numba")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
result = gb.transform(
|
||||
lambda values, index: (values - values.min()) / (values.max() - values.min()),
|
||||
engine="numba",
|
||||
)
|
||||
expected = gb.transform(
|
||||
lambda x: (x - x.min()) / (x.max() - x.min()), engine="cython"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user