craftbeerpi4-pione/venv/lib/python3.8/site-packages/pandas/tests/reshape/test_pivot.py

2077 lines
69 KiB
Python
Raw Normal View History

from datetime import date, datetime, timedelta
from itertools import product
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Grouper,
Index,
MultiIndex,
Series,
concat,
date_range,
)
import pandas._testing as tm
from pandas.api.types import CategoricalDtype as CDT
from pandas.core.reshape.pivot import pivot_table
@pytest.fixture(params=[True, False])
def dropna(request):
return request.param
@pytest.fixture(params=[([0] * 4, [1] * 4), (range(0, 3), range(1, 4))])
def interval_values(request, closed):
left, right = request.param
return Categorical(pd.IntervalIndex.from_arrays(left, right, closed))
class TestPivotTable:
def setup_method(self, method):
self.data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
}
)
def test_pivot_table(self, observed):
index = ["A", "B"]
columns = "C"
table = pivot_table(
self.data, values="D", index=index, columns=columns, observed=observed
)
table2 = self.data.pivot_table(
values="D", index=index, columns=columns, observed=observed
)
tm.assert_frame_equal(table, table2)
# this works
pivot_table(self.data, values="D", index=index, observed=observed)
if len(index) > 1:
assert table.index.names == tuple(index)
else:
assert table.index.name == index[0]
if len(columns) > 1:
assert table.columns.names == columns
else:
assert table.columns.name == columns[0]
expected = self.data.groupby(index + [columns])["D"].agg(np.mean).unstack()
tm.assert_frame_equal(table, expected)
def test_pivot_table_categorical_observed_equal(self, observed):
# issue #24923
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]}
)
expected = df.pivot_table(
index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0
)
expected.index = expected.index.astype("category")
expected.columns = expected.columns.astype("category")
df.col1 = df.col1.astype("category")
df.col2 = df.col2.astype("category")
result = df.pivot_table(
index="col1",
values="col3",
columns="col2",
aggfunc=np.sum,
fill_value=0,
observed=observed,
)
tm.assert_frame_equal(result, expected)
def test_pivot_table_nocols(self):
df = DataFrame(
{"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]}
)
rs = df.pivot_table(columns="cols", aggfunc=np.sum)
xp = df.pivot_table(index="cols", aggfunc=np.sum).T
tm.assert_frame_equal(rs, xp)
rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"})
xp = df.pivot_table(index="cols", aggfunc={"values": "mean"}).T
tm.assert_frame_equal(rs, xp)
def test_pivot_table_dropna(self):
df = DataFrame(
{
"amount": {0: 60000, 1: 100000, 2: 50000, 3: 30000},
"customer": {0: "A", 1: "A", 2: "B", 3: "C"},
"month": {0: 201307, 1: 201309, 2: 201308, 3: 201310},
"product": {0: "a", 1: "b", 2: "c", 3: "d"},
"quantity": {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000},
}
)
pv_col = df.pivot_table(
"quantity", "month", ["customer", "product"], dropna=False
)
pv_ind = df.pivot_table(
"quantity", ["customer", "product"], "month", dropna=False
)
m = MultiIndex.from_tuples(
[
("A", "a"),
("A", "b"),
("A", "c"),
("A", "d"),
("B", "a"),
("B", "b"),
("B", "c"),
("B", "d"),
("C", "a"),
("C", "b"),
("C", "c"),
("C", "d"),
],
names=["customer", "product"],
)
tm.assert_index_equal(pv_col.columns, m)
tm.assert_index_equal(pv_ind.index, m)
def test_pivot_table_categorical(self):
cat1 = Categorical(
["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True
)
cat2 = Categorical(
["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True
)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
result = pd.pivot_table(df, values="values", index=["A", "B"], dropna=True)
2021-01-30 22:29:33 +01:00
exp_index = pd.MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
tm.assert_frame_equal(result, expected)
def test_pivot_table_dropna_categoricals(self, dropna):
# GH 15193
categories = ["a", "b", "c", "d"]
df = DataFrame(
{
"A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
"B": [1, 2, 3, 1, 2, 3, 1, 2, 3],
"C": range(0, 9),
}
)
df["A"] = df["A"].astype(CDT(categories, ordered=False))
result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna)
expected_columns = Series(["a", "b", "c"], name="A")
expected_columns = expected_columns.astype(CDT(categories, ordered=False))
expected_index = Series([1, 2, 3], name="B")
expected = DataFrame(
[[0, 3, 6], [1, 4, 7], [2, 5, 8]],
index=expected_index,
columns=expected_columns,
)
if not dropna:
# add back the non observed to compare
expected = expected.reindex(columns=Categorical(categories)).astype("float")
tm.assert_frame_equal(result, expected)
def test_pivot_with_non_observable_dropna(self, dropna):
# gh-21133
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
2021-01-30 22:29:33 +01:00
"A": pd.Categorical(
[np.nan, "low", "high", "low", "high"],
categories=["low", "high"],
ordered=True,
),
"B": range(5),
}
)
result = df.pivot_table(index="A", values="B", dropna=dropna)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame(
{"B": [2, 3]},
2021-01-30 22:29:33 +01:00
index=pd.Index(
pd.Categorical.from_codes(
[0, 1], categories=["low", "high"], ordered=True
),
name="A",
),
)
tm.assert_frame_equal(result, expected)
# gh-21378
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
2021-01-30 22:29:33 +01:00
"A": pd.Categorical(
["left", "low", "high", "low", "high"],
categories=["low", "high", "left"],
ordered=True,
),
"B": range(5),
}
)
result = df.pivot_table(index="A", values="B", dropna=dropna)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame(
{"B": [2, 3, 0]},
2021-01-30 22:29:33 +01:00
index=pd.Index(
pd.Categorical.from_codes(
[0, 1, 2], categories=["low", "high", "left"], ordered=True
),
name="A",
),
)
tm.assert_frame_equal(result, expected)
def test_pivot_with_interval_index(self, interval_values, dropna):
# GH 25814
df = DataFrame({"A": interval_values, "B": 1})
result = df.pivot_table(index="A", values="B", dropna=dropna)
expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A"))
tm.assert_frame_equal(result, expected)
def test_pivot_with_interval_index_margins(self):
# GH 25815
ordered_cat = pd.IntervalIndex.from_arrays([0, 0, 1, 1], [1, 1, 2, 2])
df = DataFrame(
{
"A": np.arange(4, 0, -1, dtype=np.intp),
"B": ["a", "b", "a", "b"],
2021-01-30 22:29:33 +01:00
"C": pd.Categorical(ordered_cat, ordered=True).sort_values(
ascending=False
),
}
)
pivot_tab = pd.pivot_table(
df, index="C", columns="B", values="A", aggfunc="sum", margins=True
)
result = pivot_tab["All"]
expected = Series(
[3, 7, 10],
index=Index([pd.Interval(0, 1), pd.Interval(1, 2), "All"], name="C"),
name="All",
dtype=np.intp,
)
tm.assert_series_equal(result, expected)
def test_pass_array(self):
result = self.data.pivot_table("D", index=self.data.A, columns=self.data.C)
expected = self.data.pivot_table("D", index="A", columns="C")
tm.assert_frame_equal(result, expected)
def test_pass_function(self):
result = self.data.pivot_table("D", index=lambda x: x // 5, columns=self.data.C)
expected = self.data.pivot_table("D", index=self.data.index // 5, columns="C")
tm.assert_frame_equal(result, expected)
def test_pivot_table_multiple(self):
index = ["A", "B"]
columns = "C"
table = pivot_table(self.data, index=index, columns=columns)
expected = self.data.groupby(index + [columns]).agg(np.mean).unstack()
tm.assert_frame_equal(table, expected)
def test_pivot_dtypes(self):
# can convert dtypes
f = DataFrame(
{
"a": ["cat", "bat", "cat", "bat"],
"v": [1, 2, 3, 4],
"i": ["a", "b", "a", "b"],
}
)
assert f.dtypes["v"] == "int64"
z = pivot_table(
f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.sum
)
result = z.dtypes
expected = Series([np.dtype("int64")] * 2, index=Index(list("ab"), name="i"))
tm.assert_series_equal(result, expected)
# cannot convert dtypes
f = DataFrame(
{
"a": ["cat", "bat", "cat", "bat"],
"v": [1.5, 2.5, 3.5, 4.5],
"i": ["a", "b", "a", "b"],
}
)
assert f.dtypes["v"] == "float64"
z = pivot_table(
f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.mean
)
result = z.dtypes
expected = Series([np.dtype("float64")] * 2, index=Index(list("ab"), name="i"))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"columns,values",
[
("bool1", ["float1", "float2"]),
("bool1", ["float1", "float2", "bool1"]),
("bool2", ["float1", "float2", "bool1"]),
],
)
def test_pivot_preserve_dtypes(self, columns, values):
# GH 7142 regression test
v = np.arange(5, dtype=np.float64)
df = DataFrame(
{"float1": v, "float2": v + 2.0, "bool1": v <= 2, "bool2": v <= 3}
)
df_res = df.reset_index().pivot_table(
index="index", columns=columns, values=values
)
result = dict(df_res.dtypes)
expected = {
col: np.dtype("O") if col[0].startswith("b") else np.dtype("float64")
for col in df_res
}
assert result == expected
def test_pivot_no_values(self):
# GH 14380
idx = pd.DatetimeIndex(
["2011-01-01", "2011-02-01", "2011-01-02", "2011-01-01", "2011-01-02"]
)
2021-01-30 22:29:33 +01:00
df = pd.DataFrame({"A": [1, 2, 3, 4, 5]}, index=idx)
res = df.pivot_table(index=df.index.month, columns=df.index.day)
2021-01-30 22:29:33 +01:00
exp_columns = pd.MultiIndex.from_tuples([("A", 1), ("A", 2)])
exp = pd.DataFrame(
[[2.5, 4.0], [2.0, np.nan]], index=[1, 2], columns=exp_columns
)
tm.assert_frame_equal(res, exp)
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5],
"dt": pd.date_range("2011-01-01", freq="D", periods=5),
},
index=idx,
)
2021-01-30 22:29:33 +01:00
res = df.pivot_table(
index=df.index.month, columns=pd.Grouper(key="dt", freq="M")
)
exp_columns = pd.MultiIndex.from_tuples([("A", pd.Timestamp("2011-01-31"))])
exp_columns.names = [None, "dt"]
2021-01-30 22:29:33 +01:00
exp = pd.DataFrame([3.25, 2.0], index=[1, 2], columns=exp_columns)
tm.assert_frame_equal(res, exp)
res = df.pivot_table(
2021-01-30 22:29:33 +01:00
index=pd.Grouper(freq="A"), columns=pd.Grouper(key="dt", freq="M")
)
2021-01-30 22:29:33 +01:00
exp = pd.DataFrame(
[3], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns
)
tm.assert_frame_equal(res, exp)
def test_pivot_multi_values(self):
result = pivot_table(
self.data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0
)
expected = pivot_table(
self.data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0
)
tm.assert_frame_equal(result, expected)
def test_pivot_multi_functions(self):
f = lambda func: pivot_table(
self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func
)
result = f([np.mean, np.std])
means = f(np.mean)
stds = f(np.std)
expected = concat([means, stds], keys=["mean", "std"], axis=1)
tm.assert_frame_equal(result, expected)
# margins not supported??
f = lambda func: pivot_table(
self.data,
values=["D", "E"],
index=["A", "B"],
columns="C",
aggfunc=func,
margins=True,
)
result = f([np.mean, np.std])
means = f(np.mean)
stds = f(np.std)
expected = concat([means, stds], keys=["mean", "std"], axis=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_index_with_nan(self, method):
# GH 3588
nan = np.nan
df = DataFrame(
{
"a": ["R1", "R2", nan, "R4"],
"b": ["C1", "C2", "C3", "C4"],
"c": [10, 15, 17, 20],
}
)
if method:
result = df.pivot("a", "b", "c")
else:
result = pd.pivot(df, "a", "b", "c")
expected = DataFrame(
[
[nan, nan, 17, nan],
[10, nan, nan, nan],
[nan, 15, nan, nan],
[nan, nan, nan, 20],
],
index=Index([nan, "R1", "R2", "R4"], name="a"),
columns=Index(["C1", "C2", "C3", "C4"], name="b"),
)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T)
# GH9491
df = DataFrame(
{
"a": pd.date_range("2014-02-01", periods=6, freq="D"),
"c": 100 + np.arange(6),
}
)
df["b"] = df["a"] - pd.Timestamp("2014-02-02")
df.loc[1, "a"] = df.loc[3, "a"] = nan
df.loc[1, "b"] = df.loc[4, "b"] = nan
if method:
pv = df.pivot("a", "b", "c")
else:
pv = pd.pivot(df, "a", "b", "c")
assert pv.notna().values.sum() == len(df)
for _, row in df.iterrows():
assert pv.loc[row["a"], row["b"]] == row["c"]
if method:
result = df.pivot("b", "a", "c")
else:
result = pd.pivot(df, "b", "a", "c")
tm.assert_frame_equal(result, pv.T)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_with_tz(self, method):
# GH 5878
df = DataFrame(
{
"dt1": [
datetime(2013, 1, 1, 9, 0),
datetime(2013, 1, 2, 9, 0),
datetime(2013, 1, 1, 9, 0),
datetime(2013, 1, 2, 9, 0),
],
"dt2": [
datetime(2014, 1, 1, 9, 0),
datetime(2014, 1, 1, 9, 0),
datetime(2014, 1, 2, 9, 0),
datetime(2014, 1, 2, 9, 0),
],
"data1": np.arange(4, dtype="int64"),
"data2": np.arange(4, dtype="int64"),
}
)
df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific"))
df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo"))
exp_col1 = Index(["data1", "data1", "data2", "data2"])
exp_col2 = pd.DatetimeIndex(
["2014/01/01 09:00", "2014/01/02 09:00"] * 2, name="dt2", tz="Asia/Tokyo"
)
2021-01-30 22:29:33 +01:00
exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2])
expected = DataFrame(
[[0, 2, 0, 2], [1, 3, 1, 3]],
index=pd.DatetimeIndex(
["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific"
),
columns=exp_col,
)
if method:
pv = df.pivot(index="dt1", columns="dt2")
else:
pv = pd.pivot(df, index="dt1", columns="dt2")
tm.assert_frame_equal(pv, expected)
expected = DataFrame(
[[0, 2], [1, 3]],
index=pd.DatetimeIndex(
["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific"
),
columns=pd.DatetimeIndex(
["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo"
),
)
if method:
pv = df.pivot(index="dt1", columns="dt2", values="data1")
else:
pv = pd.pivot(df, index="dt1", columns="dt2", values="data1")
tm.assert_frame_equal(pv, expected)
def test_pivot_tz_in_values(self):
# GH 14948
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
[
{
"uid": "aa",
"ts": pd.Timestamp("2016-08-12 13:00:00-0700", tz="US/Pacific"),
},
{
"uid": "aa",
"ts": pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"),
},
{
"uid": "aa",
"ts": pd.Timestamp("2016-08-12 14:00:00-0700", tz="US/Pacific"),
},
{
"uid": "aa",
"ts": pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"),
},
{
"uid": "aa",
"ts": pd.Timestamp("2016-08-25 13:00:00-0700", tz="US/Pacific"),
},
]
)
df = df.set_index("ts").reset_index()
mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0))
result = pd.pivot_table(
df.set_index("ts").reset_index(),
values="ts",
index=["uid"],
columns=[mins],
aggfunc=np.min,
)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame(
[
[
pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"),
pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"),
]
],
2021-01-30 22:29:33 +01:00
index=pd.Index(["aa"], name="uid"),
columns=pd.DatetimeIndex(
[
pd.Timestamp("2016-08-12 00:00:00", tz="US/Pacific"),
pd.Timestamp("2016-08-25 00:00:00", tz="US/Pacific"),
],
name="ts",
),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_periods(self, method):
df = DataFrame(
{
"p1": [
pd.Period("2013-01-01", "D"),
pd.Period("2013-01-02", "D"),
pd.Period("2013-01-01", "D"),
pd.Period("2013-01-02", "D"),
],
"p2": [
pd.Period("2013-01", "M"),
pd.Period("2013-01", "M"),
pd.Period("2013-02", "M"),
pd.Period("2013-02", "M"),
],
"data1": np.arange(4, dtype="int64"),
"data2": np.arange(4, dtype="int64"),
}
)
exp_col1 = Index(["data1", "data1", "data2", "data2"])
exp_col2 = pd.PeriodIndex(["2013-01", "2013-02"] * 2, name="p2", freq="M")
2021-01-30 22:29:33 +01:00
exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2])
expected = DataFrame(
[[0, 2, 0, 2], [1, 3, 1, 3]],
index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"),
columns=exp_col,
)
if method:
pv = df.pivot(index="p1", columns="p2")
else:
pv = pd.pivot(df, index="p1", columns="p2")
tm.assert_frame_equal(pv, expected)
expected = DataFrame(
[[0, 2], [1, 3]],
index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"),
columns=pd.PeriodIndex(["2013-01", "2013-02"], name="p2", freq="M"),
)
if method:
pv = df.pivot(index="p1", columns="p2", values="data1")
else:
pv = pd.pivot(df, index="p1", columns="p2", values="data1")
tm.assert_frame_equal(pv, expected)
def test_pivot_periods_with_margins(self):
# GH 28323
df = DataFrame(
{
"a": [1, 1, 2, 2],
"b": [
pd.Period("2019Q1"),
pd.Period("2019Q2"),
pd.Period("2019Q1"),
pd.Period("2019Q2"),
],
"x": 1.0,
}
)
expected = DataFrame(
data=1.0,
2021-01-30 22:29:33 +01:00
index=pd.Index([1, 2, "All"], name="a"),
columns=pd.Index(
[pd.Period("2019Q1"), pd.Period("2019Q2"), "All"], name="b"
),
)
result = df.pivot_table(index="a", columns="b", values="x", margins=True)
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"values",
[
["baz", "zoo"],
np.array(["baz", "zoo"]),
2021-01-30 22:29:33 +01:00
pd.Series(["baz", "zoo"]),
pd.Index(["baz", "zoo"]),
],
)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_with_list_like_values(self, values, method):
# issue #17160
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
"foo": ["one", "one", "one", "two", "two", "two"],
"bar": ["A", "B", "C", "A", "B", "C"],
"baz": [1, 2, 3, 4, 5, 6],
"zoo": ["x", "y", "z", "q", "w", "t"],
}
)
if method:
result = df.pivot(index="foo", columns="bar", values=values)
else:
result = pd.pivot(df, index="foo", columns="bar", values=values)
data = [[1, 2, 3, "x", "y", "z"], [4, 5, 6, "q", "w", "t"]]
index = Index(data=["one", "two"], name="foo")
columns = MultiIndex(
levels=[["baz", "zoo"], ["A", "B", "C"]],
codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]],
names=[None, "bar"],
)
expected = DataFrame(data=data, index=index, columns=columns, dtype="object")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["bar", "baz"],
np.array(["bar", "baz"]),
2021-01-30 22:29:33 +01:00
pd.Series(["bar", "baz"]),
pd.Index(["bar", "baz"]),
],
)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_with_list_like_values_nans(self, values, method):
# issue #17160
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
"foo": ["one", "one", "one", "two", "two", "two"],
"bar": ["A", "B", "C", "A", "B", "C"],
"baz": [1, 2, 3, 4, 5, 6],
"zoo": ["x", "y", "z", "q", "w", "t"],
}
)
if method:
result = df.pivot(index="zoo", columns="foo", values=values)
else:
result = pd.pivot(df, index="zoo", columns="foo", values=values)
data = [
[np.nan, "A", np.nan, 4],
[np.nan, "C", np.nan, 6],
[np.nan, "B", np.nan, 5],
["A", np.nan, 1, np.nan],
["B", np.nan, 2, np.nan],
["C", np.nan, 3, np.nan],
]
index = Index(data=["q", "t", "w", "x", "y", "z"], name="zoo")
columns = MultiIndex(
levels=[["bar", "baz"], ["one", "two"]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=[None, "foo"],
)
expected = DataFrame(data=data, index=index, columns=columns, dtype="object")
tm.assert_frame_equal(result, expected)
def test_pivot_columns_none_raise_error(self):
# GH 30924
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1, 2, 3], "col3": [1, 2, 3]}
)
msg = r"pivot\(\) missing 1 required argument: 'columns'"
with pytest.raises(TypeError, match=msg):
df.pivot(index="col1", values="col3")
@pytest.mark.xfail(
reason="MultiIndexed unstack with tuple names fails with KeyError GH#19966"
)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_with_multiindex(self, method):
# issue #17160
index = Index(data=[0, 1, 2, 3, 4, 5])
data = [
["one", "A", 1, "x"],
["one", "B", 2, "y"],
["one", "C", 3, "z"],
["two", "A", 4, "q"],
["two", "B", 5, "w"],
["two", "C", 6, "t"],
]
columns = MultiIndex(
levels=[["bar", "baz"], ["first", "second"]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
)
df = DataFrame(data=data, index=index, columns=columns, dtype="object")
if method:
result = df.pivot(
index=("bar", "first"),
columns=("bar", "second"),
values=("baz", "first"),
)
else:
result = pd.pivot(
df,
index=("bar", "first"),
columns=("bar", "second"),
values=("baz", "first"),
)
data = {
"A": Series([1, 4], index=["one", "two"]),
"B": Series([2, 5], index=["one", "two"]),
"C": Series([3, 6], index=["one", "two"]),
}
expected = DataFrame(data)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", [True, False])
def test_pivot_with_tuple_of_values(self, method):
# issue #17160
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
"foo": ["one", "one", "one", "two", "two", "two"],
"bar": ["A", "B", "C", "A", "B", "C"],
"baz": [1, 2, 3, 4, 5, 6],
"zoo": ["x", "y", "z", "q", "w", "t"],
}
)
with pytest.raises(KeyError, match=r"^\('bar', 'baz'\)$"):
# tuple is seen as a single column name
if method:
df.pivot(index="zoo", columns="foo", values=("bar", "baz"))
else:
pd.pivot(df, index="zoo", columns="foo", values=("bar", "baz"))
def test_margins(self):
def _check_output(
result, values_col, index=["A", "B"], columns=["C"], margins_col="All"
):
col_margins = result.loc[result.index[:-1], margins_col]
expected_col_margins = self.data.groupby(index)[values_col].mean()
tm.assert_series_equal(col_margins, expected_col_margins, check_names=False)
assert col_margins.name == margins_col
result = result.sort_index()
index_margins = result.loc[(margins_col, "")].iloc[:-1]
expected_ix_margins = self.data.groupby(columns)[values_col].mean()
tm.assert_series_equal(
index_margins, expected_ix_margins, check_names=False
)
assert index_margins.name == (margins_col, "")
grand_total_margins = result.loc[(margins_col, ""), margins_col]
expected_total_margins = self.data[values_col].mean()
assert grand_total_margins == expected_total_margins
# column specified
result = self.data.pivot_table(
values="D", index=["A", "B"], columns="C", margins=True, aggfunc=np.mean
)
_check_output(result, "D")
# Set a different margins_name (not 'All')
result = self.data.pivot_table(
values="D",
index=["A", "B"],
columns="C",
margins=True,
aggfunc=np.mean,
margins_name="Totals",
)
_check_output(result, "D", margins_col="Totals")
# no column specified
table = self.data.pivot_table(
index=["A", "B"], columns="C", margins=True, aggfunc=np.mean
)
for value_col in table.columns.levels[0]:
_check_output(table[value_col], value_col)
# no col
# to help with a buglet
self.data.columns = [k * 2 for k in self.data.columns]
table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean)
for value_col in table.columns:
totals = table.loc[("All", ""), value_col]
assert totals == self.data[value_col].mean()
table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean")
for item in ["DD", "EE", "FF"]:
totals = table.loc[("All", ""), item]
assert totals == self.data[item].mean()
@pytest.mark.parametrize(
"columns, aggfunc, values, expected_columns",
[
(
"A",
np.mean,
[[5.5, 5.5, 2.2, 2.2], [8.0, 8.0, 4.4, 4.4]],
Index(["bar", "All", "foo", "All"], name="A"),
),
(
["A", "B"],
"sum",
[[9, 13, 22, 5, 6, 11], [14, 18, 32, 11, 11, 22]],
MultiIndex.from_tuples(
[
("bar", "one"),
("bar", "two"),
("bar", "All"),
("foo", "one"),
("foo", "two"),
("foo", "All"),
],
names=["A", "B"],
),
),
],
)
def test_margin_with_only_columns_defined(
self, columns, aggfunc, values, expected_columns
):
# GH 31016
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
"C": [
"small",
"large",
"large",
"small",
"small",
"large",
"small",
"small",
"large",
],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
}
)
result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame(
values, index=Index(["D", "E"]), columns=expected_columns
)
tm.assert_frame_equal(result, expected)
def test_margins_dtype(self):
# GH 17013
df = self.data.copy()
df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3)
mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")]
mi = MultiIndex.from_tuples(mi_val, names=("A", "B"))
expected = DataFrame(
{"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi
).rename_axis("C", axis=1)
expected["All"] = expected["dull"] + expected["shiny"]
result = df.pivot_table(
values="D",
index=["A", "B"],
columns="C",
margins=True,
aggfunc=np.sum,
fill_value=0,
)
tm.assert_frame_equal(expected, result)
@pytest.mark.xfail(reason="GH#17035 (len of floats is casted back to floats)")
def test_margins_dtype_len(self):
mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")]
mi = MultiIndex.from_tuples(mi_val, names=("A", "B"))
expected = DataFrame(
{"dull": [1, 1, 2, 1, 5], "shiny": [2, 0, 2, 2, 6]}, index=mi
).rename_axis("C", axis=1)
expected["All"] = expected["dull"] + expected["shiny"]
result = self.data.pivot_table(
values="D",
index=["A", "B"],
columns="C",
margins=True,
aggfunc=len,
fill_value=0,
)
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)])
def test_pivot_table_multiindex_only(self, cols):
# GH 17038
df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]})
result = df2.pivot_table(values="v", columns=cols)
expected = DataFrame(
[[4, 5, 6]],
columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols),
index=Index(["v"]),
)
tm.assert_frame_equal(result, expected)
def test_pivot_table_retains_tz(self):
dti = date_range("2016-01-01", periods=3, tz="Europe/Amsterdam")
df = DataFrame({"A": np.random.randn(3), "B": np.random.randn(3), "C": dti})
result = df.pivot_table(index=["B", "C"], dropna=False)
# check tz retention
assert result.index.levels[1].equals(dti)
def test_pivot_integer_columns(self):
# caused by upstream bug in unstack
d = date.min
data = list(
product(
["foo", "bar"],
["A", "B", "C"],
["x1", "x2"],
[d + timedelta(i) for i in range(20)],
[1.0],
)
)
df = DataFrame(data)
table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])
df2 = df.rename(columns=str)
table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"])
tm.assert_frame_equal(table, table2, check_names=False)
def test_pivot_no_level_overlap(self):
# GH #1181
data = DataFrame(
{
"a": ["a", "a", "a", "a", "b", "b", "b", "b"] * 2,
"b": [0, 0, 0, 0, 1, 1, 1, 1] * 2,
"c": (["foo"] * 4 + ["bar"] * 4) * 2,
"value": np.random.randn(16),
}
)
table = data.pivot_table("value", index="a", columns=["b", "c"])
grouped = data.groupby(["a", "b", "c"])["value"].mean()
expected = grouped.unstack("b").unstack("c").dropna(axis=1, how="all")
tm.assert_frame_equal(table, expected)
def test_pivot_columns_lexsorted(self):
n = 10000
dtype = np.dtype(
[
("Index", object),
("Symbol", object),
("Year", int),
("Month", int),
("Day", int),
("Quantity", int),
("Price", float),
]
)
products = np.array(
[
("SP500", "ADBE"),
("SP500", "NVDA"),
("SP500", "ORCL"),
("NDQ100", "AAPL"),
("NDQ100", "MSFT"),
("NDQ100", "GOOG"),
("FTSE", "DGE.L"),
("FTSE", "TSCO.L"),
("FTSE", "GSK.L"),
],
dtype=[("Index", object), ("Symbol", object)],
)
items = np.empty(n, dtype=dtype)
iproduct = np.random.randint(0, len(products), n)
items["Index"] = products["Index"][iproduct]
items["Symbol"] = products["Symbol"][iproduct]
dr = pd.date_range(date(2000, 1, 1), date(2010, 12, 31))
dates = dr[np.random.randint(0, len(dr), n)]
items["Year"] = dates.year
items["Month"] = dates.month
items["Day"] = dates.day
items["Price"] = np.random.lognormal(4.0, 2.0, n)
df = DataFrame(items)
pivoted = df.pivot_table(
"Price",
index=["Month", "Day"],
columns=["Index", "Symbol", "Year"],
aggfunc="mean",
)
assert pivoted.columns.is_monotonic
def test_pivot_complex_aggfunc(self):
f = {"D": ["std"], "E": ["sum"]}
expected = self.data.groupby(["A", "B"]).agg(f).unstack("B")
result = self.data.pivot_table(index="A", columns="B", aggfunc=f)
tm.assert_frame_equal(result, expected)
def test_margins_no_values_no_cols(self):
# Regression test on pivot table: no values or cols passed.
result = self.data[["A", "B"]].pivot_table(
index=["A", "B"], aggfunc=len, margins=True
)
result_list = result.tolist()
assert sum(result_list[:-1]) == result_list[-1]
def test_margins_no_values_two_rows(self):
# Regression test on pivot table: no values passed but rows are a
# multi-index
result = self.data[["A", "B", "C"]].pivot_table(
index=["A", "B"], columns="C", aggfunc=len, margins=True
)
assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0]
def test_margins_no_values_one_row_one_col(self):
# Regression test on pivot table: no values passed but row and col
# defined
result = self.data[["A", "B"]].pivot_table(
index="A", columns="B", aggfunc=len, margins=True
)
assert result.All.tolist() == [4.0, 7.0, 11.0]
def test_margins_no_values_two_row_two_cols(self):
# Regression test on pivot table: no values passed but rows and cols
# are multi-indexed
self.data["D"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]
result = self.data[["A", "B", "C", "D"]].pivot_table(
index=["A", "B"], columns=["C", "D"], aggfunc=len, margins=True
)
assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0]
@pytest.mark.parametrize("margin_name", ["foo", "one", 666, None, ["a", "b"]])
def test_pivot_table_with_margins_set_margin_name(self, margin_name):
# see gh-3335
msg = (
f'Conflicting name "{margin_name}" in margins|'
"margins_name argument must be a string"
)
with pytest.raises(ValueError, match=msg):
# multi-index index
pivot_table(
self.data,
values="D",
index=["A", "B"],
columns=["C"],
margins=True,
margins_name=margin_name,
)
with pytest.raises(ValueError, match=msg):
# multi-index column
pivot_table(
self.data,
values="D",
index=["C"],
columns=["A", "B"],
margins=True,
margins_name=margin_name,
)
with pytest.raises(ValueError, match=msg):
# non-multi-index index/column
pivot_table(
self.data,
values="D",
index=["A"],
columns=["B"],
margins=True,
margins_name=margin_name,
)
def test_pivot_timegrouper(self):
df = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 1, 1),
datetime(2013, 1, 1),
datetime(2013, 10, 1),
datetime(2013, 10, 2),
datetime(2013, 10, 1),
datetime(2013, 10, 2),
datetime(2013, 12, 2),
datetime(2013, 12, 2),
],
}
).set_index("Date")
expected = DataFrame(
np.array([10, 18, 3], dtype="int64").reshape(1, 3),
index=pd.DatetimeIndex([datetime(2013, 12, 31)], freq="A"),
columns="Carl Joe Mark".split(),
)
expected.index.name = "Date"
expected.columns.name = "Buyer"
result = pivot_table(
df,
index=Grouper(freq="A"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index="Buyer",
columns=Grouper(freq="A"),
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
expected = DataFrame(
np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3),
index=pd.DatetimeIndex(
[datetime(2013, 1, 1), datetime(2013, 7, 1)], freq="6MS"
),
columns="Carl Joe Mark".split(),
)
expected.index.name = "Date"
expected.columns.name = "Buyer"
result = pivot_table(
df,
index=Grouper(freq="6MS"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index="Buyer",
columns=Grouper(freq="6MS"),
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
# passing the name
df = df.reset_index()
result = pivot_table(
df,
index=Grouper(freq="6MS", key="Date"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index="Buyer",
columns=Grouper(freq="6MS", key="Date"),
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
msg = "'The grouper name foo is not found'"
with pytest.raises(KeyError, match=msg):
pivot_table(
df,
index=Grouper(freq="6MS", key="foo"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
with pytest.raises(KeyError, match=msg):
pivot_table(
df,
index="Buyer",
columns=Grouper(freq="6MS", key="foo"),
values="Quantity",
aggfunc=np.sum,
)
# passing the level
df = df.set_index("Date")
result = pivot_table(
df,
index=Grouper(freq="6MS", level="Date"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index="Buyer",
columns=Grouper(freq="6MS", level="Date"),
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
msg = "The level foo is not valid"
with pytest.raises(ValueError, match=msg):
pivot_table(
df,
index=Grouper(freq="6MS", level="foo"),
columns="Buyer",
values="Quantity",
aggfunc=np.sum,
)
with pytest.raises(ValueError, match=msg):
pivot_table(
df,
index="Buyer",
columns=Grouper(freq="6MS", level="foo"),
values="Quantity",
aggfunc=np.sum,
)
# double grouper
df = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 11, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 11, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 2, 12, 0),
datetime(2013, 12, 5, 14, 0),
],
"PayDay": [
datetime(2013, 10, 4, 0, 0),
datetime(2013, 10, 15, 13, 5),
datetime(2013, 9, 5, 20, 0),
datetime(2013, 11, 2, 10, 0),
datetime(2013, 10, 7, 20, 0),
datetime(2013, 9, 5, 10, 0),
datetime(2013, 12, 30, 12, 0),
datetime(2013, 11, 20, 14, 0),
],
}
)
result = pivot_table(
df,
index=Grouper(freq="M", key="Date"),
columns=Grouper(freq="M", key="PayDay"),
values="Quantity",
aggfunc=np.sum,
)
expected = DataFrame(
np.array(
[
np.nan,
3,
np.nan,
np.nan,
6,
np.nan,
1,
9,
np.nan,
9,
np.nan,
np.nan,
np.nan,
np.nan,
3,
np.nan,
]
).reshape(4, 4),
index=pd.DatetimeIndex(
[
datetime(2013, 9, 30),
datetime(2013, 10, 31),
datetime(2013, 11, 30),
datetime(2013, 12, 31),
],
freq="M",
),
columns=pd.DatetimeIndex(
[
datetime(2013, 9, 30),
datetime(2013, 10, 31),
datetime(2013, 11, 30),
datetime(2013, 12, 31),
],
freq="M",
),
)
expected.index.name = "Date"
expected.columns.name = "PayDay"
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index=Grouper(freq="M", key="PayDay"),
columns=Grouper(freq="M", key="Date"),
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
tuples = [
(datetime(2013, 9, 30), datetime(2013, 10, 31)),
(datetime(2013, 10, 31), datetime(2013, 9, 30)),
(datetime(2013, 10, 31), datetime(2013, 11, 30)),
(datetime(2013, 10, 31), datetime(2013, 12, 31)),
(datetime(2013, 11, 30), datetime(2013, 10, 31)),
(datetime(2013, 12, 31), datetime(2013, 11, 30)),
]
idx = MultiIndex.from_tuples(tuples, names=["Date", "PayDay"])
expected = DataFrame(
np.array(
[3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3]
).reshape(6, 2),
index=idx,
columns=["A", "B"],
)
expected.columns.name = "Branch"
result = pivot_table(
df,
index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")],
columns=["Branch"],
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index=["Branch"],
columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")],
values="Quantity",
aggfunc=np.sum,
)
tm.assert_frame_equal(result, expected.T)
def test_pivot_datetime_tz(self):
dates1 = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
dates2 = [
"2013-01-01 15:00:00",
"2013-01-01 15:00:00",
"2013-01-01 15:00:00",
"2013-02-01 15:00:00",
"2013-02-01 15:00:00",
"2013-02-01 15:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"dt1": dates1,
"dt2": dates2,
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific"))
df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo"))
exp_idx = pd.DatetimeIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
tz="US/Pacific",
name="dt1",
)
exp_col1 = Index(["value1", "value1"])
exp_col2 = Index(["a", "b"], name="label")
exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col)
result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"])
tm.assert_frame_equal(result, expected)
exp_col1 = Index(["sum", "sum", "sum", "sum", "mean", "mean", "mean", "mean"])
exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2)
exp_col3 = pd.DatetimeIndex(
["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4,
tz="Asia/Tokyo",
name="dt2",
)
exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
expected = DataFrame(
np.array(
[
[0, 3, 1, 2, 0, 3, 1, 2],
[1, 4, 2, 1, 1, 4, 2, 1],
[2, 5, 1, 2, 2, 5, 1, 2],
],
dtype="int64",
),
index=exp_idx,
columns=exp_col,
)
result = pivot_table(
df,
index=["dt1"],
columns=["dt2"],
values=["value1", "value2"],
aggfunc=[np.sum, np.mean],
)
tm.assert_frame_equal(result, expected)
def test_pivot_dtaccessor(self):
# GH 8103
dates1 = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
dates2 = [
"2013-01-01 15:00:00",
"2013-01-01 15:00:00",
"2013-01-01 15:00:00",
"2013-02-01 15:00:00",
"2013-02-01 15:00:00",
"2013-02-01 15:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"dt1": dates1,
"dt2": dates2,
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d))
df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d))
result = pivot_table(
df, index="label", columns=df["dt1"].dt.hour, values="value1"
)
exp_idx = Index(["a", "b"], name="label")
expected = DataFrame(
{7: [0, 3], 8: [1, 4], 9: [2, 5]},
index=exp_idx,
columns=Index([7, 8, 9], name="dt1"),
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df, index=df["dt2"].dt.month, columns=df["dt1"].dt.hour, values="value1"
)
expected = DataFrame(
{7: [0, 3], 8: [1, 4], 9: [2, 5]},
index=Index([1, 2], name="dt2"),
columns=Index([7, 8, 9], name="dt1"),
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index=df["dt2"].dt.year.values,
columns=[df["dt1"].dt.hour, df["dt2"].dt.month],
values="value1",
)
exp_col = MultiIndex.from_arrays(
[[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=["dt1", "dt2"]
)
expected = DataFrame(
np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"), index=[2013], columns=exp_col
)
tm.assert_frame_equal(result, expected)
result = pivot_table(
df,
index=np.array(["X", "X", "X", "X", "Y", "Y"]),
columns=[df["dt1"].dt.hour, df["dt2"].dt.month],
values="value1",
)
expected = DataFrame(
np.array(
[[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]]
),
index=["X", "Y"],
columns=exp_col,
)
tm.assert_frame_equal(result, expected)
def test_daily(self):
rng = date_range("1/1/2000", "12/31/2004", freq="D")
ts = Series(np.random.randn(len(rng)), index=rng)
annual = pivot_table(
DataFrame(ts), index=ts.index.year, columns=ts.index.dayofyear
)
annual.columns = annual.columns.droplevel(0)
doy = np.asarray(ts.index.dayofyear)
for i in range(1, 367):
subset = ts[doy == i]
subset.index = subset.index.year
result = annual[i].dropna()
tm.assert_series_equal(result, subset, check_names=False)
assert result.name == i
def test_monthly(self):
rng = date_range("1/1/2000", "12/31/2004", freq="M")
ts = Series(np.random.randn(len(rng)), index=rng)
2021-01-30 22:29:33 +01:00
annual = pivot_table(
pd.DataFrame(ts), index=ts.index.year, columns=ts.index.month
)
annual.columns = annual.columns.droplevel(0)
month = ts.index.month
for i in range(1, 13):
subset = ts[month == i]
subset.index = subset.index.year
result = annual[i].dropna()
tm.assert_series_equal(result, subset, check_names=False)
assert result.name == i
def test_pivot_table_with_iterator_values(self):
# GH 12017
aggs = {"D": "sum", "E": "mean"}
pivot_values_list = pd.pivot_table(
self.data, index=["A"], values=list(aggs.keys()), aggfunc=aggs
)
pivot_values_keys = pd.pivot_table(
self.data, index=["A"], values=aggs.keys(), aggfunc=aggs
)
tm.assert_frame_equal(pivot_values_keys, pivot_values_list)
agg_values_gen = (value for value in aggs.keys())
pivot_values_gen = pd.pivot_table(
self.data, index=["A"], values=agg_values_gen, aggfunc=aggs
)
tm.assert_frame_equal(pivot_values_gen, pivot_values_list)
def test_pivot_table_margins_name_with_aggfunc_list(self):
# GH 13354
margins_name = "Weekly"
2021-01-30 22:29:33 +01:00
costs = pd.DataFrame(
{
"item": ["bacon", "cheese", "bacon", "cheese"],
"cost": [2.5, 4.5, 3.2, 3.3],
"day": ["M", "M", "T", "T"],
}
)
table = costs.pivot_table(
index="item",
columns="day",
margins=True,
margins_name=margins_name,
aggfunc=[np.mean, max],
)
2021-01-30 22:29:33 +01:00
ix = pd.Index(["bacon", "cheese", margins_name], dtype="object", name="item")
tups = [
("mean", "cost", "M"),
("mean", "cost", "T"),
("mean", "cost", margins_name),
("max", "cost", "M"),
("max", "cost", "T"),
("max", "cost", margins_name),
]
2021-01-30 22:29:33 +01:00
cols = pd.MultiIndex.from_tuples(tups, names=[None, None, "day"])
expected = pd.DataFrame(table.values, index=ix, columns=cols)
tm.assert_frame_equal(table, expected)
@pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)")
def test_categorical_margins(self, observed):
# GH 10989
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]])
expected.index = Index([0, 1, "All"], name="y")
expected.columns = Index([0, 1, "All"], name="z")
table = df.pivot_table("x", "y", "z", dropna=observed, margins=True)
tm.assert_frame_equal(table, expected)
@pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)")
def test_categorical_margins_category(self, observed):
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]])
expected.index = Index([0, 1, "All"], name="y")
expected.columns = Index([0, 1, "All"], name="z")
df.y = df.y.astype("category")
df.z = df.z.astype("category")
table = df.pivot_table("x", "y", "z", dropna=observed, margins=True)
tm.assert_frame_equal(table, expected)
def test_margins_casted_to_float(self, observed):
# GH 24893
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
"A": [2, 4, 6, 8],
"B": [1, 4, 5, 8],
"C": [1, 3, 4, 6],
"D": ["X", "X", "Y", "Y"],
}
)
result = pd.pivot_table(df, index="D", margins=True)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame(
{"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]},
2021-01-30 22:29:33 +01:00
index=pd.Index(["X", "Y", "All"], name="D"),
)
tm.assert_frame_equal(result, expected)
def test_pivot_with_categorical(self, observed, ordered):
# gh-21370
idx = [np.nan, "low", "high", "low", np.nan]
col = [np.nan, "A", "B", np.nan, "A"]
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
2021-01-30 22:29:33 +01:00
"In": pd.Categorical(idx, categories=["low", "high"], ordered=ordered),
"Col": pd.Categorical(col, categories=["A", "B"], ordered=ordered),
"Val": range(1, 6),
}
)
# case with index/columns/value
result = df.pivot_table(
index="In", columns="Col", values="Val", observed=observed
)
expected_cols = pd.CategoricalIndex(["A", "B"], ordered=ordered, name="Col")
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame(
data=[[2.0, np.nan], [np.nan, 3.0]], columns=expected_cols
)
expected.index = Index(
2021-01-30 22:29:33 +01:00
pd.Categorical(
["low", "high"], categories=["low", "high"], ordered=ordered
),
name="In",
)
tm.assert_frame_equal(result, expected)
# case with columns/value
result = df.pivot_table(columns="Col", values="Val", observed=observed)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame(
data=[[3.5, 3.0]], columns=expected_cols, index=Index(["Val"])
)
tm.assert_frame_equal(result, expected)
def test_categorical_aggfunc(self, observed):
# GH 9534
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]}
)
df["C1"] = df["C1"].astype("category")
result = df.pivot_table(
"V", index="C1", columns="C2", dropna=observed, aggfunc="count"
)
expected_index = pd.CategoricalIndex(
["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1"
)
2021-01-30 22:29:33 +01:00
expected_columns = pd.Index(["a", "b"], name="C2")
expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]])
expected = pd.DataFrame(
expected_data, index=expected_index, columns=expected_columns
)
tm.assert_frame_equal(result, expected)
def test_categorical_pivot_index_ordering(self, observed):
# GH 8731
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
"Sales": [100, 120, 220],
"Month": ["January", "January", "January"],
"Year": [2013, 2014, 2013],
}
)
months = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
df["Month"] = df["Month"].astype("category").cat.set_categories(months)
result = df.pivot_table(
values="Sales",
index="Month",
columns="Year",
2021-01-30 22:29:33 +01:00
dropna=observed,
aggfunc="sum",
)
expected_columns = pd.Int64Index([2013, 2014], name="Year")
expected_index = pd.CategoricalIndex(
2021-01-30 22:29:33 +01:00
["January"], categories=months, ordered=False, name="Month"
)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame(
[[320, 120]], index=expected_index, columns=expected_columns
)
2021-01-30 22:29:33 +01:00
if not observed:
result = result.dropna().astype(np.int64)
tm.assert_frame_equal(result, expected)
def test_pivot_table_not_series(self):
# GH 4386
# pivot_table always returns a DataFrame
# when values is not list like and columns is None
# and aggfunc is not instance of list
df = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"], "col3": [1, 3, 9]})
result = df.pivot_table("col1", index=["col3", "col2"], aggfunc=np.sum)
m = MultiIndex.from_arrays([[1, 3, 9], ["C", "D", "E"]], names=["col3", "col2"])
expected = DataFrame([3, 4, 5], index=m, columns=["col1"])
tm.assert_frame_equal(result, expected)
result = df.pivot_table("col1", index="col3", columns="col2", aggfunc=np.sum)
expected = DataFrame(
[[3, np.NaN, np.NaN], [np.NaN, 4, np.NaN], [np.NaN, np.NaN, 5]],
index=Index([1, 3, 9], name="col3"),
columns=Index(["C", "D", "E"], name="col2"),
)
tm.assert_frame_equal(result, expected)
result = df.pivot_table("col1", index="col3", aggfunc=[np.sum])
m = MultiIndex.from_arrays([["sum"], ["col1"]])
expected = DataFrame([3, 4, 5], index=Index([1, 3, 9], name="col3"), columns=m)
tm.assert_frame_equal(result, expected)
def test_pivot_margins_name_unicode(self):
# issue #13292
greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae"
2021-01-30 22:29:33 +01:00
frame = pd.DataFrame({"foo": [1, 2, 3]})
table = pd.pivot_table(
frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek
)
2021-01-30 22:29:33 +01:00
index = pd.Index([1, 2, 3, greek], dtype="object", name="foo")
expected = pd.DataFrame(index=index)
tm.assert_frame_equal(table, expected)
def test_pivot_string_as_func(self):
# GH #18713
# for correctness purposes
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": range(11),
}
)
result = pivot_table(data, index="A", columns="B", aggfunc="sum")
mi = MultiIndex(
levels=[["C"], ["one", "two"]], codes=[[0, 0], [0, 1]], names=[None, "B"]
)
expected = DataFrame(
{("C", "one"): {"bar": 15, "foo": 13}, ("C", "two"): {"bar": 7, "foo": 20}},
columns=mi,
).rename_axis("A")
tm.assert_frame_equal(result, expected)
result = pivot_table(data, index="A", columns="B", aggfunc=["sum", "mean"])
mi = MultiIndex(
levels=[["sum", "mean"], ["C"], ["one", "two"]],
codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]],
names=[None, None, "B"],
)
expected = DataFrame(
{
("mean", "C", "one"): {"bar": 5.0, "foo": 3.25},
("mean", "C", "two"): {"bar": 7.0, "foo": 6.666666666666667},
("sum", "C", "one"): {"bar": 15, "foo": 13},
("sum", "C", "two"): {"bar": 7, "foo": 20},
},
columns=mi,
).rename_axis("A")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"f, f_numpy",
[
("sum", np.sum),
("mean", np.mean),
("std", np.std),
(["sum", "mean"], [np.sum, np.mean]),
(["sum", "std"], [np.sum, np.std]),
(["std", "mean"], [np.std, np.mean]),
],
)
def test_pivot_string_func_vs_func(self, f, f_numpy):
# GH #18713
# for consistency purposes
result = pivot_table(self.data, index="A", columns="B", aggfunc=f)
expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy)
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
def test_pivot_number_of_levels_larger_than_int32(self):
# GH 20601
df = DataFrame(
{"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0}
)
msg = "Unstacked DataFrame is too big, causing int32 overflow"
with pytest.raises(ValueError, match=msg):
df.pivot_table(
index="ind1", columns="ind2", values="count", aggfunc="count"
)
def test_pivot_table_aggfunc_dropna(self, dropna):
# GH 22159
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
"fruit": ["apple", "peach", "apple"],
"size": [1, 1, 2],
"taste": [7, 6, 6],
}
)
def ret_one(x):
return 1
def ret_sum(x):
return sum(x)
def ret_none(x):
return np.nan
result = pd.pivot_table(
df, columns="fruit", aggfunc=[ret_sum, ret_none, ret_one], dropna=dropna
)
data = [[3, 1, np.nan, np.nan, 1, 1], [13, 6, np.nan, np.nan, 1, 1]]
2021-01-30 22:29:33 +01:00
col = pd.MultiIndex.from_product(
[["ret_sum", "ret_none", "ret_one"], ["apple", "peach"]],
names=[None, "fruit"],
)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame(data, index=["size", "taste"], columns=col)
if dropna:
expected = expected.dropna(axis="columns")
tm.assert_frame_equal(result, expected)
def test_pivot_table_aggfunc_scalar_dropna(self, dropna):
# GH 22159
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{"A": ["one", "two", "one"], "x": [3, np.nan, 2], "y": [1, np.nan, np.nan]}
)
result = pd.pivot_table(df, columns="A", aggfunc=np.mean, dropna=dropna)
data = [[2.5, np.nan], [1, np.nan]]
2021-01-30 22:29:33 +01:00
col = pd.Index(["one", "two"], name="A")
expected = pd.DataFrame(data, index=["x", "y"], columns=col)
if dropna:
expected = expected.dropna(axis="columns")
tm.assert_frame_equal(result, expected)
def test_pivot_table_empty_aggfunc(self):
# GH 9186
2021-01-30 22:29:33 +01:00
df = pd.DataFrame(
{
"A": [2, 2, 3, 3, 2],
"id": [5, 6, 7, 8, 9],
"C": ["p", "q", "q", "p", "q"],
"D": [None, None, None, None, None],
}
)
result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size)
2021-01-30 22:29:33 +01:00
expected = pd.DataFrame()
tm.assert_frame_equal(result, expected)
def test_pivot_table_no_column_raises(self):
# GH 10326
2021-01-30 22:29:33 +01:00
def agg(l):
return np.mean(l)
2021-01-30 22:29:33 +01:00
foo = pd.DataFrame(
{"X": [0, 0, 1, 1], "Y": [0, 1, 0, 1], "Z": [10, 20, 30, 40]}
)
with pytest.raises(KeyError, match="notpresent"):
foo.pivot_table("notpresent", "X", "Y", aggfunc=agg)