from datetime import date, datetime, timedelta from itertools import product import numpy as np import pytest import pandas as pd from pandas import ( Categorical, DataFrame, Grouper, Index, MultiIndex, Series, concat, date_range, ) import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.pivot import pivot_table @pytest.fixture(params=[True, False]) def dropna(request): return request.param @pytest.fixture(params=[([0] * 4, [1] * 4), (range(0, 3), range(1, 4))]) def interval_values(request, closed): left, right = request.param return Categorical(pd.IntervalIndex.from_arrays(left, right, closed)) class TestPivotTable: def setup_method(self, method): self.data = DataFrame( { "A": [ "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo", ], "B": [ "one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one", ], "C": [ "dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny", ], "D": np.random.randn(11), "E": np.random.randn(11), "F": np.random.randn(11), } ) def test_pivot_table(self, observed): index = ["A", "B"] columns = "C" table = pivot_table( self.data, values="D", index=index, columns=columns, observed=observed ) table2 = self.data.pivot_table( values="D", index=index, columns=columns, observed=observed ) tm.assert_frame_equal(table, table2) # this works pivot_table(self.data, values="D", index=index, observed=observed) if len(index) > 1: assert table.index.names == tuple(index) else: assert table.index.name == index[0] if len(columns) > 1: assert table.columns.names == columns else: assert table.columns.name == columns[0] expected = self.data.groupby(index + [columns])["D"].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_table_categorical_observed_equal(self, observed): # issue #24923 df = pd.DataFrame( {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]} ) expected = df.pivot_table( index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0 ) expected.index = expected.index.astype("category") expected.columns = expected.columns.astype("category") df.col1 = df.col1.astype("category") df.col2 = df.col2.astype("category") result = df.pivot_table( index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0, observed=observed, ) tm.assert_frame_equal(result, expected) def test_pivot_table_nocols(self): df = DataFrame( {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} ) rs = df.pivot_table(columns="cols", aggfunc=np.sum) xp = df.pivot_table(index="cols", aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"}) xp = df.pivot_table(index="cols", aggfunc={"values": "mean"}).T tm.assert_frame_equal(rs, xp) def test_pivot_table_dropna(self): df = DataFrame( { "amount": {0: 60000, 1: 100000, 2: 50000, 3: 30000}, "customer": {0: "A", 1: "A", 2: "B", 3: "C"}, "month": {0: 201307, 1: 201309, 2: 201308, 3: 201310}, "product": {0: "a", 1: "b", 2: "c", 3: "d"}, "quantity": {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}, } ) pv_col = df.pivot_table( "quantity", "month", ["customer", "product"], dropna=False ) pv_ind = df.pivot_table( "quantity", ["customer", "product"], "month", dropna=False ) m = MultiIndex.from_tuples( [ ("A", "a"), ("A", "b"), ("A", "c"), ("A", "d"), ("B", "a"), ("B", "b"), ("B", "c"), ("B", "d"), ("C", "a"), ("C", "b"), ("C", "c"), ("C", "d"), ], names=["customer", "product"], ) tm.assert_index_equal(pv_col.columns, m) tm.assert_index_equal(pv_ind.index, m) def test_pivot_table_categorical(self): cat1 = Categorical( ["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True ) cat2 = Categorical( ["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True ) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) result = pd.pivot_table(df, values="values", index=["A", "B"], dropna=True) exp_index = pd.MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) tm.assert_frame_equal(result, expected) def test_pivot_table_dropna_categoricals(self, dropna): # GH 15193 categories = ["a", "b", "c", "d"] df = DataFrame( { "A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], "B": [1, 2, 3, 1, 2, 3, 1, 2, 3], "C": range(0, 9), } ) df["A"] = df["A"].astype(CDT(categories, ordered=False)) result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) expected_columns = Series(["a", "b", "c"], name="A") expected_columns = expected_columns.astype(CDT(categories, ordered=False)) expected_index = Series([1, 2, 3], name="B") expected = DataFrame( [[0, 3, 6], [1, 4, 7], [2, 5, 8]], index=expected_index, columns=expected_columns, ) if not dropna: # add back the non observed to compare expected = expected.reindex(columns=Categorical(categories)).astype("float") tm.assert_frame_equal(result, expected) def test_pivot_with_non_observable_dropna(self, dropna): # gh-21133 df = pd.DataFrame( { "A": pd.Categorical( [np.nan, "low", "high", "low", "high"], categories=["low", "high"], ordered=True, ), "B": range(5), } ) result = df.pivot_table(index="A", values="B", dropna=dropna) expected = pd.DataFrame( {"B": [2, 3]}, index=pd.Index( pd.Categorical.from_codes( [0, 1], categories=["low", "high"], ordered=True ), name="A", ), ) tm.assert_frame_equal(result, expected) # gh-21378 df = pd.DataFrame( { "A": pd.Categorical( ["left", "low", "high", "low", "high"], categories=["low", "high", "left"], ordered=True, ), "B": range(5), } ) result = df.pivot_table(index="A", values="B", dropna=dropna) expected = pd.DataFrame( {"B": [2, 3, 0]}, index=pd.Index( pd.Categorical.from_codes( [0, 1, 2], categories=["low", "high", "left"], ordered=True ), name="A", ), ) tm.assert_frame_equal(result, expected) def test_pivot_with_interval_index(self, interval_values, dropna): # GH 25814 df = DataFrame({"A": interval_values, "B": 1}) result = df.pivot_table(index="A", values="B", dropna=dropna) expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A")) tm.assert_frame_equal(result, expected) def test_pivot_with_interval_index_margins(self): # GH 25815 ordered_cat = pd.IntervalIndex.from_arrays([0, 0, 1, 1], [1, 1, 2, 2]) df = DataFrame( { "A": np.arange(4, 0, -1, dtype=np.intp), "B": ["a", "b", "a", "b"], "C": pd.Categorical(ordered_cat, ordered=True).sort_values( ascending=False ), } ) pivot_tab = pd.pivot_table( df, index="C", columns="B", values="A", aggfunc="sum", margins=True ) result = pivot_tab["All"] expected = Series( [3, 7, 10], index=Index([pd.Interval(0, 1), pd.Interval(1, 2), "All"], name="C"), name="All", dtype=np.intp, ) tm.assert_series_equal(result, expected) def test_pass_array(self): result = self.data.pivot_table("D", index=self.data.A, columns=self.data.C) expected = self.data.pivot_table("D", index="A", columns="C") tm.assert_frame_equal(result, expected) def test_pass_function(self): result = self.data.pivot_table("D", index=lambda x: x // 5, columns=self.data.C) expected = self.data.pivot_table("D", index=self.data.index // 5, columns="C") tm.assert_frame_equal(result, expected) def test_pivot_table_multiple(self): index = ["A", "B"] columns = "C" table = pivot_table(self.data, index=index, columns=columns) expected = self.data.groupby(index + [columns]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_dtypes(self): # can convert dtypes f = DataFrame( { "a": ["cat", "bat", "cat", "bat"], "v": [1, 2, 3, 4], "i": ["a", "b", "a", "b"], } ) assert f.dtypes["v"] == "int64" z = pivot_table( f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.sum ) result = z.dtypes expected = Series([np.dtype("int64")] * 2, index=Index(list("ab"), name="i")) tm.assert_series_equal(result, expected) # cannot convert dtypes f = DataFrame( { "a": ["cat", "bat", "cat", "bat"], "v": [1.5, 2.5, 3.5, 4.5], "i": ["a", "b", "a", "b"], } ) assert f.dtypes["v"] == "float64" z = pivot_table( f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.mean ) result = z.dtypes expected = Series([np.dtype("float64")] * 2, index=Index(list("ab"), name="i")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "columns,values", [ ("bool1", ["float1", "float2"]), ("bool1", ["float1", "float2", "bool1"]), ("bool2", ["float1", "float2", "bool1"]), ], ) def test_pivot_preserve_dtypes(self, columns, values): # GH 7142 regression test v = np.arange(5, dtype=np.float64) df = DataFrame( {"float1": v, "float2": v + 2.0, "bool1": v <= 2, "bool2": v <= 3} ) df_res = df.reset_index().pivot_table( index="index", columns=columns, values=values ) result = dict(df_res.dtypes) expected = { col: np.dtype("O") if col[0].startswith("b") else np.dtype("float64") for col in df_res } assert result == expected def test_pivot_no_values(self): # GH 14380 idx = pd.DatetimeIndex( ["2011-01-01", "2011-02-01", "2011-01-02", "2011-01-01", "2011-01-02"] ) df = pd.DataFrame({"A": [1, 2, 3, 4, 5]}, index=idx) res = df.pivot_table(index=df.index.month, columns=df.index.day) exp_columns = pd.MultiIndex.from_tuples([("A", 1), ("A", 2)]) exp = pd.DataFrame( [[2.5, 4.0], [2.0, np.nan]], index=[1, 2], columns=exp_columns ) tm.assert_frame_equal(res, exp) df = pd.DataFrame( { "A": [1, 2, 3, 4, 5], "dt": pd.date_range("2011-01-01", freq="D", periods=5), }, index=idx, ) res = df.pivot_table( index=df.index.month, columns=pd.Grouper(key="dt", freq="M") ) exp_columns = pd.MultiIndex.from_tuples([("A", pd.Timestamp("2011-01-31"))]) exp_columns.names = [None, "dt"] exp = pd.DataFrame([3.25, 2.0], index=[1, 2], columns=exp_columns) tm.assert_frame_equal(res, exp) res = df.pivot_table( index=pd.Grouper(freq="A"), columns=pd.Grouper(key="dt", freq="M") ) exp = pd.DataFrame( [3], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns ) tm.assert_frame_equal(res, exp) def test_pivot_multi_values(self): result = pivot_table( self.data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0 ) expected = pivot_table( self.data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0 ) tm.assert_frame_equal(result, expected) def test_pivot_multi_functions(self): f = lambda func: pivot_table( self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func ) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table( self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func, margins=True, ) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("method", [True, False]) def test_pivot_index_with_nan(self, method): # GH 3588 nan = np.nan df = DataFrame( { "a": ["R1", "R2", nan, "R4"], "b": ["C1", "C2", "C3", "C4"], "c": [10, 15, 17, 20], } ) if method: result = df.pivot("a", "b", "c") else: result = pd.pivot(df, "a", "b", "c") expected = DataFrame( [ [nan, nan, 17, nan], [10, nan, nan, nan], [nan, 15, nan, nan], [nan, nan, nan, 20], ], index=Index([nan, "R1", "R2", "R4"], name="a"), columns=Index(["C1", "C2", "C3", "C4"], name="b"), ) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T) # GH9491 df = DataFrame( { "a": pd.date_range("2014-02-01", periods=6, freq="D"), "c": 100 + np.arange(6), } ) df["b"] = df["a"] - pd.Timestamp("2014-02-02") df.loc[1, "a"] = df.loc[3, "a"] = nan df.loc[1, "b"] = df.loc[4, "b"] = nan if method: pv = df.pivot("a", "b", "c") else: pv = pd.pivot(df, "a", "b", "c") assert pv.notna().values.sum() == len(df) for _, row in df.iterrows(): assert pv.loc[row["a"], row["b"]] == row["c"] if method: result = df.pivot("b", "a", "c") else: result = pd.pivot(df, "b", "a", "c") tm.assert_frame_equal(result, pv.T) @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_tz(self, method): # GH 5878 df = DataFrame( { "dt1": [ datetime(2013, 1, 1, 9, 0), datetime(2013, 1, 2, 9, 0), datetime(2013, 1, 1, 9, 0), datetime(2013, 1, 2, 9, 0), ], "dt2": [ datetime(2014, 1, 1, 9, 0), datetime(2014, 1, 1, 9, 0), datetime(2014, 1, 2, 9, 0), datetime(2014, 1, 2, 9, 0), ], "data1": np.arange(4, dtype="int64"), "data2": np.arange(4, dtype="int64"), } ) df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) exp_col1 = Index(["data1", "data1", "data2", "data2"]) exp_col2 = pd.DatetimeIndex( ["2014/01/01 09:00", "2014/01/02 09:00"] * 2, name="dt2", tz="Asia/Tokyo" ) exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame( [[0, 2, 0, 2], [1, 3, 1, 3]], index=pd.DatetimeIndex( ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" ), columns=exp_col, ) if method: pv = df.pivot(index="dt1", columns="dt2") else: pv = pd.pivot(df, index="dt1", columns="dt2") tm.assert_frame_equal(pv, expected) expected = DataFrame( [[0, 2], [1, 3]], index=pd.DatetimeIndex( ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" ), columns=pd.DatetimeIndex( ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" ), ) if method: pv = df.pivot(index="dt1", columns="dt2", values="data1") else: pv = pd.pivot(df, index="dt1", columns="dt2", values="data1") tm.assert_frame_equal(pv, expected) def test_pivot_tz_in_values(self): # GH 14948 df = pd.DataFrame( [ { "uid": "aa", "ts": pd.Timestamp("2016-08-12 13:00:00-0700", tz="US/Pacific"), }, { "uid": "aa", "ts": pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), }, { "uid": "aa", "ts": pd.Timestamp("2016-08-12 14:00:00-0700", tz="US/Pacific"), }, { "uid": "aa", "ts": pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), }, { "uid": "aa", "ts": pd.Timestamp("2016-08-25 13:00:00-0700", tz="US/Pacific"), }, ] ) df = df.set_index("ts").reset_index() mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0)) result = pd.pivot_table( df.set_index("ts").reset_index(), values="ts", index=["uid"], columns=[mins], aggfunc=np.min, ) expected = pd.DataFrame( [ [ pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), ] ], index=pd.Index(["aa"], name="uid"), columns=pd.DatetimeIndex( [ pd.Timestamp("2016-08-12 00:00:00", tz="US/Pacific"), pd.Timestamp("2016-08-25 00:00:00", tz="US/Pacific"), ], name="ts", ), ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("method", [True, False]) def test_pivot_periods(self, method): df = DataFrame( { "p1": [ pd.Period("2013-01-01", "D"), pd.Period("2013-01-02", "D"), pd.Period("2013-01-01", "D"), pd.Period("2013-01-02", "D"), ], "p2": [ pd.Period("2013-01", "M"), pd.Period("2013-01", "M"), pd.Period("2013-02", "M"), pd.Period("2013-02", "M"), ], "data1": np.arange(4, dtype="int64"), "data2": np.arange(4, dtype="int64"), } ) exp_col1 = Index(["data1", "data1", "data2", "data2"]) exp_col2 = pd.PeriodIndex(["2013-01", "2013-02"] * 2, name="p2", freq="M") exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame( [[0, 2, 0, 2], [1, 3, 1, 3]], index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"), columns=exp_col, ) if method: pv = df.pivot(index="p1", columns="p2") else: pv = pd.pivot(df, index="p1", columns="p2") tm.assert_frame_equal(pv, expected) expected = DataFrame( [[0, 2], [1, 3]], index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"), columns=pd.PeriodIndex(["2013-01", "2013-02"], name="p2", freq="M"), ) if method: pv = df.pivot(index="p1", columns="p2", values="data1") else: pv = pd.pivot(df, index="p1", columns="p2", values="data1") tm.assert_frame_equal(pv, expected) def test_pivot_periods_with_margins(self): # GH 28323 df = DataFrame( { "a": [1, 1, 2, 2], "b": [ pd.Period("2019Q1"), pd.Period("2019Q2"), pd.Period("2019Q1"), pd.Period("2019Q2"), ], "x": 1.0, } ) expected = DataFrame( data=1.0, index=pd.Index([1, 2, "All"], name="a"), columns=pd.Index( [pd.Period("2019Q1"), pd.Period("2019Q2"), "All"], name="b" ), ) result = df.pivot_table(index="a", columns="b", values="x", margins=True) tm.assert_frame_equal(expected, result) @pytest.mark.parametrize( "values", [ ["baz", "zoo"], np.array(["baz", "zoo"]), pd.Series(["baz", "zoo"]), pd.Index(["baz", "zoo"]), ], ) @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_list_like_values(self, values, method): # issue #17160 df = pd.DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], "bar": ["A", "B", "C", "A", "B", "C"], "baz": [1, 2, 3, 4, 5, 6], "zoo": ["x", "y", "z", "q", "w", "t"], } ) if method: result = df.pivot(index="foo", columns="bar", values=values) else: result = pd.pivot(df, index="foo", columns="bar", values=values) data = [[1, 2, 3, "x", "y", "z"], [4, 5, 6, "q", "w", "t"]] index = Index(data=["one", "two"], name="foo") columns = MultiIndex( levels=[["baz", "zoo"], ["A", "B", "C"]], codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], names=[None, "bar"], ) expected = DataFrame(data=data, index=index, columns=columns, dtype="object") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "values", [ ["bar", "baz"], np.array(["bar", "baz"]), pd.Series(["bar", "baz"]), pd.Index(["bar", "baz"]), ], ) @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_list_like_values_nans(self, values, method): # issue #17160 df = pd.DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], "bar": ["A", "B", "C", "A", "B", "C"], "baz": [1, 2, 3, 4, 5, 6], "zoo": ["x", "y", "z", "q", "w", "t"], } ) if method: result = df.pivot(index="zoo", columns="foo", values=values) else: result = pd.pivot(df, index="zoo", columns="foo", values=values) data = [ [np.nan, "A", np.nan, 4], [np.nan, "C", np.nan, 6], [np.nan, "B", np.nan, 5], ["A", np.nan, 1, np.nan], ["B", np.nan, 2, np.nan], ["C", np.nan, 3, np.nan], ] index = Index(data=["q", "t", "w", "x", "y", "z"], name="zoo") columns = MultiIndex( levels=[["bar", "baz"], ["one", "two"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[None, "foo"], ) expected = DataFrame(data=data, index=index, columns=columns, dtype="object") tm.assert_frame_equal(result, expected) def test_pivot_columns_none_raise_error(self): # GH 30924 df = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1, 2, 3], "col3": [1, 2, 3]} ) msg = r"pivot\(\) missing 1 required argument: 'columns'" with pytest.raises(TypeError, match=msg): df.pivot(index="col1", values="col3") @pytest.mark.xfail( reason="MultiIndexed unstack with tuple names fails with KeyError GH#19966" ) @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_multiindex(self, method): # issue #17160 index = Index(data=[0, 1, 2, 3, 4, 5]) data = [ ["one", "A", 1, "x"], ["one", "B", 2, "y"], ["one", "C", 3, "z"], ["two", "A", 4, "q"], ["two", "B", 5, "w"], ["two", "C", 6, "t"], ] columns = MultiIndex( levels=[["bar", "baz"], ["first", "second"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], ) df = DataFrame(data=data, index=index, columns=columns, dtype="object") if method: result = df.pivot( index=("bar", "first"), columns=("bar", "second"), values=("baz", "first"), ) else: result = pd.pivot( df, index=("bar", "first"), columns=("bar", "second"), values=("baz", "first"), ) data = { "A": Series([1, 4], index=["one", "two"]), "B": Series([2, 5], index=["one", "two"]), "C": Series([3, 6], index=["one", "two"]), } expected = DataFrame(data) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_tuple_of_values(self, method): # issue #17160 df = pd.DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], "bar": ["A", "B", "C", "A", "B", "C"], "baz": [1, 2, 3, 4, 5, 6], "zoo": ["x", "y", "z", "q", "w", "t"], } ) with pytest.raises(KeyError, match=r"^\('bar', 'baz'\)$"): # tuple is seen as a single column name if method: df.pivot(index="zoo", columns="foo", values=("bar", "baz")) else: pd.pivot(df, index="zoo", columns="foo", values=("bar", "baz")) def test_margins(self): def _check_output( result, values_col, index=["A", "B"], columns=["C"], margins_col="All" ): col_margins = result.loc[result.index[:-1], margins_col] expected_col_margins = self.data.groupby(index)[values_col].mean() tm.assert_series_equal(col_margins, expected_col_margins, check_names=False) assert col_margins.name == margins_col result = result.sort_index() index_margins = result.loc[(margins_col, "")].iloc[:-1] expected_ix_margins = self.data.groupby(columns)[values_col].mean() tm.assert_series_equal( index_margins, expected_ix_margins, check_names=False ) assert index_margins.name == (margins_col, "") grand_total_margins = result.loc[(margins_col, ""), margins_col] expected_total_margins = self.data[values_col].mean() assert grand_total_margins == expected_total_margins # column specified result = self.data.pivot_table( values="D", index=["A", "B"], columns="C", margins=True, aggfunc=np.mean ) _check_output(result, "D") # Set a different margins_name (not 'All') result = self.data.pivot_table( values="D", index=["A", "B"], columns="C", margins=True, aggfunc=np.mean, margins_name="Totals", ) _check_output(result, "D", margins_col="Totals") # no column specified table = self.data.pivot_table( index=["A", "B"], columns="C", margins=True, aggfunc=np.mean ) for value_col in table.columns.levels[0]: _check_output(table[value_col], value_col) # no col # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) for value_col in table.columns: totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] assert totals == self.data[item].mean() @pytest.mark.parametrize( "columns, aggfunc, values, expected_columns", [ ( "A", np.mean, [[5.5, 5.5, 2.2, 2.2], [8.0, 8.0, 4.4, 4.4]], Index(["bar", "All", "foo", "All"], name="A"), ), ( ["A", "B"], "sum", [[9, 13, 22, 5, 6, 11], [14, 18, 32, 11, 11, 22]], MultiIndex.from_tuples( [ ("bar", "one"), ("bar", "two"), ("bar", "All"), ("foo", "one"), ("foo", "two"), ("foo", "All"), ], names=["A", "B"], ), ), ], ) def test_margin_with_only_columns_defined( self, columns, aggfunc, values, expected_columns ): # GH 31016 df = pd.DataFrame( { "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], "C": [ "small", "large", "large", "small", "small", "large", "small", "small", "large", ], "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], } ) result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) expected = pd.DataFrame( values, index=Index(["D", "E"]), columns=expected_columns ) tm.assert_frame_equal(result, expected) def test_margins_dtype(self): # GH 17013 df = self.data.copy() df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3) mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) expected = DataFrame( {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi ).rename_axis("C", axis=1) expected["All"] = expected["dull"] + expected["shiny"] result = df.pivot_table( values="D", index=["A", "B"], columns="C", margins=True, aggfunc=np.sum, fill_value=0, ) tm.assert_frame_equal(expected, result) @pytest.mark.xfail(reason="GH#17035 (len of floats is casted back to floats)") def test_margins_dtype_len(self): mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) expected = DataFrame( {"dull": [1, 1, 2, 1, 5], "shiny": [2, 0, 2, 2, 6]}, index=mi ).rename_axis("C", axis=1) expected["All"] = expected["dull"] + expected["shiny"] result = self.data.pivot_table( values="D", index=["A", "B"], columns="C", margins=True, aggfunc=len, fill_value=0, ) tm.assert_frame_equal(expected, result) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]}) result = df2.pivot_table(values="v", columns=cols) expected = DataFrame( [[4, 5, 6]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), index=Index(["v"]), ) tm.assert_frame_equal(result, expected) def test_pivot_table_retains_tz(self): dti = date_range("2016-01-01", periods=3, tz="Europe/Amsterdam") df = DataFrame({"A": np.random.randn(3), "B": np.random.randn(3), "C": dti}) result = df.pivot_table(index=["B", "C"], dropna=False) # check tz retention assert result.index.levels[1].equals(dti) def test_pivot_integer_columns(self): # caused by upstream bug in unstack d = date.min data = list( product( ["foo", "bar"], ["A", "B", "C"], ["x1", "x2"], [d + timedelta(i) for i in range(20)], [1.0], ) ) df = DataFrame(data) table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"]) tm.assert_frame_equal(table, table2, check_names=False) def test_pivot_no_level_overlap(self): # GH #1181 data = DataFrame( { "a": ["a", "a", "a", "a", "b", "b", "b", "b"] * 2, "b": [0, 0, 0, 0, 1, 1, 1, 1] * 2, "c": (["foo"] * 4 + ["bar"] * 4) * 2, "value": np.random.randn(16), } ) table = data.pivot_table("value", index="a", columns=["b", "c"]) grouped = data.groupby(["a", "b", "c"])["value"].mean() expected = grouped.unstack("b").unstack("c").dropna(axis=1, how="all") tm.assert_frame_equal(table, expected) def test_pivot_columns_lexsorted(self): n = 10000 dtype = np.dtype( [ ("Index", object), ("Symbol", object), ("Year", int), ("Month", int), ("Day", int), ("Quantity", int), ("Price", float), ] ) products = np.array( [ ("SP500", "ADBE"), ("SP500", "NVDA"), ("SP500", "ORCL"), ("NDQ100", "AAPL"), ("NDQ100", "MSFT"), ("NDQ100", "GOOG"), ("FTSE", "DGE.L"), ("FTSE", "TSCO.L"), ("FTSE", "GSK.L"), ], dtype=[("Index", object), ("Symbol", object)], ) items = np.empty(n, dtype=dtype) iproduct = np.random.randint(0, len(products), n) items["Index"] = products["Index"][iproduct] items["Symbol"] = products["Symbol"][iproduct] dr = pd.date_range(date(2000, 1, 1), date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items["Year"] = dates.year items["Month"] = dates.month items["Day"] = dates.day items["Price"] = np.random.lognormal(4.0, 2.0, n) df = DataFrame(items) pivoted = df.pivot_table( "Price", index=["Month", "Day"], columns=["Index", "Symbol", "Year"], aggfunc="mean", ) assert pivoted.columns.is_monotonic def test_pivot_complex_aggfunc(self): f = {"D": ["std"], "E": ["sum"]} expected = self.data.groupby(["A", "B"]).agg(f).unstack("B") result = self.data.pivot_table(index="A", columns="B", aggfunc=f) tm.assert_frame_equal(result, expected) def test_margins_no_values_no_cols(self): # Regression test on pivot table: no values or cols passed. result = self.data[["A", "B"]].pivot_table( index=["A", "B"], aggfunc=len, margins=True ) result_list = result.tolist() assert sum(result_list[:-1]) == result_list[-1] def test_margins_no_values_two_rows(self): # Regression test on pivot table: no values passed but rows are a # multi-index result = self.data[["A", "B", "C"]].pivot_table( index=["A", "B"], columns="C", aggfunc=len, margins=True ) assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] def test_margins_no_values_one_row_one_col(self): # Regression test on pivot table: no values passed but row and col # defined result = self.data[["A", "B"]].pivot_table( index="A", columns="B", aggfunc=len, margins=True ) assert result.All.tolist() == [4.0, 7.0, 11.0] def test_margins_no_values_two_row_two_cols(self): # Regression test on pivot table: no values passed but rows and cols # are multi-indexed self.data["D"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"] result = self.data[["A", "B", "C", "D"]].pivot_table( index=["A", "B"], columns=["C", "D"], aggfunc=len, margins=True ) assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] @pytest.mark.parametrize("margin_name", ["foo", "one", 666, None, ["a", "b"]]) def test_pivot_table_with_margins_set_margin_name(self, margin_name): # see gh-3335 msg = ( f'Conflicting name "{margin_name}" in margins|' "margins_name argument must be a string" ) with pytest.raises(ValueError, match=msg): # multi-index index pivot_table( self.data, values="D", index=["A", "B"], columns=["C"], margins=True, margins_name=margin_name, ) with pytest.raises(ValueError, match=msg): # multi-index column pivot_table( self.data, values="D", index=["C"], columns=["A", "B"], margins=True, margins_name=margin_name, ) with pytest.raises(ValueError, match=msg): # non-multi-index index/column pivot_table( self.data, values="D", index=["A"], columns=["B"], margins=True, margins_name=margin_name, ) def test_pivot_timegrouper(self): df = DataFrame( { "Branch": "A A A A A A A B".split(), "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], "Date": [ datetime(2013, 1, 1), datetime(2013, 1, 1), datetime(2013, 10, 1), datetime(2013, 10, 2), datetime(2013, 10, 1), datetime(2013, 10, 2), datetime(2013, 12, 2), datetime(2013, 12, 2), ], } ).set_index("Date") expected = DataFrame( np.array([10, 18, 3], dtype="int64").reshape(1, 3), index=pd.DatetimeIndex([datetime(2013, 12, 31)], freq="A"), columns="Carl Joe Mark".split(), ) expected.index.name = "Date" expected.columns.name = "Buyer" result = pivot_table( df, index=Grouper(freq="A"), columns="Buyer", values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index="Buyer", columns=Grouper(freq="A"), values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected.T) expected = DataFrame( np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), index=pd.DatetimeIndex( [datetime(2013, 1, 1), datetime(2013, 7, 1)], freq="6MS" ), columns="Carl Joe Mark".split(), ) expected.index.name = "Date" expected.columns.name = "Buyer" result = pivot_table( df, index=Grouper(freq="6MS"), columns="Buyer", values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index="Buyer", columns=Grouper(freq="6MS"), values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected.T) # passing the name df = df.reset_index() result = pivot_table( df, index=Grouper(freq="6MS", key="Date"), columns="Buyer", values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index="Buyer", columns=Grouper(freq="6MS", key="Date"), values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected.T) msg = "'The grouper name foo is not found'" with pytest.raises(KeyError, match=msg): pivot_table( df, index=Grouper(freq="6MS", key="foo"), columns="Buyer", values="Quantity", aggfunc=np.sum, ) with pytest.raises(KeyError, match=msg): pivot_table( df, index="Buyer", columns=Grouper(freq="6MS", key="foo"), values="Quantity", aggfunc=np.sum, ) # passing the level df = df.set_index("Date") result = pivot_table( df, index=Grouper(freq="6MS", level="Date"), columns="Buyer", values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index="Buyer", columns=Grouper(freq="6MS", level="Date"), values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected.T) msg = "The level foo is not valid" with pytest.raises(ValueError, match=msg): pivot_table( df, index=Grouper(freq="6MS", level="foo"), columns="Buyer", values="Quantity", aggfunc=np.sum, ) with pytest.raises(ValueError, match=msg): pivot_table( df, index="Buyer", columns=Grouper(freq="6MS", level="foo"), values="Quantity", aggfunc=np.sum, ) # double grouper df = DataFrame( { "Branch": "A A A A A A A B".split(), "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], "Date": [ datetime(2013, 11, 1, 13, 0), datetime(2013, 9, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 11, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 2, 12, 0), datetime(2013, 12, 5, 14, 0), ], "PayDay": [ datetime(2013, 10, 4, 0, 0), datetime(2013, 10, 15, 13, 5), datetime(2013, 9, 5, 20, 0), datetime(2013, 11, 2, 10, 0), datetime(2013, 10, 7, 20, 0), datetime(2013, 9, 5, 10, 0), datetime(2013, 12, 30, 12, 0), datetime(2013, 11, 20, 14, 0), ], } ) result = pivot_table( df, index=Grouper(freq="M", key="Date"), columns=Grouper(freq="M", key="PayDay"), values="Quantity", aggfunc=np.sum, ) expected = DataFrame( np.array( [ np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan, ] ).reshape(4, 4), index=pd.DatetimeIndex( [ datetime(2013, 9, 30), datetime(2013, 10, 31), datetime(2013, 11, 30), datetime(2013, 12, 31), ], freq="M", ), columns=pd.DatetimeIndex( [ datetime(2013, 9, 30), datetime(2013, 10, 31), datetime(2013, 11, 30), datetime(2013, 12, 31), ], freq="M", ), ) expected.index.name = "Date" expected.columns.name = "PayDay" tm.assert_frame_equal(result, expected) result = pivot_table( df, index=Grouper(freq="M", key="PayDay"), columns=Grouper(freq="M", key="Date"), values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected.T) tuples = [ (datetime(2013, 9, 30), datetime(2013, 10, 31)), (datetime(2013, 10, 31), datetime(2013, 9, 30)), (datetime(2013, 10, 31), datetime(2013, 11, 30)), (datetime(2013, 10, 31), datetime(2013, 12, 31)), (datetime(2013, 11, 30), datetime(2013, 10, 31)), (datetime(2013, 12, 31), datetime(2013, 11, 30)), ] idx = MultiIndex.from_tuples(tuples, names=["Date", "PayDay"]) expected = DataFrame( np.array( [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3] ).reshape(6, 2), index=idx, columns=["A", "B"], ) expected.columns.name = "Branch" result = pivot_table( df, index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], columns=["Branch"], values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index=["Branch"], columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected.T) def test_pivot_datetime_tz(self): dates1 = [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", ] dates2 = [ "2013-01-01 15:00:00", "2013-01-01 15:00:00", "2013-01-01 15:00:00", "2013-02-01 15:00:00", "2013-02-01 15:00:00", "2013-02-01 15:00:00", ] df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], "dt1": dates1, "dt2": dates2, "value1": np.arange(6, dtype="int64"), "value2": [1, 2] * 3, } ) df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) exp_idx = pd.DatetimeIndex( ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], tz="US/Pacific", name="dt1", ) exp_col1 = Index(["value1", "value1"]) exp_col2 = Index(["a", "b"], name="label") exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"]) tm.assert_frame_equal(result, expected) exp_col1 = Index(["sum", "sum", "sum", "sum", "mean", "mean", "mean", "mean"]) exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2) exp_col3 = pd.DatetimeIndex( ["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4, tz="Asia/Tokyo", name="dt2", ) exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) expected = DataFrame( np.array( [ [0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], [2, 5, 1, 2, 2, 5, 1, 2], ], dtype="int64", ), index=exp_idx, columns=exp_col, ) result = pivot_table( df, index=["dt1"], columns=["dt2"], values=["value1", "value2"], aggfunc=[np.sum, np.mean], ) tm.assert_frame_equal(result, expected) def test_pivot_dtaccessor(self): # GH 8103 dates1 = [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", ] dates2 = [ "2013-01-01 15:00:00", "2013-01-01 15:00:00", "2013-01-01 15:00:00", "2013-02-01 15:00:00", "2013-02-01 15:00:00", "2013-02-01 15:00:00", ] df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], "dt1": dates1, "dt2": dates2, "value1": np.arange(6, dtype="int64"), "value2": [1, 2] * 3, } ) df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d)) df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d)) result = pivot_table( df, index="label", columns=df["dt1"].dt.hour, values="value1" ) exp_idx = Index(["a", "b"], name="label") expected = DataFrame( {7: [0, 3], 8: [1, 4], 9: [2, 5]}, index=exp_idx, columns=Index([7, 8, 9], name="dt1"), ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index=df["dt2"].dt.month, columns=df["dt1"].dt.hour, values="value1" ) expected = DataFrame( {7: [0, 3], 8: [1, 4], 9: [2, 5]}, index=Index([1, 2], name="dt2"), columns=Index([7, 8, 9], name="dt1"), ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index=df["dt2"].dt.year.values, columns=[df["dt1"].dt.hour, df["dt2"].dt.month], values="value1", ) exp_col = MultiIndex.from_arrays( [[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=["dt1", "dt2"] ) expected = DataFrame( np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"), index=[2013], columns=exp_col ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index=np.array(["X", "X", "X", "X", "Y", "Y"]), columns=[df["dt1"].dt.hour, df["dt2"].dt.month], values="value1", ) expected = DataFrame( np.array( [[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]] ), index=["X", "Y"], columns=exp_col, ) tm.assert_frame_equal(result, expected) def test_daily(self): rng = date_range("1/1/2000", "12/31/2004", freq="D") ts = Series(np.random.randn(len(rng)), index=rng) annual = pivot_table( DataFrame(ts), index=ts.index.year, columns=ts.index.dayofyear ) annual.columns = annual.columns.droplevel(0) doy = np.asarray(ts.index.dayofyear) for i in range(1, 367): subset = ts[doy == i] subset.index = subset.index.year result = annual[i].dropna() tm.assert_series_equal(result, subset, check_names=False) assert result.name == i def test_monthly(self): rng = date_range("1/1/2000", "12/31/2004", freq="M") ts = Series(np.random.randn(len(rng)), index=rng) annual = pivot_table( pd.DataFrame(ts), index=ts.index.year, columns=ts.index.month ) annual.columns = annual.columns.droplevel(0) month = ts.index.month for i in range(1, 13): subset = ts[month == i] subset.index = subset.index.year result = annual[i].dropna() tm.assert_series_equal(result, subset, check_names=False) assert result.name == i def test_pivot_table_with_iterator_values(self): # GH 12017 aggs = {"D": "sum", "E": "mean"} pivot_values_list = pd.pivot_table( self.data, index=["A"], values=list(aggs.keys()), aggfunc=aggs ) pivot_values_keys = pd.pivot_table( self.data, index=["A"], values=aggs.keys(), aggfunc=aggs ) tm.assert_frame_equal(pivot_values_keys, pivot_values_list) agg_values_gen = (value for value in aggs.keys()) pivot_values_gen = pd.pivot_table( self.data, index=["A"], values=agg_values_gen, aggfunc=aggs ) tm.assert_frame_equal(pivot_values_gen, pivot_values_list) def test_pivot_table_margins_name_with_aggfunc_list(self): # GH 13354 margins_name = "Weekly" costs = pd.DataFrame( { "item": ["bacon", "cheese", "bacon", "cheese"], "cost": [2.5, 4.5, 3.2, 3.3], "day": ["M", "M", "T", "T"], } ) table = costs.pivot_table( index="item", columns="day", margins=True, margins_name=margins_name, aggfunc=[np.mean, max], ) ix = pd.Index(["bacon", "cheese", margins_name], dtype="object", name="item") tups = [ ("mean", "cost", "M"), ("mean", "cost", "T"), ("mean", "cost", margins_name), ("max", "cost", "M"), ("max", "cost", "T"), ("max", "cost", margins_name), ] cols = pd.MultiIndex.from_tuples(tups, names=[None, None, "day"]) expected = pd.DataFrame(table.values, index=ix, columns=cols) tm.assert_frame_equal(table, expected) @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") def test_categorical_margins(self, observed): # GH 10989 df = pd.DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} ) expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) expected.index = Index([0, 1, "All"], name="y") expected.columns = Index([0, 1, "All"], name="z") table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") def test_categorical_margins_category(self, observed): df = pd.DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} ) expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) expected.index = Index([0, 1, "All"], name="y") expected.columns = Index([0, 1, "All"], name="z") df.y = df.y.astype("category") df.z = df.z.astype("category") table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) def test_margins_casted_to_float(self, observed): # GH 24893 df = pd.DataFrame( { "A": [2, 4, 6, 8], "B": [1, 4, 5, 8], "C": [1, 3, 4, 6], "D": ["X", "X", "Y", "Y"], } ) result = pd.pivot_table(df, index="D", margins=True) expected = pd.DataFrame( {"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]}, index=pd.Index(["X", "Y", "All"], name="D"), ) tm.assert_frame_equal(result, expected) def test_pivot_with_categorical(self, observed, ordered): # gh-21370 idx = [np.nan, "low", "high", "low", np.nan] col = [np.nan, "A", "B", np.nan, "A"] df = pd.DataFrame( { "In": pd.Categorical(idx, categories=["low", "high"], ordered=ordered), "Col": pd.Categorical(col, categories=["A", "B"], ordered=ordered), "Val": range(1, 6), } ) # case with index/columns/value result = df.pivot_table( index="In", columns="Col", values="Val", observed=observed ) expected_cols = pd.CategoricalIndex(["A", "B"], ordered=ordered, name="Col") expected = pd.DataFrame( data=[[2.0, np.nan], [np.nan, 3.0]], columns=expected_cols ) expected.index = Index( pd.Categorical( ["low", "high"], categories=["low", "high"], ordered=ordered ), name="In", ) tm.assert_frame_equal(result, expected) # case with columns/value result = df.pivot_table(columns="Col", values="Val", observed=observed) expected = pd.DataFrame( data=[[3.5, 3.0]], columns=expected_cols, index=Index(["Val"]) ) tm.assert_frame_equal(result, expected) def test_categorical_aggfunc(self, observed): # GH 9534 df = pd.DataFrame( {"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]} ) df["C1"] = df["C1"].astype("category") result = df.pivot_table( "V", index="C1", columns="C2", dropna=observed, aggfunc="count" ) expected_index = pd.CategoricalIndex( ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" ) expected_columns = pd.Index(["a", "b"], name="C2") expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]]) expected = pd.DataFrame( expected_data, index=expected_index, columns=expected_columns ) tm.assert_frame_equal(result, expected) def test_categorical_pivot_index_ordering(self, observed): # GH 8731 df = pd.DataFrame( { "Sales": [100, 120, 220], "Month": ["January", "January", "January"], "Year": [2013, 2014, 2013], } ) months = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", ] df["Month"] = df["Month"].astype("category").cat.set_categories(months) result = df.pivot_table( values="Sales", index="Month", columns="Year", dropna=observed, aggfunc="sum", ) expected_columns = pd.Int64Index([2013, 2014], name="Year") expected_index = pd.CategoricalIndex( ["January"], categories=months, ordered=False, name="Month" ) expected = pd.DataFrame( [[320, 120]], index=expected_index, columns=expected_columns ) if not observed: result = result.dropna().astype(np.int64) tm.assert_frame_equal(result, expected) def test_pivot_table_not_series(self): # GH 4386 # pivot_table always returns a DataFrame # when values is not list like and columns is None # and aggfunc is not instance of list df = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"], "col3": [1, 3, 9]}) result = df.pivot_table("col1", index=["col3", "col2"], aggfunc=np.sum) m = MultiIndex.from_arrays([[1, 3, 9], ["C", "D", "E"]], names=["col3", "col2"]) expected = DataFrame([3, 4, 5], index=m, columns=["col1"]) tm.assert_frame_equal(result, expected) result = df.pivot_table("col1", index="col3", columns="col2", aggfunc=np.sum) expected = DataFrame( [[3, np.NaN, np.NaN], [np.NaN, 4, np.NaN], [np.NaN, np.NaN, 5]], index=Index([1, 3, 9], name="col3"), columns=Index(["C", "D", "E"], name="col2"), ) tm.assert_frame_equal(result, expected) result = df.pivot_table("col1", index="col3", aggfunc=[np.sum]) m = MultiIndex.from_arrays([["sum"], ["col1"]]) expected = DataFrame([3, 4, 5], index=Index([1, 3, 9], name="col3"), columns=m) tm.assert_frame_equal(result, expected) def test_pivot_margins_name_unicode(self): # issue #13292 greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae" frame = pd.DataFrame({"foo": [1, 2, 3]}) table = pd.pivot_table( frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek ) index = pd.Index([1, 2, 3, greek], dtype="object", name="foo") expected = pd.DataFrame(index=index) tm.assert_frame_equal(table, expected) def test_pivot_string_as_func(self): # GH #18713 # for correctness purposes data = DataFrame( { "A": [ "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo", ], "B": [ "one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one", ], "C": range(11), } ) result = pivot_table(data, index="A", columns="B", aggfunc="sum") mi = MultiIndex( levels=[["C"], ["one", "two"]], codes=[[0, 0], [0, 1]], names=[None, "B"] ) expected = DataFrame( {("C", "one"): {"bar": 15, "foo": 13}, ("C", "two"): {"bar": 7, "foo": 20}}, columns=mi, ).rename_axis("A") tm.assert_frame_equal(result, expected) result = pivot_table(data, index="A", columns="B", aggfunc=["sum", "mean"]) mi = MultiIndex( levels=[["sum", "mean"], ["C"], ["one", "two"]], codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]], names=[None, None, "B"], ) expected = DataFrame( { ("mean", "C", "one"): {"bar": 5.0, "foo": 3.25}, ("mean", "C", "two"): {"bar": 7.0, "foo": 6.666666666666667}, ("sum", "C", "one"): {"bar": 15, "foo": 13}, ("sum", "C", "two"): {"bar": 7, "foo": 20}, }, columns=mi, ).rename_axis("A") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "f, f_numpy", [ ("sum", np.sum), ("mean", np.mean), ("std", np.std), (["sum", "mean"], [np.sum, np.mean]), (["sum", "std"], [np.sum, np.std]), (["std", "mean"], [np.std, np.mean]), ], ) def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes result = pivot_table(self.data, index="A", columns="B", aggfunc=f) expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) tm.assert_frame_equal(result, expected) @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 df = DataFrame( {"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0} ) msg = "Unstacked DataFrame is too big, causing int32 overflow" with pytest.raises(ValueError, match=msg): df.pivot_table( index="ind1", columns="ind2", values="count", aggfunc="count" ) def test_pivot_table_aggfunc_dropna(self, dropna): # GH 22159 df = pd.DataFrame( { "fruit": ["apple", "peach", "apple"], "size": [1, 1, 2], "taste": [7, 6, 6], } ) def ret_one(x): return 1 def ret_sum(x): return sum(x) def ret_none(x): return np.nan result = pd.pivot_table( df, columns="fruit", aggfunc=[ret_sum, ret_none, ret_one], dropna=dropna ) data = [[3, 1, np.nan, np.nan, 1, 1], [13, 6, np.nan, np.nan, 1, 1]] col = pd.MultiIndex.from_product( [["ret_sum", "ret_none", "ret_one"], ["apple", "peach"]], names=[None, "fruit"], ) expected = pd.DataFrame(data, index=["size", "taste"], columns=col) if dropna: expected = expected.dropna(axis="columns") tm.assert_frame_equal(result, expected) def test_pivot_table_aggfunc_scalar_dropna(self, dropna): # GH 22159 df = pd.DataFrame( {"A": ["one", "two", "one"], "x": [3, np.nan, 2], "y": [1, np.nan, np.nan]} ) result = pd.pivot_table(df, columns="A", aggfunc=np.mean, dropna=dropna) data = [[2.5, np.nan], [1, np.nan]] col = pd.Index(["one", "two"], name="A") expected = pd.DataFrame(data, index=["x", "y"], columns=col) if dropna: expected = expected.dropna(axis="columns") tm.assert_frame_equal(result, expected) def test_pivot_table_empty_aggfunc(self): # GH 9186 df = pd.DataFrame( { "A": [2, 2, 3, 3, 2], "id": [5, 6, 7, 8, 9], "C": ["p", "q", "q", "p", "q"], "D": [None, None, None, None, None], } ) result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size) expected = pd.DataFrame() tm.assert_frame_equal(result, expected) def test_pivot_table_no_column_raises(self): # GH 10326 def agg(l): return np.mean(l) foo = pd.DataFrame( {"X": [0, 0, 1, 1], "Y": [0, 1, 0, 1], "Z": [10, 20, 30, 40]} ) with pytest.raises(KeyError, match="notpresent"): foo.pivot_table("notpresent", "X", "Y", aggfunc=agg)