mirror of
https://github.com/PiBrewing/craftbeerpi4.git
synced 2024-12-24 22:44:56 +01:00
1341 lines
46 KiB
Python
1341 lines
46 KiB
Python
|
from datetime import datetime
|
||
|
import itertools
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
class TestDataFrameReshape:
|
||
|
def test_pivot(self):
|
||
|
data = {
|
||
|
"index": ["A", "B", "C", "C", "B", "A"],
|
||
|
"columns": ["One", "One", "One", "Two", "Two", "Two"],
|
||
|
"values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
|
||
|
}
|
||
|
|
||
|
frame = DataFrame(data)
|
||
|
pivoted = frame.pivot(index="index", columns="columns", values="values")
|
||
|
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"One": {"A": 1.0, "B": 2.0, "C": 3.0},
|
||
|
"Two": {"A": 1.0, "B": 2.0, "C": 3.0},
|
||
|
}
|
||
|
)
|
||
|
|
||
|
expected.index.name, expected.columns.name = "index", "columns"
|
||
|
tm.assert_frame_equal(pivoted, expected)
|
||
|
|
||
|
# name tracking
|
||
|
assert pivoted.index.name == "index"
|
||
|
assert pivoted.columns.name == "columns"
|
||
|
|
||
|
# don't specify values
|
||
|
pivoted = frame.pivot(index="index", columns="columns")
|
||
|
assert pivoted.index.name == "index"
|
||
|
assert pivoted.columns.names == (None, "columns")
|
||
|
|
||
|
def test_pivot_duplicates(self):
|
||
|
data = DataFrame(
|
||
|
{
|
||
|
"a": ["bar", "bar", "foo", "foo", "foo"],
|
||
|
"b": ["one", "two", "one", "one", "two"],
|
||
|
"c": [1.0, 2.0, 3.0, 3.0, 4.0],
|
||
|
}
|
||
|
)
|
||
|
with pytest.raises(ValueError, match="duplicate entries"):
|
||
|
data.pivot("a", "b", "c")
|
||
|
|
||
|
def test_pivot_empty(self):
|
||
|
df = DataFrame(columns=["a", "b", "c"])
|
||
|
result = df.pivot("a", "b", "c")
|
||
|
expected = DataFrame()
|
||
|
tm.assert_frame_equal(result, expected, check_names=False)
|
||
|
|
||
|
def test_pivot_integer_bug(self):
|
||
|
df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")])
|
||
|
|
||
|
result = df.pivot(index=1, columns=0, values=2)
|
||
|
repr(result)
|
||
|
tm.assert_index_equal(result.columns, Index(["A", "B"], name=0))
|
||
|
|
||
|
def test_pivot_index_none(self):
|
||
|
# gh-3962
|
||
|
data = {
|
||
|
"index": ["A", "B", "C", "C", "B", "A"],
|
||
|
"columns": ["One", "One", "One", "Two", "Two", "Two"],
|
||
|
"values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
|
||
|
}
|
||
|
|
||
|
frame = DataFrame(data).set_index("index")
|
||
|
result = frame.pivot(columns="columns", values="values")
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"One": {"A": 1.0, "B": 2.0, "C": 3.0},
|
||
|
"Two": {"A": 1.0, "B": 2.0, "C": 3.0},
|
||
|
}
|
||
|
)
|
||
|
|
||
|
expected.index.name, expected.columns.name = "index", "columns"
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# omit values
|
||
|
result = frame.pivot(columns="columns")
|
||
|
|
||
|
expected.columns = pd.MultiIndex.from_tuples(
|
||
|
[("values", "One"), ("values", "Two")], names=[None, "columns"]
|
||
|
)
|
||
|
expected.index.name = "index"
|
||
|
tm.assert_frame_equal(result, expected, check_names=False)
|
||
|
assert result.index.name == "index"
|
||
|
assert result.columns.names == (None, "columns")
|
||
|
expected.columns = expected.columns.droplevel(0)
|
||
|
result = frame.pivot(columns="columns", values="values")
|
||
|
|
||
|
expected.columns.name = "columns"
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_stack_unstack(self, float_frame):
|
||
|
df = float_frame.copy()
|
||
|
df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
|
||
|
|
||
|
stacked = df.stack()
|
||
|
stacked_df = DataFrame({"foo": stacked, "bar": stacked})
|
||
|
|
||
|
unstacked = stacked.unstack()
|
||
|
unstacked_df = stacked_df.unstack()
|
||
|
|
||
|
tm.assert_frame_equal(unstacked, df)
|
||
|
tm.assert_frame_equal(unstacked_df["bar"], df)
|
||
|
|
||
|
unstacked_cols = stacked.unstack(0)
|
||
|
unstacked_cols_df = stacked_df.unstack(0)
|
||
|
tm.assert_frame_equal(unstacked_cols.T, df)
|
||
|
tm.assert_frame_equal(unstacked_cols_df["bar"].T, df)
|
||
|
|
||
|
def test_stack_mixed_level(self):
|
||
|
# GH 18310
|
||
|
levels = [range(3), [3, "a", "b"], [1, 2]]
|
||
|
|
||
|
# flat columns:
|
||
|
df = DataFrame(1, index=levels[0], columns=levels[1])
|
||
|
result = df.stack()
|
||
|
expected = Series(1, index=MultiIndex.from_product(levels[:2]))
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# MultiIndex columns:
|
||
|
df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:]))
|
||
|
result = df.stack(1)
|
||
|
expected = DataFrame(
|
||
|
1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1]
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# as above, but used labels in level are actually of homogeneous type
|
||
|
result = df[["a", "b"]].stack(1)
|
||
|
expected = expected[["a", "b"]]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_not_consolidated(self):
|
||
|
# Gh#34708
|
||
|
df = pd.DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]})
|
||
|
df2 = df[["x"]]
|
||
|
df2["y"] = df["y"]
|
||
|
assert len(df2._mgr.blocks) == 2
|
||
|
|
||
|
res = df2.unstack()
|
||
|
expected = df.unstack()
|
||
|
tm.assert_series_equal(res, expected)
|
||
|
|
||
|
def test_unstack_fill(self):
|
||
|
|
||
|
# GH #9746: fill_value keyword argument for Series
|
||
|
# and DataFrame unstack
|
||
|
|
||
|
# From a series
|
||
|
data = Series([1, 2, 4, 5], dtype=np.int16)
|
||
|
data.index = MultiIndex.from_tuples(
|
||
|
[("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
|
||
|
)
|
||
|
|
||
|
result = data.unstack(fill_value=-1)
|
||
|
expected = DataFrame(
|
||
|
{"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# From a series with incorrect data type for fill_value
|
||
|
result = data.unstack(fill_value=0.5)
|
||
|
expected = DataFrame(
|
||
|
{"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=float
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# GH #13971: fill_value when unstacking multiple levels:
|
||
|
df = DataFrame(
|
||
|
{"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]}
|
||
|
).set_index(["x", "y", "z"])
|
||
|
unstacked = df.unstack(["x", "y"], fill_value=0)
|
||
|
key = ("w", "b", "j")
|
||
|
expected = unstacked[key]
|
||
|
result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
stacked = unstacked.stack(["x", "y"])
|
||
|
stacked.index = stacked.index.reorder_levels(df.index.names)
|
||
|
# Workaround for GH #17886 (unnecessarily casts to float):
|
||
|
stacked = stacked.astype(np.int64)
|
||
|
result = stacked.loc[df.index]
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
# From a series
|
||
|
s = df["w"]
|
||
|
result = s.unstack(["x", "y"], fill_value=0)
|
||
|
expected = unstacked["w"]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_fill_frame(self):
|
||
|
|
||
|
# From a dataframe
|
||
|
rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
|
||
|
df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
|
||
|
df.index = MultiIndex.from_tuples(
|
||
|
[("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
|
||
|
)
|
||
|
|
||
|
result = df.unstack(fill_value=-1)
|
||
|
|
||
|
rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
|
||
|
expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
|
||
|
expected.columns = MultiIndex.from_tuples(
|
||
|
[("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# From a mixed type dataframe
|
||
|
df["A"] = df["A"].astype(np.int16)
|
||
|
df["B"] = df["B"].astype(np.float64)
|
||
|
|
||
|
result = df.unstack(fill_value=-1)
|
||
|
expected["A"] = expected["A"].astype(np.int16)
|
||
|
expected["B"] = expected["B"].astype(np.float64)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# From a dataframe with incorrect data type for fill_value
|
||
|
result = df.unstack(fill_value=0.5)
|
||
|
|
||
|
rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
|
||
|
expected = DataFrame(rows, index=list("xyz"), dtype=float)
|
||
|
expected.columns = MultiIndex.from_tuples(
|
||
|
[("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_fill_frame_datetime(self):
|
||
|
|
||
|
# Test unstacking with date times
|
||
|
dv = pd.date_range("2012-01-01", periods=4).values
|
||
|
data = Series(dv)
|
||
|
data.index = MultiIndex.from_tuples(
|
||
|
[("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
|
||
|
)
|
||
|
|
||
|
result = data.unstack()
|
||
|
expected = DataFrame(
|
||
|
{"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]},
|
||
|
index=["x", "y", "z"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = data.unstack(fill_value=dv[0])
|
||
|
expected = DataFrame(
|
||
|
{"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]},
|
||
|
index=["x", "y", "z"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_fill_frame_timedelta(self):
|
||
|
|
||
|
# Test unstacking with time deltas
|
||
|
td = [Timedelta(days=i) for i in range(4)]
|
||
|
data = Series(td)
|
||
|
data.index = MultiIndex.from_tuples(
|
||
|
[("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
|
||
|
)
|
||
|
|
||
|
result = data.unstack()
|
||
|
expected = DataFrame(
|
||
|
{"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]},
|
||
|
index=["x", "y", "z"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = data.unstack(fill_value=td[1])
|
||
|
expected = DataFrame(
|
||
|
{"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]},
|
||
|
index=["x", "y", "z"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_fill_frame_period(self):
|
||
|
|
||
|
# Test unstacking with period
|
||
|
periods = [
|
||
|
Period("2012-01"),
|
||
|
Period("2012-02"),
|
||
|
Period("2012-03"),
|
||
|
Period("2012-04"),
|
||
|
]
|
||
|
data = Series(periods)
|
||
|
data.index = MultiIndex.from_tuples(
|
||
|
[("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
|
||
|
)
|
||
|
|
||
|
result = data.unstack()
|
||
|
expected = DataFrame(
|
||
|
{"a": [periods[0], None, periods[3]], "b": [periods[1], periods[2], None]},
|
||
|
index=["x", "y", "z"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = data.unstack(fill_value=periods[1])
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"a": [periods[0], periods[1], periods[3]],
|
||
|
"b": [periods[1], periods[2], periods[1]],
|
||
|
},
|
||
|
index=["x", "y", "z"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_fill_frame_categorical(self):
|
||
|
|
||
|
# Test unstacking with categorical
|
||
|
data = pd.Series(["a", "b", "c", "a"], dtype="category")
|
||
|
data.index = pd.MultiIndex.from_tuples(
|
||
|
[("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
|
||
|
)
|
||
|
|
||
|
# By default missing values will be NaN
|
||
|
result = data.unstack()
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"a": pd.Categorical(list("axa"), categories=list("abc")),
|
||
|
"b": pd.Categorical(list("bcx"), categories=list("abc")),
|
||
|
},
|
||
|
index=list("xyz"),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# Fill with non-category results in a ValueError
|
||
|
msg = r"'fill_value=d' is not present in"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
data.unstack(fill_value="d")
|
||
|
|
||
|
# Fill with category value replaces missing values as expected
|
||
|
result = data.unstack(fill_value="c")
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"a": pd.Categorical(list("aca"), categories=list("abc")),
|
||
|
"b": pd.Categorical(list("bcc"), categories=list("abc")),
|
||
|
},
|
||
|
index=list("xyz"),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_tuplename_in_multiindex(self):
|
||
|
# GH 19966
|
||
|
idx = pd.MultiIndex.from_product(
|
||
|
[["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")]
|
||
|
)
|
||
|
df = pd.DataFrame({"d": [1] * 9, "e": [2] * 9}, index=idx)
|
||
|
result = df.unstack(("A", "a"))
|
||
|
|
||
|
expected = pd.DataFrame(
|
||
|
[[1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2]],
|
||
|
columns=pd.MultiIndex.from_tuples(
|
||
|
[
|
||
|
("d", "a"),
|
||
|
("d", "b"),
|
||
|
("d", "c"),
|
||
|
("e", "a"),
|
||
|
("e", "b"),
|
||
|
("e", "c"),
|
||
|
],
|
||
|
names=[None, ("A", "a")],
|
||
|
),
|
||
|
index=pd.Index([1, 2, 3], name=("B", "b")),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"unstack_idx, expected_values, expected_index, expected_columns",
|
||
|
[
|
||
|
(
|
||
|
("A", "a"),
|
||
|
[[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]],
|
||
|
pd.MultiIndex.from_tuples(
|
||
|
[(1, 3), (1, 4), (2, 3), (2, 4)], names=["B", "C"]
|
||
|
),
|
||
|
pd.MultiIndex.from_tuples(
|
||
|
[("d", "a"), ("d", "b"), ("e", "a"), ("e", "b")],
|
||
|
names=[None, ("A", "a")],
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
(("A", "a"), "B"),
|
||
|
[[1, 1, 1, 1, 2, 2, 2, 2], [1, 1, 1, 1, 2, 2, 2, 2]],
|
||
|
pd.Index([3, 4], name="C"),
|
||
|
pd.MultiIndex.from_tuples(
|
||
|
[
|
||
|
("d", "a", 1),
|
||
|
("d", "a", 2),
|
||
|
("d", "b", 1),
|
||
|
("d", "b", 2),
|
||
|
("e", "a", 1),
|
||
|
("e", "a", 2),
|
||
|
("e", "b", 1),
|
||
|
("e", "b", 2),
|
||
|
],
|
||
|
names=[None, ("A", "a"), "B"],
|
||
|
),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_unstack_mixed_type_name_in_multiindex(
|
||
|
self, unstack_idx, expected_values, expected_index, expected_columns
|
||
|
):
|
||
|
# GH 19966
|
||
|
idx = pd.MultiIndex.from_product(
|
||
|
[["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"]
|
||
|
)
|
||
|
df = pd.DataFrame({"d": [1] * 8, "e": [2] * 8}, index=idx)
|
||
|
result = df.unstack(unstack_idx)
|
||
|
|
||
|
expected = pd.DataFrame(
|
||
|
expected_values, columns=expected_columns, index=expected_index,
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_preserve_dtypes(self):
|
||
|
# Checks fix for #11847
|
||
|
df = pd.DataFrame(
|
||
|
dict(
|
||
|
state=["IL", "MI", "NC"],
|
||
|
index=["a", "b", "c"],
|
||
|
some_categories=pd.Series(["a", "b", "c"]).astype("category"),
|
||
|
A=np.random.rand(3),
|
||
|
B=1,
|
||
|
C="foo",
|
||
|
D=pd.Timestamp("20010102"),
|
||
|
E=pd.Series([1.0, 50.0, 100.0]).astype("float32"),
|
||
|
F=pd.Series([3.0, 4.0, 5.0]).astype("float64"),
|
||
|
G=False,
|
||
|
H=pd.Series([1, 200, 923442], dtype="int8"),
|
||
|
)
|
||
|
)
|
||
|
|
||
|
def unstack_and_compare(df, column_name):
|
||
|
unstacked1 = df.unstack([column_name])
|
||
|
unstacked2 = df.unstack(column_name)
|
||
|
tm.assert_frame_equal(unstacked1, unstacked2)
|
||
|
|
||
|
df1 = df.set_index(["state", "index"])
|
||
|
unstack_and_compare(df1, "index")
|
||
|
|
||
|
df1 = df.set_index(["state", "some_categories"])
|
||
|
unstack_and_compare(df1, "some_categories")
|
||
|
|
||
|
df1 = df.set_index(["F", "C"])
|
||
|
unstack_and_compare(df1, "F")
|
||
|
|
||
|
df1 = df.set_index(["G", "B", "state"])
|
||
|
unstack_and_compare(df1, "B")
|
||
|
|
||
|
df1 = df.set_index(["E", "A"])
|
||
|
unstack_and_compare(df1, "E")
|
||
|
|
||
|
df1 = df.set_index(["state", "index"])
|
||
|
s = df1["A"]
|
||
|
unstack_and_compare(s, "index")
|
||
|
|
||
|
def test_stack_ints(self):
|
||
|
columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3)))
|
||
|
df = DataFrame(np.random.randn(30, 27), columns=columns)
|
||
|
|
||
|
tm.assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1))
|
||
|
tm.assert_frame_equal(
|
||
|
df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1)
|
||
|
)
|
||
|
|
||
|
df_named = df.copy()
|
||
|
return_value = df_named.columns.set_names(range(3), inplace=True)
|
||
|
assert return_value is None
|
||
|
|
||
|
tm.assert_frame_equal(
|
||
|
df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1)
|
||
|
)
|
||
|
|
||
|
def test_stack_mixed_levels(self):
|
||
|
columns = MultiIndex.from_tuples(
|
||
|
[
|
||
|
("A", "cat", "long"),
|
||
|
("B", "cat", "long"),
|
||
|
("A", "dog", "short"),
|
||
|
("B", "dog", "short"),
|
||
|
],
|
||
|
names=["exp", "animal", "hair_length"],
|
||
|
)
|
||
|
df = DataFrame(np.random.randn(4, 4), columns=columns)
|
||
|
|
||
|
animal_hair_stacked = df.stack(level=["animal", "hair_length"])
|
||
|
exp_hair_stacked = df.stack(level=["exp", "hair_length"])
|
||
|
|
||
|
# GH #8584: Need to check that stacking works when a number
|
||
|
# is passed that is both a level name and in the range of
|
||
|
# the level numbers
|
||
|
df2 = df.copy()
|
||
|
df2.columns.names = ["exp", "animal", 1]
|
||
|
tm.assert_frame_equal(
|
||
|
df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False
|
||
|
)
|
||
|
tm.assert_frame_equal(
|
||
|
df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False
|
||
|
)
|
||
|
|
||
|
# When mixed types are passed and the ints are not level
|
||
|
# names, raise
|
||
|
msg = (
|
||
|
"level should contain all level names or all level numbers, not "
|
||
|
"a mixture of the two"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df2.stack(level=["animal", 0])
|
||
|
|
||
|
# GH #8584: Having 0 in the level names could raise a
|
||
|
# strange error about lexsort depth
|
||
|
df3 = df.copy()
|
||
|
df3.columns.names = ["exp", "animal", 0]
|
||
|
tm.assert_frame_equal(
|
||
|
df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False
|
||
|
)
|
||
|
|
||
|
def test_stack_int_level_names(self):
|
||
|
columns = MultiIndex.from_tuples(
|
||
|
[
|
||
|
("A", "cat", "long"),
|
||
|
("B", "cat", "long"),
|
||
|
("A", "dog", "short"),
|
||
|
("B", "dog", "short"),
|
||
|
],
|
||
|
names=["exp", "animal", "hair_length"],
|
||
|
)
|
||
|
df = DataFrame(np.random.randn(4, 4), columns=columns)
|
||
|
|
||
|
exp_animal_stacked = df.stack(level=["exp", "animal"])
|
||
|
animal_hair_stacked = df.stack(level=["animal", "hair_length"])
|
||
|
exp_hair_stacked = df.stack(level=["exp", "hair_length"])
|
||
|
|
||
|
df2 = df.copy()
|
||
|
df2.columns.names = [0, 1, 2]
|
||
|
tm.assert_frame_equal(
|
||
|
df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False
|
||
|
)
|
||
|
tm.assert_frame_equal(
|
||
|
df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False
|
||
|
)
|
||
|
tm.assert_frame_equal(
|
||
|
df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False
|
||
|
)
|
||
|
|
||
|
# Out-of-order int column names
|
||
|
df3 = df.copy()
|
||
|
df3.columns.names = [2, 0, 1]
|
||
|
tm.assert_frame_equal(
|
||
|
df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False
|
||
|
)
|
||
|
tm.assert_frame_equal(
|
||
|
df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False
|
||
|
)
|
||
|
tm.assert_frame_equal(
|
||
|
df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False
|
||
|
)
|
||
|
|
||
|
def test_unstack_bool(self):
|
||
|
df = DataFrame(
|
||
|
[False, False],
|
||
|
index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]),
|
||
|
columns=["col"],
|
||
|
)
|
||
|
rs = df.unstack()
|
||
|
xp = DataFrame(
|
||
|
np.array([[False, np.nan], [np.nan, False]], dtype=object),
|
||
|
index=["a", "b"],
|
||
|
columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
|
||
|
)
|
||
|
tm.assert_frame_equal(rs, xp)
|
||
|
|
||
|
def test_unstack_level_binding(self):
|
||
|
# GH9856
|
||
|
mi = pd.MultiIndex(
|
||
|
levels=[["foo", "bar"], ["one", "two"], ["a", "b"]],
|
||
|
codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
|
||
|
names=["first", "second", "third"],
|
||
|
)
|
||
|
s = pd.Series(0, index=mi)
|
||
|
result = s.unstack([1, 2]).stack(0)
|
||
|
|
||
|
expected_mi = pd.MultiIndex(
|
||
|
levels=[["foo", "bar"], ["one", "two"]],
|
||
|
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
|
||
|
names=["first", "second"],
|
||
|
)
|
||
|
|
||
|
expected = pd.DataFrame(
|
||
|
np.array(
|
||
|
[[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64
|
||
|
),
|
||
|
index=expected_mi,
|
||
|
columns=pd.Index(["a", "b"], name="third"),
|
||
|
)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_to_series(self, float_frame):
|
||
|
# check reversibility
|
||
|
data = float_frame.unstack()
|
||
|
|
||
|
assert isinstance(data, Series)
|
||
|
undo = data.unstack().T
|
||
|
tm.assert_frame_equal(undo, float_frame)
|
||
|
|
||
|
# check NA handling
|
||
|
data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]})
|
||
|
data.index = Index(["a", "b", "c"])
|
||
|
result = data.unstack()
|
||
|
|
||
|
midx = MultiIndex(
|
||
|
levels=[["x", "y"], ["a", "b", "c"]],
|
||
|
codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]],
|
||
|
)
|
||
|
expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# check composability of unstack
|
||
|
old_data = data.copy()
|
||
|
for _ in range(4):
|
||
|
data = data.unstack()
|
||
|
tm.assert_frame_equal(old_data, data)
|
||
|
|
||
|
def test_unstack_dtypes(self):
|
||
|
|
||
|
# GH 2929
|
||
|
rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]]
|
||
|
|
||
|
df = DataFrame(rows, columns=list("ABCD"))
|
||
|
result = df.dtypes
|
||
|
expected = Series([np.dtype("int64")] * 4, index=list("ABCD"))
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# single dtype
|
||
|
df2 = df.set_index(["A", "B"])
|
||
|
df3 = df2.unstack("B")
|
||
|
result = df3.dtypes
|
||
|
expected = Series(
|
||
|
[np.dtype("int64")] * 4,
|
||
|
index=pd.MultiIndex.from_arrays(
|
||
|
[["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
|
||
|
),
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# mixed
|
||
|
df2 = df.set_index(["A", "B"])
|
||
|
df2["C"] = 3.0
|
||
|
df3 = df2.unstack("B")
|
||
|
result = df3.dtypes
|
||
|
expected = Series(
|
||
|
[np.dtype("float64")] * 2 + [np.dtype("int64")] * 2,
|
||
|
index=pd.MultiIndex.from_arrays(
|
||
|
[["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
|
||
|
),
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
df2["D"] = "foo"
|
||
|
df3 = df2.unstack("B")
|
||
|
result = df3.dtypes
|
||
|
expected = Series(
|
||
|
[np.dtype("float64")] * 2 + [np.dtype("object")] * 2,
|
||
|
index=pd.MultiIndex.from_arrays(
|
||
|
[["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
|
||
|
),
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# GH7405
|
||
|
for c, d in (
|
||
|
(np.zeros(5), np.zeros(5)),
|
||
|
(np.arange(5, dtype="f8"), np.arange(5, 10, dtype="f8")),
|
||
|
):
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": ["a"] * 5,
|
||
|
"C": c,
|
||
|
"D": d,
|
||
|
"B": pd.date_range("2012-01-01", periods=5),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
right = df.iloc[:3].copy(deep=True)
|
||
|
|
||
|
df = df.set_index(["A", "B"])
|
||
|
df["D"] = df["D"].astype("int64")
|
||
|
|
||
|
left = df.iloc[:3].unstack(0)
|
||
|
right = right.set_index(["A", "B"]).unstack(0)
|
||
|
right[("D", "a")] = right[("D", "a")].astype("int64")
|
||
|
|
||
|
assert left.shape == (3, 2)
|
||
|
tm.assert_frame_equal(left, right)
|
||
|
|
||
|
def test_unstack_non_unique_index_names(self):
|
||
|
idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"])
|
||
|
df = DataFrame([1, 2], index=idx)
|
||
|
msg = "The name c1 occurs multiple times, use a level number"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.unstack("c1")
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.T.stack("c1")
|
||
|
|
||
|
def test_unstack_unused_levels(self):
|
||
|
# GH 17845: unused codes in index make unstack() cast int to float
|
||
|
idx = pd.MultiIndex.from_product([["a"], ["A", "B", "C", "D"]])[:-1]
|
||
|
df = pd.DataFrame([[1, 0]] * 3, index=idx)
|
||
|
|
||
|
result = df.unstack()
|
||
|
exp_col = pd.MultiIndex.from_product([[0, 1], ["A", "B", "C"]])
|
||
|
expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
assert (result.columns.levels[1] == idx.levels[1]).all()
|
||
|
|
||
|
# Unused items on both levels
|
||
|
levels = [[0, 1, 7], [0, 1, 2, 3]]
|
||
|
codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
|
||
|
idx = pd.MultiIndex(levels, codes)
|
||
|
block = np.arange(4).reshape(2, 2)
|
||
|
df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
|
||
|
result = df.unstack()
|
||
|
expected = pd.DataFrame(
|
||
|
np.concatenate([block * 2, block * 2 + 1], axis=1), columns=idx
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
assert (result.columns.levels[1] == idx.levels[1]).all()
|
||
|
|
||
|
# With mixed dtype and NaN
|
||
|
levels = [["a", 2, "c"], [1, 3, 5, 7]]
|
||
|
codes = [[0, -1, 1, 1], [0, 2, -1, 2]]
|
||
|
idx = pd.MultiIndex(levels, codes)
|
||
|
data = np.arange(8)
|
||
|
df = pd.DataFrame(data.reshape(4, 2), index=idx)
|
||
|
|
||
|
cases = (
|
||
|
(0, [13, 16, 6, 9, 2, 5, 8, 11], [np.nan, "a", 2], [np.nan, 5, 1]),
|
||
|
(1, [8, 11, 1, 4, 12, 15, 13, 16], [np.nan, 5, 1], [np.nan, "a", 2]),
|
||
|
)
|
||
|
for level, idces, col_level, idx_level in cases:
|
||
|
result = df.unstack(level=level)
|
||
|
exp_data = np.zeros(18) * np.nan
|
||
|
exp_data[idces] = data
|
||
|
cols = pd.MultiIndex.from_product([[0, 1], col_level])
|
||
|
expected = pd.DataFrame(
|
||
|
exp_data.reshape(3, 6), index=idx_level, columns=cols
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("cols", [["A", "C"], slice(None)])
|
||
|
def test_unstack_unused_level(self, cols):
|
||
|
# GH 18562 : unused codes on the unstacked level
|
||
|
df = pd.DataFrame(
|
||
|
[[2010, "a", "I"], [2011, "b", "II"]], columns=["A", "B", "C"]
|
||
|
)
|
||
|
|
||
|
ind = df.set_index(["A", "B", "C"], drop=False)
|
||
|
selection = ind.loc[(slice(None), slice(None), "I"), cols]
|
||
|
result = selection.unstack()
|
||
|
|
||
|
expected = ind.iloc[[0]][cols]
|
||
|
expected.columns = MultiIndex.from_product(
|
||
|
[expected.columns, ["I"]], names=[None, "C"]
|
||
|
)
|
||
|
expected.index = expected.index.droplevel("C")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_long_index(self):
|
||
|
# PH 32624: Error when using a lot of indices to unstack.
|
||
|
# The error occurred only, if a lot of indices are used.
|
||
|
df = pd.DataFrame(
|
||
|
[[1]],
|
||
|
columns=pd.MultiIndex.from_tuples([[0]], names=["c1"]),
|
||
|
index=pd.MultiIndex.from_tuples(
|
||
|
[[0, 0, 1, 0, 0, 0, 1]],
|
||
|
names=["i1", "i2", "i3", "i4", "i5", "i6", "i7"],
|
||
|
),
|
||
|
)
|
||
|
result = df.unstack(["i2", "i3", "i4", "i5", "i6", "i7"])
|
||
|
expected = pd.DataFrame(
|
||
|
[[1]],
|
||
|
columns=pd.MultiIndex.from_tuples(
|
||
|
[[0, 0, 1, 0, 0, 0, 1]],
|
||
|
names=["c1", "i2", "i3", "i4", "i5", "i6", "i7"],
|
||
|
),
|
||
|
index=pd.Index([0], name="i1"),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_multi_level_cols(self):
|
||
|
# PH 24729: Unstack a df with multi level columns
|
||
|
df = pd.DataFrame(
|
||
|
[[0.0, 0.0], [0.0, 0.0]],
|
||
|
columns=pd.MultiIndex.from_tuples(
|
||
|
[["B", "C"], ["B", "D"]], names=["c1", "c2"]
|
||
|
),
|
||
|
index=pd.MultiIndex.from_tuples(
|
||
|
[[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"],
|
||
|
),
|
||
|
)
|
||
|
assert df.unstack(["i2", "i1"]).columns.names[-2:] == ["i2", "i1"]
|
||
|
|
||
|
def test_unstack_multi_level_rows_and_cols(self):
|
||
|
# PH 28306: Unstack df with multi level cols and rows
|
||
|
df = pd.DataFrame(
|
||
|
[[1, 2], [3, 4], [-1, -2], [-3, -4]],
|
||
|
columns=pd.MultiIndex.from_tuples([["a", "b", "c"], ["d", "e", "f"]]),
|
||
|
index=pd.MultiIndex.from_tuples(
|
||
|
[
|
||
|
["m1", "P3", 222],
|
||
|
["m1", "A5", 111],
|
||
|
["m2", "P3", 222],
|
||
|
["m2", "A5", 111],
|
||
|
],
|
||
|
names=["i1", "i2", "i3"],
|
||
|
),
|
||
|
)
|
||
|
result = df.unstack(["i3", "i2"])
|
||
|
expected = df.unstack(["i3"]).unstack(["i2"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unstack_nan_index(self): # GH7466
|
||
|
def cast(val):
|
||
|
val_str = "" if val != val else val
|
||
|
return f"{val_str:1}"
|
||
|
|
||
|
def verify(df):
|
||
|
mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
|
||
|
rows, cols = df.notna().values.nonzero()
|
||
|
for i, j in zip(rows, cols):
|
||
|
left = sorted(df.iloc[i, j].split("."))
|
||
|
right = mk_list(df.index[i]) + mk_list(df.columns[j])
|
||
|
right = sorted(map(cast, right))
|
||
|
assert left == right
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"jim": ["a", "b", np.nan, "d"],
|
||
|
"joe": ["w", "x", "y", "z"],
|
||
|
"jolie": ["a.w", "b.x", " .y", "d.z"],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
left = df.set_index(["jim", "joe"]).unstack()["jolie"]
|
||
|
right = df.set_index(["joe", "jim"]).unstack()["jolie"].T
|
||
|
tm.assert_frame_equal(left, right)
|
||
|
|
||
|
for idx in itertools.permutations(df.columns[:2]):
|
||
|
mi = df.set_index(list(idx))
|
||
|
for lev in range(2):
|
||
|
udf = mi.unstack(level=lev)
|
||
|
assert udf.notna().values.sum() == len(df)
|
||
|
verify(udf["jolie"])
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"1st": ["d"] * 3
|
||
|
+ [np.nan] * 5
|
||
|
+ ["a"] * 2
|
||
|
+ ["c"] * 3
|
||
|
+ ["e"] * 2
|
||
|
+ ["b"] * 5,
|
||
|
"2nd": ["y"] * 2
|
||
|
+ ["w"] * 3
|
||
|
+ [np.nan] * 3
|
||
|
+ ["z"] * 4
|
||
|
+ [np.nan] * 3
|
||
|
+ ["x"] * 3
|
||
|
+ [np.nan] * 2,
|
||
|
"3rd": [
|
||
|
67,
|
||
|
39,
|
||
|
53,
|
||
|
72,
|
||
|
57,
|
||
|
80,
|
||
|
31,
|
||
|
18,
|
||
|
11,
|
||
|
30,
|
||
|
59,
|
||
|
50,
|
||
|
62,
|
||
|
59,
|
||
|
76,
|
||
|
52,
|
||
|
14,
|
||
|
53,
|
||
|
60,
|
||
|
51,
|
||
|
],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
df["4th"], df["5th"] = (
|
||
|
df.apply(lambda r: ".".join(map(cast, r)), axis=1),
|
||
|
df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1),
|
||
|
)
|
||
|
|
||
|
for idx in itertools.permutations(["1st", "2nd", "3rd"]):
|
||
|
mi = df.set_index(list(idx))
|
||
|
for lev in range(3):
|
||
|
udf = mi.unstack(level=lev)
|
||
|
assert udf.notna().values.sum() == 2 * len(df)
|
||
|
for col in ["4th", "5th"]:
|
||
|
verify(udf[col])
|
||
|
|
||
|
# GH7403
|
||
|
df = pd.DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)})
|
||
|
df.iloc[3, 1] = np.NaN
|
||
|
left = df.set_index(["A", "B"]).unstack(0)
|
||
|
|
||
|
vals = [
|
||
|
[3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
|
||
|
[np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7],
|
||
|
]
|
||
|
vals = list(map(list, zip(*vals)))
|
||
|
idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B")
|
||
|
cols = MultiIndex(
|
||
|
levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
|
||
|
)
|
||
|
|
||
|
right = DataFrame(vals, columns=cols, index=idx)
|
||
|
tm.assert_frame_equal(left, right)
|
||
|
|
||
|
df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)})
|
||
|
df.iloc[2, 1] = np.NaN
|
||
|
left = df.set_index(["A", "B"]).unstack(0)
|
||
|
|
||
|
vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
|
||
|
cols = MultiIndex(
|
||
|
levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
|
||
|
)
|
||
|
idx = Index([np.nan, 0, 1, 2, 3], name="B")
|
||
|
right = DataFrame(vals, columns=cols, index=idx)
|
||
|
tm.assert_frame_equal(left, right)
|
||
|
|
||
|
df = pd.DataFrame(
|
||
|
{"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}
|
||
|
)
|
||
|
df.iloc[3, 1] = np.NaN
|
||
|
left = df.set_index(["A", "B"]).unstack(0)
|
||
|
|
||
|
vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
|
||
|
cols = MultiIndex(
|
||
|
levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
|
||
|
)
|
||
|
idx = Index([np.nan, 0, 1, 2, 3], name="B")
|
||
|
right = DataFrame(vals, columns=cols, index=idx)
|
||
|
tm.assert_frame_equal(left, right)
|
||
|
|
||
|
# GH7401
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
"A": list("aaaaabbbbb"),
|
||
|
"B": (date_range("2012-01-01", periods=5).tolist() * 2),
|
||
|
"C": np.arange(10),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
df.iloc[3, 1] = np.NaN
|
||
|
left = df.set_index(["A", "B"]).unstack()
|
||
|
|
||
|
vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
|
||
|
idx = Index(["a", "b"], name="A")
|
||
|
cols = MultiIndex(
|
||
|
levels=[["C"], date_range("2012-01-01", periods=5)],
|
||
|
codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
|
||
|
names=[None, "B"],
|
||
|
)
|
||
|
|
||
|
right = DataFrame(vals, columns=cols, index=idx)
|
||
|
tm.assert_frame_equal(left, right)
|
||
|
|
||
|
# GH4862
|
||
|
vals = [
|
||
|
["Hg", np.nan, np.nan, 680585148],
|
||
|
["U", 0.0, np.nan, 680585148],
|
||
|
["Pb", 7.07e-06, np.nan, 680585148],
|
||
|
["Sn", 2.3614e-05, 0.0133, 680607017],
|
||
|
["Ag", 0.0, 0.0133, 680607017],
|
||
|
["Hg", -0.00015, 0.0133, 680607017],
|
||
|
]
|
||
|
df = DataFrame(
|
||
|
vals,
|
||
|
columns=["agent", "change", "dosage", "s_id"],
|
||
|
index=[17263, 17264, 17265, 17266, 17267, 17268],
|
||
|
)
|
||
|
|
||
|
left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack()
|
||
|
|
||
|
vals = [
|
||
|
[np.nan, np.nan, 7.07e-06, np.nan, 0.0],
|
||
|
[0.0, -0.00015, np.nan, 2.3614e-05, np.nan],
|
||
|
]
|
||
|
|
||
|
idx = MultiIndex(
|
||
|
levels=[[680585148, 680607017], [0.0133]],
|
||
|
codes=[[0, 1], [-1, 0]],
|
||
|
names=["s_id", "dosage"],
|
||
|
)
|
||
|
|
||
|
cols = MultiIndex(
|
||
|
levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]],
|
||
|
codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
|
||
|
names=[None, "agent"],
|
||
|
)
|
||
|
|
||
|
right = DataFrame(vals, columns=cols, index=idx)
|
||
|
tm.assert_frame_equal(left, right)
|
||
|
|
||
|
left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"])
|
||
|
tm.assert_frame_equal(left.unstack(), right)
|
||
|
|
||
|
# GH9497 - multiple unstack with nulls
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"1st": [1, 2, 1, 2, 1, 2],
|
||
|
"2nd": pd.date_range("2014-02-01", periods=6, freq="D"),
|
||
|
"jim": 100 + np.arange(6),
|
||
|
"joe": (np.random.randn(6) * 10).round(2),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02")
|
||
|
df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan
|
||
|
df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan
|
||
|
|
||
|
left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"])
|
||
|
assert left.notna().values.sum() == 2 * len(df)
|
||
|
|
||
|
for col in ["jim", "joe"]:
|
||
|
for _, r in df.iterrows():
|
||
|
key = r["1st"], (col, r["2nd"], r["3rd"])
|
||
|
assert r[col] == left.loc[key]
|
||
|
|
||
|
def test_stack_datetime_column_multiIndex(self):
|
||
|
# GH 8039
|
||
|
t = datetime(2014, 1, 1)
|
||
|
df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")]))
|
||
|
result = df.stack()
|
||
|
|
||
|
eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)])
|
||
|
ecols = MultiIndex.from_tuples([(t, "A")])
|
||
|
expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_stack_partial_multiIndex(self):
|
||
|
# GH 8844
|
||
|
def _test_stack_with_multiindex(multiindex):
|
||
|
df = DataFrame(
|
||
|
np.arange(3 * len(multiindex)).reshape(3, len(multiindex)),
|
||
|
columns=multiindex,
|
||
|
)
|
||
|
for level in (-1, 0, 1, [0, 1], [1, 0]):
|
||
|
result = df.stack(level=level, dropna=False)
|
||
|
|
||
|
if isinstance(level, int):
|
||
|
# Stacking a single level should not make any all-NaN rows,
|
||
|
# so df.stack(level=level, dropna=False) should be the same
|
||
|
# as df.stack(level=level, dropna=True).
|
||
|
expected = df.stack(level=level, dropna=True)
|
||
|
if isinstance(expected, Series):
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
else:
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
df.columns = MultiIndex.from_tuples(
|
||
|
df.columns.to_numpy(), names=df.columns.names
|
||
|
)
|
||
|
expected = df.stack(level=level, dropna=False)
|
||
|
if isinstance(expected, Series):
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
else:
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
full_multiindex = MultiIndex.from_tuples(
|
||
|
[("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")],
|
||
|
names=["Upper", "Lower"],
|
||
|
)
|
||
|
for multiindex_columns in (
|
||
|
[0, 1, 2, 3, 4],
|
||
|
[0, 1, 2, 3],
|
||
|
[0, 1, 2, 4],
|
||
|
[0, 1, 2],
|
||
|
[1, 2, 3],
|
||
|
[2, 3, 4],
|
||
|
[0, 1],
|
||
|
[0, 2],
|
||
|
[0, 3],
|
||
|
[0],
|
||
|
[2],
|
||
|
[4],
|
||
|
):
|
||
|
_test_stack_with_multiindex(full_multiindex[multiindex_columns])
|
||
|
if len(multiindex_columns) > 1:
|
||
|
multiindex_columns.reverse()
|
||
|
_test_stack_with_multiindex(full_multiindex[multiindex_columns])
|
||
|
|
||
|
df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]])
|
||
|
result = df.stack(dropna=False)
|
||
|
expected = DataFrame(
|
||
|
[[0, 2], [1, np.nan], [3, 5], [4, np.nan]],
|
||
|
index=MultiIndex(
|
||
|
levels=[[0, 1], ["u", "x", "y", "z"]],
|
||
|
codes=[[0, 0, 1, 1], [1, 3, 1, 3]],
|
||
|
names=[None, "Lower"],
|
||
|
),
|
||
|
columns=Index(["B", "C"], name="Upper"),
|
||
|
dtype=df.dtypes[0],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("ordered", [False, True])
|
||
|
@pytest.mark.parametrize("labels", [list("yxz"), list("yxy")])
|
||
|
def test_stack_preserve_categorical_dtype(self, ordered, labels):
|
||
|
# GH13854
|
||
|
cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered)
|
||
|
df = DataFrame([[10, 11, 12]], columns=cidx)
|
||
|
result = df.stack()
|
||
|
|
||
|
# `MultiIndex.from_product` preserves categorical dtype -
|
||
|
# it's tested elsewhere.
|
||
|
midx = pd.MultiIndex.from_product([df.index, cidx])
|
||
|
expected = Series([10, 11, 12], index=midx)
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_stack_preserve_categorical_dtype_values(self):
|
||
|
# GH-23077
|
||
|
cat = pd.Categorical(["a", "a", "b", "c"])
|
||
|
df = pd.DataFrame({"A": cat, "B": cat})
|
||
|
result = df.stack()
|
||
|
index = pd.MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]])
|
||
|
expected = pd.Series(
|
||
|
pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"index, columns",
|
||
|
[
|
||
|
([0, 0, 1, 1], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])),
|
||
|
([0, 0, 2, 3], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])),
|
||
|
([0, 1, 2, 3], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])),
|
||
|
],
|
||
|
)
|
||
|
def test_stack_multi_columns_non_unique_index(self, index, columns):
|
||
|
# GH-28301
|
||
|
df = pd.DataFrame(index=index, columns=columns).fillna(1)
|
||
|
stacked = df.stack()
|
||
|
new_index = pd.MultiIndex.from_tuples(stacked.index.to_numpy())
|
||
|
expected = pd.DataFrame(
|
||
|
stacked.to_numpy(), index=new_index, columns=stacked.columns
|
||
|
)
|
||
|
tm.assert_frame_equal(stacked, expected)
|
||
|
stacked_codes = np.asarray(stacked.index.codes)
|
||
|
expected_codes = np.asarray(new_index.codes)
|
||
|
tm.assert_numpy_array_equal(stacked_codes, expected_codes)
|
||
|
|
||
|
@pytest.mark.parametrize("level", [0, 1])
|
||
|
def test_unstack_mixed_extension_types(self, level):
|
||
|
index = pd.MultiIndex.from_tuples(
|
||
|
[("A", 0), ("A", 1), ("B", 1)], names=["a", "b"]
|
||
|
)
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
"A": pd.core.arrays.integer_array([0, 1, None]),
|
||
|
"B": pd.Categorical(["a", "a", "b"]),
|
||
|
},
|
||
|
index=index,
|
||
|
)
|
||
|
|
||
|
result = df.unstack(level=level)
|
||
|
expected = df.astype(object).unstack(level=level)
|
||
|
|
||
|
expected_dtypes = pd.Series(
|
||
|
[df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns
|
||
|
)
|
||
|
tm.assert_series_equal(result.dtypes, expected_dtypes)
|
||
|
tm.assert_frame_equal(result.astype(object), expected)
|
||
|
|
||
|
@pytest.mark.parametrize("level", [0, "baz"])
|
||
|
def test_unstack_swaplevel_sortlevel(self, level):
|
||
|
# GH 20994
|
||
|
mi = pd.MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"])
|
||
|
df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"])
|
||
|
df.columns.name = "foo"
|
||
|
|
||
|
expected = pd.DataFrame(
|
||
|
[[3, 1, 2, 0]],
|
||
|
columns=pd.MultiIndex.from_tuples(
|
||
|
[("c", "A"), ("c", "B"), ("d", "A"), ("d", "B")], names=["baz", "foo"]
|
||
|
),
|
||
|
)
|
||
|
expected.index.name = "bar"
|
||
|
|
||
|
result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_unstack_fill_frame_object():
|
||
|
# GH12815 Test unstacking with object.
|
||
|
data = pd.Series(["a", "b", "c", "a"], dtype="object")
|
||
|
data.index = pd.MultiIndex.from_tuples(
|
||
|
[("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
|
||
|
)
|
||
|
|
||
|
# By default missing values will be NaN
|
||
|
result = data.unstack()
|
||
|
expected = pd.DataFrame(
|
||
|
{"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz")
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# Fill with any value replaces missing values as expected
|
||
|
result = data.unstack(fill_value="d")
|
||
|
expected = pd.DataFrame(
|
||
|
{"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz")
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_unstack_timezone_aware_values():
|
||
|
# GH 18338
|
||
|
df = pd.DataFrame(
|
||
|
{
|
||
|
"timestamp": [pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC")],
|
||
|
"a": ["a"],
|
||
|
"b": ["b"],
|
||
|
"c": ["c"],
|
||
|
},
|
||
|
columns=["timestamp", "a", "b", "c"],
|
||
|
)
|
||
|
result = df.set_index(["a", "b"]).unstack()
|
||
|
expected = pd.DataFrame(
|
||
|
[[pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC"), "c"]],
|
||
|
index=pd.Index(["a"], name="a"),
|
||
|
columns=pd.MultiIndex(
|
||
|
levels=[["timestamp", "c"], ["b"]],
|
||
|
codes=[[0, 1], [0, 0]],
|
||
|
names=[None, "b"],
|
||
|
),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_stack_timezone_aware_values():
|
||
|
# GH 19420
|
||
|
ts = pd.date_range(
|
||
|
freq="D", start="20180101", end="20180103", tz="America/New_York"
|
||
|
)
|
||
|
df = pd.DataFrame({"A": ts}, index=["a", "b", "c"])
|
||
|
result = df.stack()
|
||
|
expected = pd.Series(
|
||
|
ts,
|
||
|
index=pd.MultiIndex(
|
||
|
levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]]
|
||
|
),
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_unstacking_multi_index_df():
|
||
|
# see gh-30740
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"name": ["Alice", "Bob"],
|
||
|
"score": [9.5, 8],
|
||
|
"employed": [False, True],
|
||
|
"kids": [0, 0],
|
||
|
"gender": ["female", "male"],
|
||
|
}
|
||
|
)
|
||
|
df = df.set_index(["name", "employed", "kids", "gender"])
|
||
|
df = df.unstack(["gender"], fill_value=0)
|
||
|
expected = df.unstack("employed", fill_value=0).unstack("kids", fill_value=0)
|
||
|
result = df.unstack(["employed", "kids"], fill_value=0)
|
||
|
expected = DataFrame(
|
||
|
[[9.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 8.0]],
|
||
|
index=Index(["Alice", "Bob"], name="name"),
|
||
|
columns=MultiIndex.from_tuples(
|
||
|
[
|
||
|
("score", "female", False, 0),
|
||
|
("score", "female", True, 0),
|
||
|
("score", "male", False, 0),
|
||
|
("score", "male", True, 0),
|
||
|
],
|
||
|
names=[None, "gender", "employed", "kids"],
|
||
|
),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_stack_positional_level_duplicate_column_names():
|
||
|
# https://github.com/pandas-dev/pandas/issues/36353
|
||
|
columns = pd.MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"])
|
||
|
df = pd.DataFrame([[1, 1, 1, 1]], columns=columns)
|
||
|
result = df.stack(0)
|
||
|
|
||
|
new_columns = pd.Index(["y", "z"], name="a")
|
||
|
new_index = pd.MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"])
|
||
|
expected = pd.DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_unstack_with_missing_int_cast_to_float():
|
||
|
# https://github.com/pandas-dev/pandas/issues/37115
|
||
|
df = DataFrame(
|
||
|
{"a": ["A", "A", "B"], "b": ["ca", "cb", "cb"], "v": [10] * 3}
|
||
|
).set_index(["a", "b"])
|
||
|
|
||
|
# add another int column to get 2 blocks
|
||
|
df["is_"] = 1
|
||
|
assert len(df._mgr.blocks) == 2
|
||
|
|
||
|
result = df.unstack("b")
|
||
|
result[("is_", "ca")] = result[("is_", "ca")].fillna(0)
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]],
|
||
|
index=Index(["A", "B"], dtype="object", name="a"),
|
||
|
columns=MultiIndex.from_tuples(
|
||
|
[("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")], names=[None, "b"],
|
||
|
),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|