mirror of
synced 2025-02-17 00:33:24 +01:00
1299 lines
44 KiB
1299 lines
44 KiB
from collections import OrderedDict
from datetime import date, datetime
import itertools
import operator
import re
import numpy as np
import pytest
from pandas._libs.internals import BlockPlacement
import pandas as pd
from pandas import Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series
import pandas._testing as tm
import pandas.core.algorithms as algos
from pandas.core.arrays import DatetimeArray, SparseArray, TimedeltaArray
from pandas.core.internals import BlockManager, SingleBlockManager, make_block
def mgr():
return create_mgr(
"a: f8; b: object; c: f8; d: object; e: f8;"
"f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;"
"k: M8[ns, US/Eastern]; l: M8[ns, CET];"
def assert_block_equal(left, right):
tm.assert_numpy_array_equal(left.values, right.values)
assert left.dtype == right.dtype
assert isinstance(left.mgr_locs, BlockPlacement)
assert isinstance(right.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(left.mgr_locs.as_array, right.mgr_locs.as_array)
def get_numeric_mat(shape):
arr = np.arange(shape[0])
return np.lib.stride_tricks.as_strided(
x=arr, shape=shape, strides=(arr.itemsize,) + (0,) * (len(shape) - 1)
N = 10
def create_block(typestr, placement, item_shape=None, num_offset=0):
Supported typestr:
* float, f8, f4, f2
* int, i8, i4, i2, i1
* uint, u8, u4, u2, u1
* complex, c16, c8
* bool
* object, string, O
* datetime, dt, M8[ns], M8[ns, tz]
* timedelta, td, m8[ns]
* sparse (SparseArray with fill_value=0.0)
* sparse_na (SparseArray with fill_value=np.nan)
* category, category2
placement = BlockPlacement(placement)
num_items = len(placement)
if item_shape is None:
item_shape = (N,)
shape = (num_items,) + item_shape
mat = get_numeric_mat(shape)
if typestr in (
values = mat.astype(typestr) + num_offset
elif typestr in ("complex", "c16", "c8"):
values = 1.0j * (mat.astype(typestr) + num_offset)
elif typestr in ("object", "string", "O"):
values = np.reshape([f"A{i:d}" for i in mat.ravel() + num_offset], shape)
elif typestr in ("b", "bool"):
values = np.ones(shape, dtype=np.bool_)
elif typestr in ("datetime", "dt", "M8[ns]"):
values = (mat * 1e9).astype("M8[ns]")
elif typestr.startswith("M8[ns"):
# datetime with tz
m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr)
assert m is not None, f"incompatible typestr -> {typestr}"
tz = m.groups()[0]
assert num_items == 1, "must have only 1 num items for a tz-aware"
values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)
elif typestr in ("timedelta", "td", "m8[ns]"):
values = (mat * 1).astype("m8[ns]")
elif typestr in ("category",):
values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4])
elif typestr in ("category2",):
values = Categorical(["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"])
elif typestr in ("sparse", "sparse_na"):
# FIXME: doesn't support num_rows != 10
assert shape[-1] == 10
assert all(s == 1 for s in shape[:-1])
if typestr.endswith("_na"):
fill_value = np.nan
fill_value = 0.0
values = SparseArray(
[fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6],
arr = values.sp_values.view()
arr += num_offset - 1
raise ValueError(f'Unsupported typestr: "{typestr}"')
return make_block(values, placement=placement, ndim=len(shape))
def create_single_mgr(typestr, num_rows=None):
if num_rows is None:
num_rows = N
return SingleBlockManager(
create_block(typestr, placement=slice(0, num_rows), item_shape=()),
def create_mgr(descr, item_shape=None):
Construct BlockManager from string description.
String description syntax looks similar to np.matrix initializer. It looks
like this::
a,b,c: f8; d,e,f: i8
Rules are rather simple:
* see list of supported datatypes in `create_block` method
* components are semicolon-separated
* each component is `NAME,NAME,NAME: DTYPE_ID`
* whitespace around colons & semicolons are removed
* components with same DTYPE_ID are combined into single block
* to force multiple blocks with same dtype, use '-SUFFIX'::
'a:f8-1; b:f8-2; c:f8-foobar'
if item_shape is None:
item_shape = (N,)
offset = 0
mgr_items = []
block_placements = OrderedDict()
for d in descr.split(";"):
d = d.strip()
if not len(d):
names, blockstr = d.partition(":")[::2]
blockstr = blockstr.strip()
names = names.strip().split(",")
placement = list(np.arange(len(names)) + offset)
except KeyError:
block_placements[blockstr] = placement
offset += len(names)
mgr_items = Index(mgr_items)
blocks = []
num_offset = 0
for blockstr, placement in block_placements.items():
typestr = blockstr.split("-")[0]
typestr, placement, item_shape=item_shape, num_offset=num_offset
num_offset += len(placement)
return BlockManager(
sorted(blocks, key=lambda b: b.mgr_locs[0]),
[mgr_items] + [np.arange(n) for n in item_shape],
class TestBlock:
def setup_method(self, method):
self.fblock = create_block("float", [0, 2, 4])
self.cblock = create_block("complex", [7])
self.oblock = create_block("object", [1, 3])
self.bool_block = create_block("bool", [5])
def test_constructor(self):
int32block = create_block("i4", [0])
assert int32block.dtype == np.int32
def test_pickle(self):
def _check(blk):
assert_block_equal(tm.round_trip_pickle(blk), blk)
def test_mgr_locs(self):
assert isinstance(self.fblock.mgr_locs, BlockPlacement)
self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.int64)
def test_attrs(self):
assert self.fblock.shape == self.fblock.values.shape
assert self.fblock.dtype == self.fblock.values.dtype
assert len(self.fblock) == len(self.fblock.values)
def test_copy(self):
cop = self.fblock.copy()
assert cop is not self.fblock
assert_block_equal(self.fblock, cop)
def test_delete(self):
newb = self.fblock.copy()
assert isinstance(newb.mgr_locs, BlockPlacement)
newb.mgr_locs.as_array, np.array([2, 4], dtype=np.int64)
assert (newb.values[0] == 1).all()
newb = self.fblock.copy()
assert isinstance(newb.mgr_locs, BlockPlacement)
newb.mgr_locs.as_array, np.array([0, 4], dtype=np.int64)
assert (newb.values[1] == 2).all()
newb = self.fblock.copy()
newb.mgr_locs.as_array, np.array([0, 2], dtype=np.int64)
assert (newb.values[1] == 1).all()
newb = self.fblock.copy()
with pytest.raises(IndexError, match=None):
class TestBlockManager:
def test_attrs(self):
mgr = create_mgr("a,b,c: f8-1; d,e,f: f8-2")
assert mgr.nblocks == 2
assert len(mgr) == 6
def test_is_mixed_dtype(self):
assert not create_mgr("a,b:f8").is_mixed_type
assert not create_mgr("a:f8-1; b:f8-2").is_mixed_type
assert create_mgr("a,b:f8; c,d: f4").is_mixed_type
assert create_mgr("a,b:f8; c,d: object").is_mixed_type
def test_duplicate_ref_loc_failure(self):
tmp_mgr = create_mgr("a:bool; a: f8")
axes, blocks = tmp_mgr.axes, tmp_mgr.blocks
blocks[0].mgr_locs = np.array([0])
blocks[1].mgr_locs = np.array([0])
# test trying to create block manager with overlapping ref locs
msg = "Gaps in blk ref_locs"
with pytest.raises(AssertionError, match=msg):
mgr = BlockManager(blocks, axes)
blocks[0].mgr_locs = np.array([0])
blocks[1].mgr_locs = np.array([1])
mgr = BlockManager(blocks, axes)
def test_pickle(self, mgr):
mgr2 = tm.round_trip_pickle(mgr)
tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
# GH2431
assert hasattr(mgr2, "_is_consolidated")
assert hasattr(mgr2, "_known_consolidated")
# reset to False on load
assert not mgr2._is_consolidated
assert not mgr2._known_consolidated
@pytest.mark.parametrize("mgr_string", ["a,a,a:f8", "a: f8; a: i8"])
def test_non_unique_pickle(self, mgr_string):
mgr = create_mgr(mgr_string)
mgr2 = tm.round_trip_pickle(mgr)
tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
def test_categorical_block_pickle(self):
mgr = create_mgr("a: category")
mgr2 = tm.round_trip_pickle(mgr)
tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
smgr = create_single_mgr("category")
smgr2 = tm.round_trip_pickle(smgr)
tm.assert_series_equal(Series(smgr), Series(smgr2))
def test_iget(self):
cols = Index(list("abc"))
values = np.random.rand(3, 3)
block = make_block(values=values.copy(), placement=np.arange(3))
mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)])
tm.assert_almost_equal(mgr.iget(0).internal_values(), values[0])
tm.assert_almost_equal(mgr.iget(1).internal_values(), values[1])
tm.assert_almost_equal(mgr.iget(2).internal_values(), values[2])
def test_set(self):
mgr = create_mgr("a,b,c: int", item_shape=(3,))
mgr.insert(len(mgr.items), "d", np.array(["foo"] * 3))
mgr.iset(1, np.array(["bar"] * 3))
tm.assert_numpy_array_equal(mgr.iget(0).internal_values(), np.array([0] * 3))
mgr.iget(1).internal_values(), np.array(["bar"] * 3, dtype=np.object_)
tm.assert_numpy_array_equal(mgr.iget(2).internal_values(), np.array([2] * 3))
mgr.iget(3).internal_values(), np.array(["foo"] * 3, dtype=np.object_)
def test_set_change_dtype(self, mgr):
mgr.insert(len(mgr.items), "baz", np.zeros(N, dtype=bool))
mgr.iset(mgr.items.get_loc("baz"), np.repeat("foo", N))
idx = mgr.items.get_loc("baz")
assert mgr.iget(idx).dtype == np.object_
mgr2 = mgr.consolidate()
mgr2.iset(mgr2.items.get_loc("baz"), np.repeat("foo", N))
idx = mgr2.items.get_loc("baz")
assert mgr2.iget(idx).dtype == np.object_
mgr2.insert(len(mgr2.items), "quux", tm.randn(N).astype(int))
idx = mgr2.items.get_loc("quux")
assert mgr2.iget(idx).dtype == np.int_
mgr2.iset(mgr2.items.get_loc("quux"), tm.randn(N))
assert mgr2.iget(idx).dtype == np.float_
def test_copy(self, mgr):
cp = mgr.copy(deep=False)
for blk, cp_blk in zip(mgr.blocks, cp.blocks):
# view assertion
tm.assert_equal(cp_blk.values, blk.values)
if isinstance(blk.values, np.ndarray):
assert cp_blk.values.base is blk.values.base
# DatetimeTZBlock has DatetimeIndex values
assert cp_blk.values._data.base is blk.values._data.base
cp = mgr.copy(deep=True)
for blk, cp_blk in zip(mgr.blocks, cp.blocks):
# copy assertion we either have a None for a base or in case of
# some blocks it is an array (e.g. datetimetz), but was copied
tm.assert_equal(cp_blk.values, blk.values)
if not isinstance(cp_blk.values, np.ndarray):
assert cp_blk.values._data.base is not blk.values._data.base
assert cp_blk.values.base is None and blk.values.base is None
def test_sparse(self):
mgr = create_mgr("a: sparse-1; b: sparse-2")
# what to test here?
assert mgr.as_array().dtype == np.float64
def test_sparse_mixed(self):
mgr = create_mgr("a: sparse-1; b: sparse-2; c: f8")
assert len(mgr.blocks) == 3
assert isinstance(mgr, BlockManager)
# TODO: what to test here?
"mgr_string, dtype",
[("c: f4; d: f2", np.float32), ("c: f4; d: f2; e: f8", np.float64)],
def test_as_array_float(self, mgr_string, dtype):
mgr = create_mgr(mgr_string)
assert mgr.as_array().dtype == dtype
"mgr_string, dtype",
("a: bool-1; b: bool-2", np.bool_),
("a: i8-1; b: i8-2; c: i4; d: i2; e: u1", np.int64),
("c: i4; d: i2; e: u1", np.int32),
def test_as_array_int_bool(self, mgr_string, dtype):
mgr = create_mgr(mgr_string)
assert mgr.as_array().dtype == dtype
def test_as_array_datetime(self):
mgr = create_mgr("h: datetime-1; g: datetime-2")
assert mgr.as_array().dtype == "M8[ns]"
def test_as_array_datetime_tz(self):
mgr = create_mgr("h: M8[ns, US/Eastern]; g: M8[ns, CET]")
assert mgr.iget(0).dtype == "datetime64[ns, US/Eastern]"
assert mgr.iget(1).dtype == "datetime64[ns, CET]"
assert mgr.as_array().dtype == "object"
@pytest.mark.parametrize("t", ["float16", "float32", "float64", "int32", "int64"])
def test_astype(self, t):
# coerce all
mgr = create_mgr("c: f4; d: f2; e: f8")
t = np.dtype(t)
tmgr = mgr.astype(t)
assert tmgr.iget(0).dtype.type == t
assert tmgr.iget(1).dtype.type == t
assert tmgr.iget(2).dtype.type == t
# mixed
mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8")
t = np.dtype(t)
tmgr = mgr.astype(t, errors="ignore")
assert tmgr.iget(2).dtype.type == t
assert tmgr.iget(4).dtype.type == t
assert tmgr.iget(5).dtype.type == t
assert tmgr.iget(6).dtype.type == t
assert tmgr.iget(0).dtype.type == np.object_
assert tmgr.iget(1).dtype.type == np.object_
if t != np.int64:
assert tmgr.iget(3).dtype.type == np.datetime64
assert tmgr.iget(3).dtype.type == t
def test_convert(self):
def _compare(old_mgr, new_mgr):
""" compare the blocks, numeric compare ==, object don't """
old_blocks = set(old_mgr.blocks)
new_blocks = set(new_mgr.blocks)
assert len(old_blocks) == len(new_blocks)
# compare non-numeric
for b in old_blocks:
found = False
for nb in new_blocks:
if (b.values == nb.values).all():
found = True
assert found
for b in new_blocks:
found = False
for ob in old_blocks:
if (b.values == ob.values).all():
found = True
assert found
# noops
mgr = create_mgr("f: i8; g: f8")
new_mgr = mgr.convert()
_compare(mgr, new_mgr)
# convert
mgr = create_mgr("a,b,foo: object; f: i8; g: f8")
mgr.iset(0, np.array(["1"] * N, dtype=np.object_))
mgr.iset(1, np.array(["2."] * N, dtype=np.object_))
mgr.iset(2, np.array(["foo."] * N, dtype=np.object_))
new_mgr = mgr.convert(numeric=True)
assert new_mgr.iget(0).dtype == np.int64
assert new_mgr.iget(1).dtype == np.float64
assert new_mgr.iget(2).dtype == np.object_
assert new_mgr.iget(3).dtype == np.int64
assert new_mgr.iget(4).dtype == np.float64
mgr = create_mgr(
"a,b,foo: object; f: i4; bool: bool; dt: datetime; i: i8; g: f8; h: f2"
mgr.iset(0, np.array(["1"] * N, dtype=np.object_))
mgr.iset(1, np.array(["2."] * N, dtype=np.object_))
mgr.iset(2, np.array(["foo."] * N, dtype=np.object_))
new_mgr = mgr.convert(numeric=True)
assert new_mgr.iget(0).dtype == np.int64
assert new_mgr.iget(1).dtype == np.float64
assert new_mgr.iget(2).dtype == np.object_
assert new_mgr.iget(3).dtype == np.int32
assert new_mgr.iget(4).dtype == np.bool_
assert new_mgr.iget(5).dtype.type, np.datetime64
assert new_mgr.iget(6).dtype == np.int64
assert new_mgr.iget(7).dtype == np.float64
assert new_mgr.iget(8).dtype == np.float16
def test_invalid_ea_block(self):
with pytest.raises(AssertionError, match="block.size != values.size"):
create_mgr("a: category; b: category")
with pytest.raises(AssertionError, match="block.size != values.size"):
create_mgr("a: category2; b: category2")
def test_interleave(self):
# self
for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]:
mgr = create_mgr(f"a: {dtype}")
assert mgr.as_array().dtype == dtype
mgr = create_mgr(f"a: {dtype}; b: {dtype}")
assert mgr.as_array().dtype == dtype
"mgr_string, dtype",
("a: category", "i8"),
("a: category; b: category", "i8"),
("a: category; b: category2", "object"),
("a: category2", "object"),
("a: category2; b: category2", "object"),
("a: f8", "f8"),
("a: f8; b: i8", "f8"),
("a: f4; b: i8", "f8"),
("a: f4; b: i8; d: object", "object"),
("a: bool; b: i8", "object"),
("a: complex", "complex"),
("a: f8; b: category", "object"),
("a: M8[ns]; b: category", "object"),
("a: M8[ns]; b: bool", "object"),
("a: M8[ns]; b: i8", "object"),
("a: m8[ns]; b: bool", "object"),
("a: m8[ns]; b: i8", "object"),
("a: M8[ns]; b: m8[ns]", "object"),
def test_interleave_dtype(self, mgr_string, dtype):
# will be converted according the actual dtype of the underlying
mgr = create_mgr("a: category")
assert mgr.as_array().dtype == "i8"
mgr = create_mgr("a: category; b: category2")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: category2")
assert mgr.as_array().dtype == "object"
# combinations
mgr = create_mgr("a: f8")
assert mgr.as_array().dtype == "f8"
mgr = create_mgr("a: f8; b: i8")
assert mgr.as_array().dtype == "f8"
mgr = create_mgr("a: f4; b: i8")
assert mgr.as_array().dtype == "f8"
mgr = create_mgr("a: f4; b: i8; d: object")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: bool; b: i8")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: complex")
assert mgr.as_array().dtype == "complex"
mgr = create_mgr("a: f8; b: category")
assert mgr.as_array().dtype == "f8"
mgr = create_mgr("a: M8[ns]; b: category")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: M8[ns]; b: bool")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: M8[ns]; b: i8")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: m8[ns]; b: bool")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: m8[ns]; b: i8")
assert mgr.as_array().dtype == "object"
mgr = create_mgr("a: M8[ns]; b: m8[ns]")
assert mgr.as_array().dtype == "object"
def test_consolidate_ordering_issues(self, mgr):
mgr.iset(mgr.items.get_loc("f"), tm.randn(N))
mgr.iset(mgr.items.get_loc("d"), tm.randn(N))
mgr.iset(mgr.items.get_loc("b"), tm.randn(N))
mgr.iset(mgr.items.get_loc("g"), tm.randn(N))
mgr.iset(mgr.items.get_loc("h"), tm.randn(N))
# we have datetime/tz blocks in mgr
cons = mgr.consolidate()
assert cons.nblocks == 4
cons = mgr.consolidate().get_numeric_data()
assert cons.nblocks == 1
assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement)
cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.int64)
def test_reindex_items(self):
# mgr is not consolidated, f8 & f8-2 blocks
mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8; f: bool; g: f8-2")
reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0)
assert reindexed.nblocks == 2
tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"]))
mgr.iget(6).internal_values(), reindexed.iget(0).internal_values()
mgr.iget(2).internal_values(), reindexed.iget(1).internal_values()
mgr.iget(0).internal_values(), reindexed.iget(2).internal_values()
mgr.iget(3).internal_values(), reindexed.iget(3).internal_values()
def test_get_numeric_data(self):
mgr = create_mgr(
"int: int; float: float; complex: complex;"
"str: object; bool: bool; obj: object; dt: datetime",
mgr.iset(5, np.array([1, 2, 3], dtype=np.object_))
numeric = mgr.get_numeric_data()
numeric.items, pd.Index(["int", "float", "complex", "bool"])
# Check sharing
numeric.iset(numeric.items.get_loc("float"), np.array([100.0, 200.0, 300.0]))
np.array([100.0, 200.0, 300.0]),
numeric2 = mgr.get_numeric_data(copy=True)
numeric.items, pd.Index(["int", "float", "complex", "bool"])
numeric2.items.get_loc("float"), np.array([1000.0, 2000.0, 3000.0])
np.array([100.0, 200.0, 300.0]),
def test_get_bool_data(self):
mgr = create_mgr(
"int: int; float: float; complex: complex;"
"str: object; bool: bool; obj: object; dt: datetime",
mgr.iset(6, np.array([True, False, True], dtype=np.object_))
bools = mgr.get_bool_data()
tm.assert_index_equal(bools.items, pd.Index(["bool"]))
bools.iset(0, np.array([True, False, True]))
np.array([True, False, True]),
# Check sharing
bools2 = mgr.get_bool_data(copy=True)
bools2.iset(0, np.array([False, True, False]))
np.array([True, False, True]),
def test_unicode_repr_doesnt_raise(self):
repr(create_mgr("b,\u05d0: object"))
"mgr_string", ["a,b,c: i8-1; d,e,f: i8-2", "a,a,a: i8-1; b,b,b: i8-2"]
def test_equals(self, mgr_string):
# unique items
bm1 = create_mgr(mgr_string)
bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
assert bm1.equals(bm2)
"a:i8;b:f8", # basic case
"a:i8;b:f8;c:c8;d:b", # many types
"a:i8;e:dt;f:td;g:string", # more types
"a:i8;b:category;c:category2", # categories
"c:sparse;d:sparse_na;b:f8", # sparse
def test_equals_block_order_different_dtypes(self, mgr_string):
# GH 9330
bm = create_mgr(mgr_string)
block_perms = itertools.permutations(bm.blocks)
for bm_perm in block_perms:
bm_this = BlockManager(bm_perm, bm.axes)
assert bm.equals(bm_this)
assert bm_this.equals(bm)
def test_single_mgr_ctor(self):
mgr = create_single_mgr("f8", num_rows=5)
assert mgr.as_array().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0]
@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
def test_validate_bool_args(self, value):
bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2")
msg = (
'For argument "inplace" expected type bool, '
f"received type {type(value).__name__}."
with pytest.raises(ValueError, match=msg):
bm1.replace_list([1], [2], inplace=value)
class TestIndexing:
# Nosetests-style data-driven tests.
# This test applies different indexing routines to block managers and
# compares the outcome to the result of same operations on np.ndarray.
# NOTE: sparse (SparseBlock with fill_value != np.nan) fail a lot of tests
# and are disabled.
create_single_mgr("f8", N),
create_single_mgr("i8", N),
# 2-dim
create_mgr("a,b,c,d,e,f: f8", item_shape=(N,)),
create_mgr("a,b,c,d,e,f: i8", item_shape=(N,)),
create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N,)),
create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N,)),
@pytest.mark.parametrize("mgr", MANAGERS)
def test_get_slice(self, mgr):
def assert_slice_ok(mgr, axis, slobj):
mat = mgr.as_array()
# we maybe using an ndarray to test slicing and
# might not be the full length of the axis
if isinstance(slobj, np.ndarray):
ax = mgr.axes[axis]
if len(ax) and len(slobj) and len(slobj) != len(ax):
slobj = np.concatenate(
[slobj, np.zeros(len(ax) - len(slobj), dtype=bool)]
sliced = mgr.get_slice(slobj, axis=axis)
mat_slobj = (slice(None),) * axis + (slobj,)
mat[mat_slobj], sliced.as_array(), check_dtype=False
tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis])
assert mgr.ndim <= 2, mgr.ndim
for ax in range(mgr.ndim):
# slice
assert_slice_ok(mgr, ax, slice(None))
assert_slice_ok(mgr, ax, slice(3))
assert_slice_ok(mgr, ax, slice(100))
assert_slice_ok(mgr, ax, slice(1, 4))
assert_slice_ok(mgr, ax, slice(3, 0, -2))
# boolean mask
assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_))
assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_))
if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0)
assert_slice_ok(mgr, ax, np.array([True, True, False], dtype=np.bool_))
# fancy indexer
assert_slice_ok(mgr, ax, [])
assert_slice_ok(mgr, ax, list(range(mgr.shape[ax])))
if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, [0, 1, 2])
assert_slice_ok(mgr, ax, [-1, -2, -3])
@pytest.mark.parametrize("mgr", MANAGERS)
def test_take(self, mgr):
def assert_take_ok(mgr, axis, indexer):
mat = mgr.as_array()
taken = mgr.take(indexer, axis)
np.take(mat, indexer, axis), taken.as_array(), check_dtype=False
tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis])
for ax in range(mgr.ndim):
# take/fancy indexer
assert_take_ok(mgr, ax, indexer=[])
assert_take_ok(mgr, ax, indexer=[0, 0, 0])
assert_take_ok(mgr, ax, indexer=list(range(mgr.shape[ax])))
if mgr.shape[ax] >= 3:
assert_take_ok(mgr, ax, indexer=[0, 1, 2])
assert_take_ok(mgr, ax, indexer=[-1, -2, -3])
@pytest.mark.parametrize("mgr", MANAGERS)
@pytest.mark.parametrize("fill_value", [None, np.nan, 100.0])
def test_reindex_axis(self, fill_value, mgr):
def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value):
mat = mgr.as_array()
indexer = mgr.axes[axis].get_indexer_for(new_labels)
reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value)
algos.take_nd(mat, indexer, axis, fill_value=fill_value),
tm.assert_index_equal(reindexed.axes[axis], new_labels)
for ax in range(mgr.ndim):
assert_reindex_axis_is_ok(mgr, ax, pd.Index([]), fill_value)
assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax], fill_value)
assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][[0, 0, 0]], fill_value)
mgr, ax, pd.Index(["foo", "bar", "baz"]), fill_value
mgr, ax, pd.Index(["foo", mgr.axes[ax][0], "baz"]), fill_value
if mgr.shape[ax] >= 3:
assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][:-3], fill_value)
assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][-3::-1], fill_value)
mgr, ax, mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value
@pytest.mark.parametrize("mgr", MANAGERS)
@pytest.mark.parametrize("fill_value", [None, np.nan, 100.0])
def test_reindex_indexer(self, fill_value, mgr):
def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value):
mat = mgr.as_array()
reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value)
reindexed = mgr.reindex_indexer(
new_labels, indexer, axis, fill_value=fill_value
reindexed_mat, reindexed.as_array(), check_dtype=False
tm.assert_index_equal(reindexed.axes[axis], new_labels)
for ax in range(mgr.ndim):
assert_reindex_indexer_is_ok(mgr, ax, pd.Index([]), [], fill_value)
mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value
pd.Index(["foo"] * mgr.shape[ax]),
mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value,
mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value,
mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value
mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value,
pd.Index(["foo", mgr.axes[ax][0], "baz"]),
[-1, -1, -1],
if mgr.shape[ax] >= 3:
mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value,
class TestBlockPlacement:
"slc, expected",
(slice(0, 4), 4),
(slice(0, 4, 2), 2),
(slice(0, 3, 2), 2),
(slice(0, 1, 2), 1),
(slice(1, 0, -1), 1),
def test_slice_len(self, slc, expected):
assert len(BlockPlacement(slc)) == expected
@pytest.mark.parametrize("slc", [slice(1, 1, 0), slice(1, 2, 0)])
def test_zero_step_raises(self, slc):
msg = "slice step cannot be zero"
with pytest.raises(ValueError, match=msg):
slice(None, None),
slice(10, None),
slice(None, None, -1),
slice(None, 10, -1),
# These are "unbounded" because negative index will
# change depending on container shape.
slice(-1, None),
slice(None, -1),
slice(-1, -1),
slice(-1, None, -1),
slice(None, -1, -1),
slice(-1, -1, -1),
def test_unbounded_slice_raises(self, slc):
msg = "unbounded slice"
with pytest.raises(ValueError, match=msg):
slice(0, 0),
slice(100, 0),
slice(100, 100),
slice(100, 100, -1),
slice(0, 100, -1),
def test_not_slice_like_slices(self, slc):
assert not BlockPlacement(slc).is_slice_like
"arr, slc",
([0], slice(0, 1, 1)),
([100], slice(100, 101, 1)),
([0, 1, 2], slice(0, 3, 1)),
([0, 5, 10], slice(0, 15, 5)),
([0, 100], slice(0, 200, 100)),
([2, 1], slice(2, 0, -1)),
def test_array_to_slice_conversion(self, arr, slc):
assert BlockPlacement(arr).as_slice == slc
[-1, -2, -3],
[-1, 0, 1, 2],
[-2, 0, 2, 4],
[1, 0, -1],
[1, 1, 1],
def test_not_slice_like_arrays(self, arr):
assert not BlockPlacement(arr).is_slice_like
"slc, expected",
[(slice(0, 3), [0, 1, 2]), (slice(0, 0), []), (slice(3, 0), [])],
def test_slice_iter(self, slc, expected):
assert list(BlockPlacement(slc)) == expected
"slc, arr",
(slice(0, 3), [0, 1, 2]),
(slice(0, 0), []),
(slice(3, 0), []),
(slice(3, 0, -1), [3, 2, 1]),
def test_slice_to_array_conversion(self, slc, arr):
BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.int64)
def test_blockplacement_add(self):
bpl = BlockPlacement(slice(0, 5))
assert bpl.add(1).as_slice == slice(1, 6, 1)
assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2)
assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5]
"val, inc, expected",
(slice(0, 0), 0, []),
(slice(1, 4), 0, [1, 2, 3]),
(slice(3, 0, -1), 0, [3, 2, 1]),
([1, 2, 4], 0, [1, 2, 4]),
(slice(0, 0), 10, []),
(slice(1, 4), 10, [11, 12, 13]),
(slice(3, 0, -1), 10, [13, 12, 11]),
([1, 2, 4], 10, [11, 12, 14]),
(slice(0, 0), -1, []),
(slice(1, 4), -1, [0, 1, 2]),
([1, 2, 4], -1, [0, 1, 3]),
def test_blockplacement_add_int(self, val, inc, expected):
assert list(BlockPlacement(val).add(inc)) == expected
@pytest.mark.parametrize("val", [slice(1, 4), [1, 2, 4]])
def test_blockplacement_add_int_raises(self, val):
msg = "iadd causes length change"
with pytest.raises(ValueError, match=msg):
class DummyElement:
def __init__(self, value, dtype):
self.value = value
self.dtype = np.dtype(dtype)
def __array__(self):
return np.array(self.value, dtype=self.dtype)
def __str__(self) -> str:
return f"DummyElement({self.value}, {self.dtype})"
def __repr__(self) -> str:
return str(self)
def astype(self, dtype, copy=False):
self.dtype = dtype
return self
def view(self, dtype):
return type(self)(self.value.view(dtype), dtype)
def any(self, axis=None):
return bool(self.value)
class TestCanHoldElement:
def test_datetime_block_can_hold_element(self):
block = create_block("datetime", [0])
# We will check that block._can_hold_element iff arr.__setitem__ works
arr = pd.array(block.values.ravel())
# coerce None
assert block._can_hold_element(None)
arr[0] = None
assert arr[0] is pd.NaT
# coerce different types of datetime objects
vals = [np.datetime64("2010-10-10"), datetime(2010, 10, 10)]
for val in vals:
assert block._can_hold_element(val)
arr[0] = val
val = date(2010, 10, 10)
assert not block._can_hold_element(val)
msg = (
"'value' should be a 'Timestamp', 'NaT', "
"or array of those. Got 'date' instead."
with pytest.raises(TypeError, match=msg):
arr[0] = val
"value, dtype",
(1, "i8"),
(1.0, "f8"),
(2 ** 63, "f8"),
(1j, "complex128"),
(2 ** 63, "complex128"),
(True, "bool"),
(np.timedelta64(20, "ns"), "<m8[ns]"),
(np.datetime64(20, "ns"), "<M8[ns]"),
ids=lambda x: x.__name__,
def test_binop_other(self, op, value, dtype):
skip = {
(operator.add, "bool"),
(operator.sub, "bool"),
(operator.mul, "bool"),
(operator.truediv, "bool"),
(operator.mod, "i8"),
(operator.mod, "complex128"),
(operator.pow, "bool"),
if (op, dtype) in skip:
pytest.skip(f"Invalid combination {op},{dtype}")
e = DummyElement(value, dtype)
s = pd.DataFrame({"A": [e.value, e.value]}, dtype=e.dtype)
invalid = {
(operator.pow, "<M8[ns]"),
(operator.mod, "<M8[ns]"),
(operator.truediv, "<M8[ns]"),
(operator.mul, "<M8[ns]"),
(operator.add, "<M8[ns]"),
(operator.pow, "<m8[ns]"),
(operator.mul, "<m8[ns]"),
if (op, dtype) in invalid:
msg = (
if (dtype == "<M8[ns]" and op == operator.add)
or (dtype == "<m8[ns]" and op == operator.mul)
else (
f"cannot perform __{op.__name__}__ with this "
"index type: (DatetimeArray|TimedeltaArray)"
with pytest.raises(TypeError, match=msg):
op(s, e.value)
# FIXME: Since dispatching to Series, this test no longer
# asserts anything meaningful
result = op(s, e.value).dtypes
expected = op(s, value).dtypes
tm.assert_series_equal(result, expected)
class TestShouldStore:
def test_should_store_categorical(self):
cat = pd.Categorical(["A", "B", "C"])
df = pd.DataFrame(cat)
blk = df._mgr.blocks[0]
# matching dtype
assert blk.should_store(cat)
assert blk.should_store(cat[:-1])
# different dtype
assert not blk.should_store(cat.as_ordered())
# ndarray instead of Categorical
assert not blk.should_store(np.asarray(cat))
"typestr, holder",
("category", Categorical),
("M8[ns]", DatetimeArray),
("M8[ns, US/Central]", DatetimeArray),
("m8[ns]", TimedeltaArray),
("sparse", SparseArray),
def test_holder(typestr, holder):
blk = create_block(typestr, [1])
assert blk._holder is holder
def test_validate_ndim():
values = np.array([1.0, 2.0])
placement = slice(2)
msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]"
with pytest.raises(ValueError, match=msg):
make_block(values, placement, ndim=2)
def test_block_shape():
idx = pd.Index([0, 1, 2, 3, 4])
a = pd.Series([1, 2, 3]).reindex(idx)
b = pd.Series(pd.Categorical([1, 2, 3])).reindex(idx)
assert a._mgr.blocks[0].mgr_locs.indexer == b._mgr.blocks[0].mgr_locs.indexer
def test_make_block_no_pandas_array():
# https://github.com/pandas-dev/pandas/pull/24866
arr = pd.arrays.PandasArray(np.array([1, 2]))
# PandasArray, no dtype
result = make_block(arr, slice(len(arr)))
assert result.is_integer is True
assert result.is_extension is False
# PandasArray, PandasDtype
result = make_block(arr, slice(len(arr)), dtype=arr.dtype)
assert result.is_integer is True
assert result.is_extension is False
# ndarray, PandasDtype
result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype)
assert result.is_integer is True
assert result.is_extension is False
def test_dataframe_not_equal():
# see GH28839
df1 = pd.DataFrame({"a": [1, 2], "b": ["s", "d"]})
df2 = pd.DataFrame({"a": ["s", "d"], "b": [1, 2]})
assert df1.equals(df2) is False
def test_missing_unicode_key():
df = DataFrame({"a": [1]})
with pytest.raises(KeyError, match="\u05d0"):
df.loc[:, "\u05d0"] # should not raise UnicodeEncodeError
def test_set_change_dtype_slice():
# GH#8850
cols = MultiIndex.from_tuples([("1st", "a"), ("2nd", "b"), ("3rd", "c")])
df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols)
df["2nd"] = df["2nd"] * 2.0
blocks = df._to_dict_of_blocks()
assert sorted(blocks.keys()) == ["float64", "int64"]
blocks["float64"], DataFrame([[1.0, 4.0], [4.0, 10.0]], columns=cols[:2])
tm.assert_frame_equal(blocks["int64"], DataFrame([[3], [6]], columns=cols[2:]))
def test_interleave_non_unique_cols():
df = DataFrame(
[[pd.Timestamp("20130101"), 3.5], [pd.Timestamp("20130102"), 4.5]],
columns=["x", "x"],
index=[1, 2],
df_unique = df.copy()
df_unique.columns = ["x", "y"]
assert df_unique.values.shape == df.values.shape
tm.assert_numpy_array_equal(df_unique.values[0], df.values[0])
tm.assert_numpy_array_equal(df_unique.values[1], df.values[1])
def test_single_block_manager_fastpath_deprecated():
# GH#33092
ser = pd.Series(range(3))
blk = ser._data.blocks[0]
with tm.assert_produces_warning(FutureWarning):
SingleBlockManager(blk, ser.index, fastpath=True)