Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
ea0bff8
API: New copy / view semantics using Copy-on-Write
jorisvandenbossche May 6, 2022
74981f9
fix more tests
jorisvandenbossche May 7, 2022
36faac9
Handle CoW in BM.iset
jorisvandenbossche May 9, 2022
ffa24a4
Handle CoW in xs
jorisvandenbossche May 9, 2022
8c00bbd
add bunch of todo comments and usage warnings
jorisvandenbossche May 9, 2022
af5a02c
Insert None ref in BM.insert
jorisvandenbossche May 9, 2022
a64f310
Ensure to not assume copy_on_write is set in case of ArrayManager
jorisvandenbossche May 9, 2022
e6c0baa
Merge remote-tracking branch 'upstream/main' into blockmanager-cow
jorisvandenbossche May 10, 2022
977cbb8
Handle refs in BM._combine / test CoW in select_dtypes
jorisvandenbossche May 10, 2022
cc367f5
handle get_numeric_data for single block manager
jorisvandenbossche May 10, 2022
53a8273
fix test_internals (get_numeric_data now uses CoW)
jorisvandenbossche May 11, 2022
d3d26f4
handle refs in consolidation
jorisvandenbossche May 20, 2022
5360e57
Merge remote-tracking branch 'upstream/main' into blockmanager-cow
jorisvandenbossche May 20, 2022
e431bc4
fix deep=None for ArrayManager
jorisvandenbossche May 20, 2022
a2d8fde
update copy/view tests from other PR
jorisvandenbossche May 20, 2022
81f6eae
Merge remote-tracking branch 'upstream/main' into blockmanager-cow
jorisvandenbossche May 23, 2022
b2a1428
clean-up fast_xs workarounds now it returns a SingleBlockManager
jorisvandenbossche May 23, 2022
4b1ccf6
tracks refs in to_frame
jorisvandenbossche May 23, 2022
40f2b24
Merge remote-tracking branch 'upstream/main' into blockmanager-cow
jorisvandenbossche Jun 8, 2022
9339f15
fixup after updata main and column_setitem + iloc inplace setitem cha…
jorisvandenbossche Jul 16, 2022
358deaf
Merge remote-tracking branch 'upstream/main' into blockmanager-cow
jorisvandenbossche Jul 16, 2022
085d15b
fix inplace fillna + fixup new tests
jorisvandenbossche Jul 16, 2022
377f789
Merge remote-tracking branch 'upstream/main' into blockmanager-cow
jorisvandenbossche Jul 17, 2022
7dcefe8
Merge remote-tracking branch 'upstream/main' into blockmanager-cow
jorisvandenbossche Jul 21, 2022
773f03f
address comments + update some todo comments
jorisvandenbossche Jul 21, 2022
bfc2117
Update pandas/core/internals/managers.py
jorisvandenbossche Jul 21, 2022
89c4fff
fixup linting
jorisvandenbossche Jul 21, 2022
b2e56ea
update new copy_view tests to use get_array helper
jorisvandenbossche Jul 21, 2022
fdeb154
add comment to setitem
jorisvandenbossche Jul 21, 2022
c521161
switch default to False, ensure CoW copies only happen when enabled +…
jorisvandenbossche Jul 21, 2022
fb297df
update type annotations
jorisvandenbossche Jul 21, 2022
9ab9df5
Merge remote-tracking branch 'upstream/main' into blockmanager-cow
jorisvandenbossche Aug 10, 2022
14f753e
Fix stata issue to avoid SettingWithCopyWarning in read_stata
jorisvandenbossche Aug 10, 2022
f7389b1
Merge remote-tracking branch 'upstream/main' into blockmanager-cow
jorisvandenbossche Aug 17, 2022
764838f
update type + option comment
jorisvandenbossche Aug 17, 2022
efde1bf
Merge remote-tracking branch 'upstream/main' into blockmanager-cow
jorisvandenbossche Aug 18, 2022
3d1377b
fixup new rename test
jorisvandenbossche Aug 19, 2022
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Handle CoW in BM.iset
  • Loading branch information
jorisvandenbossche committed May 9, 2022
commit 36faac9b3f8f35a40edde3fa0e1ff336b2b8838f
11 changes: 9 additions & 2 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,17 +829,22 @@ def _slice(

return self.values[slicer]

def set_inplace(self, locs, values: ArrayLike) -> None:
def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None:
"""
Modify block values in-place with new item value.

If copy=True, first copy the underlying values in place before modifying
(for Copy-on-Write).

Notes
-----
`set_inplace` never creates a new array or new Block, whereas `setitem`
_may_ create a new array and always creates a new Block.

Caller is responsible for checking values.dtype == self.dtype.
"""
if copy:
self.values = self.values.copy()
self.values[locs] = values

def take_nd(
Expand Down Expand Up @@ -1653,9 +1658,11 @@ def iget(self, i: int | tuple[int, int] | tuple[slice, int]):
raise IndexError(f"{self} only contains one item")
return self.values

def set_inplace(self, locs, values: ArrayLike) -> None:
def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None:
# When an ndarray, we should have locs.tolist() == [0]
# When a BlockPlacement we should have list(locs) == [0]
if copy:
self.values = self.values.copy()
self.values[:] = values

def _maybe_squeeze_arg(self, arg):
Expand Down
45 changes: 39 additions & 6 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,20 @@ def _has_no_reference(self, i: int) -> bool:
Returns True if the columns has no references.
"""
blkno = self.blknos[i]
return self._has_no_reference_block(blkno)

def _has_no_reference_block(self, blkno: int) -> bool:
"""
Check for column `i` if has references.
(whether it references another array or is itself being referenced)
Returns True if the columns has no references.
"""
# TODO include `or self.refs[blkno]() is None` ?
return (
self.refs is None or self.refs[blkno] is None
) and weakref.getweakrefcount(self.blocks[blkno]) == 0

def _clear_reference(self, blkno: int) -> None:
def _clear_reference_block(self, blkno: int) -> None:
"""
Clear any reference for column `i`.
"""
Expand Down Expand Up @@ -369,7 +377,7 @@ def setitem(self: T, indexer, value) -> T:
# if being referenced -> perform Copy-on-Write and clear the reference
# self.blocks = tuple([self.blocks[0].copy()])
self = self.copy()
# self._clear_reference(blkno)
# self._clear_reference_block(blkno)

return self.apply("setitem", indexer=indexer, value=value)

Expand Down Expand Up @@ -1179,7 +1187,12 @@ def value_getitem(placement):
blk = self.blocks[blkno_l]
blk_locs = blklocs[val_locs.indexer]
if inplace and blk.should_store(value):
blk.set_inplace(blk_locs, value_getitem(val_locs))
# Updating inplace -> check if we need to do Copy-on-Write
if not self._has_no_reference_block(blkno_l):
blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True)
self._clear_reference_block(blkno_l)
else:
blk.set_inplace(blk_locs, value_getitem(val_locs))
else:
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
unfit_val_locs.append(val_locs)
Expand All @@ -1194,9 +1207,11 @@ def value_getitem(placement):
)
self.blocks = blocks_tup
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
# blk.delete gives a copy, so we can remove a possible reference
self._clear_reference_block(blkno_l)

if len(removed_blknos):
# Remove blocks & update blknos accordingly
# Remove blocks & update blknos and refs accordingly
is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
is_deleted[removed_blknos] = True

Expand All @@ -1207,6 +1222,12 @@ def value_getitem(placement):
self.blocks = tuple(
blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)
)
if self.refs is not None:
self.refs = [
ref
for i, ref in enumerate(self.refs)
if i not in set(removed_blknos)
]

if unfit_val_locs:
unfit_idxr = np.concatenate(unfit_mgr_locs)
Expand Down Expand Up @@ -1243,6 +1264,10 @@ def value_getitem(placement):
self._blklocs[unfit_idxr] = np.arange(unfit_count)

self.blocks += tuple(new_blocks)
# TODO(CoW) is this always correct to assume that the new_blocks
# are not referencing anything else?
if self.refs is not None:
self.refs = list(self.refs) + [None] * len(new_blocks)

# Newly created block's dtype may already be present.
self._known_consolidated = False
Expand All @@ -1260,14 +1285,20 @@ def _iset_single(
# Caller is responsible for verifying value.shape

if inplace and blk.should_store(value):
copy = False
if not self._has_no_reference_block(blkno):
# perform Copy-on-Write and clear the reference
copy = True
self._clear_reference_block(blkno)
iloc = self.blklocs[loc]
blk.set_inplace(slice(iloc, iloc + 1), value)
blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)
return

nb = new_block_2d(value, placement=blk._mgr_locs)
old_blocks = self.blocks
new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]
self.blocks = new_blocks
self._clear_reference_block(blkno)
return

def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
Expand All @@ -1280,6 +1311,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
item : hashable
value : np.ndarray or ExtensionArray
"""
# TODO handle CoW (insert into refs as well)

# insert to the axis; this could possibly raise a TypeError
new_axis = self.items.insert(loc, item)

Expand Down Expand Up @@ -1370,7 +1403,7 @@ def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value):
blocks = list(self.blocks)
blocks[blkno] = blocks[blkno].copy()
self.blocks = tuple(blocks)
self._clear_reference(blkno)
self._clear_reference_block(blkno)

col_mgr = self.iget(loc)
new_mgr = col_mgr.setitem((idx,), value)
Expand Down
58 changes: 58 additions & 0 deletions pandas/tests/indexing/test_copy_on_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,8 @@ def test_subset_row_slice(using_copy_on_write):
# with tm.assert_produces_warning(com.SettingWithCopyWarning):
subset.iloc[0, 0] = 0

subset._mgr._verify_integrity()

expected = pd.DataFrame(
{"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3)
)
Expand Down Expand Up @@ -333,13 +335,64 @@ def test_subset_set_column(using_copy_on_write):
with tm.assert_produces_warning(com.SettingWithCopyWarning):
subset["a"] = np.array([10, 11], dtype="int64")

subset._mgr._verify_integrity()
expected = pd.DataFrame(
{"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3)
)
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_set_column_with_loc(using_copy_on_write, dtype):
# Case: setting a single column with loc on a viewing subset
# -> subset.loc[:, col] = value
df = pd.DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df[1:3]

if using_copy_on_write:
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(com.SettingWithCopyWarning):
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")

subset._mgr._verify_integrity()
expected = pd.DataFrame(
{"a": [10, 11], "b": [5, 6], "c": np.array([8, 9], dtype=dtype)},
index=range(1, 3),
)
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)


def test_subset_set_column_with_loc2(using_copy_on_write):
# Case: setting a single column with loc on a viewing subset
# -> subset.loc[:, col] = value
# separate test for case of DataFrame of a single column -> takes a separate
# code path
df = pd.DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()
subset = df[1:3]

if using_copy_on_write:
subset.loc[:, "a"] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(com.SettingWithCopyWarning):
subset.loc[:, "a"] = 0

subset._mgr._verify_integrity()
expected = pd.DataFrame({"a": [0, 0]}, index=range(1, 3))
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
Expand All @@ -359,6 +412,10 @@ def test_subset_set_columns(using_copy_on_write, dtype):
with tm.assert_produces_warning(com.SettingWithCopyWarning):
subset[["a", "c"]] = 0

subset._mgr._verify_integrity()
if using_copy_on_write:
# first and third column should certainly have no references anymore
assert all(subset._mgr._has_no_reference(i) for i in [0, 2])
expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3))
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
Expand All @@ -383,6 +440,7 @@ def test_subset_set_with_column_indexer(indexer, using_copy_on_write):
with tm.assert_produces_warning(com.SettingWithCopyWarning):
subset.loc[:, indexer] = 0

subset._mgr._verify_integrity()
expected = pd.DataFrame(
{"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3)
)
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/indexing/test_iloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,8 +863,12 @@ def test_identity_slice_returns_new_object(
assert np.shares_memory(original_df["a"], sliced_df["a"])

# Setting using .loc[:, "a"] sets inplace so alters both sliced and orig
# depending on CoW
original_df.loc[:, "a"] = [4, 4, 4]
assert (sliced_df["a"] == 4).all()
if using_copy_on_write:
assert (sliced_df["a"] == [1, 2, 3]).all()
else:
assert (sliced_df["a"] == 4).all()

original_series = Series([1, 2, 3, 4, 5, 6])
sliced_series = original_series.iloc[:]
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,8 +1069,12 @@ def test_identity_slice_returns_new_object(
assert np.shares_memory(original_df["a"]._values, sliced_df["a"]._values)

# Setting using .loc[:, "a"] sets inplace so alters both sliced and orig
# depending on CoW
original_df.loc[:, "a"] = [4, 4, 4]
assert (sliced_df["a"] == 4).all()
if using_copy_on_write:
assert (sliced_df["a"] == [1, 2, 3]).all()
else:
assert (sliced_df["a"] == 4).all()

# These should not return copies
assert original_df is original_df.loc[:, :]
Expand Down