initial commit

This commit is contained in:
Gokul
2026-05-11 12:36:20 +05:30
commit 384cbe8019
15377 changed files with 2360544 additions and 0 deletions

View File

@@ -0,0 +1,86 @@
import numpy as np
import pytest
from pandas import (
DatetimeIndex,
Series,
Timestamp,
array,
date_range,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)
@pytest.mark.parametrize("box", [lambda x: x, DatetimeIndex])
def test_datetimeindex(box):
dt = date_range("2019-12-31", periods=3, freq="D")
ser = Series(dt)
idx = box(DatetimeIndex(ser))
expected = idx.copy(deep=True)
ser.iloc[0] = Timestamp("2020-12-31")
tm.assert_index_equal(idx, expected)
def test_datetimeindex_tz_convert():
dt = date_range("2019-12-31", periods=3, freq="D", tz="Europe/Berlin")
ser = Series(dt)
idx = DatetimeIndex(ser).tz_convert("US/Eastern")
expected = idx.copy(deep=True)
ser.iloc[0] = Timestamp("2020-12-31", tz="Europe/Berlin")
tm.assert_index_equal(idx, expected)
def test_datetimeindex_tz_localize():
dt = date_range("2019-12-31", periods=3, freq="D")
ser = Series(dt)
idx = DatetimeIndex(ser).tz_localize("Europe/Berlin")
expected = idx.copy(deep=True)
ser.iloc[0] = Timestamp("2020-12-31")
tm.assert_index_equal(idx, expected)
def test_datetimeindex_isocalendar():
dt = date_range("2019-12-31", periods=3, freq="D")
ser = Series(dt)
df = DatetimeIndex(ser).isocalendar()
expected = df.index.copy(deep=True)
ser.iloc[0] = Timestamp("2020-12-31")
tm.assert_index_equal(df.index, expected)
def test_index_values():
idx = date_range("2019-12-31", periods=3, freq="D")
result = idx.values
assert result.flags.writeable is False
def test_constructor_copy_input_datetime_ndarray_default():
# GH 63388
arr = np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]")
idx = DatetimeIndex(arr)
assert not np.shares_memory(arr, get_array(idx))
def test_constructor_copy_input_datetime_ea_default():
# GH 63388
arr = array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]")
idx = DatetimeIndex(arr)
assert not tm.shares_memory(arr, idx.array)
def test_series_from_temporary_datetimeindex_readonly_data():
# GH 63388
arr = np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]")
arr.flags.writeable = False
ser = Series(DatetimeIndex(arr))
assert not np.shares_memory(arr, get_array(ser))
ser.iloc[0] = Timestamp("2020-01-01")
expected = Series(
[Timestamp("2020-01-01"), Timestamp("2020-01-02")], dtype="datetime64[ns]"
)
tm.assert_series_equal(ser, expected)

View File

@@ -0,0 +1,177 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
array,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def index_view(index_data):
df = DataFrame({"a": index_data, "b": 1.5})
view = df[:]
df = df.set_index("a", drop=True)
idx = df.index
# df = None
return idx, view
def test_set_index_update_column():
df = DataFrame({"a": [1, 2], "b": 1})
df = df.set_index("a", drop=False)
expected = df.index.copy(deep=True)
df.iloc[0, 0] = 100
tm.assert_index_equal(df.index, expected)
def test_set_index_drop_update_column():
df = DataFrame({"a": [1, 2], "b": 1.5})
view = df[:]
df = df.set_index("a", drop=True)
expected = df.index.copy(deep=True)
view.iloc[0, 0] = 100
tm.assert_index_equal(df.index, expected)
def test_set_index_series():
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
df = df.set_index(ser)
expected = df.index.copy(deep=True)
ser.iloc[0] = 100
tm.assert_index_equal(df.index, expected)
def test_assign_index_as_series():
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
df.index = ser
expected = df.index.copy(deep=True)
ser.iloc[0] = 100
tm.assert_index_equal(df.index, expected)
def test_assign_index_as_index():
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
rhs_index = Index(ser)
df.index = rhs_index
rhs_index = None # overwrite to clear reference
expected = df.index.copy(deep=True)
ser.iloc[0] = 100
tm.assert_index_equal(df.index, expected)
def test_index_from_series():
ser = Series([1, 2])
idx = Index(ser)
expected = idx.copy(deep=True)
ser.iloc[0] = 100
tm.assert_index_equal(idx, expected)
def test_index_from_series_copy():
ser = Series([1, 2])
idx = Index(ser, copy=True) # noqa: F841
arr = get_array(ser)
ser.iloc[0] = 100
assert np.shares_memory(get_array(ser), arr)
def test_index_from_index():
ser = Series([1, 2])
idx = Index(ser)
idx = Index(idx)
expected = idx.copy(deep=True)
ser.iloc[0] = 100
tm.assert_index_equal(idx, expected)
@pytest.mark.parametrize(
"func",
[
lambda x: x._shallow_copy(x._values),
lambda x: x.view(),
lambda x: x.take([0, 1]),
lambda x: x.repeat([1, 1]),
lambda x: x[slice(0, 2)],
lambda x: x[[0, 1]],
lambda x: x._getitem_slice(slice(0, 2)),
lambda x: x.delete([]),
lambda x: x.rename("b"),
lambda x: x.astype("Int64", copy=False),
],
ids=[
"_shallow_copy",
"view",
"take",
"repeat",
"getitem_slice",
"getitem_list",
"_getitem_slice",
"delete",
"rename",
"astype",
],
)
def test_index_ops(func, request):
idx, view_ = index_view([1, 2])
expected = idx.copy(deep=True)
if "astype" in request.node.callspec.id:
expected = expected.astype("Int64")
idx = func(idx)
view_.iloc[0, 0] = 100
tm.assert_index_equal(idx, expected, check_names=False)
def test_infer_objects():
idx, view_ = index_view(["a", "b"])
expected = idx.copy(deep=True)
idx = idx.infer_objects(copy=False)
view_.iloc[0, 0] = "aaaa"
tm.assert_index_equal(idx, expected, check_names=False)
def test_index_to_frame():
idx = Index([1, 2, 3], name="a")
expected = idx.copy(deep=True)
df = idx.to_frame()
assert np.shares_memory(get_array(df, "a"), idx._values)
assert not df._mgr._has_no_reference(0)
df.iloc[0, 0] = 100
tm.assert_index_equal(idx, expected)
def test_index_values():
idx = Index([1, 2, 3])
result = idx.values
assert result.flags.writeable is False
def test_constructor_copy_input_ndarray_default():
arr = np.array([0, 1])
idx = Index(arr)
assert not np.shares_memory(arr, get_array(idx))
def test_constructor_copy_input_ea_default():
arr = array([0, 1], dtype="Int64")
idx = Index(arr)
assert not tm.shares_memory(arr, idx.array)
def test_series_from_temporary_index_readonly_data():
# GH 63370
arr = np.array([0, 1], dtype=np.dtype(np.int8))
arr.flags.writeable = False
ser = Series(Index(arr))
assert not np.shares_memory(arr, get_array(ser))
assert ser._mgr._has_no_reference(0)
ser[[False, True]] = np.array([0, 2], dtype=np.dtype(np.int8))
expected = Series([0, 2], dtype=np.dtype(np.int8))
tm.assert_series_equal(ser, expected)

View File

@@ -0,0 +1,29 @@
import numpy as np
from pandas import (
Interval,
IntervalIndex,
Series,
array,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def test_constructor_copy_input_interval_ea_default():
# GH 63388
arr = array([Interval(0, 1), Interval(1, 2)])
idx = IntervalIndex(arr)
assert not tm.shares_memory(arr, idx.array)
def test_series_from_temporary_intervalindex_readonly_data():
# GH 63388
arr = array([Interval(0, 1), Interval(1, 2)])
arr._left.flags.writeable = False
arr._right.flags.writeable = False
ser = Series(IntervalIndex(arr))
assert not np.shares_memory(arr._left, get_array(ser)._left)
ser.iloc[0] = Interval(5, 6)
expected = Series([Interval(5, 6), Interval(1, 2)], dtype="interval[int64, right]")
tm.assert_series_equal(ser, expected)

View File

@@ -0,0 +1,47 @@
import numpy as np
import pytest
from pandas import (
Period,
PeriodIndex,
Series,
array,
period_range,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)
@pytest.mark.parametrize("box", [lambda x: x, PeriodIndex])
def test_periodindex(box):
dt = period_range("2019-12-31", periods=3, freq="D")
ser = Series(dt)
idx = box(PeriodIndex(ser))
expected = idx.copy(deep=True)
ser.iloc[0] = Period("2020-12-31")
tm.assert_index_equal(idx, expected)
def test_constructor_copy_input_period_ea_default():
# GH 63388
arr = array(["2020-01-01", "2020-01-02"], dtype="period[D]")
idx = PeriodIndex(arr)
assert not tm.shares_memory(arr, idx.array)
def test_series_from_temporary_periodindex_readonly_data():
# GH 63388
arr = array(["2020-01-01", "2020-01-02"], dtype="period[D]")
arr._ndarray.flags.writeable = False
ser = Series(PeriodIndex(arr))
assert not np.shares_memory(arr._ndarray, get_array(ser))
ser.iloc[0] = Period("2022-01-01", freq="D")
expected = Series(
[Period("2022-01-01", freq="D"), Period("2020-01-02", freq="D")],
dtype="period[D]",
)
tm.assert_series_equal(ser, expected)

View File

@@ -0,0 +1,59 @@
import numpy as np
import pytest
from pandas import (
Series,
Timedelta,
TimedeltaIndex,
array,
timedelta_range,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)
@pytest.mark.parametrize(
"cons",
[
lambda x: TimedeltaIndex(x),
lambda x: TimedeltaIndex(TimedeltaIndex(x)),
],
)
def test_timedeltaindex(cons):
dt = timedelta_range("1 day", periods=3)
ser = Series(dt)
idx = cons(ser)
expected = idx.copy(deep=True)
ser.iloc[0] = Timedelta("5 days")
tm.assert_index_equal(idx, expected)
def test_constructor_copy_input_timedelta_ndarray_default():
# GH 63388
arr = np.array([1, 2], dtype="timedelta64[ns]")
idx = TimedeltaIndex(arr)
assert not np.shares_memory(arr, get_array(idx))
def test_constructor_copy_input_timedelta_ea_default():
# GH 63388
arr = array([1, 2], dtype="timedelta64[ns]")
idx = TimedeltaIndex(arr)
assert not tm.shares_memory(arr, idx.array)
def test_series_from_temporary_timedeltaindex_readonly_data():
# GH 63388
arr = np.array([1, 2], dtype="timedelta64[ns]")
arr.flags.writeable = False
ser = Series(TimedeltaIndex(arr))
assert not np.shares_memory(arr, get_array(ser))
ser.iloc[0] = Timedelta(days=1)
expected = Series(
[Timedelta(days=1), Timedelta(nanoseconds=2)], dtype="timedelta64[ns]"
)
tm.assert_series_equal(ser, expected)

View File

@@ -0,0 +1,229 @@
import numpy as np
import pytest
from pandas.compat.numpy import np_version_gt2
from pandas import (
DataFrame,
Series,
date_range,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
# -----------------------------------------------------------------------------
# Copy/view behaviour for accessing underlying array of Series/DataFrame
@pytest.mark.parametrize(
"method",
[
lambda ser: ser.values,
lambda ser: np.asarray(ser.array),
lambda ser: np.asarray(ser),
lambda ser: np.array(ser, copy=False),
],
ids=["values", "array", "np.asarray", "np.array"],
)
def test_series_values(request, method):
ser = Series([1, 2, 3], name="name")
ser_orig = ser.copy()
arr = method(ser)
if request.node.callspec.id == "array":
# https://github.com/pandas-dev/pandas/issues/63099
# .array for now does not return a read-only view
assert arr.flags.writeable is True
# updating the array updates the series
arr[0] = 0
assert ser.iloc[0] == 0
return
# .values still gives a view but is read-only
assert np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is False
# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0] = 0
tm.assert_series_equal(ser, ser_orig)
# mutating the series itself still works
ser.iloc[0] = 0
assert ser.values[0] == 0
@pytest.mark.parametrize(
"method",
[
lambda df: df.values,
lambda df: np.asarray(df),
lambda ser: np.array(ser, copy=False),
],
ids=["values", "asarray", "array"],
)
def test_dataframe_values(method):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df_orig = df.copy()
arr = method(df)
# .values still gives a view but is read-only
assert np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is False
# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0, 0] = 0
tm.assert_frame_equal(df, df_orig)
# mutating the series itself still works
df.iloc[0, 0] = 0
assert df.values[0, 0] == 0
def test_series_to_numpy():
ser = Series([1, 2, 3], name="name")
ser_orig = ser.copy()
# default: copy=False, no dtype or NAs
arr = ser.to_numpy()
# to_numpy still gives a view but is read-only
assert np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is False
# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0] = 0
tm.assert_series_equal(ser, ser_orig)
# mutating the series itself still works
ser.iloc[0] = 0
assert ser.values[0] == 0
# specify copy=True gives a writeable array
ser = Series([1, 2, 3], name="name")
arr = ser.to_numpy(copy=True)
assert not np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is True
# specifying a dtype that already causes a copy also gives a writeable array
ser = Series([1, 2, 3], name="name")
arr = ser.to_numpy(dtype="float64")
assert not np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is True
@pytest.mark.parametrize(
"method",
[
lambda ser: np.asarray(ser.values),
lambda ser: np.asarray(ser.array),
lambda ser: np.asarray(ser),
lambda ser: np.asarray(ser, dtype="int64"),
lambda ser: np.array(ser, copy=False),
],
ids=["values", "array", "np.asarray", "np.asarray-dtype", "np.array"],
)
def test_series_values_ea_dtypes(request, method):
ser = Series([1, 2, 3], dtype="Int64")
ser_orig = ser.copy()
arr = method(ser)
if request.node.callspec.id in ("values", "array"):
# https://github.com/pandas-dev/pandas/issues/63099
# .array/values for now does not return a read-only view
assert arr.flags.writeable is True
# updating the array updates the series
arr[0] = 0
assert ser.iloc[0] == 0
return
# conversion to ndarray gives a view but is read-only
assert np.shares_memory(arr, get_array(ser))
assert arr.flags.writeable is False
# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0] = 0
tm.assert_series_equal(ser, ser_orig)
# mutating the series itself still works
ser.iloc[0] = 0
assert ser.values[0] == 0
@pytest.mark.parametrize(
"method",
[
lambda df: df.values,
lambda df: np.asarray(df),
lambda df: np.asarray(df, dtype="int64"),
lambda df: np.array(df, copy=False),
],
ids=["values", "np.asarray", "np.asarray-dtype", "np.array"],
)
def test_dataframe_array_ea_dtypes(method):
df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
arr = method(df)
assert np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is False
def test_dataframe_array_string_dtype():
df = DataFrame({"a": ["a", "b"]}, dtype="string[python]")
arr = np.asarray(df)
assert np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is False
def test_series_array_string_dtype(any_string_dtype):
ser = Series(["a", "b"], dtype=any_string_dtype)
arr = np.asarray(ser)
if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
# for pyarrow strings, the numpy arrays is not a view, so also does
# not need to be read-only (https://github.com/pandas-dev/pandas/pull/64035)
assert not np.shares_memory(arr, get_array(ser))
assert arr.flags.writeable is True
else:
assert np.shares_memory(arr, get_array(ser))
assert arr.flags.writeable is False
def test_dataframe_multiple_numpy_dtypes():
df = DataFrame({"a": [1, 2, 3], "b": 1.5})
arr = np.asarray(df)
assert not np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is True
if np_version_gt2:
# copy=False semantics are only supported in NumPy>=2.
with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
arr = np.array(df, copy=False)
arr = np.array(df, copy=True)
assert arr.flags.writeable is True
def test_dataframe_single_block_copy_true():
# the copy=False/None cases are tested above in test_dataframe_values
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
arr = np.array(df, copy=True)
assert not np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is True
def test_values_is_ea():
df = DataFrame({"a": date_range("2012-01-01", periods=3)})
arr = np.asarray(df)
assert arr.flags.writeable is False
def test_empty_dataframe():
df = DataFrame()
arr = np.asarray(df)
assert arr.flags.writeable is True

View File

@@ -0,0 +1,230 @@
import pickle
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def test_astype_single_dtype():
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5})
df_orig = df.copy()
df2 = df.astype("float64")
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
# mutating df2 triggers a copy-on-write for that column/block
df2.iloc[0, 2] = 5.5
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
tm.assert_frame_equal(df, df_orig)
# mutating parent also doesn't update result
df2 = df.astype("float64")
df.iloc[0, 2] = 5.5
tm.assert_frame_equal(df2, df_orig.astype("float64"))
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"])
def test_astype_avoids_copy(dtype, new_dtype):
if new_dtype == "int64[pyarrow]":
pytest.importorskip("pyarrow")
df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
df_orig = df.copy()
df2 = df.astype(new_dtype)
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
# mutating df2 triggers a copy-on-write for that column/block
df2.iloc[0, 0] = 10
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)
# mutating parent also doesn't update result
df2 = df.astype(new_dtype)
df.iloc[0, 0] = 100
tm.assert_frame_equal(df2, df_orig.astype(new_dtype))
@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"])
def test_astype_different_target_dtype(dtype):
if dtype == "int32[pyarrow]":
pytest.importorskip("pyarrow")
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()
df2 = df.astype(dtype)
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert df2._mgr._has_no_reference(0)
df2.iloc[0, 0] = 5
tm.assert_frame_equal(df, df_orig)
# mutating parent also doesn't update result
df2 = df.astype(dtype)
df.iloc[0, 0] = 100
tm.assert_frame_equal(df2, df_orig.astype(dtype))
def test_astype_numpy_to_ea():
ser = Series([1, 2, 3])
result = ser.astype("Int64")
assert np.shares_memory(get_array(ser), get_array(result))
@pytest.mark.parametrize(
"dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")]
)
def test_astype_string_and_object(dtype, new_dtype):
df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
df_orig = df.copy()
df2 = df.astype(new_dtype)
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
df2.iloc[0, 0] = "x"
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")]
)
def test_astype_string_and_object_update_original(dtype, new_dtype):
df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
df2 = df.astype(new_dtype)
df_orig = df2.copy()
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
df.iloc[0, 0] = "x"
tm.assert_frame_equal(df2, df_orig)
def test_astype_str_copy_on_pickle_roundrip():
# TODO(infer_string) this test can be removed after 3.0 (once str is the default)
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy.astype(str)
tm.assert_series_equal(base, base_copy)
def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy.astype(any_string_dtype)
tm.assert_series_equal(base, base_copy)
def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter read-only array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy._values.flags.writeable = False
base_copy.astype(any_string_dtype)
tm.assert_series_equal(base, base_copy)
def test_astype_dict_dtypes():
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}
)
df_orig = df.copy()
df2 = df.astype({"a": "float64", "c": "float64"})
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
# mutating df2 triggers a copy-on-write for that column/block
df2.iloc[0, 2] = 5.5
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
df2.iloc[0, 1] = 10
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
tm.assert_frame_equal(df, df_orig)
def test_astype_different_datetime_resos():
df = DataFrame({"a": date_range("2019-12-31", periods=2, freq="D")})
result = df.astype("datetime64[ms]")
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
assert result._mgr._has_no_reference(0)
def test_astype_different_timezones():
df = DataFrame(
{"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific", unit="ns")}
)
result = df.astype("datetime64[ns, Europe/Berlin]")
assert not result._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
def test_astype_different_timezones_different_reso():
df = DataFrame(
{"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific", unit="ns")}
)
result = df.astype("datetime64[ms, Europe/Berlin]")
assert result._mgr._has_no_reference(0)
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
def test_astype_arrow_timestamp():
pytest.importorskip("pyarrow")
df = DataFrame(
{
"a": [
Timestamp("2020-01-01 01:01:01.000001"),
Timestamp("2020-01-01 01:01:01.000001"),
]
},
dtype="M8[ns]",
)
result = df.astype("timestamp[ns][pyarrow]")
assert not result._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
def test_convert_dtypes_infer_objects():
ser = Series(["a", "b", "c"])
ser_orig = ser.copy()
result = ser.convert_dtypes(
convert_integer=False,
convert_boolean=False,
convert_floating=False,
convert_string=False,
)
assert tm.shares_memory(get_array(ser), get_array(result))
result.iloc[0] = "x"
tm.assert_series_equal(ser, ser_orig)
def test_convert_dtypes(using_infer_string):
df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
df_orig = df.copy()
df2 = df.convert_dtypes()
if using_infer_string:
# String column is already Arrow-backed, so memory is shared
assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
# String column converts from object to Arrow, no memory sharing
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
df2.iloc[0, 0] = "x"
df2.iloc[0, 1] = 10
tm.assert_frame_equal(df, df_orig)

View File

@@ -0,0 +1,104 @@
import numpy as np
import pytest
from pandas.compat import CHAINED_WARNING_DISABLED
from pandas.errors import ChainedAssignmentError
from pandas import DataFrame
import pandas._testing as tm
@pytest.mark.parametrize(
"indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
)
def test_series_setitem(indexer):
# ensure we only get a single warning for those typical cases of chained
# assignment
df = DataFrame({"a": [1, 2, 3], "b": 1})
# using custom check instead of tm.assert_produces_warning because that doesn't
# fail if multiple warnings are raised
if CHAINED_WARNING_DISABLED:
return
with pytest.warns() as record: # noqa: TID251
df["a"][indexer] = 0
assert len(record) == 1
assert record[0].category == ChainedAssignmentError
@pytest.mark.parametrize(
"indexer", ["a", ["a", "b"], slice(0, 2), np.array([True, False, True])]
)
def test_frame_setitem(indexer):
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
with tm.raises_chained_assignment_error():
df[0:3][indexer] = 10
@pytest.mark.parametrize(
"indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
)
def test_series_iloc_setitem(indexer):
df = DataFrame({"a": [1, 2, 3], "b": 1})
with tm.raises_chained_assignment_error():
df["a"].iloc[indexer] = 0
@pytest.mark.parametrize(
"indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
)
def test_frame_iloc_setitem(indexer):
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
with tm.raises_chained_assignment_error():
df[0:3].iloc[indexer] = 10
@pytest.mark.parametrize(
"indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
)
def test_series_loc_setitem(indexer):
df = DataFrame({"a": [1, 2, 3], "b": 1})
with tm.raises_chained_assignment_error():
df["a"].loc[indexer] = 0
@pytest.mark.parametrize(
"indexer", [0, [0, 1], (0, "a"), slice(0, 2), np.array([True, False, True])]
)
def test_frame_loc_setitem(indexer):
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
with tm.raises_chained_assignment_error():
df[0:3].loc[indexer] = 10
def test_series_at_setitem():
df = DataFrame({"a": [1, 2, 3], "b": 1})
with tm.raises_chained_assignment_error():
df["a"].at[0] = 0
def test_frame_at_setitem():
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
with tm.raises_chained_assignment_error():
df[0:3].at[0, "a"] = 10
def test_series_iat_setitem():
df = DataFrame({"a": [1, 2, 3], "b": 1})
with tm.raises_chained_assignment_error():
df["a"].iat[0] = 0
def test_frame_iat_setitem():
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
with tm.raises_chained_assignment_error():
df[0:3].iat[0, 0] = 10

View File

@@ -0,0 +1,72 @@
import numpy as np
from pandas import DataFrame
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def test_clip_inplace_reference():
df = DataFrame({"a": [1.5, 2, 3]})
df_copy = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.clip(lower=2, inplace=True)
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(df_copy, view)
def test_clip_inplace_reference_no_op():
df = DataFrame({"a": [1.5, 2, 3]})
df_copy = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.clip(lower=0, inplace=True)
assert np.shares_memory(get_array(df, "a"), arr_a)
assert not df._mgr._has_no_reference(0)
assert not view._mgr._has_no_reference(0)
tm.assert_frame_equal(df_copy, view)
def test_clip_inplace():
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
df.clip(lower=2, inplace=True)
assert np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
def test_clip():
df = DataFrame({"a": [1.5, 2, 3]})
df_orig = df.copy()
df2 = df.clip(lower=2)
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert df._mgr._has_no_reference(0)
tm.assert_frame_equal(df_orig, df)
def test_clip_no_op():
df = DataFrame({"a": [1.5, 2, 3]})
df2 = df.clip(lower=0)
assert not df._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
def test_clip_chained_inplace():
df = DataFrame({"a": [1, 4, 2], "b": 1})
df_orig = df.copy()
with tm.raises_chained_assignment_error():
df["a"].clip(1, 2, inplace=True)
tm.assert_frame_equal(df, df_orig)
with tm.raises_chained_assignment_error():
df[["a"]].clip(1, 2, inplace=True)
tm.assert_frame_equal(df, df_orig)

View File

@@ -0,0 +1,382 @@
import numpy as np
import pytest
from pandas._config import using_string_dtype
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
Period,
PeriodIndex,
Series,
Timedelta,
TimedeltaIndex,
Timestamp,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
# -----------------------------------------------------------------------------
# Copy/view behaviour for Series / DataFrame constructors
@pytest.mark.parametrize("dtype", [None, "int64"])
def test_series_from_series(dtype):
# Case: constructing a Series from another Series object follows CoW rules:
# a new object is returned and thus mutations are not propagated
ser = Series([1, 2, 3], name="name")
# default is copy=False -> new Series is a shallow copy / view of original
result = Series(ser, dtype=dtype)
# the shallow copy still shares memory
assert np.shares_memory(get_array(ser), get_array(result))
assert result._mgr.blocks[0].refs.has_reference()
# mutating new series copy doesn't mutate original
result.iloc[0] = 0
assert ser.iloc[0] == 1
# mutating triggered a copy-on-write -> no longer shares memory
assert not np.shares_memory(get_array(ser), get_array(result))
# the same when modifying the parent
result = Series(ser, dtype=dtype)
# mutating original doesn't mutate new series
ser.iloc[0] = 0
assert result.iloc[0] == 1
# forcing copy=False still gives a CoW shallow copy
result = Series(ser, dtype=dtype, copy=False)
assert np.shares_memory(get_array(ser), get_array(result))
assert result._mgr.blocks[0].refs.has_reference()
# forcing copy=True still results in an actual hard copy up front
result = Series(ser, dtype=dtype, copy=True)
assert not np.shares_memory(get_array(ser), get_array(result))
assert ser._mgr._has_no_reference(0)
def test_series_from_series_with_reindex():
# Case: constructing a Series from another Series with specifying an index
# that potentially requires a reindex of the values
ser = Series([1, 2, 3], name="name")
# passing an index that doesn't actually require a reindex of the values
# -> still getting a CoW shallow copy
for index in [
ser.index,
ser.index.copy(),
list(ser.index),
ser.index.rename("idx"),
]:
result = Series(ser, index=index)
assert np.shares_memory(ser.values, result.values)
result.iloc[0] = 0
assert ser.iloc[0] == 1
# forcing copy=True still results in an actual hard copy up front
result = Series(ser, index=index, copy=True)
assert not np.shares_memory(ser.values, result.values)
assert not result._mgr.blocks[0].refs.has_reference()
# ensure that if an actual reindex is needed, we don't have any refs
# (mutating the result wouldn't trigger CoW)
result = Series(ser, index=[0, 1, 2, 3])
assert not np.shares_memory(ser.values, result.values)
assert not result._mgr.blocks[0].refs.has_reference()
@pytest.mark.parametrize("dtype", [None, "int64"])
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
@pytest.mark.parametrize(
"arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
)
def test_series_from_array(idx, dtype, arr):
ser = Series(arr, dtype=dtype, index=idx)
ser_orig = ser.copy()
data = getattr(arr, "_data", arr)
assert not np.shares_memory(get_array(ser), data)
arr[0] = 100
tm.assert_series_equal(ser, ser_orig)
# if the user explicitly passes copy=False, we get an actual view
# not protected by CoW
ser = Series(arr, dtype=dtype, index=idx, copy=False)
assert np.shares_memory(get_array(ser), data)
arr[0] = 50
assert ser.iloc[0] == 50
@pytest.mark.parametrize("copy", [True, False, None])
def test_series_from_array_different_dtype(copy):
arr = np.array([1, 2, 3], dtype="int64")
ser = Series(arr, dtype="int32", copy=copy)
assert not np.shares_memory(get_array(ser), arr)
@pytest.mark.parametrize(
"idx",
[
Index([1, 2]),
DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]),
PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]),
TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]),
],
)
def test_series_from_index(idx):
ser = Series(idx)
expected = idx.copy(deep=True)
assert np.shares_memory(get_array(ser), get_array(idx))
assert not ser._mgr._has_no_reference(0)
ser.iloc[0] = ser.iloc[1]
tm.assert_index_equal(idx, expected)
# forcing copy=False still gives a CoW shallow copy
ser = Series(idx, copy=False)
assert np.shares_memory(get_array(ser), get_array(idx))
assert not ser._mgr._has_no_reference(0)
ser.iloc[0] = ser.iloc[1]
tm.assert_index_equal(idx, expected)
# forcing copy=True still results in a copy
ser = Series(idx, copy=True)
assert not np.shares_memory(get_array(ser), get_array(idx))
assert ser._mgr._has_no_reference(0)
@pytest.mark.parametrize("copy", [True, False, None])
def test_series_from_index_different_dtypes(copy):
idx = Index([1, 2, 3], dtype="int64", copy=copy)
ser = Series(idx, dtype="int32")
assert not np.shares_memory(get_array(ser), get_array(idx))
assert ser._mgr._has_no_reference(0)
def test_series_from_block_manager_different_dtype():
ser = Series([1, 2, 3], dtype="int64")
msg = "Passing a SingleBlockManager to Series"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
ser2 = Series(ser._mgr, dtype="int32")
assert not np.shares_memory(get_array(ser), get_array(ser2))
assert ser2._mgr._has_no_reference(0)
@pytest.mark.parametrize("use_mgr", [True, False])
@pytest.mark.parametrize("columns", [None, ["a"]])
def test_dataframe_constructor_mgr_or_df(columns, use_mgr):
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()
if use_mgr:
data = df._mgr
warn = DeprecationWarning
else:
data = df
warn = None
msg = "Passing a BlockManager to DataFrame"
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
new_df = DataFrame(data)
assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
new_df.iloc[0] = 100
assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize("dtype", [None, "int64", "Int64"])
@pytest.mark.parametrize("index", [None, [0, 1, 2]])
@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]])
def test_dataframe_from_dict_of_series(columns, index, dtype):
# Case: constructing a DataFrame from Series objects with copy=False
# has to do a lazy following CoW rules
# (the default for DataFrame(dict) is still to copy to ensure consolidation)
s1 = Series([1, 2, 3])
s2 = Series([4, 5, 6])
s1_orig = s1.copy()
expected = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6]}, index=index, columns=columns, dtype=dtype
)
result = DataFrame(
{"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
)
# the shallow copy still shares memory
assert np.shares_memory(get_array(result, "a"), get_array(s1))
# mutating the new dataframe doesn't mutate original
result.iloc[0, 0] = 10
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
tm.assert_series_equal(s1, s1_orig)
# the same when modifying the parent series
s1 = Series([1, 2, 3])
s2 = Series([4, 5, 6])
result = DataFrame(
{"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
)
s1.iloc[0] = 10
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", [None, "int64"])
def test_dataframe_from_dict_of_series_with_reindex(dtype):
# Case: constructing a DataFrame from Series objects with copy=False
# and passing an index that requires an actual (no-view) reindex -> need
# to ensure the result doesn't have refs set up to unnecessarily trigger
# a copy on write
s1 = Series([1, 2, 3])
s2 = Series([4, 5, 6])
df = DataFrame({"a": s1, "b": s2}, index=[1, 2, 3], dtype=dtype, copy=False)
# df should own its memory, so mutating shouldn't trigger a copy
arr_before = get_array(df, "a")
assert not np.shares_memory(arr_before, get_array(s1))
df.iloc[0, 0] = 100
arr_after = get_array(df, "a")
assert np.shares_memory(arr_before, arr_after)
@pytest.mark.parametrize(
"data, dtype",
[
([1, 2], "int64"),
# 1D-only EA
([1, 2], "Int64"),
pytest.param(
["a", "b"],
"str",
marks=pytest.mark.xfail(
reason="TODO bug with infer_string=False and specifying dtype='str'"
)
if not using_string_dtype()
else [],
),
(["a", "b"], object),
# 2D EA
(
[Timestamp("2020", tz="UTC"), Timestamp("2021", tz="UTC")],
"datetime64[ns, UTC]",
),
],
ids=["int", "int-ea", "str", "object", "datetime64tz"],
)
def test_dataframe_from_series_or_index(data, dtype, index_or_series):
obj = index_or_series(data, dtype=dtype)
obj_orig = obj.copy(deep=True) # deep=True needed for Index
# default is copy=False -> DataFrame holds a shallow copy of original Index/Series
df = DataFrame(obj)
assert tm.shares_memory(get_array(obj), get_array(df, 0))
assert not df._mgr._has_no_reference(0)
df.iloc[0, 0] = data[-1]
tm.assert_equal(obj, obj_orig)
# with passing the (identical) dtype -> same
df = DataFrame(obj, dtype=dtype)
assert tm.shares_memory(get_array(obj), get_array(df, 0))
assert not df._mgr._has_no_reference(0)
df.iloc[0, 0] = data[-1]
tm.assert_equal(obj, obj_orig)
# forcing copy=True still results in an actual hard copy up front
df = DataFrame(obj, copy=True)
if not (obj.dtype == "str" and obj.dtype.storage == "pyarrow"):
# ArrowExtensionArray deep copy still points to the same underlying data
assert not tm.shares_memory(get_array(obj), get_array(df, 0))
assert df._mgr._has_no_reference(0)
df.iloc[0, 0] = data[-1]
tm.assert_equal(obj, obj_orig)
def test_dataframe_from_series_or_index_different_dtype(index_or_series):
obj = index_or_series([1, 2], dtype="int64")
df = DataFrame(obj, dtype="int32")
assert not np.shares_memory(get_array(obj), get_array(df, 0))
assert df._mgr._has_no_reference(0)
def test_dataframe_from_series_dont_infer_datetime():
ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object)
df = DataFrame(ser)
assert df.dtypes.iloc[0] == np.dtype(object)
assert np.shares_memory(get_array(ser), get_array(df, 0))
assert not df._mgr._has_no_reference(0)
@pytest.mark.parametrize("index", [None, [0, 1, 2]])
def test_dataframe_from_dict_of_series_with_dtype(index):
# Variant of above, but now passing a dtype that causes a copy
# -> need to ensure the result doesn't have refs set up to unnecessarily
# trigger a copy on write
s1 = Series([1.0, 2.0, 3.0])
s2 = Series([4, 5, 6])
df = DataFrame({"a": s1, "b": s2}, index=index, dtype="int64", copy=False)
# df should own its memory, so mutating shouldn't trigger a copy
arr_before = get_array(df, "a")
assert not np.shares_memory(arr_before, get_array(s1))
df.iloc[0, 0] = 100
arr_after = get_array(df, "a")
assert np.shares_memory(arr_before, arr_after)
@pytest.mark.parametrize("copy", [False, None, True])
def test_dataframe_from_numpy_array(copy):
arr = np.array([[1, 2], [3, 4]])
df = DataFrame(arr, copy=copy)
if copy is not False or copy is True:
assert not np.shares_memory(get_array(df, 0), arr)
else:
assert np.shares_memory(get_array(df, 0), arr)
@pytest.mark.parametrize(
"data, dtype",
[
# 1D-only EA
([1, 2], "Int64"),
# 2D EA
(
[Timestamp("2020", tz="UTC"), Timestamp("2021", tz="UTC")],
"datetime64[ns, UTC]",
),
],
ids=["int-ea", "datetime64tz"],
)
@pytest.mark.parametrize("copy", [False, None, True])
def test_dataframe_from_extension_array(copy, data, dtype):
arr = pd.array(data, dtype=dtype)
df = DataFrame(arr, copy=copy)
if arr.dtype == "Int64":
# to ensure tm.shares_memory works correctly
# TODO fix in tm.shares_memory or get_array?
arr = arr._data
if copy is None or copy is True:
assert not tm.shares_memory(get_array(df, 0), arr)
else:
assert tm.shares_memory(get_array(df, 0), arr)
def test_frame_from_dict_of_index():
idx = Index([1, 2, 3])
expected = idx.copy(deep=True)
df = DataFrame({"a": idx}, copy=False)
assert np.shares_memory(get_array(df, "a"), idx._values)
assert not df._mgr._has_no_reference(0)
df.iloc[0, 0] = 100
tm.assert_index_equal(idx, expected)

View File

@@ -0,0 +1,100 @@
import pytest
from pandas.errors import Pandas4Warning
import pandas as pd
from pandas import (
concat,
merge,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"meth, kwargs",
[
("truncate", {}),
("tz_convert", {"tz": "UTC"}),
("tz_localize", {"tz": "UTC"}),
("infer_objects", {}),
("astype", {"dtype": "float64"}),
("reindex", {"index": [2, 0, 1]}),
("transpose", {}),
("set_axis", {"labels": [1, 2, 3]}),
("rename", {"index": {1: 2}}),
("set_flags", {}),
("to_period", {}),
("to_timestamp", {}),
("swaplevel", {"i": 0, "j": 1}),
],
)
def test_copy_deprecation(meth, kwargs):
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1})
if meth in ("tz_convert", "tz_localize", "to_period"):
tz = None if meth in ("tz_localize", "to_period") else "US/Eastern"
df.index = pd.date_range("2020-01-01", freq="D", periods=len(df), tz=tz)
elif meth == "to_timestamp":
df.index = pd.period_range("2020-01-01", freq="D", periods=len(df))
elif meth == "swaplevel":
df = df.set_index(["b", "c"])
if meth != "swaplevel":
with tm.assert_produces_warning(Pandas4Warning, match="copy"):
getattr(df, meth)(copy=False, **kwargs)
if meth != "transpose":
with tm.assert_produces_warning(Pandas4Warning, match="copy"):
getattr(df.a, meth)(copy=False, **kwargs)
def test_copy_deprecation_reindex_like_align():
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
# Somehow the stack level check is incorrect here
with tm.assert_produces_warning(
Pandas4Warning, match="copy", check_stacklevel=False
):
df.reindex_like(df, copy=False)
with tm.assert_produces_warning(
Pandas4Warning, match="copy", check_stacklevel=False
):
df.a.reindex_like(df.a, copy=False)
with tm.assert_produces_warning(
Pandas4Warning, match="copy", check_stacklevel=False
):
df.align(df, copy=False)
with tm.assert_produces_warning(
Pandas4Warning, match="copy", check_stacklevel=False
):
df.a.align(df.a, copy=False)
def test_copy_deprecation_merge_concat():
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
with tm.assert_produces_warning(
Pandas4Warning, match="copy", check_stacklevel=False
):
df.merge(df, copy=False)
with tm.assert_produces_warning(
Pandas4Warning, match="copy", check_stacklevel=False
):
merge(df, df, copy=False)
with tm.assert_produces_warning(
Pandas4Warning, match="copy", check_stacklevel=False
):
concat([df, df], copy=False)
@pytest.mark.parametrize("value", [False, True, "warn"])
def test_copy_on_write_deprecation_option(value):
msg = "Copy-on-Write can no longer be disabled"
# stacklevel points to contextlib due to use of context manager.
with tm.assert_produces_warning(Pandas4Warning, match=msg, check_stacklevel=False):
with pd.option_context("mode.copy_on_write", value):
pass

View File

@@ -0,0 +1,93 @@
import numpy as np
import pytest
from pandas import DataFrame
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def test_assigning_to_same_variable_removes_references():
df = DataFrame({"a": [1, 2, 3]})
df = df.reset_index()
assert df._mgr._has_no_reference(1)
arr = get_array(df, "a")
df.iloc[0, 1] = 100 # Write into a
assert np.shares_memory(arr, get_array(df, "a"))
def test_setitem_dont_track_unnecessary_references():
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
df["b"] = 100
arr = get_array(df, "a")
# We split the block in setitem, if we are not careful the new blocks will
# reference each other triggering a copy
df.iloc[0, 0] = 100
assert np.shares_memory(arr, get_array(df, "a"))
def test_setitem_with_view_copies():
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
view = df[:]
expected = df.copy()
df["b"] = 100
arr = get_array(df, "a")
df.iloc[0, 0] = 100 # Check that we correctly track reference
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(view, expected)
def test_setitem_with_view_invalidated_does_not_copy(request):
df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
view = df[:]
df["b"] = 100
arr = get_array(df, "a")
view = None # noqa: F841
# TODO(CoW) block gets split because of `df["b"] = 100`
# which introduces additional refs, even when those of `view` go out of scopes
df.iloc[0, 0] = 100
# Setitem split the block. Since the old block shared data with view
# all the new blocks are referencing view and each other. When view
# goes out of scope, they don't share data with any other block,
# so we should not trigger a copy
mark = pytest.mark.xfail(reason="blk.delete does not track references correctly")
request.applymarker(mark)
assert np.shares_memory(arr, get_array(df, "a"))
def test_out_of_scope():
def func():
df = DataFrame({"a": [1, 2], "b": 1.5, "c": 1})
# create some subset
result = df[["a", "b"]]
return result
result = func()
assert not result._mgr.blocks[0].refs.has_reference()
assert not result._mgr.blocks[1].refs.has_reference()
def test_delete():
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"]
)
del df["b"]
assert not df._mgr.blocks[0].refs.has_reference()
assert not df._mgr.blocks[1].refs.has_reference()
df = df[["a"]]
assert not df._mgr.blocks[0].refs.has_reference()
def test_delete_reference():
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"]
)
x = df[:]
del df["b"]
assert df._mgr.blocks[0].refs.has_reference()
assert df._mgr.blocks[1].refs.has_reference()
assert x._mgr.blocks[0].refs.has_reference()

View File

@@ -0,0 +1,332 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
concat,
merge,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def test_concat_frames():
df = DataFrame({"b": ["a"] * 3}, dtype=object)
df2 = DataFrame({"a": ["a"] * 3}, dtype=object)
df_orig = df.copy()
result = concat([df, df2], axis=1)
assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
result.iloc[0, 0] = "d"
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
result.iloc[0, 1] = "d"
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
tm.assert_frame_equal(df, df_orig)
def test_concat_frames_updating_input():
df = DataFrame({"b": ["a"] * 3}, dtype=object)
df2 = DataFrame({"a": ["a"] * 3}, dtype=object)
result = concat([df, df2], axis=1)
assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
expected = result.copy()
df.iloc[0, 0] = "d"
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
df2.iloc[0, 0] = "d"
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
tm.assert_frame_equal(result, expected)
def test_concat_series():
ser = Series([1, 2], name="a")
ser2 = Series([3, 4], name="b")
ser_orig = ser.copy()
ser2_orig = ser2.copy()
result = concat([ser, ser2], axis=1)
assert np.shares_memory(get_array(result, "a"), ser.values)
assert np.shares_memory(get_array(result, "b"), ser2.values)
result.iloc[0, 0] = 100
assert not np.shares_memory(get_array(result, "a"), ser.values)
assert np.shares_memory(get_array(result, "b"), ser2.values)
result.iloc[0, 1] = 1000
assert not np.shares_memory(get_array(result, "b"), ser2.values)
tm.assert_series_equal(ser, ser_orig)
tm.assert_series_equal(ser2, ser2_orig)
def test_concat_frames_chained():
df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
df2 = DataFrame({"c": [4, 5, 6]})
df3 = DataFrame({"d": [4, 5, 6]})
result = concat([concat([df1, df2], axis=1), df3], axis=1)
expected = result.copy()
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "c"), get_array(df2, "c"))
assert np.shares_memory(get_array(result, "d"), get_array(df3, "d"))
df1.iloc[0, 0] = 100
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
tm.assert_frame_equal(result, expected)
def test_concat_series_chained():
ser1 = Series([1, 2, 3], name="a")
ser2 = Series([4, 5, 6], name="c")
ser3 = Series([4, 5, 6], name="d")
result = concat([concat([ser1, ser2], axis=1), ser3], axis=1)
expected = result.copy()
assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c"))
assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d"))
ser1.iloc[0] = 100
assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
tm.assert_frame_equal(result, expected)
def test_concat_series_updating_input():
ser = Series([1, 2], name="a")
ser2 = Series([3, 4], name="b")
expected = DataFrame({"a": [1, 2], "b": [3, 4]})
result = concat([ser, ser2], axis=1)
assert np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
ser.iloc[0] = 100
assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
tm.assert_frame_equal(result, expected)
ser2.iloc[0] = 1000
assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
tm.assert_frame_equal(result, expected)
def test_concat_mixed_series_frame():
df = DataFrame({"a": [1, 2, 3], "c": 1})
ser = Series([4, 5, 6], name="d")
result = concat([df, ser], axis=1)
expected = result.copy()
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(result, "c"), get_array(df, "c"))
assert np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
ser.iloc[0] = 100
assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
df.iloc[0, 0] = 100
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(result, expected)
def test_concat_copy_keyword():
df = DataFrame({"a": [1, 2]})
df2 = DataFrame({"b": [1.5, 2.5]})
result = concat([df, df2], axis=1)
assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
@pytest.mark.parametrize(
"func",
[
lambda df1, df2, **kwargs: df1.merge(df2, **kwargs),
lambda df1, df2, **kwargs: merge(df1, df2, **kwargs),
],
)
def test_merge_on_key(func):
df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]})
df2 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "b": [4, 5, 6]})
df1_orig = df1.copy()
df2_orig = df2.copy()
result = func(df1, df2, on="key")
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(result, "key"), get_array(df1, "key"))
assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
result.iloc[0, 1] = 0
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
result.iloc[0, 2] = 0
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
tm.assert_frame_equal(df1, df1_orig)
tm.assert_frame_equal(df2, df2_orig)
def test_merge_on_index():
df1 = DataFrame({"a": [1, 2, 3]})
df2 = DataFrame({"b": [4, 5, 6]})
df1_orig = df1.copy()
df2_orig = df2.copy()
result = merge(df1, df2, left_index=True, right_index=True)
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
result.iloc[0, 0] = 0
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
result.iloc[0, 1] = 0
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
tm.assert_frame_equal(df1, df1_orig)
tm.assert_frame_equal(df2, df2_orig)
@pytest.mark.parametrize(
"func, how",
[
(lambda df1, df2, **kwargs: merge(df2, df1, on="key", **kwargs), "right"),
(lambda df1, df2, **kwargs: merge(df1, df2, on="key", **kwargs), "left"),
],
)
def test_merge_on_key_enlarging_one(func, how):
df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]})
df2 = DataFrame({"key": Series(["a", "b"], dtype=object), "b": [4, 5]})
df1_orig = df1.copy()
df2_orig = df2.copy()
result = func(df1, df2, how=how)
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
assert df2._mgr._has_no_reference(1)
assert df2._mgr._has_no_reference(0)
assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) is (
how == "left"
)
assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
if how == "left":
result.iloc[0, 1] = 0
else:
result.iloc[0, 2] = 0
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
tm.assert_frame_equal(df1, df1_orig)
tm.assert_frame_equal(df2, df2_orig)
def test_merge_copy_keyword():
df = DataFrame({"a": [1, 2]})
df2 = DataFrame({"b": [3, 4.5]})
result = df.merge(df2, left_index=True, right_index=True)
assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
def test_merge_upcasting_no_copy():
left = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
left_copy = left.copy()
right = DataFrame({"a": [1, 2, 3], "c": [7, 8, 9]}, dtype=object)
result = merge(left, right, on="a")
assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
assert not np.shares_memory(get_array(result, "a"), get_array(left, "a"))
tm.assert_frame_equal(left, left_copy)
result = merge(right, left, on="a")
assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
assert not np.shares_memory(get_array(result, "a"), get_array(left, "a"))
tm.assert_frame_equal(left, left_copy)
def test_merge_indicator_no_deep_copy():
left = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
right = DataFrame({"a": [1, 2, 3], "c": [7, 8, 9]})
result = merge(left, right, on="a", indicator=True)
assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
assert np.shares_memory(get_array(result, "c"), get_array(right, "c"))
@pytest.mark.parametrize("dtype", [object, "str"])
def test_join_on_key(dtype):
df_index = Index(["a", "b", "c"], name="key", dtype=dtype)
df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True))
df1_orig = df1.copy()
df2_orig = df2.copy()
result = df1.join(df2, on="key")
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
assert tm.shares_memory(get_array(result.index), get_array(df1.index))
assert not np.shares_memory(get_array(result.index), get_array(df2.index))
result.iloc[0, 0] = 0
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
result.iloc[0, 1] = 0
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
tm.assert_frame_equal(df1, df1_orig)
tm.assert_frame_equal(df2, df2_orig)
def test_join_multiple_dataframes_on_key():
df_index = Index(["a", "b", "c"], name="key", dtype=object)
df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
dfs_list = [
DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)),
DataFrame({"c": [7, 8, 9]}, index=df_index.copy(deep=True)),
]
df1_orig = df1.copy()
dfs_list_orig = [df.copy() for df in dfs_list]
result = df1.join(dfs_list)
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
assert np.shares_memory(get_array(result.index), get_array(df1.index))
assert not np.shares_memory(get_array(result.index), get_array(dfs_list[0].index))
assert not np.shares_memory(get_array(result.index), get_array(dfs_list[1].index))
result.iloc[0, 0] = 0
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
result.iloc[0, 1] = 0
assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
result.iloc[0, 2] = 0
assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
tm.assert_frame_equal(df1, df1_orig)
for df, df_orig in zip(dfs_list, dfs_list_orig, strict=True):
tm.assert_frame_equal(df, df_orig)

View File

@@ -0,0 +1,902 @@
import numpy as np
import pytest
from pandas.core.dtypes.common import is_float_dtype
import pandas as pd
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
@pytest.fixture(params=["numpy", "nullable"])
def backend(request):
if request.param == "numpy":
def make_dataframe(*args, **kwargs):
return DataFrame(*args, **kwargs)
def make_series(*args, **kwargs):
return Series(*args, **kwargs)
elif request.param == "nullable":
def make_dataframe(*args, **kwargs):
df = DataFrame(*args, **kwargs)
df_nullable = df.convert_dtypes()
# convert_dtypes will try to cast float to int if there is no loss in
# precision -> undo that change
for col in df.columns:
if is_float_dtype(df[col].dtype) and not is_float_dtype(
df_nullable[col].dtype
):
df_nullable[col] = df_nullable[col].astype("Float64")
# copy final result to ensure we start with a fully self-owning DataFrame
return df_nullable.copy()
def make_series(*args, **kwargs):
ser = Series(*args, **kwargs)
return ser.convert_dtypes().copy()
return request.param, make_dataframe, make_series
# -----------------------------------------------------------------------------
# Indexing operations taking subset + modifying the subset/parent
def test_subset_column_selection(backend):
# Case: taking a subset of the columns of a DataFrame
# + afterwards modifying the subset
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
subset = df[["a", "c"]]
assert subset.index is not df.index
# the subset shares memory ...
assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
# ... but uses CoW when being modified
subset.iloc[0, 0] = 0
assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
expected = DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]})
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
def test_subset_column_selection_modify_parent(backend):
# Case: taking a subset of the columns of a DataFrame
# + afterwards modifying the parent
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
subset = df[["a", "c"]]
# the subset shares memory ...
assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
# ... but parent uses CoW parent when it is modified
df.iloc[0, 0] = 0
assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
# different column/block still shares memory
assert np.shares_memory(get_array(subset, "c"), get_array(df, "c"))
expected = DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]})
tm.assert_frame_equal(subset, expected)
def test_subset_row_slice(backend):
# Case: taking a subset of the rows of a DataFrame using a slice
# + afterwards modifying the subset
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
subset = df[1:3]
subset._mgr._verify_integrity()
assert subset.columns is not df.columns
assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
subset.iloc[0, 0] = 0
assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
subset._mgr._verify_integrity()
expected = DataFrame({"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3))
tm.assert_frame_equal(subset, expected)
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_column_slice(backend, dtype):
# Case: taking a subset of the columns of a DataFrame using a slice
# + afterwards modifying the subset
dtype_backend, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df.iloc[:, 1:]
subset._mgr._verify_integrity()
assert subset.index is not df.index
assert np.shares_memory(get_array(subset, "b"), get_array(df, "b"))
subset.iloc[0, 0] = 0
assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b"))
expected = DataFrame({"b": [0, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)})
tm.assert_frame_equal(subset, expected)
# original parent dataframe is not modified (also not for BlockManager case,
# except for single block)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
@pytest.mark.parametrize(
"row_indexer",
[slice(1, 2), np.array([False, True, True]), np.array([1, 2])],
ids=["slice", "mask", "array"],
)
@pytest.mark.parametrize(
"column_indexer",
[slice("b", "c"), np.array([False, True, True]), ["b", "c"]],
ids=["slice", "mask", "array"],
)
def test_subset_loc_rows_columns(
backend,
dtype,
row_indexer,
column_indexer,
):
# Case: taking a subset of the rows+columns of a DataFrame using .loc
# + afterwards modifying the subset
# Generic test for several combinations of row/column indexers, not all
# of those could actually return a view / need CoW (so this test is not
# checking memory sharing, only ensuring subsequent mutation doesn't
# affect the parent dataframe)
dtype_backend, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df.loc[row_indexer, column_indexer]
assert subset.index is not df.index
assert subset.columns is not df.columns
# modifying the subset never modifies the parent
subset.iloc[0, 0] = 0
expected = DataFrame(
{"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3)
)
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
@pytest.mark.parametrize(
"row_indexer",
[slice(1, 3), np.array([False, True, True]), np.array([1, 2])],
ids=["slice", "mask", "array"],
)
@pytest.mark.parametrize(
"column_indexer",
[slice(1, 3), np.array([False, True, True]), [1, 2]],
ids=["slice", "mask", "array"],
)
def test_subset_iloc_rows_columns(
backend,
dtype,
row_indexer,
column_indexer,
):
# Case: taking a subset of the rows+columns of a DataFrame using .iloc
# + afterwards modifying the subset
# Generic test for several combinations of row/column indexers, not all
# of those could actually return a view / need CoW (so this test is not
# checking memory sharing, only ensuring subsequent mutation doesn't
# affect the parent dataframe)
dtype_backend, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df.iloc[row_indexer, column_indexer]
assert subset.index is not df.index
assert subset.columns is not df.columns
# modifying the subset never modifies the parent
subset.iloc[0, 0] = 0
expected = DataFrame(
{"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3)
)
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"indexer",
[slice(0, 2), np.array([True, True, False]), np.array([0, 1])],
ids=["slice", "mask", "array"],
)
def test_subset_set_with_row_indexer(backend, indexer_si, indexer):
# Case: setting values with a row indexer on a viewing subset
# subset[indexer] = value and subset.iloc[indexer] = value
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]})
df_orig = df.copy()
subset = df[1:4]
if (
indexer_si is tm.setitem
and isinstance(indexer, np.ndarray)
and indexer.dtype == "int"
):
pytest.skip("setitem with labels selects on columns")
indexer_si(subset)[indexer] = 0
expected = DataFrame(
{"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4)
)
tm.assert_frame_equal(subset, expected)
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
def test_subset_set_with_mask(backend):
# Case: setting values with a mask on a viewing subset: subset[mask] = value
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]})
df_orig = df.copy()
subset = df[1:4]
mask = subset > 3
subset[mask] = 0
expected = DataFrame(
{"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4)
)
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
def test_subset_set_column(backend):
# Case: setting a single column on a viewing subset -> subset[col] = value
dtype_backend, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
subset = df[1:3]
if dtype_backend == "numpy":
arr = np.array([10, 11], dtype="int64")
else:
arr = pd.array([10, 11], dtype="Int64")
subset["a"] = arr
subset._mgr._verify_integrity()
expected = DataFrame(
{"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3)
)
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_set_column_with_loc(backend, dtype):
# Case: setting a single column with loc on a viewing subset
# -> subset.loc[:, col] = value
_, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df[1:3]
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
subset._mgr._verify_integrity()
expected = DataFrame(
{"a": [10, 11], "b": [5, 6], "c": np.array([8, 9], dtype=dtype)},
index=range(1, 3),
)
tm.assert_frame_equal(subset, expected)
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
def test_subset_set_column_with_loc2(backend):
# Case: setting a single column with loc on a viewing subset
# -> subset.loc[:, col] = value
# separate test for case of DataFrame of a single column -> takes a separate
# code path
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()
subset = df[1:3]
subset.loc[:, "a"] = 0
subset._mgr._verify_integrity()
expected = DataFrame({"a": [0, 0]}, index=range(1, 3))
tm.assert_frame_equal(subset, expected)
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_set_columns(backend, dtype):
# Case: setting multiple columns on a viewing subset
# -> subset[[col1, col2]] = value
dtype_backend, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
subset = df[1:3]
subset[["a", "c"]] = 0
subset._mgr._verify_integrity()
# first and third column should certainly have no references anymore
assert all(subset._mgr._has_no_reference(i) for i in [0, 2])
expected = DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3))
if dtype_backend == "nullable":
# there is not yet a global option, so overriding a column by setting a scalar
# defaults to numpy dtype even if original column was nullable
expected["a"] = expected["a"].astype("int64")
expected["c"] = expected["c"].astype("int64")
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"indexer",
[slice("a", "b"), np.array([True, True, False]), ["a", "b"]],
ids=["slice", "mask", "array"],
)
def test_subset_set_with_column_indexer(backend, indexer):
# Case: setting multiple columns with a column indexer on a viewing subset
# -> subset.loc[:, [col1, col2]] = value
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]})
df_orig = df.copy()
subset = df[1:3]
subset.loc[:, indexer] = 0
subset._mgr._verify_integrity()
expected = DataFrame({"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3))
tm.assert_frame_equal(subset, expected)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"method",
[
lambda df: df[["a", "b"]][0:2],
lambda df: df[0:2][["a", "b"]],
lambda df: df[["a", "b"]].iloc[0:2],
lambda df: df[["a", "b"]].loc[0:1],
lambda df: df[0:2].iloc[:, 0:2],
lambda df: df[0:2].loc[:, "a":"b"], # type: ignore[misc]
],
ids=[
"row-getitem-slice",
"column-getitem",
"row-iloc-slice",
"row-loc-slice",
"column-iloc-slice",
"column-loc-slice",
],
)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_chained_getitem(
request,
backend,
method,
dtype,
):
# Case: creating a subset using multiple, chained getitem calls using views
# still needs to guarantee proper CoW behaviour
_, DataFrame, _ = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
# modify subset -> don't modify parent
subset = method(df)
subset.iloc[0, 0] = 0
tm.assert_frame_equal(df, df_orig)
# modify parent -> don't modify subset
subset = method(df)
df.iloc[0, 0] = 0
expected = DataFrame({"a": [1, 2], "b": [4, 5]})
tm.assert_frame_equal(subset, expected)
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_chained_getitem_column(backend, dtype):
# Case: creating a subset using multiple, chained getitem calls using views
# still needs to guarantee proper CoW behaviour
dtype_backend, DataFrame, Series = backend
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
)
df_orig = df.copy()
# modify subset -> don't modify parent
subset = df[:]["a"][0:2]
subset.iloc[0] = 0
tm.assert_frame_equal(df, df_orig)
# modify parent -> don't modify subset
subset = df[:]["a"][0:2]
df.iloc[0, 0] = 0
expected = Series([1, 2], name="a")
tm.assert_series_equal(subset, expected)
@pytest.mark.parametrize(
"method",
[
lambda s: s["a":"c"]["a":"b"], # type: ignore[misc]
lambda s: s.iloc[0:3].iloc[0:2],
lambda s: s.loc["a":"c"].loc["a":"b"], # type: ignore[misc]
lambda s: s.loc["a":"c"] # type: ignore[misc]
.iloc[0:3]
.iloc[0:2]
.loc["a":"b"] # type: ignore[misc]
.iloc[0:1],
],
ids=["getitem", "iloc", "loc", "long-chain"],
)
def test_subset_chained_getitem_series(backend, method):
# Case: creating a subset using multiple, chained getitem calls using views
# still needs to guarantee proper CoW behaviour
_, _, Series = backend
s = Series([1, 2, 3], index=["a", "b", "c"])
s_orig = s.copy()
# modify subset -> don't modify parent
subset = method(s)
subset.iloc[0] = 0
tm.assert_series_equal(s, s_orig)
# modify parent -> don't modify subset
subset = s.iloc[0:3].iloc[0:2]
s.iloc[0] = 0
expected = Series([1, 2], index=["a", "b"])
tm.assert_series_equal(subset, expected)
def test_subset_chained_single_block_row():
# not parametrizing this for dtype backend, since this explicitly tests single block
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
df_orig = df.copy()
# modify subset -> don't modify parent
subset = df[:].iloc[0].iloc[0:2]
subset.iloc[0] = 0
tm.assert_frame_equal(df, df_orig)
# modify parent -> don't modify subset
subset = df[:].iloc[0].iloc[0:2]
df.iloc[0, 0] = 0
expected = Series([1, 4], index=["a", "b"], name=0)
tm.assert_series_equal(subset, expected)
@pytest.mark.parametrize(
"method",
[
lambda df: df[:],
lambda df: df.loc[:, :],
lambda df: df.loc[:],
lambda df: df.iloc[:, :],
lambda df: df.iloc[:],
],
ids=["getitem", "loc", "loc-rows", "iloc", "iloc-rows"],
)
def test_null_slice(backend, method):
# Case: also all variants of indexing with a null slice (:) should return
# new objects to ensure we correctly use CoW for the results
dtype_backend, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
df_orig = df.copy()
df2 = method(df)
# we always return new objects (shallow copy), regardless of CoW or not
assert df2 is not df
assert df2.index is not df.index
assert df2.columns is not df.columns
# and those trigger CoW when mutated
df2.iloc[0, 0] = 0
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize(
"method",
[
lambda s: s[:],
lambda s: s.loc[:],
lambda s: s.iloc[:],
],
ids=["getitem", "loc", "iloc"],
)
def test_null_slice_series(backend, method):
_, _, Series = backend
s = Series([1, 2, 3], index=["a", "b", "c"])
s_orig = s.copy()
s2 = method(s)
# we always return new objects, regardless of CoW or not
assert s2 is not s
assert s2.index is not s.index
# and those trigger CoW when mutated
s2.iloc[0] = 0
tm.assert_series_equal(s, s_orig)
# TODO add more tests modifying the parent
# -----------------------------------------------------------------------------
# Series -- Indexing operations taking subset + modifying the subset/parent
def test_series_getitem_slice(backend):
# Case: taking a slice of a Series + afterwards modifying the subset
_, _, Series = backend
s = Series([1, 2, 3], index=["a", "b", "c"])
s_orig = s.copy()
subset = s[:]
assert np.shares_memory(get_array(subset), get_array(s))
assert subset.index is not s.index
subset.iloc[0] = 0
assert not np.shares_memory(get_array(subset), get_array(s))
expected = Series([0, 2, 3], index=["a", "b", "c"])
tm.assert_series_equal(subset, expected)
# original parent series is not modified (CoW)
tm.assert_series_equal(s, s_orig)
def test_series_getitem_ellipsis():
# Case: taking a view of a Series using Ellipsis + afterwards modifying the subset
s = Series([1, 2, 3])
s_orig = s.copy()
subset = s[...]
assert np.shares_memory(get_array(subset), get_array(s))
assert subset.index is not s.index
subset.iloc[0] = 0
assert not np.shares_memory(get_array(subset), get_array(s))
expected = Series([0, 2, 3])
tm.assert_series_equal(subset, expected)
# original parent series is not modified (CoW)
tm.assert_series_equal(s, s_orig)
@pytest.mark.parametrize(
"indexer",
[slice(0, 2), np.array([True, True, False]), np.array([0, 1])],
ids=["slice", "mask", "array"],
)
def test_series_subset_set_with_indexer(backend, indexer_si, indexer):
# Case: setting values in a viewing Series with an indexer
_, _, Series = backend
s = Series([1, 2, 3], index=["a", "b", "c"])
s_orig = s.copy()
subset = s[:]
if (
indexer_si is tm.setitem
and isinstance(indexer, np.ndarray)
and indexer.dtype.kind == "i"
):
# In 3.0 we treat integers as always-labels
with pytest.raises(KeyError):
indexer_si(subset)[indexer] = 0
return
indexer_si(subset)[indexer] = 0
expected = Series([0, 0, 3], index=["a", "b", "c"])
tm.assert_series_equal(subset, expected)
tm.assert_series_equal(s, s_orig)
# -----------------------------------------------------------------------------
# del operator
def test_del_frame(backend):
# Case: deleting a column with `del` on a viewing child dataframe should
# not modify parent + update the references
dtype_backend, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
df2 = df[:]
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
del df2["b"]
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
tm.assert_frame_equal(df, df_orig)
tm.assert_frame_equal(df2, df_orig[["a", "c"]])
df2._mgr._verify_integrity()
df.loc[0, "b"] = 200
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
df_orig = df.copy()
df2.loc[0, "a"] = 100
# modifying child after deleting a column still doesn't update parent
tm.assert_frame_equal(df, df_orig)
def test_del_series(backend):
_, _, Series = backend
s = Series([1, 2, 3], index=["a", "b", "c"])
s_orig = s.copy()
s2 = s[:]
assert np.shares_memory(get_array(s), get_array(s2))
del s2["a"]
assert not np.shares_memory(get_array(s), get_array(s2))
tm.assert_series_equal(s, s_orig)
tm.assert_series_equal(s2, s_orig[["b", "c"]])
# modifying s2 doesn't need copy on write (due to `del`, s2 is backed by new array)
values = s2.values
s2.loc["b"] = 100
assert values[0] == 100
# -----------------------------------------------------------------------------
# Accessing column as Series
def test_column_as_series(backend):
# Case: selecting a single column now also uses Copy-on-Write
dtype_backend, DataFrame, Series = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
s = df["a"]
assert s.index is not df.index
assert np.shares_memory(get_array(s, "a"), get_array(df, "a"))
s[0] = 0
expected = Series([0, 2, 3], name="a")
tm.assert_series_equal(s, expected)
# assert not np.shares_memory(s.values, get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)
# ensure cached series on getitem is not the changed series
tm.assert_series_equal(df["a"], df_orig["a"])
def test_column_as_series_set_with_upcast(backend):
# Case: selecting a single column now also uses Copy-on-Write -> when
# setting a value causes an upcast, we don't need to update the parent
# DataFrame through the cache mechanism
dtype_backend, DataFrame, Series = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
s = df["a"]
if dtype_backend == "nullable":
with pytest.raises(TypeError, match="Invalid value"):
s[0] = "foo"
expected = Series([1, 2, 3], name="a")
tm.assert_series_equal(s, expected)
tm.assert_frame_equal(df, df_orig)
# ensure cached series on getitem is not the changed series
tm.assert_series_equal(df["a"], df_orig["a"])
else:
with pytest.raises(TypeError, match="Invalid value"):
s[0] = "foo"
@pytest.mark.parametrize(
"method",
[
lambda df: df["a"],
lambda df: df.loc[:, "a"],
lambda df: df.iloc[:, 0],
],
ids=["getitem", "loc", "iloc"],
)
def test_column_as_series_no_item_cache(request, backend, method):
# Case: selecting a single column (which now also uses Copy-on-Write to protect
# the view) should always give a new object (i.e. not make use of a cache)
dtype_backend, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
s1 = method(df)
s2 = method(df)
assert s1 is not s2
assert s1.index is not df.index
assert s1.index is not s2.index
s1.iloc[0] = 0
tm.assert_series_equal(s2, df_orig["a"])
tm.assert_frame_equal(df, df_orig)
# TODO add tests for other indexing methods on the Series
def test_dataframe_add_column_from_series(backend):
# Case: adding a new column to a DataFrame from an existing column/series
# -> delays copy under CoW
_, DataFrame, Series = backend
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
s = Series([10, 11, 12])
df["new"] = s
assert np.shares_memory(get_array(df, "new"), get_array(s))
# editing series -> doesn't modify column in frame
s[0] = 0
expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]})
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("val", [100, "a"])
@pytest.mark.parametrize(
"indexer_func, indexer",
[
(tm.loc, (0, "a")),
(tm.iloc, (0, 0)),
(tm.loc, ([0], "a")),
(tm.iloc, ([0], 0)),
(tm.loc, (slice(None), "a")),
(tm.iloc, (slice(None), 0)),
],
)
@pytest.mark.parametrize(
"col", [[0.1, 0.2, 0.3], [7, 8, 9]], ids=["mixed-block", "single-block"]
)
def test_set_value_copy_only_necessary_column(indexer_func, indexer, val, col):
# When setting inplace, only copy column that is modified instead of the whole
# block (by splitting the block)
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col})
df_orig = df.copy()
view = df[:]
if val == "a":
with pytest.raises(TypeError, match="Invalid value"):
indexer_func(df)[indexer] = val
else:
indexer_func(df)[indexer] = val
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
tm.assert_frame_equal(view, df_orig)
def test_series_midx_slice():
ser = Series([1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]]))
ser_orig = ser.copy()
result = ser[1]
assert np.shares_memory(get_array(ser), get_array(result))
result.iloc[0] = 100
tm.assert_series_equal(ser, ser_orig)
def test_getitem_midx_slice():
df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2})
df_orig = df.copy()
new_df = df[("a",)]
assert not new_df._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x"))
new_df.iloc[0, 0] = 100
tm.assert_frame_equal(df_orig, df)
def test_series_midx_tuples_slice():
ser = Series(
[1, 2, 3],
index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]),
)
result = ser[(1, 2)]
assert np.shares_memory(get_array(ser), get_array(result))
result.iloc[0] = 100
expected = Series(
[1, 2, 3],
index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]),
)
tm.assert_series_equal(ser, expected)
def test_midx_read_only_bool_indexer():
# GH#56635
def mklbl(prefix, n):
return [f"{prefix}{i}" for i in range(n)]
idx = pd.MultiIndex.from_product(
[mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)]
)
cols = pd.MultiIndex.from_tuples(
[("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"]
)
df = DataFrame(1, index=idx, columns=cols).sort_index().sort_index(axis=1)
mask = df[("a", "foo")] == 1
expected_mask = mask.copy()
result = df.loc[pd.IndexSlice[mask, :, ["C1", "C3"]], :]
expected = df.loc[pd.IndexSlice[:, :, ["C1", "C3"]], :]
tm.assert_frame_equal(result, expected)
tm.assert_series_equal(mask, expected_mask)
def test_loc_enlarging_with_dataframe():
df = DataFrame({"a": [1, 2, 3]})
rhs = DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]})
rhs_orig = rhs.copy()
df.loc[:, ["b", "c"]] = rhs
assert np.shares_memory(get_array(df, "b"), get_array(rhs, "b"))
assert np.shares_memory(get_array(df, "c"), get_array(rhs, "c"))
assert not df._mgr._has_no_reference(1)
df.iloc[0, 1] = 100
tm.assert_frame_equal(rhs, rhs_orig)

View File

@@ -0,0 +1,112 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
def test_consolidate():
# create unconsolidated DataFrame
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
df["c"] = [4, 5, 6]
# take a viewing subset
subset = df[:]
# each block of subset references a block of df
assert all(blk.refs.has_reference() for blk in subset._mgr.blocks)
# consolidate the two int64 blocks
subset._consolidate_inplace()
# the float64 block still references the parent one because it still a view
assert subset._mgr.blocks[0].refs.has_reference()
# equivalent of assert np.shares_memory(df["b"].values, subset["b"].values)
# but avoids caching df["b"]
assert np.shares_memory(get_array(df, "b"), get_array(subset, "b"))
# the new consolidated int64 block does not reference another
assert not subset._mgr.blocks[1].refs.has_reference()
# the parent dataframe now also only is linked for the float column
assert not df._mgr.blocks[0].refs.has_reference()
assert df._mgr.blocks[1].refs.has_reference()
assert not df._mgr.blocks[2].refs.has_reference()
# and modifying subset still doesn't modify parent
subset.iloc[0, 1] = 0.0
assert not df._mgr.blocks[1].refs.has_reference()
assert df.loc[0, "b"] == 0.1
@pytest.mark.parametrize("dtype", [np.intp, np.int8])
@pytest.mark.parametrize(
"locs, arr",
[
([0], np.array([-1, -2, -3])),
([1], np.array([-1, -2, -3])),
([5], np.array([-1, -2, -3])),
([0, 1], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
([0, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
([0, 1, 2], np.array([[-1, -2, -3], [-4, -5, -6], [-4, -5, -6]]).T),
([1, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
],
)
def test_iset_splits_blocks_inplace(locs, arr, dtype):
# Nothing currently calls iset with
# more than 1 loc with inplace=True (only happens with inplace=False)
# but ensure that it works
df = DataFrame(
{
"a": [1, 2, 3],
"b": [4, 5, 6],
"c": [7, 8, 9],
"d": [10, 11, 12],
"e": [13, 14, 15],
"f": Series(["a", "b", "c"], dtype=object),
},
)
arr = arr.astype(dtype)
df_orig = df.copy()
df2 = df.copy(deep=False) # Trigger a CoW (if enabled, otherwise makes copy)
df2._mgr.iset(locs, arr, inplace=True)
tm.assert_frame_equal(df, df_orig)
for i, col in enumerate(df.columns):
if i not in locs:
assert np.shares_memory(get_array(df, col), get_array(df2, col))
def test_exponential_backoff():
# GH#55518
df = DataFrame({"a": [1, 2, 3]})
for i in range(490):
df.copy(deep=False)
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491
df = DataFrame({"a": [1, 2, 3]})
dfs = [df.copy(deep=False) for i in range(510)]
for i in range(20):
df.copy(deep=False)
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531
assert df._mgr.blocks[0].refs.clear_counter == 1000
for i in range(500):
df.copy(deep=False)
# Don't reduce since we still have over 500 objects alive
assert df._mgr.blocks[0].refs.clear_counter == 1000
dfs = dfs[:300]
for i in range(500):
df.copy(deep=False)
# Reduce since there are less than 500 objects alive
assert df._mgr.blocks[0].refs.clear_counter == 500

View File

@@ -0,0 +1,307 @@
import numpy as np
import pytest
from pandas import (
NA,
DataFrame,
Interval,
NaT,
Series,
Timestamp,
interval_range,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
@pytest.mark.parametrize("method", ["pad", "nearest", "linear"])
def test_interpolate_no_op(method):
df = DataFrame({"a": [1, 2]})
df_orig = df.copy()
if method == "pad":
msg = f"Can not interpolate with method={method}"
with pytest.raises(ValueError, match=msg):
df.interpolate(method=method)
else:
result = df.interpolate(method=method)
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
assert result.index is not df.index
assert result.columns is not df.columns
result.iloc[0, 0] = 100
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_interp_fill_functions(func):
# Check that these takes the same code paths as interpolate
df = DataFrame({"a": [1, 2]})
df_orig = df.copy()
result = getattr(df, func)()
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
assert result.index is not df.index
assert result.columns is not df.columns
result.iloc[0, 0] = 100
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize("func", ["ffill", "bfill"])
@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_triggers_copy(vals, func):
df = DataFrame({"a": vals})
result = getattr(df, func)()
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
# Check that we don't have references when triggering a copy
assert result._mgr._has_no_reference(0)
@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_inplace_no_reference_no_copy(vals):
df = DataFrame({"a": vals})
arr = get_array(df, "a")
df.interpolate(method="linear", inplace=True)
assert np.shares_memory(arr, get_array(df, "a"))
# Check that we don't have references when triggering a copy
assert df._mgr._has_no_reference(0)
@pytest.mark.parametrize(
"vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
)
def test_interpolate_inplace_with_refs(vals):
df = DataFrame({"a": [1, np.nan, 2]})
df_orig = df.copy()
arr = get_array(df, "a")
view = df[:]
df.interpolate(method="linear", inplace=True)
# Check that copy was triggered in interpolate and that we don't
# have any references left
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
@pytest.mark.parametrize("func", ["ffill", "bfill"])
@pytest.mark.parametrize("dtype", ["float64", "Float64"])
def test_interp_fill_functions_inplace(func, dtype):
# Check that these takes the same code paths as interpolate
df = DataFrame({"a": [1, np.nan, 2]}, dtype=dtype)
df_orig = df.copy()
arr = get_array(df, "a")
view = df[:]
getattr(df, func)(inplace=True)
# Check that copy was triggered in interpolate and that we don't
# have any references left
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
def test_interpolate_cannot_with_object_dtype():
df = DataFrame({"a": ["a", np.nan, "c"], "b": 1})
df["a"] = df["a"].astype(object)
msg = "DataFrame cannot interpolate with object dtype"
with pytest.raises(TypeError, match=msg):
df.interpolate()
def test_interpolate_object_convert_no_op():
df = DataFrame({"a": ["a", "b", "c"], "b": 1})
df["a"] = df["a"].astype(object)
arr_a = get_array(df, "a")
# Now CoW makes a copy, it should not!
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr_a, get_array(df, "a"))
def test_interpolate_object_convert_copies():
df = DataFrame({"a": [1, np.nan, 2.5], "b": 1})
arr_a = get_array(df, "a")
msg = "Can not interpolate with method=pad"
with pytest.raises(ValueError, match=msg):
df.interpolate(method="pad", inplace=True)
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr_a, get_array(df, "a"))
def test_interpolate_downcast_reference_triggers_copy():
df = DataFrame({"a": [1, np.nan, 2.5], "b": 1})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
msg = "Can not interpolate with method=pad"
with pytest.raises(ValueError, match=msg):
df.interpolate(method="pad", inplace=True)
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr_a, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
def test_fillna():
df = DataFrame({"a": [1.5, np.nan], "b": 1})
df_orig = df.copy()
df2 = df.fillna(5.5)
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert df2.index is not df.index
assert df2.columns is not df.columns
df2.iloc[0, 1] = 100
tm.assert_frame_equal(df_orig, df)
def test_fillna_dict():
df = DataFrame({"a": [1.5, np.nan], "b": 1})
df_orig = df.copy()
df2 = df.fillna({"a": 100.5})
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
df2.iloc[0, 1] = 100
tm.assert_frame_equal(df_orig, df)
def test_fillna_inplace():
df = DataFrame({"a": [1.5, np.nan], "b": 1})
arr_a = get_array(df, "a")
arr_b = get_array(df, "b")
df.fillna(5.5, inplace=True)
assert np.shares_memory(get_array(df, "a"), arr_a)
assert np.shares_memory(get_array(df, "b"), arr_b)
assert df._mgr._has_no_reference(0)
assert df._mgr._has_no_reference(1)
def test_fillna_inplace_reference():
df = DataFrame({"a": [1.5, np.nan], "b": 1})
df_orig = df.copy()
arr_a = get_array(df, "a")
arr_b = get_array(df, "b")
view = df[:]
df.fillna(5.5, inplace=True)
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert np.shares_memory(get_array(df, "b"), arr_b)
assert view._mgr._has_no_reference(0)
assert df._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
expected = DataFrame({"a": [1.5, 5.5], "b": 1})
tm.assert_frame_equal(df, expected)
def test_fillna_interval_inplace_reference():
# Set dtype explicitly to avoid implicit cast when setting nan
ser = Series(
interval_range(start=0, end=5), name="a", dtype="interval[float64, right]"
)
ser.iloc[1] = np.nan
ser_orig = ser.copy()
view = ser[:]
ser.fillna(value=Interval(left=0, right=5), inplace=True)
assert not np.shares_memory(
get_array(ser, "a").left.values, get_array(view, "a").left.values
)
tm.assert_series_equal(view, ser_orig)
def test_fillna_series_empty_arg():
ser = Series([1, np.nan, 2])
ser_orig = ser.copy()
result = ser.fillna({})
assert np.shares_memory(get_array(ser), get_array(result))
ser.iloc[0] = 100.5
tm.assert_series_equal(ser_orig, result)
def test_fillna_series_empty_arg_inplace():
ser = Series([1, np.nan, 2])
arr = get_array(ser)
ser.fillna({}, inplace=True)
assert np.shares_memory(get_array(ser), arr)
assert ser._mgr._has_no_reference(0)
def test_fillna_ea_noop_shares_memory(any_numeric_ea_and_arrow_dtype):
df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype)
df_orig = df.copy()
df2 = df.fillna(100)
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not df2._mgr._has_no_reference(1)
tm.assert_frame_equal(df_orig, df)
df2.iloc[0, 1] = 100
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert df2._mgr._has_no_reference(1)
assert df._mgr._has_no_reference(1)
tm.assert_frame_equal(df_orig, df)
def test_fillna_inplace_ea_noop_shares_memory(any_numeric_ea_and_arrow_dtype):
df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype)
df_orig = df.copy()
view = df[:]
df.fillna(100, inplace=True)
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
assert not df._mgr._has_no_reference(1)
assert not view._mgr._has_no_reference(1)
df.iloc[0, 1] = 100
tm.assert_frame_equal(df_orig, view)
def test_fillna_chained_assignment():
df = DataFrame({"a": [1, np.nan, 2], "b": 1})
df_orig = df.copy()
with tm.raises_chained_assignment_error():
df["a"].fillna(100, inplace=True)
tm.assert_frame_equal(df, df_orig)
with tm.raises_chained_assignment_error():
df[["a"]].fillna(100, inplace=True)
tm.assert_frame_equal(df, df_orig)
@pytest.mark.parametrize("func", ["interpolate", "ffill", "bfill"])
def test_interpolate_chained_assignment(func):
df = DataFrame({"a": [1, np.nan, 2], "b": 1})
df_orig = df.copy()
with tm.raises_chained_assignment_error():
getattr(df["a"], func)(inplace=True)
tm.assert_frame_equal(df, df_orig)
with tm.raises_chained_assignment_error():
getattr(df[["a"]], func)(inplace=True)
tm.assert_frame_equal(df, df_orig)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,356 @@
import numpy as np
import pytest
from pandas import (
Categorical,
DataFrame,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
@pytest.mark.parametrize(
"replace_kwargs",
[
{"to_replace": {"a": 1, "b": 4}, "value": -1},
# Test CoW splits blocks to avoid copying unchanged columns
{"to_replace": {"a": 1}, "value": -1},
{"to_replace": {"b": 4}, "value": -1},
{"to_replace": {"b": {4: 1}}},
# TODO: Add these in a further optimization
# We would need to see which columns got replaced in the mask
# which could be expensive
# {"to_replace": {"b": 1}},
# 1
],
)
def test_replace(replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
df_replaced = df.replace(**replace_kwargs)
if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
if "a" in replace_kwargs["to_replace"]:
arr = get_array(df_replaced, "a")
df_replaced.loc[0, "a"] = 100
assert np.shares_memory(get_array(df_replaced, "a"), arr)
tm.assert_frame_equal(df, df_orig)
def test_replace_regex_inplace_refs():
df = DataFrame({"a": ["aaa", "bbb"]})
df_orig = df.copy()
view = df[:]
arr = get_array(df, "a")
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
assert not np.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
def test_replace_regex_inplace():
df = DataFrame({"a": ["aaa", "bbb"]})
arr = get_array(df, "a")
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
assert df._mgr._has_no_reference(0)
assert tm.shares_memory(arr, get_array(df, "a"))
df_orig = df.copy()
df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
tm.assert_frame_equal(df_orig, df)
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
def test_replace_regex_inplace_no_op():
df = DataFrame({"a": [1, 2]})
arr = get_array(df, "a")
df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True)
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr, get_array(df, "a"))
df_orig = df.copy()
df2 = df.replace(to_replace=r"^x.$", value="new", regex=True)
tm.assert_frame_equal(df_orig, df)
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
def test_replace_mask_all_false_second_block():
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
df_orig = df.copy()
df2 = df.replace(to_replace=1.5, value=55.5)
# TODO: Block splitting would allow us to avoid copying b
assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
df2.loc[0, "c"] = 1
tm.assert_frame_equal(df, df_orig) # Original is unchanged
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))
def test_replace_coerce_single_column():
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
df_orig = df.copy()
df2 = df.replace(to_replace=1.5, value="a")
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
df2.loc[0, "b"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
def test_replace_to_replace_wrong_dtype():
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
df_orig = df.copy()
df2 = df.replace(to_replace="xxx", value=1.5)
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
df2.loc[0, "b"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
def test_replace_list_categorical():
df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
arr = get_array(df, "a")
df.replace(["c"], value="a", inplace=True)
assert np.shares_memory(arr.codes, get_array(df, "a").codes)
assert df._mgr._has_no_reference(0)
df_orig = df.copy()
df.replace(["b"], value="a")
df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"}))
assert not np.shares_memory(arr.codes, get_array(df2, "a").codes)
tm.assert_frame_equal(df, df_orig)
def test_replace_list_inplace_refs_categorical():
df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
view = df[:]
df_orig = df.copy()
df.replace(["c"], value="a", inplace=True)
tm.assert_frame_equal(df_orig, view)
@pytest.mark.parametrize("to_replace", [1.5, [1.5], []])
def test_replace_inplace(to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
df.replace(to_replace=1.5, value=15.5, inplace=True)
assert np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
@pytest.mark.parametrize("to_replace", [1.5, [1.5]])
def test_replace_inplace_reference(to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=to_replace, value=15.5, inplace=True)
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
@pytest.mark.parametrize("to_replace", ["a", 100.5])
def test_replace_inplace_reference_no_op(to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=to_replace, value=15.5, inplace=True)
assert np.shares_memory(get_array(df, "a"), arr_a)
assert not df._mgr._has_no_reference(0)
assert not view._mgr._has_no_reference(0)
@pytest.mark.parametrize("to_replace", [1, [1]])
def test_replace_categorical_inplace_reference(to_replace):
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=to_replace, value=1, inplace=True)
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
def test_replace_categorical_inplace():
df = DataFrame({"a": Categorical([1, 2, 3])})
arr_a = get_array(df, "a")
df.replace(to_replace=1, value=1, inplace=True)
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert df._mgr._has_no_reference(0)
expected = DataFrame({"a": Categorical([1, 2, 3])})
tm.assert_frame_equal(df, expected)
def test_replace_categorical():
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
df2 = df.replace(to_replace=1, value=1)
assert df._mgr._has_no_reference(0)
assert df2._mgr._has_no_reference(0)
assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes)
tm.assert_frame_equal(df, df_orig)
arr_a = get_array(df2, "a").codes
df2.iloc[0, 0] = 2.0
assert np.shares_memory(get_array(df2, "a").codes, arr_a)
@pytest.mark.parametrize("method", ["where", "mask"])
def test_masking_inplace(method):
df = DataFrame({"a": [1.5, 2, 3]})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
method = getattr(df, method)
method(df["a"] > 1.6, -1, inplace=True)
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
def test_replace_empty_list():
df = DataFrame({"a": [1, 2]})
df2 = df.replace([], [])
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not df._mgr._has_no_reference(0)
arr_a = get_array(df, "a")
df.replace([], [])
assert np.shares_memory(get_array(df, "a"), arr_a)
assert not df._mgr._has_no_reference(0)
assert not df2._mgr._has_no_reference(0)
@pytest.mark.parametrize("value", ["d", None])
def test_replace_object_list_inplace(value):
df = DataFrame({"a": ["a", "b", "c"]}, dtype=object)
arr = get_array(df, "a")
df.replace(["c"], value, inplace=True)
assert np.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
def test_replace_list_multiple_elements_inplace():
df = DataFrame({"a": [1, 2, 3]})
arr = get_array(df, "a")
df.replace([1, 2], 4, inplace=True)
assert np.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
def test_replace_list_none():
df = DataFrame({"a": ["a", "b", "c"]})
df_orig = df.copy()
df2 = df.replace(["b"], value=None)
tm.assert_frame_equal(df, df_orig)
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
# replace multiple values that don't actually replace anything with None
# https://github.com/pandas-dev/pandas/issues/59770
df3 = df.replace(["d", "e", "f"], value=None)
tm.assert_frame_equal(df3, df_orig)
assert tm.shares_memory(get_array(df, "a"), get_array(df3, "a"))
def test_replace_list_none_inplace_refs():
df = DataFrame({"a": ["a", "b", "c"]})
arr = get_array(df, "a")
df_orig = df.copy()
view = df[:]
df.replace(["a"], value=None, inplace=True)
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
def test_replace_columnwise_no_op_inplace():
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
view = df[:]
df_orig = df.copy()
df.replace({"a": 10}, 100, inplace=True)
assert np.shares_memory(get_array(view, "a"), get_array(df, "a"))
df.iloc[0, 0] = 100
tm.assert_frame_equal(view, df_orig)
def test_replace_columnwise_no_op():
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
df_orig = df.copy()
df2 = df.replace({"a": 10}, 100)
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
df2.iloc[0, 0] = 100
tm.assert_frame_equal(df, df_orig)
def test_replace_chained_assignment():
df = DataFrame({"a": [1, np.nan, 2], "b": 1})
df_orig = df.copy()
with tm.raises_chained_assignment_error():
df["a"].replace(1, 100, inplace=True)
tm.assert_frame_equal(df, df_orig)
with tm.raises_chained_assignment_error():
df[["a"]].replace(1, 100, inplace=True)
tm.assert_frame_equal(df, df_orig)
def test_replace_listlike():
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
df_orig = df.copy()
result = df.replace([200, 201], [11, 11])
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
result.iloc[0, 0] = 100
tm.assert_frame_equal(df, df)
result = df.replace([200, 2], [10, 10])
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
tm.assert_frame_equal(df, df_orig)
def test_replace_listlike_inplace():
df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
arr = get_array(df, "a")
df.replace([200, 2], [10, 11], inplace=True)
assert np.shares_memory(get_array(df, "a"), arr)
view = df[:]
df_orig = df.copy()
df.replace([200, 3], [10, 11], inplace=True)
assert not np.shares_memory(get_array(df, "a"), arr)
tm.assert_frame_equal(view, df_orig)

View File

@@ -0,0 +1,142 @@
import numpy as np
from pandas import (
DataFrame,
Index,
MultiIndex,
RangeIndex,
Series,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
# -----------------------------------------------------------------------------
# Copy/view behaviour for the values that are set in a DataFrame
def test_set_column_with_array():
# Case: setting an array as a new column (df[col] = arr) copies that data
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
arr = np.array([1, 2, 3], dtype="int64")
df["c"] = arr
# the array data is copied
assert not np.shares_memory(get_array(df, "c"), arr)
# and thus modifying the array does not modify the DataFrame
arr[0] = 0
tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
def test_set_column_with_series():
# Case: setting a series as a new column (df[col] = s) copies that data
# (with delayed copy with CoW)
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
ser = Series([1, 2, 3])
df["c"] = ser
assert np.shares_memory(get_array(df, "c"), get_array(ser))
# and modifying the series does not modify the DataFrame
ser.iloc[0] = 0
assert ser.iloc[0] == 0
tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
def test_set_column_with_index():
# Case: setting an index as a new column (df[col] = idx) copies that data
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
idx = Index([1, 2, 3])
df["c"] = idx
# the index data is copied
assert not np.shares_memory(get_array(df, "c"), idx.values)
idx = RangeIndex(1, 4)
arr = idx.values
df["d"] = idx
assert not np.shares_memory(get_array(df, "d"), arr)
def test_set_columns_with_dataframe():
# Case: setting a DataFrame as new columns copies that data
# (with delayed copy with CoW)
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]})
df[["c", "d"]] = df2
assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
# and modifying the set DataFrame does not modify the original DataFrame
df2.iloc[0, 0] = 0
tm.assert_series_equal(df["c"], Series([7, 8, 9], name="c"))
def test_setitem_series_no_copy():
# Case: setting a Series as column into a DataFrame can delay copying that data
df = DataFrame({"a": [1, 2, 3]})
rhs = Series([4, 5, 6])
rhs_orig = rhs.copy()
# adding a new column
df["b"] = rhs
assert np.shares_memory(get_array(rhs), get_array(df, "b"))
df.iloc[0, 1] = 100
tm.assert_series_equal(rhs, rhs_orig)
def test_setitem_series_no_copy_single_block():
# Overwriting an existing column that is a single block
df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
rhs = Series([4, 5, 6])
rhs_orig = rhs.copy()
df["a"] = rhs
assert np.shares_memory(get_array(rhs), get_array(df, "a"))
df.iloc[0, 0] = 100
tm.assert_series_equal(rhs, rhs_orig)
def test_setitem_series_no_copy_split_block():
# Overwriting an existing column that is part of a larger block
df = DataFrame({"a": [1, 2, 3], "b": 1})
rhs = Series([4, 5, 6])
rhs_orig = rhs.copy()
df["b"] = rhs
assert np.shares_memory(get_array(rhs), get_array(df, "b"))
df.iloc[0, 1] = 100
tm.assert_series_equal(rhs, rhs_orig)
def test_setitem_series_column_midx_broadcasting():
# Setting a Series to multiple columns will repeat the data
# (currently copying the data eagerly)
df = DataFrame(
[[1, 2, 3], [3, 4, 5]],
columns=MultiIndex.from_arrays([["a", "a", "b"], [1, 2, 3]]),
)
rhs = Series([10, 11])
df["a"] = rhs
assert not np.shares_memory(get_array(rhs), df._get_column_array(0))
assert df._mgr._has_no_reference(0)
def test_set_column_with_inplace_operator():
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
# this should not raise any warning
with tm.assert_produces_warning(None):
df["a"] += 1
# when it is not in a chain, then it should produce a warning
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
ser = df["a"]
ser += 1

View File

@@ -0,0 +1,14 @@
import numpy as np
from pandas import DataFrame
from pandas.tests.copy_view.util import get_array
def test_get_array_numpy():
df = DataFrame({"a": [1, 2, 3]})
assert np.shares_memory(get_array(df, "a"), get_array(df, "a"))
def test_get_array_masked():
df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
assert np.shares_memory(get_array(df, "a"), get_array(df, "a"))

View File

@@ -0,0 +1,30 @@
from pandas import (
Categorical,
Index,
Series,
)
from pandas.core.arrays import BaseMaskedArray
def get_array(obj, col=None):
"""
Helper method to get array for a DataFrame column or a Series.
Equivalent of df[col].values, but without going through normal getitem,
which triggers tracking references / CoW (and we might be testing that
this is done by some other operation).
"""
if isinstance(obj, Index):
arr = obj._values
elif isinstance(obj, Series) and (col is None or obj.name == col):
arr = obj._values
else:
assert col is not None
icol = obj.columns.get_loc(col)
assert isinstance(icol, int)
arr = obj._get_column_array(icol)
if isinstance(arr, BaseMaskedArray):
return arr._data
elif isinstance(arr, Categorical):
return arr
return getattr(arr, "_ndarray", arr)