Skip to content

Commit

Permalink
In-place updates with loc or iloc don't work correctly when the LHS h…
Browse files Browse the repository at this point in the history
…as more than one column (NVIDIA#9918)

Fixes: rapidsai/cudf#7377

This PR enables to `setitem` using a scalar value, dataframe  or  array/list iterable in both `DataframeLocIndexer `and  `DataFrameIlocIndexer `. Only the following cases are currently supported in cudf:
- Scalar value: follows the original code path, assigns column- values via specified  key (row-label)
- Dataframe : checks for column-alignment in LHS and RHS, then uses a scatter map of the indices to assign column-values accordingly. Substitute NA for columns not found in the RHS
- All other cases (array, list, range value, etc) :  first conversion to cupy array followed by special handling:
   * If 2d array:  If the inner dimension is 1, it's broadcastable to all columns of the dataframe.
   * Otherwise the value must be a 1d array (scalar values are handled in case 1 above), there are 2 subcases:
     * If the key on column axis is a scalar, meaning the user is indexing a single column; Therefore 1d value should assign along the columns.
     * Otherwise, the key on column axis is a 1d array. In this case, the key on row axis can be a scalar or 1d and in both cases of row key, the ith element in value corresponds to the ith row in the indexed object. If the key is 1d, a broadcast will happen.

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Michael Wang (https://github.com/isVoid)

URL: rapidsai/cudf#9918
  • Loading branch information
skirui-source authored May 4, 2022
1 parent dd68db3 commit 1a457ef
Show file tree
Hide file tree
Showing 4 changed files with 286 additions and 61 deletions.
123 changes: 100 additions & 23 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,14 @@
}


def _shape_mismatch_error(x, y):
raise ValueError(
f"shape mismatch: value array of shape {x} "
f"could not be broadcast to indexing result of "
f"shape {y}"
)


class _DataFrameIndexer(_FrameIndexer):
def __getitem__(self, arg):
if (
Expand Down Expand Up @@ -342,28 +350,58 @@ def _setitem_tuple_arg(self, key, value):
)
self._frame._data.insert(key[1], new_col)
else:
if isinstance(value, (cupy.ndarray, np.ndarray)):
value_df = DataFrame(value)
if value_df.shape[1] != columns_df.shape[1]:
if value_df.shape[1] == 1:
value_cols = (
value_df._data.columns * columns_df.shape[1]
)
else:
raise ValueError(
f"shape mismatch: value array of shape "
f"{value_df.shape} could not be "
f"broadcast to indexing result of shape "
f"{columns_df.shape}"
)
else:
value_cols = value_df._data.columns
for i, col in enumerate(columns_df._column_names):
self._frame[col].loc[key[0]] = value_cols[i]
else:
if is_scalar(value):
for col in columns_df._column_names:
self._frame[col].loc[key[0]] = value

elif isinstance(value, cudf.DataFrame):
if value.shape != self._frame.loc[key[0]].shape:
_shape_mismatch_error(
value.shape,
self._frame.loc[key[0]].shape,
)
value_column_names = set(value._column_names)
scatter_map = _indices_from_labels(self._frame, key[0])
for col in columns_df._column_names:
columns_df[col][scatter_map] = (
value._data[col]
if col in value_column_names
else cudf.NA
)

else:
value = cupy.asarray(value)
if cupy.ndim(value) == 2:
# If the inner dimension is 1, it's broadcastable to
# all columns of the dataframe.
indexed_shape = columns_df.loc[key[0]].shape
if value.shape[1] == 1:
if value.shape[0] != indexed_shape[0]:
_shape_mismatch_error(value.shape, indexed_shape)
for i, col in enumerate(columns_df._column_names):
self._frame[col].loc[key[0]] = value[:, 0]
else:
if value.shape != indexed_shape:
_shape_mismatch_error(value.shape, indexed_shape)
for i, col in enumerate(columns_df._column_names):
self._frame[col].loc[key[0]] = value[:, i]
else:
# handle cases where value is 1d object:
# If the key on column axis is a scalar, we indexed
# a single column; The 1d value should assign along
# the columns.
if is_scalar(key[1]):
for col in columns_df._column_names:
self._frame[col].loc[key[0]] = value
# Otherwise, there are two situations. The key on row axis
# can be a scalar or 1d. In either of the situation, the
# ith element in value corresponds to the ith row in
# the indexed object.
# If the key is 1d, a broadcast will happen.
else:
for i, col in enumerate(columns_df._column_names):
self._frame[col].loc[key[0]] = value[i]


class _DataFrameIlocIndexer(_DataFrameIndexer):
"""
Expand Down Expand Up @@ -424,10 +462,49 @@ def _getitem_tuple_arg(self, arg):

@_cudf_nvtx_annotate
def _setitem_tuple_arg(self, key, value):
# TODO: Determine if this usage is prevalent enough to expose this
# selection logic at a higher level than ColumnAccessor.
for col in self._frame._data.get_labels_by_index(key[1]):
self._frame[col].iloc[key[0]] = value
columns_df = self._frame._from_data(
self._frame._data.select_by_index(key[1]), self._frame._index
)

if is_scalar(value):
for col in columns_df._column_names:
self._frame[col].iloc[key[0]] = value

elif isinstance(value, cudf.DataFrame):
if value.shape != self._frame.iloc[key[0]].shape:
_shape_mismatch_error(
value.shape,
self._frame.loc[key[0]].shape,
)
value_column_names = set(value._column_names)
for col in columns_df._column_names:
columns_df[col][key[0]] = (
value._data[col] if col in value_column_names else cudf.NA
)

else:
# TODO: consolidate code path with identical counterpart
# in `_DataFrameLocIndexer._setitem_tuple_arg`
value = cupy.asarray(value)
if cupy.ndim(value) == 2:
indexed_shape = columns_df.iloc[key[0]].shape
if value.shape[1] == 1:
if value.shape[0] != indexed_shape[0]:
_shape_mismatch_error(value.shape, indexed_shape)
for i, col in enumerate(columns_df._column_names):
self._frame[col].iloc[key[0]] = value[:, 0]
else:
if value.shape != indexed_shape:
_shape_mismatch_error(value.shape, indexed_shape)
for i, col in enumerate(columns_df._column_names):
self._frame._data[col][key[0]] = value[:, i]
else:
if is_scalar(key[1]):
for col in columns_df._column_names:
self._frame[col].iloc[key[0]] = value
else:
for i, col in enumerate(columns_df._column_names):
self._frame[col].iloc[key[0]] = value[i]

def _getitem_scalar(self, arg):
col = self._frame.columns[arg[1]]
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):


def _indices_from_labels(obj, labels):

if not isinstance(labels, cudf.MultiIndex):
labels = cudf.core.column.as_column(labels)

Expand Down
37 changes: 0 additions & 37 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8697,43 +8697,6 @@ def test_frame_series_where():
assert_eq(expected, actual)


@pytest.mark.parametrize(
"array,is_error",
[
(cupy.arange(20, 40).reshape(-1, 2), False),
(cupy.arange(20, 50).reshape(-1, 3), True),
(np.arange(20, 40).reshape(-1, 2), False),
(np.arange(20, 30).reshape(-1, 1), False),
(cupy.arange(20, 30).reshape(-1, 1), False),
],
)
def test_dataframe_indexing_setitem_np_cp_array(array, is_error):
gdf = cudf.DataFrame({"a": range(10), "b": range(10)})
pdf = gdf.to_pandas()
if not is_error:
gdf.loc[:, ["a", "b"]] = array
pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array)

assert_eq(gdf, pdf)
else:
assert_exceptions_equal(
lfunc=pdf.loc.__setitem__,
rfunc=gdf.loc.__setitem__,
lfunc_args_and_kwargs=(
[(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)],
{},
),
rfunc_args_and_kwargs=(
[(slice(None, None, None), ["a", "b"]), array],
{},
),
compare_error_message=False,
expected_error_message="shape mismatch: value array of shape "
"(10, 3) could not be broadcast to indexing "
"result of shape (10, 2)",
)


@pytest.mark.parametrize(
"data",
[{"a": [1, 2, 3], "b": [1, 1, 0]}],
Expand Down
186 changes: 186 additions & 0 deletions python/cudf/cudf/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1486,3 +1486,189 @@ def test_iloc_decimal():
["4.00", "3.00", "2.00", "1.00"],
).astype(cudf.Decimal64Dtype(scale=2, precision=3))
assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True))


@pytest.mark.parametrize(
("key, value"),
[
(
([0], ["x", "y"]),
[10, 20],
),
(
([0, 2], ["x", "y"]),
[[10, 30], [20, 40]],
),
(
(0, ["x", "y"]),
[10, 20],
),
(
([0, 2], "x"),
[10, 20],
),
],
)
def test_dataframe_loc_inplace_update(key, value):
gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
pdf = gdf.to_pandas()

actual = gdf.loc[key] = value
expected = pdf.loc[key] = value

assert_eq(expected, actual)


def test_dataframe_loc_inplace_update_string_index():
gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=list("abc"))
pdf = gdf.to_pandas()

actual = gdf.loc[["a"], ["x", "y"]] = [10, 20]
expected = pdf.loc[["a"], ["x", "y"]] = [10, 20]

assert_eq(expected, actual)


@pytest.mark.parametrize(
("key, value"),
[
([0], [10, 20]),
([0, 2], [[10, 30], [20, 40]]),
(([0, 2], [0, 1]), [[10, 30], [20, 40]]),
(([0, 2], 0), [10, 30]),
((0, [0, 1]), [20, 40]),
],
)
def test_dataframe_iloc_inplace_update(key, value):
gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
pdf = gdf.to_pandas()

actual = gdf.iloc[key] = value
expected = pdf.iloc[key] = value

assert_eq(expected, actual)


@pytest.mark.parametrize(
"loc_key",
[([0, 2], ["x", "y"])],
)
@pytest.mark.parametrize(
"iloc_key",
[[0, 2]],
)
@pytest.mark.parametrize(
("data, index"),
[
(
{"x": [10, 20], "y": [30, 40]},
[0, 2],
)
],
)
def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe(
loc_key, iloc_key, data, index
):
gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
pdf = gdf.to_pandas()

actual = gdf.loc[loc_key] = cudf.DataFrame(data, index=cudf.Index(index))
expected = pdf.loc[loc_key] = pd.DataFrame(data, index=pd.Index(index))
assert_eq(expected, actual)

actual = gdf.iloc[iloc_key] = cudf.DataFrame(data, index=cudf.Index(index))
expected = pdf.iloc[iloc_key] = pd.DataFrame(data, index=pd.Index(index))
assert_eq(expected, actual)


def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns():
gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
pdf = gdf.to_pandas()

actual = gdf.loc[[0, 2], ["x", "y"]] = cudf.DataFrame(
{"b": [10, 20], "y": [30, 40]}, index=cudf.Index([0, 2])
)
expected = pdf.loc[[0, 2], ["x", "y"]] = pd.DataFrame(
{"b": [10, 20], "y": [30, 40]}, index=pd.Index([0, 2])
)

assert_eq(expected, actual)


@pytest.mark.parametrize(
("key, value"),
[
(([0, 2], ["x", "y"]), [[10, 30, 50], [20, 40, 60]]),
(([0], ["x", "y"]), [[10], [20]]),
],
)
def test_dataframe_loc_inplace_update_shape_mismatch(key, value):
gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
with pytest.raises(ValueError, match="shape mismatch:"):
gdf.loc[key] = value


@pytest.mark.parametrize(
("key, value"),
[
([0, 2], [[10, 30, 50], [20, 40, 60]]),
([0], [[10], [20]]),
],
)
def test_dataframe_iloc_inplace_update_shape_mismatch(key, value):
gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
with pytest.raises(ValueError, match="shape mismatch:"):
gdf.iloc[key] = value


def test_dataframe_loc_inplace_update_shape_mismatch_RHS_df():
gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
with pytest.raises(ValueError, match="shape mismatch:"):
gdf.loc[([0, 2], ["x", "y"])] = cudf.DataFrame(
{"x": [10, 20]}, index=cudf.Index([0, 2])
)


def test_dataframe_iloc_inplace_update_shape_mismatch_RHS_df():
gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
with pytest.raises(ValueError, match="shape mismatch:"):
gdf.iloc[[0, 2]] = cudf.DataFrame(
{"x": [10, 20]}, index=cudf.Index([0, 2])
)


@pytest.mark.parametrize(
"array,is_error",
[
(cupy.arange(20, 40).reshape(-1, 2), False),
(cupy.arange(20, 50).reshape(-1, 3), True),
(np.arange(20, 40).reshape(-1, 2), False),
(np.arange(20, 30).reshape(-1, 1), False),
(cupy.arange(20, 30).reshape(-1, 1), False),
],
)
def test_dataframe_indexing_setitem_np_cp_array(array, is_error):
gdf = cudf.DataFrame({"a": range(10), "b": range(10)})
pdf = gdf.to_pandas()
if not is_error:
gdf.loc[:, ["a", "b"]] = array
pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array)

assert_eq(gdf, pdf)
else:
assert_exceptions_equal(
lfunc=pdf.loc.__setitem__,
rfunc=gdf.loc.__setitem__,
lfunc_args_and_kwargs=(
[(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)],
{},
),
rfunc_args_and_kwargs=(
[(slice(None, None, None), ["a", "b"]), array],
{},
),
compare_error_message=False,
expected_error_message="shape mismatch: value array of shape "
"(10, 3) could not be broadcast to indexing "
"result of shape (10, 2)",
)

0 comments on commit 1a457ef

Please sign in to comment.