From 1a457efc019ee06bf11d350485cee12087db9d6e Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Wed, 4 May 2022 14:42:42 -0700 Subject: [PATCH] In-place updates with loc or iloc don't work correctly when the LHS has more than one column (#9918) Fixes: https://github.com/rapidsai/cudf/issues/7377 This PR enables to `setitem` using a scalar value, dataframe or array/list iterable in both `DataframeLocIndexer `and `DataFrameIlocIndexer `. Only the following cases are currently supported in cudf: - Scalar value: follows the original code path, assigns column- values via specified key (row-label) - Dataframe : checks for column-alignment in LHS and RHS, then uses a scatter map of the indices to assign column-values accordingly. Substitute NA for columns not found in the RHS - All other cases (array, list, range value, etc) : first conversion to cupy array followed by special handling: * If 2d array: If the inner dimension is 1, it's broadcastable to all columns of the dataframe. * Otherwise the value must be a 1d array (scalar values are handled in case 1 above), there are 2 subcases: * If the key on column axis is a scalar, meaning the user is indexing a single column; Therefore 1d value should assign along the columns. * Otherwise, the key on column axis is a 1d array. In this case, the key on row axis can be a scalar or 1d and in both cases of row key, the ith element in value corresponds to the ith row in the indexed object. If the key is 1d, a broadcast will happen. Authors: - Sheilah Kirui (https://github.com/skirui-source) - Michael Wang (https://github.com/isVoid) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) - GALI PREM SAGAR (https://github.com/galipremsagar) - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/9918 --- python/cudf/cudf/core/dataframe.py | 123 ++++++++++++--- python/cudf/cudf/core/indexed_frame.py | 1 - python/cudf/cudf/tests/test_dataframe.py | 37 ----- python/cudf/cudf/tests/test_indexing.py | 186 +++++++++++++++++++++++ 4 files changed, 286 insertions(+), 61 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8c459e855c1..036ef890696 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -111,6 +111,14 @@ } +def _shape_mismatch_error(x, y): + raise ValueError( + f"shape mismatch: value array of shape {x} " + f"could not be broadcast to indexing result of " + f"shape {y}" + ) + + class _DataFrameIndexer(_FrameIndexer): def __getitem__(self, arg): if ( @@ -342,28 +350,58 @@ def _setitem_tuple_arg(self, key, value): ) self._frame._data.insert(key[1], new_col) else: - if isinstance(value, (cupy.ndarray, np.ndarray)): - value_df = DataFrame(value) - if value_df.shape[1] != columns_df.shape[1]: - if value_df.shape[1] == 1: - value_cols = ( - value_df._data.columns * columns_df.shape[1] - ) - else: - raise ValueError( - f"shape mismatch: value array of shape " - f"{value_df.shape} could not be " - f"broadcast to indexing result of shape " - f"{columns_df.shape}" - ) - else: - value_cols = value_df._data.columns - for i, col in enumerate(columns_df._column_names): - self._frame[col].loc[key[0]] = value_cols[i] - else: + if is_scalar(value): for col in columns_df._column_names: self._frame[col].loc[key[0]] = value + elif isinstance(value, cudf.DataFrame): + if value.shape != self._frame.loc[key[0]].shape: + _shape_mismatch_error( + value.shape, + self._frame.loc[key[0]].shape, + ) + value_column_names = set(value._column_names) + scatter_map = _indices_from_labels(self._frame, key[0]) + for col in columns_df._column_names: + columns_df[col][scatter_map] = ( + value._data[col] + if col in value_column_names + else cudf.NA + ) + + else: + value = cupy.asarray(value) + if cupy.ndim(value) == 2: + # If the inner dimension is 1, it's broadcastable to + # all columns of the dataframe. + indexed_shape = columns_df.loc[key[0]].shape + if value.shape[1] == 1: + if value.shape[0] != indexed_shape[0]: + _shape_mismatch_error(value.shape, indexed_shape) + for i, col in enumerate(columns_df._column_names): + self._frame[col].loc[key[0]] = value[:, 0] + else: + if value.shape != indexed_shape: + _shape_mismatch_error(value.shape, indexed_shape) + for i, col in enumerate(columns_df._column_names): + self._frame[col].loc[key[0]] = value[:, i] + else: + # handle cases where value is 1d object: + # If the key on column axis is a scalar, we indexed + # a single column; The 1d value should assign along + # the columns. + if is_scalar(key[1]): + for col in columns_df._column_names: + self._frame[col].loc[key[0]] = value + # Otherwise, there are two situations. The key on row axis + # can be a scalar or 1d. In either of the situation, the + # ith element in value corresponds to the ith row in + # the indexed object. + # If the key is 1d, a broadcast will happen. + else: + for i, col in enumerate(columns_df._column_names): + self._frame[col].loc[key[0]] = value[i] + class _DataFrameIlocIndexer(_DataFrameIndexer): """ @@ -424,10 +462,49 @@ def _getitem_tuple_arg(self, arg): @_cudf_nvtx_annotate def _setitem_tuple_arg(self, key, value): - # TODO: Determine if this usage is prevalent enough to expose this - # selection logic at a higher level than ColumnAccessor. - for col in self._frame._data.get_labels_by_index(key[1]): - self._frame[col].iloc[key[0]] = value + columns_df = self._frame._from_data( + self._frame._data.select_by_index(key[1]), self._frame._index + ) + + if is_scalar(value): + for col in columns_df._column_names: + self._frame[col].iloc[key[0]] = value + + elif isinstance(value, cudf.DataFrame): + if value.shape != self._frame.iloc[key[0]].shape: + _shape_mismatch_error( + value.shape, + self._frame.loc[key[0]].shape, + ) + value_column_names = set(value._column_names) + for col in columns_df._column_names: + columns_df[col][key[0]] = ( + value._data[col] if col in value_column_names else cudf.NA + ) + + else: + # TODO: consolidate code path with identical counterpart + # in `_DataFrameLocIndexer._setitem_tuple_arg` + value = cupy.asarray(value) + if cupy.ndim(value) == 2: + indexed_shape = columns_df.iloc[key[0]].shape + if value.shape[1] == 1: + if value.shape[0] != indexed_shape[0]: + _shape_mismatch_error(value.shape, indexed_shape) + for i, col in enumerate(columns_df._column_names): + self._frame[col].iloc[key[0]] = value[:, 0] + else: + if value.shape != indexed_shape: + _shape_mismatch_error(value.shape, indexed_shape) + for i, col in enumerate(columns_df._column_names): + self._frame._data[col][key[0]] = value[:, i] + else: + if is_scalar(key[1]): + for col in columns_df._column_names: + self._frame[col].iloc[key[0]] = value + else: + for i, col in enumerate(columns_df._column_names): + self._frame[col].iloc[key[0]] = value[i] def _getitem_scalar(self, arg): col = self._frame.columns[arg[1]] diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 1361fc56fa0..f4dcf9f59ca 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -144,7 +144,6 @@ def _drop_columns(f: Frame, columns: abc.Iterable, errors: str): def _indices_from_labels(obj, labels): - if not isinstance(labels, cudf.MultiIndex): labels = cudf.core.column.as_column(labels) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 9f2a3d45778..7f482c0e776 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8697,43 +8697,6 @@ def test_frame_series_where(): assert_eq(expected, actual) -@pytest.mark.parametrize( - "array,is_error", - [ - (cupy.arange(20, 40).reshape(-1, 2), False), - (cupy.arange(20, 50).reshape(-1, 3), True), - (np.arange(20, 40).reshape(-1, 2), False), - (np.arange(20, 30).reshape(-1, 1), False), - (cupy.arange(20, 30).reshape(-1, 1), False), - ], -) -def test_dataframe_indexing_setitem_np_cp_array(array, is_error): - gdf = cudf.DataFrame({"a": range(10), "b": range(10)}) - pdf = gdf.to_pandas() - if not is_error: - gdf.loc[:, ["a", "b"]] = array - pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array) - - assert_eq(gdf, pdf) - else: - assert_exceptions_equal( - lfunc=pdf.loc.__setitem__, - rfunc=gdf.loc.__setitem__, - lfunc_args_and_kwargs=( - [(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)], - {}, - ), - rfunc_args_and_kwargs=( - [(slice(None, None, None), ["a", "b"]), array], - {}, - ), - compare_error_message=False, - expected_error_message="shape mismatch: value array of shape " - "(10, 3) could not be broadcast to indexing " - "result of shape (10, 2)", - ) - - @pytest.mark.parametrize( "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}], diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 225aa0cd6bc..790fbd0d3f8 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1486,3 +1486,189 @@ def test_iloc_decimal(): ["4.00", "3.00", "2.00", "1.00"], ).astype(cudf.Decimal64Dtype(scale=2, precision=3)) assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True)) + + +@pytest.mark.parametrize( + ("key, value"), + [ + ( + ([0], ["x", "y"]), + [10, 20], + ), + ( + ([0, 2], ["x", "y"]), + [[10, 30], [20, 40]], + ), + ( + (0, ["x", "y"]), + [10, 20], + ), + ( + ([0, 2], "x"), + [10, 20], + ), + ], +) +def test_dataframe_loc_inplace_update(key, value): + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + pdf = gdf.to_pandas() + + actual = gdf.loc[key] = value + expected = pdf.loc[key] = value + + assert_eq(expected, actual) + + +def test_dataframe_loc_inplace_update_string_index(): + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=list("abc")) + pdf = gdf.to_pandas() + + actual = gdf.loc[["a"], ["x", "y"]] = [10, 20] + expected = pdf.loc[["a"], ["x", "y"]] = [10, 20] + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + ("key, value"), + [ + ([0], [10, 20]), + ([0, 2], [[10, 30], [20, 40]]), + (([0, 2], [0, 1]), [[10, 30], [20, 40]]), + (([0, 2], 0), [10, 30]), + ((0, [0, 1]), [20, 40]), + ], +) +def test_dataframe_iloc_inplace_update(key, value): + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + pdf = gdf.to_pandas() + + actual = gdf.iloc[key] = value + expected = pdf.iloc[key] = value + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "loc_key", + [([0, 2], ["x", "y"])], +) +@pytest.mark.parametrize( + "iloc_key", + [[0, 2]], +) +@pytest.mark.parametrize( + ("data, index"), + [ + ( + {"x": [10, 20], "y": [30, 40]}, + [0, 2], + ) + ], +) +def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe( + loc_key, iloc_key, data, index +): + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + pdf = gdf.to_pandas() + + actual = gdf.loc[loc_key] = cudf.DataFrame(data, index=cudf.Index(index)) + expected = pdf.loc[loc_key] = pd.DataFrame(data, index=pd.Index(index)) + assert_eq(expected, actual) + + actual = gdf.iloc[iloc_key] = cudf.DataFrame(data, index=cudf.Index(index)) + expected = pdf.iloc[iloc_key] = pd.DataFrame(data, index=pd.Index(index)) + assert_eq(expected, actual) + + +def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns(): + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + pdf = gdf.to_pandas() + + actual = gdf.loc[[0, 2], ["x", "y"]] = cudf.DataFrame( + {"b": [10, 20], "y": [30, 40]}, index=cudf.Index([0, 2]) + ) + expected = pdf.loc[[0, 2], ["x", "y"]] = pd.DataFrame( + {"b": [10, 20], "y": [30, 40]}, index=pd.Index([0, 2]) + ) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + ("key, value"), + [ + (([0, 2], ["x", "y"]), [[10, 30, 50], [20, 40, 60]]), + (([0], ["x", "y"]), [[10], [20]]), + ], +) +def test_dataframe_loc_inplace_update_shape_mismatch(key, value): + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + with pytest.raises(ValueError, match="shape mismatch:"): + gdf.loc[key] = value + + +@pytest.mark.parametrize( + ("key, value"), + [ + ([0, 2], [[10, 30, 50], [20, 40, 60]]), + ([0], [[10], [20]]), + ], +) +def test_dataframe_iloc_inplace_update_shape_mismatch(key, value): + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + with pytest.raises(ValueError, match="shape mismatch:"): + gdf.iloc[key] = value + + +def test_dataframe_loc_inplace_update_shape_mismatch_RHS_df(): + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + with pytest.raises(ValueError, match="shape mismatch:"): + gdf.loc[([0, 2], ["x", "y"])] = cudf.DataFrame( + {"x": [10, 20]}, index=cudf.Index([0, 2]) + ) + + +def test_dataframe_iloc_inplace_update_shape_mismatch_RHS_df(): + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + with pytest.raises(ValueError, match="shape mismatch:"): + gdf.iloc[[0, 2]] = cudf.DataFrame( + {"x": [10, 20]}, index=cudf.Index([0, 2]) + ) + + +@pytest.mark.parametrize( + "array,is_error", + [ + (cupy.arange(20, 40).reshape(-1, 2), False), + (cupy.arange(20, 50).reshape(-1, 3), True), + (np.arange(20, 40).reshape(-1, 2), False), + (np.arange(20, 30).reshape(-1, 1), False), + (cupy.arange(20, 30).reshape(-1, 1), False), + ], +) +def test_dataframe_indexing_setitem_np_cp_array(array, is_error): + gdf = cudf.DataFrame({"a": range(10), "b": range(10)}) + pdf = gdf.to_pandas() + if not is_error: + gdf.loc[:, ["a", "b"]] = array + pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array) + + assert_eq(gdf, pdf) + else: + assert_exceptions_equal( + lfunc=pdf.loc.__setitem__, + rfunc=gdf.loc.__setitem__, + lfunc_args_and_kwargs=( + [(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)], + {}, + ), + rfunc_args_and_kwargs=( + [(slice(None, None, None), ["a", "b"]), array], + {}, + ), + compare_error_message=False, + expected_error_message="shape mismatch: value array of shape " + "(10, 3) could not be broadcast to indexing " + "result of shape (10, 2)", + )