From 10fac9e4a1af17fe6297fcfd6262262a965a8963 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 8 Feb 2022 15:29:55 -0800 Subject: [PATCH 1/5] Mark DataFrame.columns as external only and remove all usage required for dataframe,series,index,and multiindex tests to pass. --- python/cudf/cudf/core/_base_index.py | 4 +- python/cudf/cudf/core/_internals/where.py | 2 +- python/cudf/cudf/core/dataframe.py | 140 ++++++++++++---------- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/groupby/groupby.py | 13 +- python/cudf/cudf/core/indexed_frame.py | 25 ++-- python/cudf/cudf/core/multiindex.py | 9 +- python/cudf/cudf/core/reshape.py | 6 +- 8 files changed, 105 insertions(+), 96 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 7a9a17631a9..16fb0cf99c1 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -957,7 +957,9 @@ def _union(self, other, sort=None): self_df["order"] = self_df.index other_df["order"] = other_df.index res = self_df.merge(other_df, on=[0], how="outer") - res = res.sort_values(by=res.columns[1:], ignore_index=True) + res = res.sort_values( + by=res._data.to_pandas_index()[1:], ignore_index=True + ) union_result = cudf.core.index._index_from_data({0: res._data[0]}) if sort is None and len(other): diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 6c94a84fd37..40fa8a5de0b 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -264,7 +264,7 @@ def where( ) # Setting `frame` column names to `cond` # as `cond` has no column names. - cond.columns = frame.columns + cond.columns = frame._data.to_pandas_index() (source_df, others,) = _normalize_columns_and_scalars_type( frame, other diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1c672aacd86..162f7fd3c92 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -75,7 +75,7 @@ min_scalar_type, numeric_normalize_types, ) -from cudf.utils.utils import GetAttrGetItemMixin +from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api T = TypeVar("T", bound="DataFrame") @@ -94,8 +94,9 @@ class _DataFrameIndexer(_FrameIndexer): def __getitem__(self, arg): - if isinstance(self._frame.index, MultiIndex) or isinstance( - self._frame.columns, MultiIndex + if ( + isinstance(self._frame.index, MultiIndex) + or self._frame._data.multiindex ): # This try/except block allows the use of pandas-like # tuple arguments into MultiIndex dataframes. @@ -164,7 +165,7 @@ def _downcast_to_series(self, df, arg): # determine the axis along which the Series is taken: if nrows == 1 and ncols == 1: if is_scalar(arg[0]) and is_scalar(arg[1]): - return df[df.columns[0]].iloc[0] + return df[df._column_names[0]].iloc[0] elif not is_scalar(arg[0]): axis = 1 else: @@ -285,8 +286,9 @@ def _getitem_tuple_arg(self, arg): @annotate("LOC_SETITEM", color="blue", domain="cudf_python") def _setitem_tuple_arg(self, key, value): - if isinstance(self._frame.index, MultiIndex) or isinstance( - self._frame.columns, pd.MultiIndex + if ( + isinstance(self._frame.index, MultiIndex) + or self._frame._data.multiindex ): raise NotImplementedError( "Setting values using df.loc[] not supported on " @@ -546,7 +548,6 @@ def __init__( ) else: self._data = data._data - self.columns = data.columns elif isinstance(data, (cudf.Series, pd.Series)): if isinstance(data, pd.Series): data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null) @@ -595,7 +596,6 @@ def __init__( self._data = new_df._data self.index = new_df._index - self.columns = new_df.columns elif hasattr(data, "__array_interface__"): arr_interface = data.__array_interface__ if len(arr_interface["descr"]) == 1: @@ -605,7 +605,6 @@ def __init__( new_df = self.from_records(data, index=index, columns=columns) self._data = new_df._data self.index = new_df._index - self.columns = new_df.columns else: if is_list_like(data): if len(data) > 0 and is_scalar(data[0]): @@ -617,7 +616,6 @@ def __init__( self._data = new_df._data self.index = new_df._index - self.columns = new_df.columns elif len(data) > 0 and isinstance(data[0], Series): self._init_from_series_list( data=data, columns=columns, index=index @@ -719,8 +717,9 @@ def _init_from_series_list(self, data, columns, index): else: concat_df = cudf.concat(data, axis=1) - if concat_df.columns.dtype == "object": - concat_df.columns = concat_df.columns.astype("str") + cols = concat_df._data.to_pandas_index() + if cols.dtype == "object": + concat_df.columns = cols.astype("str") transpose = concat_df.T @@ -972,7 +971,9 @@ def __dir__(self): o = set(dir(type(self))) o.update(self.__dict__) o.update( - c for c in self.columns if isinstance(c, str) and c.isidentifier() + c + for c in self._column_names + if isinstance(c, str) and c.isidentifier() ) return list(o) @@ -983,7 +984,6 @@ def __setattr__(self, key, col): # properties, and we must call object.__getattribute__ to bypass # the `__getitem__` behavior inherited from `GetAttrGetItemMixin`. object.__getattribute__(self, key) - super().__setattr__(key, col) except AttributeError: if key not in self._PROTECTED_KEYS: try: @@ -998,6 +998,16 @@ def __setattr__(self, key, col): # Set a new attribute that is not already a column. super().__setattr__(key, col) + except RuntimeError as e: + # Need to allow setting properties that are marked as forbidden for + # internal usage. + # TODO: Check if there are alternatives that could be used instead. + if "External-only API" not in str(e): + raise + super().__setattr__(key, col) + else: + super().__setattr__(key, col) + @annotate("DATAFRAME_GETITEM", color="blue", domain="cudf_python") def __getitem__(self, arg): """ @@ -1304,7 +1314,7 @@ def _slice(self: T, arg: slice) -> T: # Adding index of type RangeIndex back to # result result.index = self.index[start:stop] - result.columns = self.columns + result.columns = self._data.to_pandas_index() return result @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python") @@ -1606,7 +1616,7 @@ def _concat( ) # Reassign index and column names - if isinstance(objs[0].columns, pd.MultiIndex): + if objs[0]._data.multiindex: out.columns = objs[0].columns else: out.columns = names @@ -1930,12 +1940,14 @@ def _prep_for_binop( not can_reindex and fn in cudf.utils.utils._EQUALITY_OPS and ( - not lhs.columns.equals(rhs.columns) + not lhs._data.to_pandas_index().equals( + rhs._data.to_pandas_index() + ) or not lhs.index.equals(rhs.index) ) ): raise ValueError( - "Can only compare identically-labeled " "DataFrame objects" + "Can only compare identically-labeled DataFrame objects" ) lhs, rhs = _align_indices(lhs, rhs) @@ -2074,8 +2086,9 @@ def update( if not isinstance(other, DataFrame): other = DataFrame(other) - if not self.columns.equals(other.columns): - other = other.reindex(self.columns, axis=1) + self_cols = self._data.to_pandas_index() + if not self_cols.equals(other._data.to_pandas_index()): + other = other.reindex(self_cols, axis=1) if not self.index.equals(other.index): other = other.reindex(self.index, axis=0) @@ -2104,7 +2117,7 @@ def update( @annotate("DATAFRAME_ITER", color="blue", domain="cudf_python") def __iter__(self): - return iter(self.columns) + return iter(self._column_names) @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python") def iteritems(self): @@ -2150,6 +2163,7 @@ def at(self): return self.loc @property # type: ignore + @_external_only_api @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python") def columns(self): """Returns a tuple of columns""" @@ -2451,7 +2465,7 @@ def set_index( for col in keys: # Is column label if is_scalar(col) or isinstance(col, tuple): - if col in self.columns: + if col in self._column_names: columns_to_add.append(self[col]) names.append(col) if drop: @@ -2747,7 +2761,7 @@ def diff(self, periods=1, axis=0): df = cudf.DataFrame._from_data( { name: column_empty(len(self), dtype=dtype, masked=True) - for name, dtype in zip(self.columns, self.dtypes) + for name, dtype in zip(self._column_names, self.dtypes) } ) return df @@ -3460,7 +3474,7 @@ def transpose(self): # Never transpose a MultiIndex - remove the existing columns and # replace with a RangeIndex. Afterward, reassign. columns = self.index.copy(deep=False) - index = self.columns.copy(deep=False) + index = self._data.to_pandas_index() if self._num_columns == 0 or self._num_rows == 0: return DataFrame(index=index, columns=columns) # Set the old column names as the new index @@ -4313,13 +4327,13 @@ def info( ) lines.append(index_summary) - if len(self.columns) == 0: + if len(self._data) == 0: lines.append(f"Empty {type(self).__name__}") cudf.utils.ioutils.buffer_write_lines(buf, lines) return - cols = self.columns - col_count = len(self.columns) + cols = self._column_names + col_count = len(cols) if max_cols is None: max_cols = pd.options.display.max_info_columns @@ -4337,7 +4351,7 @@ def _put_str(s, space): return str(s)[:space].ljust(space) def _verbose_repr(): - lines.append(f"Data columns (total {len(self.columns)} columns):") + lines.append(f"Data columns (total {col_count} columns):") id_head = " # " column_head = "Column" @@ -4357,10 +4371,10 @@ def _verbose_repr(): ) if show_counts: counts = self.count().to_pandas().tolist() - if len(cols) != len(counts): + if col_count != len(counts): raise AssertionError( f"Columns must equal " - f"counts ({len(cols)} != {len(counts)})" + f"counts ({col_count} != {len(counts)})" ) count_header = "Non-Null Count" len_count = len(count_header) @@ -4393,7 +4407,7 @@ def _verbose_repr(): + _put_str("-" * len_dtype, space_dtype).rstrip() ) - for i, col in enumerate(self.columns): + for i, col in enumerate(self._column_names): dtype = self.dtypes.iloc[i] col = pprint_thing(col) @@ -4410,13 +4424,11 @@ def _verbose_repr(): ) def _non_verbose_repr(): - if len(self.columns) > 0: - entries_summary = f", {self.columns[0]} to {self.columns[-1]}" + if col_count > 0: + entries_summary = f", {cols[0]} to {cols[-1]}" else: entries_summary = "" - columns_summary = ( - f"Columns: {len(self.columns)} entries{entries_summary}" - ) + columns_summary = f"Columns: {col_count} entries{entries_summary}" lines.append(columns_summary) def _sizeof_fmt(num, size_qualifier): @@ -4479,7 +4491,7 @@ def describe( if datetime_is_numeric: default_include.append("datetime") data_to_describe = self.select_dtypes(include=default_include) - if len(data_to_describe.columns) == 0: + if data_to_describe._num_columns == 0: data_to_describe = self elif include == "all": @@ -4497,7 +4509,7 @@ def describe( describe_series_list = [ data_to_describe[col].describe(percentiles=percentiles) - for col in data_to_describe.columns + for col in data_to_describe._column_names ] if len(describe_series_list) == 1: return describe_series_list[0].to_frame() @@ -4587,26 +4599,14 @@ def to_pandas(self, nullable=False, **kwargs): out_data = {} out_index = self.index.to_pandas() - if not isinstance(self.columns, pd.Index): - out_columns = self.columns.to_pandas() - else: - out_columns = self.columns - for i, col_key in enumerate(self._data): out_data[i] = self._data[col_key].to_pandas( index=out_index, nullable=nullable ) - if isinstance(self.columns, BaseIndex): - out_columns = self.columns.to_pandas() - if isinstance(self.columns, MultiIndex): - if self.columns.names is not None: - out_columns.names = self.columns.names - else: - out_columns.name = self.columns.name - out_df = pd.DataFrame(out_data, index=out_index) - out_df.columns = out_columns + out_df.columns = self._data.to_pandas_index() + return out_df @classmethod @@ -5105,7 +5105,7 @@ def quantile( if isinstance(q, numbers.Number) and numeric_only: result = result.fillna(np.nan) result = result.iloc[0] - result.index = as_index(data_df.columns) + result.index = data_df._data.to_pandas_index() result.name = q return result else: @@ -5285,6 +5285,7 @@ def make_false_column_like_self(): f"'{type(values).__name__}'" ) + # TODO: Update this logic to properly preserve MultiIndex columns. return DataFrame._from_data(result, self.index) # @@ -5521,7 +5522,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): if isinstance(df, Series): df = df.to_frame() - df.columns = data_df.columns + df.columns = data_df._data.to_pandas_index() return df @@ -5651,7 +5652,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): return Series(result, index=self.index, dtype=result_dtype,) else: result_df = DataFrame(result).set_index(self.index) - result_df.columns = prepared.columns + result_df.columns = prepared._data.to_pandas_index() return result_df @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python") @@ -5926,16 +5927,20 @@ def cov(self, **kwargs): cov : DataFrame """ cov = cupy.cov(self.values, rowvar=False) - df = DataFrame(cupy.asfortranarray(cov)).set_index(self.columns) - df.columns = self.columns + # TODO: Why are we setting this for both index and columns? + cols = self._data.to_pandas_index() + df = DataFrame(cupy.asfortranarray(cov)).set_index(cols) + df.columns = cols return df @annotate("DATAFRAME_CORR", color="green", domain="cudf_python") def corr(self): """Compute the correlation matrix of a DataFrame.""" corr = cupy.corrcoef(self.values, rowvar=False) - df = DataFrame(cupy.asfortranarray(corr)).set_index(self.columns) - df.columns = self.columns + # TODO: Why are we setting this for both index and columns? + cols = self._data.to_pandas_index() + df = DataFrame(cupy.asfortranarray(corr)).set_index(cols) + df.columns = cols return df @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python") @@ -6001,7 +6006,7 @@ def keys(self): >>> df.keys() Int64Index([0, 1, 2, 3], dtype='int64') """ - return self.columns + return self._data.to_pandas_index() def itertuples(self, index=True, name="Pandas"): raise TypeError( @@ -6134,7 +6139,7 @@ def append( "or if the Series has a name" ) - current_cols = self.columns + current_cols = self._data.to_pandas_index() combined_columns = other.index.to_pandas() if len(current_cols): @@ -6162,8 +6167,11 @@ def append( pass elif not isinstance(other[0], DataFrame): other = DataFrame(other) - if (self.columns.get_indexer(other.columns) >= 0).all(): - other = other.reindex(columns=self.columns) + cols = self._data.to_pandas_index() + if ( + cols.get_indexer(other._data.to_pandas_index()) >= 0 + ).all(): + other = other.reindex(columns=cols) if is_list_like(other): to_concat = [self, *other] @@ -6622,7 +6630,7 @@ def _setitem_with_dataframe( if input_cols is None: input_cols = input_df.columns - if len(input_cols) != len(replace_df.columns): + if len(input_cols) != len(replace_df._column_names): raise ValueError( "Number of Input Columns must be same replacement Dataframe" ) @@ -6634,8 +6642,8 @@ def _setitem_with_dataframe( ): replace_df = replace_df.reindex(input_df.index) - for col_1, col_2 in zip(input_cols, replace_df.columns): - if col_1 in input_df.columns: + for col_1, col_2 in zip(input_cols, replace_df._column_names): + if col_1 in input_df._column_names: if mask is not None: input_df._data[col_1][mask] = column.as_column( replace_df[col_2] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d478baf2d69..50ede4faea4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6822,7 +6822,7 @@ def _drop_rows_by_labels( return obj.__class__._from_data( join_res.iloc[:, idx_nlv:]._data, index=midx, - columns=obj.columns, + columns=obj._data.to_pandas_index(), ) else: diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b90f857ce84..5e3a88193ac 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -7,7 +7,6 @@ from functools import cached_property import numpy as np -import pandas as pd from nvtx import annotate import cudf @@ -905,7 +904,7 @@ def corr(self, method="pearson", min_periods=1): # create expanded dataframe consisting all combinations of the # struct columns-pairs to be correlated # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - _cols = self.grouping.values.columns.tolist() + _cols = self.grouping.values._data.to_pandas_index().tolist() len_cols = len(_cols) new_df_data = {} @@ -1048,7 +1047,7 @@ def cov(self, min_periods=0, ddof=1): # create expanded dataframe consisting all combinations of the # struct columns-pairs used in the covariance calculation # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - column_names = self.grouping.values.columns.tolist() + column_names = self.grouping.values._column_names num_cols = len(column_names) column_pair_structs = {} @@ -1616,11 +1615,9 @@ def agg(self, func): return result.iloc[:, 0] # drop the first level if we have a multiindex - if ( - isinstance(result.columns, pd.MultiIndex) - and result.columns.nlevels > 1 - ): - result.columns = result.columns.droplevel(0) + cols = result._data.to_pandas_index() + if result._data.multiindex and cols.nlevels > 1: + result.columns = cols.droplevel(0) return result diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index bc7337d0a42..d423bb430b3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -444,12 +444,11 @@ def sort_index( out = self._gather(inds) # TODO: frame factory function should handle multilevel column # names - if isinstance( - self, cudf.core.dataframe.DataFrame - ) and isinstance( - self.columns, pd.core.indexes.multi.MultiIndex + if ( + isinstance(self, cudf.core.dataframe.DataFrame) + and self._data.multiindex ): - out.columns = self.columns + out.columns = self._data.to_pandas_index() elif (ascending and idx.is_monotonic_increasing) or ( not ascending and idx.is_monotonic_decreasing ): @@ -459,12 +458,11 @@ def sort_index( ascending=ascending, na_position=na_position ) out = self._gather(inds) - if isinstance( - self, cudf.core.dataframe.DataFrame - ) and isinstance( - self.columns, pd.core.indexes.multi.MultiIndex + if ( + isinstance(self, cudf.core.dataframe.DataFrame) + and self._data.multiindex ): - out.columns = self.columns + out.columns = self._data.to_pandas_index() else: labels = sorted(self._data.names, reverse=not ascending) out = self[labels] @@ -938,10 +936,11 @@ def sort_values( ), keep_index=not ignore_index, ) - if isinstance(self, cudf.core.dataframe.DataFrame) and isinstance( - self.columns, pd.core.indexes.multi.MultiIndex + if ( + isinstance(self, cudf.core.dataframe.DataFrame) + and self._data.multiindex ): - out.columns = self.columns + out.columns = self._data.to_pandas_index() return out def _n_largest_or_smallest(self, largest, n, columns, keep): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 5e0cd2ca8cb..dc34054a0d7 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -110,7 +110,7 @@ def __init__( levels = [cudf.Series(level) for level in levels] - if len(levels) != len(codes.columns): + if len(levels) != len(codes._data): raise ValueError( "MultiIndex has unequal number of levels and " "codes and is inconsistent!" @@ -960,12 +960,12 @@ def _concat(cls, objs): # TODO: Verify if this is really necessary or if we can rely on # DataFrame._concat. if len(source_data) > 1: - colnames = source_data[0].columns + colnames = source_data[0]._data.to_pandas_index() for obj in source_data[1:]: obj.columns = colnames source_data = cudf.DataFrame._concat(source_data) - names = [None for x in source_data.columns] + names = [None] * source_data._num_columns objs = list(filter(lambda o: o.names is not None, objs)) for o in range(len(objs)): for i, name in enumerate(objs[o].names): @@ -1682,7 +1682,8 @@ def _union(self, other, sort=None): result_df = self_df.merge(other_df, on=col_names, how="outer") result_df = result_df.sort_values( - by=result_df.columns[self.nlevels :], ignore_index=True + by=result_df._data.to_pandas_index()[self.nlevels :], + ignore_index=True, ) midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels]) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 43c7d0343fd..f791a9f5ed3 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -334,8 +334,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): else: df[name] = col - result_columns = objs[0].columns.append( - [obj.columns for obj in objs[1:]] + result_columns = ( + objs[0] + ._data.to_pandas_index() + .append([obj._data.to_pandas_index() for obj in objs[1:]]) ) if ignore_index: From 68bb6021ec6e557bf4a3bde5a7cac613ba47a301 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 9 Feb 2022 16:33:38 -0800 Subject: [PATCH 2/5] Add new internal API for setting columns and use it where possible. --- python/cudf/cudf/core/_internals/where.py | 2 +- python/cudf/cudf/core/dataframe.py | 33 ++++++++++++++--------- python/cudf/cudf/core/indexed_frame.py | 4 +-- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 40fa8a5de0b..b7a8c67ff30 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -264,7 +264,7 @@ def where( ) # Setting `frame` column names to `cond` # as `cond` has no column names. - cond.columns = frame._data.to_pandas_index() + cond._set_column_names_like(frame) (source_df, others,) = _normalize_columns_and_scalars_type( frame, other diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 162f7fd3c92..d7dcfef0a84 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -999,9 +999,10 @@ def __setattr__(self, key, col): super().__setattr__(key, col) except RuntimeError as e: - # Need to allow setting properties that are marked as forbidden for - # internal usage. - # TODO: Check if there are alternatives that could be used instead. + # TODO: This allows setting properties that are marked as forbidden + # for internal usage. It is necesary because the __getattribute__ + # call in the try block will trigger the error. We should see if + # setting these variables can also always be disabled if "External-only API" not in str(e): raise super().__setattr__(key, col) @@ -1314,7 +1315,7 @@ def _slice(self: T, arg: slice) -> T: # Adding index of type RangeIndex back to # result result.index = self.index[start:stop] - result.columns = self._data.to_pandas_index() + result._set_column_names_like(self) return result @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python") @@ -2191,12 +2192,20 @@ def columns(self, columns): f"got {len(columns)} elements" ) - data = dict(zip(columns, self._data.columns)) - if len(columns) != len(data): + self._set_column_names(columns, is_multiindex, columns.names) + + def _set_column_names(self, names, multiindex=False, level_names=None): + data = dict(zip(names, self._data.columns)) + if len(names) != len(data): raise ValueError("Duplicate column names are not allowed") self._data = ColumnAccessor( - data, multiindex=is_multiindex, level_names=columns.names, + data, multiindex=multiindex, level_names=level_names, + ) + + def _set_column_names_like(self, other): + self._set_column_names( + other._data.names, other._data.multiindex, other._data.level_names ) @annotate("DATAFRAME_REINDEX_INTERNAL", color="blue", domain="cudf_python") @@ -5522,7 +5531,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): if isinstance(df, Series): df = df.to_frame() - df.columns = data_df._data.to_pandas_index() + df._set_column_names_like(data_df) return df @@ -5652,7 +5661,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): return Series(result, index=self.index, dtype=result_dtype,) else: result_df = DataFrame(result).set_index(self.index) - result_df.columns = prepared._data.to_pandas_index() + result_df._set_column_names_like(prepared) return result_df @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python") @@ -5927,20 +5936,18 @@ def cov(self, **kwargs): cov : DataFrame """ cov = cupy.cov(self.values, rowvar=False) - # TODO: Why are we setting this for both index and columns? cols = self._data.to_pandas_index() df = DataFrame(cupy.asfortranarray(cov)).set_index(cols) - df.columns = cols + df._set_column_names_like(self) return df @annotate("DATAFRAME_CORR", color="green", domain="cudf_python") def corr(self): """Compute the correlation matrix of a DataFrame.""" corr = cupy.corrcoef(self.values, rowvar=False) - # TODO: Why are we setting this for both index and columns? cols = self._data.to_pandas_index() df = DataFrame(cupy.asfortranarray(corr)).set_index(cols) - df.columns = cols + df._set_column_names_like(self) return df @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d423bb430b3..8ff3e39519c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -448,7 +448,7 @@ def sort_index( isinstance(self, cudf.core.dataframe.DataFrame) and self._data.multiindex ): - out.columns = self._data.to_pandas_index() + out._set_column_names_like(self) elif (ascending and idx.is_monotonic_increasing) or ( not ascending and idx.is_monotonic_decreasing ): @@ -462,7 +462,7 @@ def sort_index( isinstance(self, cudf.core.dataframe.DataFrame) and self._data.multiindex ): - out.columns = self._data.to_pandas_index() + out._set_column_names_like(self) else: labels = sorted(self._data.names, reverse=not ascending) out = self[labels] From ccb08a1f038f272104208f0f3deb1ddc939b0f2d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 10 Feb 2022 12:23:34 -0800 Subject: [PATCH 3/5] Specify alternatives for DataFrame.columns and fix copyrights. --- python/cudf/cudf/core/_internals/where.py | 2 +- python/cudf/cudf/core/dataframe.py | 6 +++++- python/cudf/cudf/core/reshape.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index b7a8c67ff30..8bfcad4c8f4 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import warnings from typing import Any, Optional, Tuple, Union, cast diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d7dcfef0a84..afbd00bb00c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2164,7 +2164,11 @@ def at(self): return self.loc @property # type: ignore - @_external_only_api + @_external_only_api( + "Use _column_names instead, or _data.to_pandas_index() if a pandas " + "index is absolutely necessary. For checking if the columns are a " + "MultiIndex, use _data.multiindex." + ) @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python") def columns(self): """Returns a tuple of columns""" diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index f791a9f5ed3..cc4a6b1adc6 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import itertools from typing import Dict, Optional From 102ccef02618dfe4af614e2dd4f04d0012d31d41 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 16 Feb 2022 17:21:44 -0800 Subject: [PATCH 4/5] Fix remaining internal uses of columns API throughout codebase. --- python/cudf/cudf/core/dataframe.py | 8 +++--- python/cudf/cudf/core/df_protocol.py | 6 +++-- python/cudf/cudf/core/reshape.py | 15 ++++++----- python/cudf/cudf/core/window/rolling.py | 30 +++++++++++++--------- python/cudf/cudf/io/parquet.py | 2 +- python/cudf/cudf/testing/testing.py | 12 ++++----- python/cudf/cudf/tests/test_df_protocol.py | 4 ++- 7 files changed, 45 insertions(+), 32 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index afbd00bb00c..6b5f3809c98 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1618,7 +1618,7 @@ def _concat( # Reassign index and column names if objs[0]._data.multiindex: - out.columns = objs[0].columns + out._set_column_names_like(objs[0]) else: out.columns = names if not ignore_index: @@ -6606,10 +6606,10 @@ def _align_indices(lhs, rhs): df = df.sort_index() lhs_out = DataFrame(index=df.index) rhs_out = DataFrame(index=df.index) - common = set(lhs.columns) & set(rhs.columns) + common = set(lhs._column_names) & set(rhs._column_names) common_x = {f"{x}_x" for x in common} common_y = {f"{x}_y" for x in common} - for col in df.columns: + for col in df._column_names: if col in common_x: lhs_out[col[:-2]] = df[col] elif col in common_y: @@ -6639,7 +6639,7 @@ def _setitem_with_dataframe( """ if input_cols is None: - input_cols = input_df.columns + input_cols = input_df._column_names if len(input_cols) != len(replace_df._column_names): raise ValueError( diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 8f258ce27b2..8f00289afcb 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + import collections import enum from typing import ( @@ -535,7 +537,7 @@ def metadata(self): return {"cudf.index": self._df.index} def num_columns(self) -> int: - return len(self._df.columns) + return len(self._df._column_names) def num_rows(self) -> int: return len(self._df) @@ -544,7 +546,7 @@ def num_chunks(self) -> int: return 1 def column_names(self) -> Iterable[str]: - return self._df.columns.tolist() + return self._df._column_names def get_column(self, i: int) -> _CuDFColumn: return _CuDFColumn( diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index cc4a6b1adc6..5aa7f616e35 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -494,7 +494,7 @@ def melt( if not isinstance(id_vars, collections.abc.Sequence): id_vars = [id_vars] id_vars = list(id_vars) - missing = set(id_vars) - set(frame.columns) + missing = set(id_vars) - set(frame._column_names) if not len(missing) == 0: raise KeyError( f"The following 'id_vars' are not present" @@ -508,7 +508,7 @@ def melt( if not isinstance(value_vars, collections.abc.Sequence): value_vars = [value_vars] value_vars = list(value_vars) - missing = set(value_vars) - set(frame.columns) + missing = set(value_vars) - set(frame._column_names) if not len(missing) == 0: raise KeyError( f"The following 'value_vars' are not present" @@ -516,8 +516,7 @@ def melt( ) else: # then all remaining columns in frame - value_vars = frame.columns.drop(id_vars) - value_vars = list(value_vars) + value_vars = list(set(frame._column_names) - set(id_vars)) # Error for unimplemented support for datatype dtypes = [frame[col].dtype for col in id_vars + value_vars] @@ -691,7 +690,9 @@ def get_dummies( encode_fallback_dtypes = ["object", "category"] if columns is None or len(columns) == 0: - columns = df.select_dtypes(include=encode_fallback_dtypes).columns + columns = df.select_dtypes( + include=encode_fallback_dtypes + )._column_names _length_check_params(prefix, columns, "prefix") _length_check_params(prefix_sep, columns, "prefix_sep") @@ -1062,7 +1063,9 @@ def unstack(df, level, fill_value=None): ) res = df.T.stack(dropna=False) # Result's index is a multiindex - res.index.names = tuple(df.columns.names) + df.index.names + res.index.names = ( + tuple(df._data.to_pandas_index().names) + df.index.names + ) return res else: columns = df.index._poplevels(level) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 8ffd75b1d76..2282a435ed3 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION +# Copyright (c) 2020-2022, NVIDIA CORPORATION import itertools @@ -198,8 +198,7 @@ def __getitem__(self, arg): center=self.center, ) - def _apply_agg_series(self, sr, agg_name): - source_column = sr._column + def _apply_agg_column(self, source_column, agg_name): min_periods = self.min_periods or 1 if isinstance(self.window, int): preceding_window = None @@ -230,7 +229,7 @@ def _apply_agg_series(self, sr, agg_name): ) window = None - result_col = libcudf.rolling.rolling( + return libcudf.rolling.rolling( source_column=source_column, pre_column_window=preceding_window, fwd_column_window=following_window, @@ -240,19 +239,26 @@ def _apply_agg_series(self, sr, agg_name): op=agg_name, agg_params=self.agg_params, ) - return sr._from_data({sr.name: result_col}, sr._index) def _apply_agg_dataframe(self, df, agg_name): - result_df = cudf.DataFrame({}) - for i, col_name in enumerate(df.columns): - result_col = self._apply_agg_series(df[col_name], agg_name) - result_df.insert(i, col_name, result_col) - result_df.index = df.index - return result_df + return cudf.DataFrame._from_data( + { + col_name: self._apply_agg_column(col, agg_name) + for col_name, col in df._data.items() + }, + index=df.index, + ) def _apply_agg(self, agg_name): if isinstance(self.obj, cudf.Series): - return self._apply_agg_series(self.obj, agg_name) + return cudf.Series._from_data( + { + self.obj.name: self._apply_agg_column( + self.obj._column, agg_name + ) + }, + index=self.obj.index, + ) else: return self._apply_agg_dataframe(self.obj, agg_name) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 948428de4f0..6e4e104df4d 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -565,7 +565,7 @@ def to_parquet( if engine == "cudf": # Ensure that no columns dtype is 'category' - for col in df.columns: + for col in df._column_names: if partition_cols is None or col not in partition_cols: if df[col].dtype.name == "category": raise ValueError( diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index b3e30fac7d5..5f7616cc75e 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -696,8 +696,8 @@ def assert_frame_equal( if PANDAS_GE_110: pd.testing.assert_index_equal( - left.columns, - right.columns, + left._data.to_pandas_index(), + right._data.to_pandas_index(), exact=check_column_type, check_names=check_names, check_exact=check_exact, @@ -708,8 +708,8 @@ def assert_frame_equal( ) else: pd.testing.assert_index_equal( - left.columns, - right.columns, + left._data.to_pandas_index(), + right._data.to_pandas_index(), exact=check_column_type, check_names=check_names, check_exact=check_exact, @@ -717,7 +717,7 @@ def assert_frame_equal( obj=f"{obj}.columns", ) - for col in left.columns: + for col in left._column_names: assert_column_equal( left._data[col], right._data[col], diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index d24c8ca2860..de6aa0a6bf3 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + from typing import Any, Tuple import cupy as cp @@ -74,7 +76,7 @@ def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame): assert dfo.num_columns() == len(df.columns) assert dfo.num_rows() == len(df) assert dfo.num_chunks() == 1 - assert dfo.column_names() == list(df.columns) + assert dfo.column_names() == tuple(df.columns) for col in df.columns: assert_column_equal(dfo.get_column_by_name(col), df[col]._column) From ae168f473899cefd02304d1502774f78f5a59eaf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 23 Feb 2022 14:51:06 -0800 Subject: [PATCH 5/5] Address PR comments. --- python/cudf/cudf/core/groupby/groupby.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 5e3a88193ac..0ad6a9dc35a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1615,9 +1615,8 @@ def agg(self, func): return result.iloc[:, 0] # drop the first level if we have a multiindex - cols = result._data.to_pandas_index() - if result._data.multiindex and cols.nlevels > 1: - result.columns = cols.droplevel(0) + if result._data.nlevels > 1: + result.columns = result._data.to_pandas_index().droplevel(0) return result