From 10fac9e4a1af17fe6297fcfd6262262a965a8963 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 8 Feb 2022 15:29:55 -0800
Subject: [PATCH 1/5] Mark DataFrame.columns as external only and remove all
 usage required for dataframe,series,index,and multiindex tests to pass.

---
 python/cudf/cudf/core/_base_index.py      |   4 +-
 python/cudf/cudf/core/_internals/where.py |   2 +-
 python/cudf/cudf/core/dataframe.py        | 140 ++++++++++++----------
 python/cudf/cudf/core/frame.py            |   2 +-
 python/cudf/cudf/core/groupby/groupby.py  |  13 +-
 python/cudf/cudf/core/indexed_frame.py    |  25 ++--
 python/cudf/cudf/core/multiindex.py       |   9 +-
 python/cudf/cudf/core/reshape.py          |   6 +-
 8 files changed, 105 insertions(+), 96 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 7a9a17631a9..16fb0cf99c1 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -957,7 +957,9 @@ def _union(self, other, sort=None):
         self_df["order"] = self_df.index
         other_df["order"] = other_df.index
         res = self_df.merge(other_df, on=[0], how="outer")
-        res = res.sort_values(by=res.columns[1:], ignore_index=True)
+        res = res.sort_values(
+            by=res._data.to_pandas_index()[1:], ignore_index=True
+        )
         union_result = cudf.core.index._index_from_data({0: res._data[0]})
 
         if sort is None and len(other):
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 6c94a84fd37..40fa8a5de0b 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -264,7 +264,7 @@ def where(
                 )
             # Setting `frame` column names to `cond`
             # as `cond` has no column names.
-            cond.columns = frame.columns
+            cond.columns = frame._data.to_pandas_index()
 
         (source_df, others,) = _normalize_columns_and_scalars_type(
             frame, other
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1c672aacd86..162f7fd3c92 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -75,7 +75,7 @@
     min_scalar_type,
     numeric_normalize_types,
 )
-from cudf.utils.utils import GetAttrGetItemMixin
+from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
 
 T = TypeVar("T", bound="DataFrame")
 
@@ -94,8 +94,9 @@
 
 class _DataFrameIndexer(_FrameIndexer):
     def __getitem__(self, arg):
-        if isinstance(self._frame.index, MultiIndex) or isinstance(
-            self._frame.columns, MultiIndex
+        if (
+            isinstance(self._frame.index, MultiIndex)
+            or self._frame._data.multiindex
         ):
             # This try/except block allows the use of pandas-like
             # tuple arguments into MultiIndex dataframes.
@@ -164,7 +165,7 @@ def _downcast_to_series(self, df, arg):
         # determine the axis along which the Series is taken:
         if nrows == 1 and ncols == 1:
             if is_scalar(arg[0]) and is_scalar(arg[1]):
-                return df[df.columns[0]].iloc[0]
+                return df[df._column_names[0]].iloc[0]
             elif not is_scalar(arg[0]):
                 axis = 1
             else:
@@ -285,8 +286,9 @@ def _getitem_tuple_arg(self, arg):
 
     @annotate("LOC_SETITEM", color="blue", domain="cudf_python")
     def _setitem_tuple_arg(self, key, value):
-        if isinstance(self._frame.index, MultiIndex) or isinstance(
-            self._frame.columns, pd.MultiIndex
+        if (
+            isinstance(self._frame.index, MultiIndex)
+            or self._frame._data.multiindex
         ):
             raise NotImplementedError(
                 "Setting values using df.loc[] not supported on "
@@ -546,7 +548,6 @@ def __init__(
                 )
             else:
                 self._data = data._data
-                self.columns = data.columns
         elif isinstance(data, (cudf.Series, pd.Series)):
             if isinstance(data, pd.Series):
                 data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null)
@@ -595,7 +596,6 @@ def __init__(
 
             self._data = new_df._data
             self.index = new_df._index
-            self.columns = new_df.columns
         elif hasattr(data, "__array_interface__"):
             arr_interface = data.__array_interface__
             if len(arr_interface["descr"]) == 1:
@@ -605,7 +605,6 @@ def __init__(
                 new_df = self.from_records(data, index=index, columns=columns)
             self._data = new_df._data
             self.index = new_df._index
-            self.columns = new_df.columns
         else:
             if is_list_like(data):
                 if len(data) > 0 and is_scalar(data[0]):
@@ -617,7 +616,6 @@ def __init__(
 
                     self._data = new_df._data
                     self.index = new_df._index
-                    self.columns = new_df.columns
                 elif len(data) > 0 and isinstance(data[0], Series):
                     self._init_from_series_list(
                         data=data, columns=columns, index=index
@@ -719,8 +717,9 @@ def _init_from_series_list(self, data, columns, index):
         else:
             concat_df = cudf.concat(data, axis=1)
 
-            if concat_df.columns.dtype == "object":
-                concat_df.columns = concat_df.columns.astype("str")
+            cols = concat_df._data.to_pandas_index()
+            if cols.dtype == "object":
+                concat_df.columns = cols.astype("str")
 
             transpose = concat_df.T
 
@@ -972,7 +971,9 @@ def __dir__(self):
         o = set(dir(type(self)))
         o.update(self.__dict__)
         o.update(
-            c for c in self.columns if isinstance(c, str) and c.isidentifier()
+            c
+            for c in self._column_names
+            if isinstance(c, str) and c.isidentifier()
         )
         return list(o)
 
@@ -983,7 +984,6 @@ def __setattr__(self, key, col):
             # properties, and we must call object.__getattribute__ to bypass
             # the `__getitem__` behavior inherited from `GetAttrGetItemMixin`.
             object.__getattribute__(self, key)
-            super().__setattr__(key, col)
         except AttributeError:
             if key not in self._PROTECTED_KEYS:
                 try:
@@ -998,6 +998,16 @@ def __setattr__(self, key, col):
             # Set a new attribute that is not already a column.
             super().__setattr__(key, col)
 
+        except RuntimeError as e:
+            # Need to allow setting properties that are marked as forbidden for
+            # internal usage.
+            # TODO: Check if there are alternatives that could be used instead.
+            if "External-only API" not in str(e):
+                raise
+            super().__setattr__(key, col)
+        else:
+            super().__setattr__(key, col)
+
     @annotate("DATAFRAME_GETITEM", color="blue", domain="cudf_python")
     def __getitem__(self, arg):
         """
@@ -1304,7 +1314,7 @@ def _slice(self: T, arg: slice) -> T:
                         # Adding index of type RangeIndex back to
                         # result
                         result.index = self.index[start:stop]
-                result.columns = self.columns
+                result.columns = self._data.to_pandas_index()
                 return result
 
     @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python")
@@ -1606,7 +1616,7 @@ def _concat(
                 )
 
         # Reassign index and column names
-        if isinstance(objs[0].columns, pd.MultiIndex):
+        if objs[0]._data.multiindex:
             out.columns = objs[0].columns
         else:
             out.columns = names
@@ -1930,12 +1940,14 @@ def _prep_for_binop(
                 not can_reindex
                 and fn in cudf.utils.utils._EQUALITY_OPS
                 and (
-                    not lhs.columns.equals(rhs.columns)
+                    not lhs._data.to_pandas_index().equals(
+                        rhs._data.to_pandas_index()
+                    )
                     or not lhs.index.equals(rhs.index)
                 )
             ):
                 raise ValueError(
-                    "Can only compare identically-labeled " "DataFrame objects"
+                    "Can only compare identically-labeled DataFrame objects"
                 )
 
             lhs, rhs = _align_indices(lhs, rhs)
@@ -2074,8 +2086,9 @@ def update(
         if not isinstance(other, DataFrame):
             other = DataFrame(other)
 
-        if not self.columns.equals(other.columns):
-            other = other.reindex(self.columns, axis=1)
+        self_cols = self._data.to_pandas_index()
+        if not self_cols.equals(other._data.to_pandas_index()):
+            other = other.reindex(self_cols, axis=1)
         if not self.index.equals(other.index):
             other = other.reindex(self.index, axis=0)
 
@@ -2104,7 +2117,7 @@ def update(
 
     @annotate("DATAFRAME_ITER", color="blue", domain="cudf_python")
     def __iter__(self):
-        return iter(self.columns)
+        return iter(self._column_names)
 
     @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python")
     def iteritems(self):
@@ -2150,6 +2163,7 @@ def at(self):
         return self.loc
 
     @property  # type: ignore
+    @_external_only_api
     @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python")
     def columns(self):
         """Returns a tuple of columns"""
@@ -2451,7 +2465,7 @@ def set_index(
         for col in keys:
             # Is column label
             if is_scalar(col) or isinstance(col, tuple):
-                if col in self.columns:
+                if col in self._column_names:
                     columns_to_add.append(self[col])
                     names.append(col)
                     if drop:
@@ -2747,7 +2761,7 @@ def diff(self, periods=1, axis=0):
             df = cudf.DataFrame._from_data(
                 {
                     name: column_empty(len(self), dtype=dtype, masked=True)
-                    for name, dtype in zip(self.columns, self.dtypes)
+                    for name, dtype in zip(self._column_names, self.dtypes)
                 }
             )
             return df
@@ -3460,7 +3474,7 @@ def transpose(self):
         # Never transpose a MultiIndex - remove the existing columns and
         # replace with a RangeIndex. Afterward, reassign.
         columns = self.index.copy(deep=False)
-        index = self.columns.copy(deep=False)
+        index = self._data.to_pandas_index()
         if self._num_columns == 0 or self._num_rows == 0:
             return DataFrame(index=index, columns=columns)
         # Set the old column names as the new index
@@ -4313,13 +4327,13 @@ def info(
         )
         lines.append(index_summary)
 
-        if len(self.columns) == 0:
+        if len(self._data) == 0:
             lines.append(f"Empty {type(self).__name__}")
             cudf.utils.ioutils.buffer_write_lines(buf, lines)
             return
 
-        cols = self.columns
-        col_count = len(self.columns)
+        cols = self._column_names
+        col_count = len(cols)
 
         if max_cols is None:
             max_cols = pd.options.display.max_info_columns
@@ -4337,7 +4351,7 @@ def _put_str(s, space):
             return str(s)[:space].ljust(space)
 
         def _verbose_repr():
-            lines.append(f"Data columns (total {len(self.columns)} columns):")
+            lines.append(f"Data columns (total {col_count} columns):")
 
             id_head = " # "
             column_head = "Column"
@@ -4357,10 +4371,10 @@ def _verbose_repr():
             )
             if show_counts:
                 counts = self.count().to_pandas().tolist()
-                if len(cols) != len(counts):
+                if col_count != len(counts):
                     raise AssertionError(
                         f"Columns must equal "
-                        f"counts ({len(cols)} != {len(counts)})"
+                        f"counts ({col_count} != {len(counts)})"
                     )
                 count_header = "Non-Null Count"
                 len_count = len(count_header)
@@ -4393,7 +4407,7 @@ def _verbose_repr():
                 + _put_str("-" * len_dtype, space_dtype).rstrip()
             )
 
-            for i, col in enumerate(self.columns):
+            for i, col in enumerate(self._column_names):
                 dtype = self.dtypes.iloc[i]
                 col = pprint_thing(col)
 
@@ -4410,13 +4424,11 @@ def _verbose_repr():
                 )
 
         def _non_verbose_repr():
-            if len(self.columns) > 0:
-                entries_summary = f", {self.columns[0]} to {self.columns[-1]}"
+            if col_count > 0:
+                entries_summary = f", {cols[0]} to {cols[-1]}"
             else:
                 entries_summary = ""
-            columns_summary = (
-                f"Columns: {len(self.columns)} entries{entries_summary}"
-            )
+            columns_summary = f"Columns: {col_count} entries{entries_summary}"
             lines.append(columns_summary)
 
         def _sizeof_fmt(num, size_qualifier):
@@ -4479,7 +4491,7 @@ def describe(
             if datetime_is_numeric:
                 default_include.append("datetime")
             data_to_describe = self.select_dtypes(include=default_include)
-            if len(data_to_describe.columns) == 0:
+            if data_to_describe._num_columns == 0:
                 data_to_describe = self
 
         elif include == "all":
@@ -4497,7 +4509,7 @@ def describe(
 
         describe_series_list = [
             data_to_describe[col].describe(percentiles=percentiles)
-            for col in data_to_describe.columns
+            for col in data_to_describe._column_names
         ]
         if len(describe_series_list) == 1:
             return describe_series_list[0].to_frame()
@@ -4587,26 +4599,14 @@ def to_pandas(self, nullable=False, **kwargs):
         out_data = {}
         out_index = self.index.to_pandas()
 
-        if not isinstance(self.columns, pd.Index):
-            out_columns = self.columns.to_pandas()
-        else:
-            out_columns = self.columns
-
         for i, col_key in enumerate(self._data):
             out_data[i] = self._data[col_key].to_pandas(
                 index=out_index, nullable=nullable
             )
 
-        if isinstance(self.columns, BaseIndex):
-            out_columns = self.columns.to_pandas()
-            if isinstance(self.columns, MultiIndex):
-                if self.columns.names is not None:
-                    out_columns.names = self.columns.names
-            else:
-                out_columns.name = self.columns.name
-
         out_df = pd.DataFrame(out_data, index=out_index)
-        out_df.columns = out_columns
+        out_df.columns = self._data.to_pandas_index()
+
         return out_df
 
     @classmethod
@@ -5105,7 +5105,7 @@ def quantile(
         if isinstance(q, numbers.Number) and numeric_only:
             result = result.fillna(np.nan)
             result = result.iloc[0]
-            result.index = as_index(data_df.columns)
+            result.index = data_df._data.to_pandas_index()
             result.name = q
             return result
         else:
@@ -5285,6 +5285,7 @@ def make_false_column_like_self():
                 f"'{type(values).__name__}'"
             )
 
+        # TODO: Update this logic to properly preserve MultiIndex columns.
         return DataFrame._from_data(result, self.index)
 
     #
@@ -5521,7 +5522,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         if isinstance(df, Series):
             df = df.to_frame()
 
-        df.columns = data_df.columns
+        df.columns = data_df._data.to_pandas_index()
 
         return df
 
@@ -5651,7 +5652,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             return Series(result, index=self.index, dtype=result_dtype,)
         else:
             result_df = DataFrame(result).set_index(self.index)
-            result_df.columns = prepared.columns
+            result_df.columns = prepared._data.to_pandas_index()
             return result_df
 
     @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python")
@@ -5926,16 +5927,20 @@ def cov(self, **kwargs):
         cov : DataFrame
         """
         cov = cupy.cov(self.values, rowvar=False)
-        df = DataFrame(cupy.asfortranarray(cov)).set_index(self.columns)
-        df.columns = self.columns
+        # TODO: Why are we setting this for both index and columns?
+        cols = self._data.to_pandas_index()
+        df = DataFrame(cupy.asfortranarray(cov)).set_index(cols)
+        df.columns = cols
         return df
 
     @annotate("DATAFRAME_CORR", color="green", domain="cudf_python")
     def corr(self):
         """Compute the correlation matrix of a DataFrame."""
         corr = cupy.corrcoef(self.values, rowvar=False)
-        df = DataFrame(cupy.asfortranarray(corr)).set_index(self.columns)
-        df.columns = self.columns
+        # TODO: Why are we setting this for both index and columns?
+        cols = self._data.to_pandas_index()
+        df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
+        df.columns = cols
         return df
 
     @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python")
@@ -6001,7 +6006,7 @@ def keys(self):
         >>> df.keys()
         Int64Index([0, 1, 2, 3], dtype='int64')
         """
-        return self.columns
+        return self._data.to_pandas_index()
 
     def itertuples(self, index=True, name="Pandas"):
         raise TypeError(
@@ -6134,7 +6139,7 @@ def append(
                     "or if the Series has a name"
                 )
 
-            current_cols = self.columns
+            current_cols = self._data.to_pandas_index()
             combined_columns = other.index.to_pandas()
             if len(current_cols):
 
@@ -6162,8 +6167,11 @@ def append(
                 pass
             elif not isinstance(other[0], DataFrame):
                 other = DataFrame(other)
-                if (self.columns.get_indexer(other.columns) >= 0).all():
-                    other = other.reindex(columns=self.columns)
+                cols = self._data.to_pandas_index()
+                if (
+                    cols.get_indexer(other._data.to_pandas_index()) >= 0
+                ).all():
+                    other = other.reindex(columns=cols)
 
         if is_list_like(other):
             to_concat = [self, *other]
@@ -6622,7 +6630,7 @@ def _setitem_with_dataframe(
     if input_cols is None:
         input_cols = input_df.columns
 
-    if len(input_cols) != len(replace_df.columns):
+    if len(input_cols) != len(replace_df._column_names):
         raise ValueError(
             "Number of Input Columns must be same replacement Dataframe"
         )
@@ -6634,8 +6642,8 @@ def _setitem_with_dataframe(
     ):
         replace_df = replace_df.reindex(input_df.index)
 
-    for col_1, col_2 in zip(input_cols, replace_df.columns):
-        if col_1 in input_df.columns:
+    for col_1, col_2 in zip(input_cols, replace_df._column_names):
+        if col_1 in input_df._column_names:
             if mask is not None:
                 input_df._data[col_1][mask] = column.as_column(
                     replace_df[col_2]
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index d478baf2d69..50ede4faea4 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6822,7 +6822,7 @@ def _drop_rows_by_labels(
             return obj.__class__._from_data(
                 join_res.iloc[:, idx_nlv:]._data,
                 index=midx,
-                columns=obj.columns,
+                columns=obj._data.to_pandas_index(),
             )
 
     else:
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b90f857ce84..5e3a88193ac 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -7,7 +7,6 @@
 from functools import cached_property
 
 import numpy as np
-import pandas as pd
 from nvtx import annotate
 
 import cudf
@@ -905,7 +904,7 @@ def corr(self, method="pearson", min_periods=1):
         # create expanded dataframe consisting all combinations of the
         # struct columns-pairs to be correlated
         # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
-        _cols = self.grouping.values.columns.tolist()
+        _cols = self.grouping.values._data.to_pandas_index().tolist()
         len_cols = len(_cols)
 
         new_df_data = {}
@@ -1048,7 +1047,7 @@ def cov(self, min_periods=0, ddof=1):
         # create expanded dataframe consisting all combinations of the
         # struct columns-pairs used in the covariance calculation
         # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
-        column_names = self.grouping.values.columns.tolist()
+        column_names = self.grouping.values._column_names
         num_cols = len(column_names)
 
         column_pair_structs = {}
@@ -1616,11 +1615,9 @@ def agg(self, func):
                 return result.iloc[:, 0]
 
         # drop the first level if we have a multiindex
-        if (
-            isinstance(result.columns, pd.MultiIndex)
-            and result.columns.nlevels > 1
-        ):
-            result.columns = result.columns.droplevel(0)
+        cols = result._data.to_pandas_index()
+        if result._data.multiindex and cols.nlevels > 1:
+            result.columns = cols.droplevel(0)
 
         return result
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index bc7337d0a42..d423bb430b3 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -444,12 +444,11 @@ def sort_index(
                 out = self._gather(inds)
                 # TODO: frame factory function should handle multilevel column
                 # names
-                if isinstance(
-                    self, cudf.core.dataframe.DataFrame
-                ) and isinstance(
-                    self.columns, pd.core.indexes.multi.MultiIndex
+                if (
+                    isinstance(self, cudf.core.dataframe.DataFrame)
+                    and self._data.multiindex
                 ):
-                    out.columns = self.columns
+                    out.columns = self._data.to_pandas_index()
             elif (ascending and idx.is_monotonic_increasing) or (
                 not ascending and idx.is_monotonic_decreasing
             ):
@@ -459,12 +458,11 @@ def sort_index(
                     ascending=ascending, na_position=na_position
                 )
                 out = self._gather(inds)
-                if isinstance(
-                    self, cudf.core.dataframe.DataFrame
-                ) and isinstance(
-                    self.columns, pd.core.indexes.multi.MultiIndex
+                if (
+                    isinstance(self, cudf.core.dataframe.DataFrame)
+                    and self._data.multiindex
                 ):
-                    out.columns = self.columns
+                    out.columns = self._data.to_pandas_index()
         else:
             labels = sorted(self._data.names, reverse=not ascending)
             out = self[labels]
@@ -938,10 +936,11 @@ def sort_values(
             ),
             keep_index=not ignore_index,
         )
-        if isinstance(self, cudf.core.dataframe.DataFrame) and isinstance(
-            self.columns, pd.core.indexes.multi.MultiIndex
+        if (
+            isinstance(self, cudf.core.dataframe.DataFrame)
+            and self._data.multiindex
         ):
-            out.columns = self.columns
+            out.columns = self._data.to_pandas_index()
         return out
 
     def _n_largest_or_smallest(self, largest, n, columns, keep):
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 5e0cd2ca8cb..dc34054a0d7 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -110,7 +110,7 @@ def __init__(
 
         levels = [cudf.Series(level) for level in levels]
 
-        if len(levels) != len(codes.columns):
+        if len(levels) != len(codes._data):
             raise ValueError(
                 "MultiIndex has unequal number of levels and "
                 "codes and is inconsistent!"
@@ -960,12 +960,12 @@ def _concat(cls, objs):
         # TODO: Verify if this is really necessary or if we can rely on
         # DataFrame._concat.
         if len(source_data) > 1:
-            colnames = source_data[0].columns
+            colnames = source_data[0]._data.to_pandas_index()
             for obj in source_data[1:]:
                 obj.columns = colnames
 
         source_data = cudf.DataFrame._concat(source_data)
-        names = [None for x in source_data.columns]
+        names = [None] * source_data._num_columns
         objs = list(filter(lambda o: o.names is not None, objs))
         for o in range(len(objs)):
             for i, name in enumerate(objs[o].names):
@@ -1682,7 +1682,8 @@ def _union(self, other, sort=None):
 
         result_df = self_df.merge(other_df, on=col_names, how="outer")
         result_df = result_df.sort_values(
-            by=result_df.columns[self.nlevels :], ignore_index=True
+            by=result_df._data.to_pandas_index()[self.nlevels :],
+            ignore_index=True,
         )
 
         midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels])
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 43c7d0343fd..f791a9f5ed3 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -334,8 +334,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 else:
                     df[name] = col
 
-        result_columns = objs[0].columns.append(
-            [obj.columns for obj in objs[1:]]
+        result_columns = (
+            objs[0]
+            ._data.to_pandas_index()
+            .append([obj._data.to_pandas_index() for obj in objs[1:]])
         )
 
         if ignore_index:

From 68bb6021ec6e557bf4a3bde5a7cac613ba47a301 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 9 Feb 2022 16:33:38 -0800
Subject: [PATCH 2/5] Add new internal API for setting columns and use it where
 possible.

---
 python/cudf/cudf/core/_internals/where.py |  2 +-
 python/cudf/cudf/core/dataframe.py        | 33 ++++++++++++++---------
 python/cudf/cudf/core/indexed_frame.py    |  4 +--
 3 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 40fa8a5de0b..b7a8c67ff30 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -264,7 +264,7 @@ def where(
                 )
             # Setting `frame` column names to `cond`
             # as `cond` has no column names.
-            cond.columns = frame._data.to_pandas_index()
+            cond._set_column_names_like(frame)
 
         (source_df, others,) = _normalize_columns_and_scalars_type(
             frame, other
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 162f7fd3c92..d7dcfef0a84 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -999,9 +999,10 @@ def __setattr__(self, key, col):
             super().__setattr__(key, col)
 
         except RuntimeError as e:
-            # Need to allow setting properties that are marked as forbidden for
-            # internal usage.
-            # TODO: Check if there are alternatives that could be used instead.
+            # TODO: This allows setting properties that are marked as forbidden
+            # for internal usage. It is necesary because the __getattribute__
+            # call in the try block will trigger the error. We should see if
+            # setting these variables can also always be disabled
             if "External-only API" not in str(e):
                 raise
             super().__setattr__(key, col)
@@ -1314,7 +1315,7 @@ def _slice(self: T, arg: slice) -> T:
                         # Adding index of type RangeIndex back to
                         # result
                         result.index = self.index[start:stop]
-                result.columns = self._data.to_pandas_index()
+                result._set_column_names_like(self)
                 return result
 
     @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python")
@@ -2191,12 +2192,20 @@ def columns(self, columns):
                 f"got {len(columns)} elements"
             )
 
-        data = dict(zip(columns, self._data.columns))
-        if len(columns) != len(data):
+        self._set_column_names(columns, is_multiindex, columns.names)
+
+    def _set_column_names(self, names, multiindex=False, level_names=None):
+        data = dict(zip(names, self._data.columns))
+        if len(names) != len(data):
             raise ValueError("Duplicate column names are not allowed")
 
         self._data = ColumnAccessor(
-            data, multiindex=is_multiindex, level_names=columns.names,
+            data, multiindex=multiindex, level_names=level_names,
+        )
+
+    def _set_column_names_like(self, other):
+        self._set_column_names(
+            other._data.names, other._data.multiindex, other._data.level_names
         )
 
     @annotate("DATAFRAME_REINDEX_INTERNAL", color="blue", domain="cudf_python")
@@ -5522,7 +5531,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         if isinstance(df, Series):
             df = df.to_frame()
 
-        df.columns = data_df._data.to_pandas_index()
+        df._set_column_names_like(data_df)
 
         return df
 
@@ -5652,7 +5661,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             return Series(result, index=self.index, dtype=result_dtype,)
         else:
             result_df = DataFrame(result).set_index(self.index)
-            result_df.columns = prepared._data.to_pandas_index()
+            result_df._set_column_names_like(prepared)
             return result_df
 
     @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python")
@@ -5927,20 +5936,18 @@ def cov(self, **kwargs):
         cov : DataFrame
         """
         cov = cupy.cov(self.values, rowvar=False)
-        # TODO: Why are we setting this for both index and columns?
         cols = self._data.to_pandas_index()
         df = DataFrame(cupy.asfortranarray(cov)).set_index(cols)
-        df.columns = cols
+        df._set_column_names_like(self)
         return df
 
     @annotate("DATAFRAME_CORR", color="green", domain="cudf_python")
     def corr(self):
         """Compute the correlation matrix of a DataFrame."""
         corr = cupy.corrcoef(self.values, rowvar=False)
-        # TODO: Why are we setting this for both index and columns?
         cols = self._data.to_pandas_index()
         df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
-        df.columns = cols
+        df._set_column_names_like(self)
         return df
 
     @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python")
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index d423bb430b3..8ff3e39519c 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -448,7 +448,7 @@ def sort_index(
                     isinstance(self, cudf.core.dataframe.DataFrame)
                     and self._data.multiindex
                 ):
-                    out.columns = self._data.to_pandas_index()
+                    out._set_column_names_like(self)
             elif (ascending and idx.is_monotonic_increasing) or (
                 not ascending and idx.is_monotonic_decreasing
             ):
@@ -462,7 +462,7 @@ def sort_index(
                     isinstance(self, cudf.core.dataframe.DataFrame)
                     and self._data.multiindex
                 ):
-                    out.columns = self._data.to_pandas_index()
+                    out._set_column_names_like(self)
         else:
             labels = sorted(self._data.names, reverse=not ascending)
             out = self[labels]

From ccb08a1f038f272104208f0f3deb1ddc939b0f2d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 10 Feb 2022 12:23:34 -0800
Subject: [PATCH 3/5] Specify alternatives for DataFrame.columns and fix
 copyrights.

---
 python/cudf/cudf/core/_internals/where.py | 2 +-
 python/cudf/cudf/core/dataframe.py        | 6 +++++-
 python/cudf/cudf/core/reshape.py          | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index b7a8c67ff30..8bfcad4c8f4 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 import warnings
 from typing import Any, Optional, Tuple, Union, cast
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d7dcfef0a84..afbd00bb00c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2164,7 +2164,11 @@ def at(self):
         return self.loc
 
     @property  # type: ignore
-    @_external_only_api
+    @_external_only_api(
+        "Use _column_names instead, or _data.to_pandas_index() if a pandas "
+        "index is absolutely necessary. For checking if the columns are a "
+        "MultiIndex, use _data.multiindex."
+    )
     @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python")
     def columns(self):
         """Returns a tuple of columns"""
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index f791a9f5ed3..cc4a6b1adc6 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import itertools
 from typing import Dict, Optional

From 102ccef02618dfe4af614e2dd4f04d0012d31d41 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 16 Feb 2022 17:21:44 -0800
Subject: [PATCH 4/5] Fix remaining internal uses of columns API throughout
 codebase.

---
 python/cudf/cudf/core/dataframe.py         |  8 +++---
 python/cudf/cudf/core/df_protocol.py       |  6 +++--
 python/cudf/cudf/core/reshape.py           | 15 ++++++-----
 python/cudf/cudf/core/window/rolling.py    | 30 +++++++++++++---------
 python/cudf/cudf/io/parquet.py             |  2 +-
 python/cudf/cudf/testing/testing.py        | 12 ++++-----
 python/cudf/cudf/tests/test_df_protocol.py |  4 ++-
 7 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index afbd00bb00c..6b5f3809c98 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1618,7 +1618,7 @@ def _concat(
 
         # Reassign index and column names
         if objs[0]._data.multiindex:
-            out.columns = objs[0].columns
+            out._set_column_names_like(objs[0])
         else:
             out.columns = names
         if not ignore_index:
@@ -6606,10 +6606,10 @@ def _align_indices(lhs, rhs):
         df = df.sort_index()
         lhs_out = DataFrame(index=df.index)
         rhs_out = DataFrame(index=df.index)
-        common = set(lhs.columns) & set(rhs.columns)
+        common = set(lhs._column_names) & set(rhs._column_names)
         common_x = {f"{x}_x" for x in common}
         common_y = {f"{x}_y" for x in common}
-        for col in df.columns:
+        for col in df._column_names:
             if col in common_x:
                 lhs_out[col[:-2]] = df[col]
             elif col in common_y:
@@ -6639,7 +6639,7 @@ def _setitem_with_dataframe(
     """
 
     if input_cols is None:
-        input_cols = input_df.columns
+        input_cols = input_df._column_names
 
     if len(input_cols) != len(replace_df._column_names):
         raise ValueError(
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 8f258ce27b2..8f00289afcb 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
 import collections
 import enum
 from typing import (
@@ -535,7 +537,7 @@ def metadata(self):
         return {"cudf.index": self._df.index}
 
     def num_columns(self) -> int:
-        return len(self._df.columns)
+        return len(self._df._column_names)
 
     def num_rows(self) -> int:
         return len(self._df)
@@ -544,7 +546,7 @@ def num_chunks(self) -> int:
         return 1
 
     def column_names(self) -> Iterable[str]:
-        return self._df.columns.tolist()
+        return self._df._column_names
 
     def get_column(self, i: int) -> _CuDFColumn:
         return _CuDFColumn(
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index cc4a6b1adc6..5aa7f616e35 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -494,7 +494,7 @@ def melt(
         if not isinstance(id_vars, collections.abc.Sequence):
             id_vars = [id_vars]
         id_vars = list(id_vars)
-        missing = set(id_vars) - set(frame.columns)
+        missing = set(id_vars) - set(frame._column_names)
         if not len(missing) == 0:
             raise KeyError(
                 f"The following 'id_vars' are not present"
@@ -508,7 +508,7 @@ def melt(
         if not isinstance(value_vars, collections.abc.Sequence):
             value_vars = [value_vars]
         value_vars = list(value_vars)
-        missing = set(value_vars) - set(frame.columns)
+        missing = set(value_vars) - set(frame._column_names)
         if not len(missing) == 0:
             raise KeyError(
                 f"The following 'value_vars' are not present"
@@ -516,8 +516,7 @@ def melt(
             )
     else:
         # then all remaining columns in frame
-        value_vars = frame.columns.drop(id_vars)
-        value_vars = list(value_vars)
+        value_vars = list(set(frame._column_names) - set(id_vars))
 
     # Error for unimplemented support for datatype
     dtypes = [frame[col].dtype for col in id_vars + value_vars]
@@ -691,7 +690,9 @@ def get_dummies(
         encode_fallback_dtypes = ["object", "category"]
 
         if columns is None or len(columns) == 0:
-            columns = df.select_dtypes(include=encode_fallback_dtypes).columns
+            columns = df.select_dtypes(
+                include=encode_fallback_dtypes
+            )._column_names
 
         _length_check_params(prefix, columns, "prefix")
         _length_check_params(prefix_sep, columns, "prefix_sep")
@@ -1062,7 +1063,9 @@ def unstack(df, level, fill_value=None):
                 )
         res = df.T.stack(dropna=False)
         # Result's index is a multiindex
-        res.index.names = tuple(df.columns.names) + df.index.names
+        res.index.names = (
+            tuple(df._data.to_pandas_index().names) + df.index.names
+        )
         return res
     else:
         columns = df.index._poplevels(level)
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 8ffd75b1d76..2282a435ed3 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION
+# Copyright (c) 2020-2022, NVIDIA CORPORATION
 
 import itertools
 
@@ -198,8 +198,7 @@ def __getitem__(self, arg):
             center=self.center,
         )
 
-    def _apply_agg_series(self, sr, agg_name):
-        source_column = sr._column
+    def _apply_agg_column(self, source_column, agg_name):
         min_periods = self.min_periods or 1
         if isinstance(self.window, int):
             preceding_window = None
@@ -230,7 +229,7 @@ def _apply_agg_series(self, sr, agg_name):
             )
             window = None
 
-        result_col = libcudf.rolling.rolling(
+        return libcudf.rolling.rolling(
             source_column=source_column,
             pre_column_window=preceding_window,
             fwd_column_window=following_window,
@@ -240,19 +239,26 @@ def _apply_agg_series(self, sr, agg_name):
             op=agg_name,
             agg_params=self.agg_params,
         )
-        return sr._from_data({sr.name: result_col}, sr._index)
 
     def _apply_agg_dataframe(self, df, agg_name):
-        result_df = cudf.DataFrame({})
-        for i, col_name in enumerate(df.columns):
-            result_col = self._apply_agg_series(df[col_name], agg_name)
-            result_df.insert(i, col_name, result_col)
-        result_df.index = df.index
-        return result_df
+        return cudf.DataFrame._from_data(
+            {
+                col_name: self._apply_agg_column(col, agg_name)
+                for col_name, col in df._data.items()
+            },
+            index=df.index,
+        )
 
     def _apply_agg(self, agg_name):
         if isinstance(self.obj, cudf.Series):
-            return self._apply_agg_series(self.obj, agg_name)
+            return cudf.Series._from_data(
+                {
+                    self.obj.name: self._apply_agg_column(
+                        self.obj._column, agg_name
+                    )
+                },
+                index=self.obj.index,
+            )
         else:
             return self._apply_agg_dataframe(self.obj, agg_name)
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 948428de4f0..6e4e104df4d 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -565,7 +565,7 @@ def to_parquet(
 
     if engine == "cudf":
         # Ensure that no columns dtype is 'category'
-        for col in df.columns:
+        for col in df._column_names:
             if partition_cols is None or col not in partition_cols:
                 if df[col].dtype.name == "category":
                     raise ValueError(
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index b3e30fac7d5..5f7616cc75e 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -696,8 +696,8 @@ def assert_frame_equal(
 
     if PANDAS_GE_110:
         pd.testing.assert_index_equal(
-            left.columns,
-            right.columns,
+            left._data.to_pandas_index(),
+            right._data.to_pandas_index(),
             exact=check_column_type,
             check_names=check_names,
             check_exact=check_exact,
@@ -708,8 +708,8 @@ def assert_frame_equal(
         )
     else:
         pd.testing.assert_index_equal(
-            left.columns,
-            right.columns,
+            left._data.to_pandas_index(),
+            right._data.to_pandas_index(),
             exact=check_column_type,
             check_names=check_names,
             check_exact=check_exact,
@@ -717,7 +717,7 @@ def assert_frame_equal(
             obj=f"{obj}.columns",
         )
 
-    for col in left.columns:
+    for col in left._column_names:
         assert_column_equal(
             left._data[col],
             right._data[col],
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index d24c8ca2860..de6aa0a6bf3 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
 from typing import Any, Tuple
 
 import cupy as cp
@@ -74,7 +76,7 @@ def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame):
     assert dfo.num_columns() == len(df.columns)
     assert dfo.num_rows() == len(df)
     assert dfo.num_chunks() == 1
-    assert dfo.column_names() == list(df.columns)
+    assert dfo.column_names() == tuple(df.columns)
     for col in df.columns:
         assert_column_equal(dfo.get_column_by_name(col), df[col]._column)
 

From ae168f473899cefd02304d1502774f78f5a59eaf Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 23 Feb 2022 14:51:06 -0800
Subject: [PATCH 5/5] Address PR comments.

---
 python/cudf/cudf/core/groupby/groupby.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 5e3a88193ac..0ad6a9dc35a 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1615,9 +1615,8 @@ def agg(self, func):
                 return result.iloc[:, 0]
 
         # drop the first level if we have a multiindex
-        cols = result._data.to_pandas_index()
-        if result._data.multiindex and cols.nlevels > 1:
-            result.columns = cols.droplevel(0)
+        if result._data.nlevels > 1:
+            result.columns = result._data.to_pandas_index().droplevel(0)
 
         return result