From 301f5ea3cad1fe5fcb58cdc9eec3cad0ffa89bfb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 16 Jan 2024 14:47:43 -0800
Subject: [PATCH 1/8] Move _drop_na_columns, type _has_nulls

---
 python/cudf/cudf/core/frame.py             | 34 +---------------------
 python/cudf/cudf/core/groupby/groupby.py   |  4 +--
 python/cudf/cudf/core/indexed_frame.py     | 32 ++++++++++++++++++++
 python/cudf/cudf/core/udf/groupby_utils.py |  4 ++-
 4 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 5f7a86e86d8..ac5b092808a 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -95,7 +95,7 @@ def _dtypes(self):
         )
 
     @property
-    def _has_nulls(self):
+    def _has_nulls(self) -> bool:
         return any(col.has_nulls() for col in self._data.values())
 
     @_cudf_nvtx_annotate
@@ -911,38 +911,6 @@ def _drop_column(self, name):
             raise KeyError(f"column '{name}' does not exist")
         del self._data[name]
 
-    @_cudf_nvtx_annotate
-    def _drop_na_columns(self, how="any", subset=None, thresh=None):
-        """
-        Drop columns containing nulls
-        """
-        out_cols = []
-
-        if subset is None:
-            df = self
-        else:
-            df = self.take(subset)
-
-        if thresh is None:
-            if how == "all":
-                thresh = 1
-            else:
-                thresh = len(df)
-
-        for name, col in df._data.items():
-            try:
-                check_col = col.nans_to_nulls()
-            except AttributeError:
-                check_col = col
-            no_threshold_valid_count = (
-                len(col) - check_col.null_count
-            ) < thresh
-            if no_threshold_valid_count:
-                continue
-            out_cols.append(name)
-
-        return self[out_cols]
-
     @_cudf_nvtx_annotate
     def _quantile_table(
         self,
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 4e8947652ff..f5275858ba7 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1445,9 +1445,7 @@ def mult(df):
         group_names, offsets, group_keys, grouped_values = self._grouped()
 
         if engine == "auto":
-            if (not grouped_values._has_nulls) and _can_be_jitted(
-                grouped_values, function, args
-            ):
+            if _can_be_jitted(grouped_values, function, args):
                 engine = "jit"
             else:
                 engine = "cudf"
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2a35ac0f959..17b818ada5f 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3084,6 +3084,38 @@ def dropna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
+    @_cudf_nvtx_annotate
+    def _drop_na_columns(self, how="any", subset=None, thresh=None):
+        """
+        Drop columns containing nulls
+        """
+        out_cols = []
+
+        if subset is None:
+            df = self
+        else:
+            df = self.take(subset)
+
+        if thresh is None:
+            if how == "all":
+                thresh = 1
+            else:
+                thresh = len(df)
+
+        for name, col in df._data.items():
+            try:
+                check_col = col.nans_to_nulls()
+            except AttributeError:
+                check_col = col
+            no_threshold_valid_count = (
+                len(col) - check_col.null_count
+            ) < thresh
+            if no_threshold_valid_count:
+                continue
+            out_cols.append(name)
+
+        return self[out_cols]
+
     def _drop_na_rows(self, how="any", subset=None, thresh=None):
         """
         Drop null rows from `self`.
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index c82f8d2cd7b..199260e28cb 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 
 import cupy as cp
@@ -209,6 +209,8 @@ def _can_be_jitted(frame, func, args):
         # Numba requires bytecode to be present to proceed.
         # See https://github.com/numba/numba/issues/4587
         return False
+    elif frame._has_nulls:
+        return False
     np_field_types = np.dtype(
         list(
             _supported_dtypes_from_frame(

From b86aa1873097912091432543b99b07d210b538bb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 16 Jan 2024 14:59:03 -0800
Subject: [PATCH 2/8] Remove _has_nulls

---
 python/cudf/cudf/core/frame.py             | 4 ----
 python/cudf/cudf/core/groupby/groupby.py   | 4 +++-
 python/cudf/cudf/core/udf/groupby_utils.py | 4 +++-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ac5b092808a..08d6a0463a7 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -94,10 +94,6 @@ def _dtypes(self):
             zip(self._data.names, (col.dtype for col in self._data.columns))
         )
 
-    @property
-    def _has_nulls(self) -> bool:
-        return any(col.has_nulls() for col in self._data.values())
-
     @_cudf_nvtx_annotate
     def serialize(self):
         # TODO: See if self._data can be serialized outright
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index f5275858ba7..3612d4c45d8 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1252,7 +1252,9 @@ def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
         # Nulls are not yet supported
-        if self.grouping._obj._has_nulls:
+        obj = self.grouping._obj
+        any_na = obj.isna()
+        if (obj.ndim == 1 and any_na) or (obj.ndim == 2 and any_na.any()):
             raise ValueError("Nulls not yet supported with groupby JIT engine")
 
         chunk_results = jit_groupby_apply(
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 199260e28cb..6b11a90f578 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -209,7 +209,9 @@ def _can_be_jitted(frame, func, args):
         # Numba requires bytecode to be present to proceed.
         # See https://github.com/numba/numba/issues/4587
         return False
-    elif frame._has_nulls:
+
+    any_na = frame.isna()
+    if (frame.ndim == 1 and any_na) or (frame.ndim == 2 and any_na.any()):
         return False
     np_field_types = np.dtype(
         list(

From c175612b0327e12a72019afe003b223e2ab7eb72 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 16 Jan 2024 15:15:48 -0800
Subject: [PATCH 3/8] Add some typing to frame.py

---
 python/cudf/cudf/core/frame.py | 56 +++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 08d6a0463a7..95bcb12122f 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -131,13 +131,13 @@ def deserialize(cls, header, frames):
 
     @classmethod
     @_cudf_nvtx_annotate
-    def _from_data(cls, data: MutableMapping):
+    def _from_data(cls, data: MutableMapping) -> Self:
         obj = cls.__new__(cls)
         Frame.__init__(obj, data)
         return obj
 
     @_cudf_nvtx_annotate
-    def _from_data_like_self(self, data: MutableMapping):
+    def _from_data_like_self(self, data: MutableMapping) -> Self:
         return self._from_data(data)
 
     @classmethod
@@ -185,7 +185,7 @@ def _mimic_inplace(
 
     @property
     @_cudf_nvtx_annotate
-    def size(self):
+    def size(self) -> int:
         """
         Return the number of elements in the underlying data.
 
@@ -278,7 +278,7 @@ def memory_usage(self, deep=False):
         raise NotImplementedError
 
     @_cudf_nvtx_annotate
-    def __len__(self):
+    def __len__(self) -> int:
         return self._num_rows
 
     @_cudf_nvtx_annotate
@@ -300,7 +300,7 @@ def astype(self, dtype, copy=False, **kwargs):
         )
 
     @_cudf_nvtx_annotate
-    def equals(self, other):
+    def equals(self, other) -> bool:
         """
         Test whether two objects contain the same elements.
 
@@ -384,7 +384,7 @@ def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
 
     @property
     @_cudf_nvtx_annotate
-    def values(self):
+    def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the DataFrame.
 
@@ -400,7 +400,7 @@ def values(self):
 
     @property
     @_cudf_nvtx_annotate
-    def values_host(self):
+    def values_host(self) -> np.ndarray:
         """
         Return a NumPy representation of the data.
 
@@ -556,7 +556,7 @@ def to_numpy(
         )
 
     @_cudf_nvtx_annotate
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
         """
         Replace values where the condition is False.
 
@@ -627,7 +627,7 @@ def where(self, cond, other=None, inplace=False):
         raise NotImplementedError
 
     @_cudf_nvtx_annotate
-    def mask(self, cond, other=None, inplace=False):
+    def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
         """
         Replace values where the condition is True.
 
@@ -737,8 +737,13 @@ def pipe(self, func, *args, **kwargs):
 
     @_cudf_nvtx_annotate
     def fillna(
-        self, value=None, method=None, axis=None, inplace=False, limit=None
-    ):
+        self,
+        value=None,
+        method: Optional[Literal["ffill", "bfill", "pad", "backfill"]] = None,
+        axis=None,
+        inplace: bool = False,
+        limit=None,
+    ) -> Optional[Self]:
         """Fill null values with ``value`` or specified ``method``.
 
         Parameters
@@ -857,15 +862,15 @@ def fillna(
         if isinstance(value, cudf.Series):
             value = value.reindex(self._data.names)
         elif isinstance(value, cudf.DataFrame):
-            if not self.index.equals(value.index):
-                value = value.reindex(self.index)
+            if not self.index.equals(value.index):  # type: ignore[attr-defined]
+                value = value.reindex(self.index)  # type: ignore[attr-defined]
             else:
                 value = value
         elif not isinstance(value, abc.Mapping):
             value = {name: copy.deepcopy(value) for name in self._data.names}
         else:
             value = {
-                key: value.reindex(self.index)
+                key: value.reindex(self.index)  # type: ignore[attr-defined]
                 if isinstance(value, cudf.Series)
                 else value
                 for key, value in value.items()
@@ -910,9 +915,11 @@ def _drop_column(self, name):
     @_cudf_nvtx_annotate
     def _quantile_table(
         self,
-        q,
-        interpolation="LINEAR",
-        is_sorted=False,
+        q: float,
+        interpolation: Literal[
+            "LINEAR", "LOWER", "HIGHER", "MIDPOINT", "NEAREST"
+        ] = "LINEAR",
+        is_sorted: bool = False,
         column_order=(),
         null_precedence=(),
     ):
@@ -940,7 +947,7 @@ def _quantile_table(
 
     @classmethod
     @_cudf_nvtx_annotate
-    def from_arrow(cls, data):
+    def from_arrow(cls, data: pa.Table) -> Self:
         """Convert from PyArrow Table to Frame
 
         Parameters
@@ -1117,7 +1124,7 @@ def to_arrow(self):
         )
 
     @_cudf_nvtx_annotate
-    def _positions_from_column_names(self, column_names):
+    def _positions_from_column_names(self, column_names) -> list[int]:
         """Map each column name into their positions in the frame.
 
         The order of indices returned corresponds to the column order in this
@@ -1519,7 +1526,12 @@ def argsort(
         ).values
 
     @_cudf_nvtx_annotate
-    def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
+    def _get_sorted_inds(
+        self,
+        by=None,
+        ascending: bool = True,
+        na_position: Literal["first", "last"] = "last",
+    ) -> ColumnBase:
         """
         Get the indices required to sort self according to the columns
         specified in by.
@@ -1535,11 +1547,11 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
 
         # If given a scalar need to construct a sequence of length # of columns
         if np.isscalar(ascending):
-            ascending = [ascending] * len(to_sort)
+            ascending_lst = [ascending] * len(to_sort)
 
         return libcudf.sort.order_by(
             to_sort,
-            ascending,
+            ascending_lst,
             na_position,
             stable=True,
         )

From 62a68c74522525c8d3ff3443c70cd4a2063f031b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 17 Jan 2024 14:16:15 -0800
Subject: [PATCH 4/8] add any

---
 python/cudf/cudf/core/udf/groupby_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 6b11a90f578..8b58252c8ad 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args):
         # See https://github.com/numba/numba/issues/4587
         return False
 
-    any_na = frame.isna()
+    any_na = frame.isna().any()
     if (frame.ndim == 1 and any_na) or (frame.ndim == 2 and any_na.any()):
         return False
     np_field_types = np.dtype(

From d6824d649030b0b6e0d5977526aa90756185d97d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jan 2024 10:15:53 -0800
Subject: [PATCH 5/8] Add any

---
 python/cudf/cudf/core/groupby/groupby.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 8b6643a4170..7192ca58586 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1253,7 +1253,7 @@ def _jit_groupby_apply(
     ):
         # Nulls are not yet supported
         obj = self.grouping._obj
-        any_na = obj.isna()
+        any_na = obj.isna().any()
         if (obj.ndim == 1 and any_na) or (obj.ndim == 2 and any_na.any()):
             raise ValueError("Nulls not yet supported with groupby JIT engine")
 

From 3004e1e7f6ea67141ad61a96633e2e0858c35229 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jan 2024 12:03:28 -0800
Subject: [PATCH 6/8] Just use _can_jit

---
 python/cudf/cudf/core/frame.py             | 4 ++--
 python/cudf/cudf/core/groupby/groupby.py   | 6 ------
 python/cudf/cudf/core/udf/groupby_utils.py | 3 +--
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 697e36ec55d..9f4089c60a0 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1537,11 +1537,11 @@ def _get_sorted_inds(
 
         # If given a scalar need to construct a sequence of length # of columns
         if np.isscalar(ascending):
-            ascending_lst = [ascending] * len(to_sort)
+            ascending = [ascending] * len(to_sort)  # type: ignore[assignment]
 
         return libcudf.sort.order_by(
             to_sort,
-            ascending_lst,
+            ascending,
             na_position,
             stable=True,
         )
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 7192ca58586..e28ba233c56 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1251,12 +1251,6 @@ def pipe(self, func, *args, **kwargs):
     def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
-        # Nulls are not yet supported
-        obj = self.grouping._obj
-        any_na = obj.isna().any()
-        if (obj.ndim == 1 and any_na) or (obj.ndim == 2 and any_na.any()):
-            raise ValueError("Nulls not yet supported with groupby JIT engine")
-
         chunk_results = jit_groupby_apply(
             offsets, grouped_values, function, *args
         )
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 8b58252c8ad..06d9296ca0f 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -210,8 +210,7 @@ def _can_be_jitted(frame, func, args):
         # See https://github.com/numba/numba/issues/4587
         return False
 
-    any_na = frame.isna().any()
-    if (frame.ndim == 1 and any_na) or (frame.ndim == 2 and any_na.any()):
+    if any(col.has_nulls() for col in frame._data.values()):
         return False
     np_field_types = np.dtype(
         list(

From 9e80bf146c54d90a205d446a579b5f4580599396 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jan 2024 12:09:25 -0800
Subject: [PATCH 7/8] stronger typing to _get_sorted_inds

---
 python/cudf/cudf/core/frame.py         | 6 ++----
 python/cudf/cudf/core/indexed_frame.py | 5 ++++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 9f4089c60a0..197e65947dc 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1535,13 +1535,11 @@ def _get_sorted_inds(
             )._columns
         ]
 
-        # If given a scalar need to construct a sequence of length # of columns
-        if np.isscalar(ascending):
-            ascending = [ascending] * len(to_sort)  # type: ignore[assignment]
+        ascending_lst = [ascending] * len(to_sort)
 
         return libcudf.sort.order_by(
             to_sort,
-            ascending,
+            ascending_lst,
             na_position,
             stable=True,
         )
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 620ba82e7f4..a5f1ee60fe4 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -14,6 +14,7 @@
     Callable,
     Dict,
     List,
+    Literal,
     MutableMapping,
     Optional,
     Tuple,
@@ -2438,7 +2439,9 @@ def sort_values(
             out.columns = self._data.to_pandas_index()
         return out
 
-    def _n_largest_or_smallest(self, largest, n, columns, keep):
+    def _n_largest_or_smallest(
+        self, largest: bool, n: int, columns, keep: Literal["first", "last"]
+    ):
         # Get column to operate on
         if isinstance(columns, str):
             columns = [columns]

From ca1e911755c71f3dbc618ad79dc14484cf79b51a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jan 2024 11:54:41 -0800
Subject: [PATCH 8/8] _get_sorted_inds accepts list-like ascending

---
 python/cudf/cudf/core/frame.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 197e65947dc..f165c329e46 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1519,7 +1519,7 @@ def argsort(
     def _get_sorted_inds(
         self,
         by=None,
-        ascending: bool = True,
+        ascending=True,
         na_position: Literal["first", "last"] = "last",
     ) -> ColumnBase:
         """
@@ -1535,7 +1535,10 @@ def _get_sorted_inds(
             )._columns
         ]
 
-        ascending_lst = [ascending] * len(to_sort)
+        if is_scalar(ascending):
+            ascending_lst = [ascending] * len(to_sort)
+        else:
+            ascending_lst = list(ascending)
 
         return libcudf.sort.order_by(
             to_sort,