rapidsai · rapids-bot · Jan 19, 2024 · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024
@@ -94,10 +94,6 @@ def _dtypes(self):
             zip(self._data.names, (col.dtype for col in self._data.columns))
         )
 
-    @property
-    def _has_nulls(self):
-        return any(col.has_nulls() for col in self._data.values())
-
     @_cudf_nvtx_annotate
     def serialize(self):
         # TODO: See if self._data can be serialized outright
@@ -135,13 +131,13 @@ def deserialize(cls, header, frames):
 
     @classmethod
     @_cudf_nvtx_annotate
-    def _from_data(cls, data: MutableMapping):
+    def _from_data(cls, data: MutableMapping) -> Self:
         obj = cls.__new__(cls)
         Frame.__init__(obj, data)
         return obj
 
     @_cudf_nvtx_annotate
-    def _from_data_like_self(self, data: MutableMapping):
+    def _from_data_like_self(self, data: MutableMapping) -> Self:
         return self._from_data(data)
 
     @_cudf_nvtx_annotate
@@ -179,7 +175,7 @@ def _mimic_inplace(
 
     @property
     @_cudf_nvtx_annotate
-    def size(self):
+    def size(self) -> int:
         """
         Return the number of elements in the underlying data.
 
@@ -272,7 +268,7 @@ def memory_usage(self, deep=False):
         raise NotImplementedError
 
     @_cudf_nvtx_annotate
-    def __len__(self):
+    def __len__(self) -> int:
         return self._num_rows
 
     @_cudf_nvtx_annotate
@@ -294,7 +290,7 @@ def astype(self, dtype, copy=False, **kwargs):
         )
 
     @_cudf_nvtx_annotate
-    def equals(self, other):
+    def equals(self, other) -> bool:
         """
         Test whether two objects contain the same elements.
 
@@ -378,7 +374,7 @@ def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
 
     @property
     @_cudf_nvtx_annotate
-    def values(self):
+    def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the DataFrame.
 
@@ -394,7 +390,7 @@ def values(self):
 
     @property
     @_cudf_nvtx_annotate
-    def values_host(self):
+    def values_host(self) -> np.ndarray:
         """
         Return a NumPy representation of the data.
 
@@ -550,7 +546,7 @@ def to_numpy(
         )
 
     @_cudf_nvtx_annotate
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
         """
         Replace values where the condition is False.
 
@@ -621,7 +617,7 @@ def where(self, cond, other=None, inplace=False):
         raise NotImplementedError
 
     @_cudf_nvtx_annotate
-    def mask(self, cond, other=None, inplace=False):
+    def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
         """
         Replace values where the condition is True.
 
@@ -731,8 +727,13 @@ def pipe(self, func, *args, **kwargs):
 
     @_cudf_nvtx_annotate
     def fillna(
-        self, value=None, method=None, axis=None, inplace=False, limit=None
-    ):
+        self,
+        value=None,
+        method: Optional[Literal["ffill", "bfill", "pad", "backfill"]] = None,
+        axis=None,
+        inplace: bool = False,
+        limit=None,
+    ) -> Optional[Self]:
         """Fill null values with ``value`` or specified ``method``.
 
         Parameters
@@ -851,15 +852,15 @@ def fillna(
         if isinstance(value, cudf.Series):
             value = value.reindex(self._data.names)
         elif isinstance(value, cudf.DataFrame):
-            if not self.index.equals(value.index):
-                value = value.reindex(self.index)
+            if not self.index.equals(value.index):  # type: ignore[attr-defined]
+                value = value.reindex(self.index)  # type: ignore[attr-defined]
             else:
                 value = value
         elif not isinstance(value, abc.Mapping):
             value = {name: copy.deepcopy(value) for name in self._data.names}
         else:
             value = {
-                key: value.reindex(self.index)
+                key: value.reindex(self.index)  # type: ignore[attr-defined]
                 if isinstance(value, cudf.Series)
                 else value
                 for key, value in value.items()
@@ -901,44 +902,14 @@ def _drop_column(self, name):
             raise KeyError(f"column '{name}' does not exist")
         del self._data[name]
 
-    @_cudf_nvtx_annotate
-    def _drop_na_columns(self, how="any", subset=None, thresh=None):
-        """
-        Drop columns containing nulls
-        """
-        out_cols = []
-
-        if subset is None:
-            df = self
-        else:
-            df = self.take(subset)
-
-        if thresh is None:
-            if how == "all":
-                thresh = 1
-            else:
-                thresh = len(df)
-
-        for name, col in df._data.items():
-            try:
-                check_col = col.nans_to_nulls()
-            except AttributeError:
-                check_col = col
-            no_threshold_valid_count = (
-                len(col) - check_col.null_count
-            ) < thresh
-            if no_threshold_valid_count:
-                continue
-            out_cols.append(name)
-
-        return self[out_cols]
-
     @_cudf_nvtx_annotate
     def _quantile_table(
         self,
-        q,
-        interpolation="LINEAR",
-        is_sorted=False,
+        q: float,
+        interpolation: Literal[
+            "LINEAR", "LOWER", "HIGHER", "MIDPOINT", "NEAREST"
+        ] = "LINEAR",
+        is_sorted: bool = False,
         column_order=(),
         null_precedence=(),
     ):
@@ -966,7 +937,7 @@ def _quantile_table(
 
     @classmethod
     @_cudf_nvtx_annotate
-    def from_arrow(cls, data):
+    def from_arrow(cls, data: pa.Table) -> Self:
         """Convert from PyArrow Table to Frame
 
         Parameters
@@ -1143,7 +1114,7 @@ def to_arrow(self):
         )
 
     @_cudf_nvtx_annotate
-    def _positions_from_column_names(self, column_names):
+    def _positions_from_column_names(self, column_names) -> list[int]:
         """Map each column name into their positions in the frame.
 
         The order of indices returned corresponds to the column order in this
@@ -1545,7 +1516,12 @@ def argsort(
         ).values
 
     @_cudf_nvtx_annotate
-    def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
+    def _get_sorted_inds(
+        self,
+        by=None,
+        ascending: bool = True,
+        na_position: Literal["first", "last"] = "last",
+    ) -> ColumnBase:
         """
         Get the indices required to sort self according to the columns
         specified in by.
@@ -1559,13 +1535,11 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
             )._columns
         ]
 
-        # If given a scalar need to construct a sequence of length # of columns
-        if np.isscalar(ascending):
-            ascending = [ascending] * len(to_sort)
+        ascending_lst = [ascending] * len(to_sort)
 
         return libcudf.sort.order_by(
             to_sort,
-            ascending,
+            ascending_lst,
             na_position,
             stable=True,
         )

@@ -1251,10 +1251,6 @@ def pipe(self, func, *args, **kwargs):
     def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
-        # Nulls are not yet supported
-        if self.grouping._obj._has_nulls:
-            raise ValueError("Nulls not yet supported with groupby JIT engine")
-
         chunk_results = jit_groupby_apply(
             offsets, grouped_values, function, *args
         )
@@ -1445,9 +1441,7 @@ def mult(df):
         group_names, offsets, group_keys, grouped_values = self._grouped()
 
         if engine == "auto":
-            if (not grouped_values._has_nulls) and _can_be_jitted(
-                grouped_values, function, args
-            ):
+            if _can_be_jitted(grouped_values, function, args):
                 engine = "jit"
             else:
                 engine = "cudf"

@@ -14,6 +14,7 @@
     Callable,
     Dict,
     List,
+    Literal,
     MutableMapping,
     Optional,
     Tuple,
@@ -2438,7 +2439,9 @@ def sort_values(
             out.columns = self._data.to_pandas_index()
         return out
 
-    def _n_largest_or_smallest(self, largest, n, columns, keep):
+    def _n_largest_or_smallest(
+        self, largest: bool, n: int, columns, keep: Literal["first", "last"]
+    ):
         # Get column to operate on
         if isinstance(columns, str):
             columns = [columns]
@@ -3067,6 +3070,38 @@ def dropna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
+    @_cudf_nvtx_annotate
+    def _drop_na_columns(self, how="any", subset=None, thresh=None):
+        """
+        Drop columns containing nulls
+        """
+        out_cols = []
+
+        if subset is None:
+            df = self
+        else:
+            df = self.take(subset)
+
+        if thresh is None:
+            if how == "all":
+                thresh = 1
+            else:
+                thresh = len(df)
+
+        for name, col in df._data.items():
+            try:
+                check_col = col.nans_to_nulls()
+            except AttributeError:
+                check_col = col
+            no_threshold_valid_count = (
+                len(col) - check_col.null_count
+            ) < thresh
+            if no_threshold_valid_count:
+                continue
+            out_cols.append(name)
+
+        return self[out_cols]
+
     def _drop_na_rows(self, how="any", subset=None, thresh=None):
         """
         Drop null rows from `self`.

@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 
 import cupy as cp
@@ -209,6 +209,9 @@ def _can_be_jitted(frame, func, args):
         # Numba requires bytecode to be present to proceed.
         # See https://github.com/numba/numba/issues/4587
         return False
+
+    if any(col.has_nulls() for col in frame._data.values()):
+        return False
     np_field_types = np.dtype(
         list(
             _supported_dtypes_from_frame(