From 301f5ea3cad1fe5fcb58cdc9eec3cad0ffa89bfb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 16 Jan 2024 14:47:43 -0800 Subject: [PATCH 1/8] Move _drop_na_columns, type _has_nulls --- python/cudf/cudf/core/frame.py | 34 +--------------------- python/cudf/cudf/core/groupby/groupby.py | 4 +-- python/cudf/cudf/core/indexed_frame.py | 32 ++++++++++++++++++++ python/cudf/cudf/core/udf/groupby_utils.py | 4 ++- 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5f7a86e86d8..ac5b092808a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -95,7 +95,7 @@ def _dtypes(self): ) @property - def _has_nulls(self): + def _has_nulls(self) -> bool: return any(col.has_nulls() for col in self._data.values()) @_cudf_nvtx_annotate @@ -911,38 +911,6 @@ def _drop_column(self, name): raise KeyError(f"column '{name}' does not exist") del self._data[name] - @_cudf_nvtx_annotate - def _drop_na_columns(self, how="any", subset=None, thresh=None): - """ - Drop columns containing nulls - """ - out_cols = [] - - if subset is None: - df = self - else: - df = self.take(subset) - - if thresh is None: - if how == "all": - thresh = 1 - else: - thresh = len(df) - - for name, col in df._data.items(): - try: - check_col = col.nans_to_nulls() - except AttributeError: - check_col = col - no_threshold_valid_count = ( - len(col) - check_col.null_count - ) < thresh - if no_threshold_valid_count: - continue - out_cols.append(name) - - return self[out_cols] - @_cudf_nvtx_annotate def _quantile_table( self, diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4e8947652ff..f5275858ba7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1445,9 +1445,7 @@ def mult(df): group_names, offsets, group_keys, grouped_values = self._grouped() if engine == "auto": - if (not grouped_values._has_nulls) and _can_be_jitted( - grouped_values, function, args - ): + if _can_be_jitted(grouped_values, function, args): engine = "jit" else: engine = "cudf" diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2a35ac0f959..17b818ada5f 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3084,6 +3084,38 @@ def dropna( return self._mimic_inplace(result, inplace=inplace) + @_cudf_nvtx_annotate + def _drop_na_columns(self, how="any", subset=None, thresh=None): + """ + Drop columns containing nulls + """ + out_cols = [] + + if subset is None: + df = self + else: + df = self.take(subset) + + if thresh is None: + if how == "all": + thresh = 1 + else: + thresh = len(df) + + for name, col in df._data.items(): + try: + check_col = col.nans_to_nulls() + except AttributeError: + check_col = col + no_threshold_valid_count = ( + len(col) - check_col.null_count + ) < thresh + if no_threshold_valid_count: + continue + out_cols.append(name) + + return self[out_cols] + def _drop_na_rows(self, how="any", subset=None, thresh=None): """ Drop null rows from `self`. diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index c82f8d2cd7b..199260e28cb 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import cupy as cp @@ -209,6 +209,8 @@ def _can_be_jitted(frame, func, args): # Numba requires bytecode to be present to proceed. # See https://github.com/numba/numba/issues/4587 return False + elif frame._has_nulls: + return False np_field_types = np.dtype( list( _supported_dtypes_from_frame( From b86aa1873097912091432543b99b07d210b538bb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 16 Jan 2024 14:59:03 -0800 Subject: [PATCH 2/8] Remove _has_nulls --- python/cudf/cudf/core/frame.py | 4 ---- python/cudf/cudf/core/groupby/groupby.py | 4 +++- python/cudf/cudf/core/udf/groupby_utils.py | 4 +++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ac5b092808a..08d6a0463a7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -94,10 +94,6 @@ def _dtypes(self): zip(self._data.names, (col.dtype for col in self._data.columns)) ) - @property - def _has_nulls(self) -> bool: - return any(col.has_nulls() for col in self._data.values()) - @_cudf_nvtx_annotate def serialize(self): # TODO: See if self._data can be serialized outright diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f5275858ba7..3612d4c45d8 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1252,7 +1252,9 @@ def _jit_groupby_apply( self, function, group_names, offsets, group_keys, grouped_values, *args ): # Nulls are not yet supported - if self.grouping._obj._has_nulls: + obj = self.grouping._obj + any_na = obj.isna() + if (obj.ndim == 1 and any_na) or (obj.ndim == 2 and any_na.any()): raise ValueError("Nulls not yet supported with groupby JIT engine") chunk_results = jit_groupby_apply( diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 199260e28cb..6b11a90f578 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -209,7 +209,9 @@ def _can_be_jitted(frame, func, args): # Numba requires bytecode to be present to proceed. # See https://github.com/numba/numba/issues/4587 return False - elif frame._has_nulls: + + any_na = frame.isna() + if (frame.ndim == 1 and any_na) or (frame.ndim == 2 and any_na.any()): return False np_field_types = np.dtype( list( From c175612b0327e12a72019afe003b223e2ab7eb72 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 16 Jan 2024 15:15:48 -0800 Subject: [PATCH 3/8] Add some typing to frame.py --- python/cudf/cudf/core/frame.py | 56 +++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 08d6a0463a7..95bcb12122f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -131,13 +131,13 @@ def deserialize(cls, header, frames): @classmethod @_cudf_nvtx_annotate - def _from_data(cls, data: MutableMapping): + def _from_data(cls, data: MutableMapping) -> Self: obj = cls.__new__(cls) Frame.__init__(obj, data) return obj @_cudf_nvtx_annotate - def _from_data_like_self(self, data: MutableMapping): + def _from_data_like_self(self, data: MutableMapping) -> Self: return self._from_data(data) @classmethod @@ -185,7 +185,7 @@ def _mimic_inplace( @property @_cudf_nvtx_annotate - def size(self): + def size(self) -> int: """ Return the number of elements in the underlying data. @@ -278,7 +278,7 @@ def memory_usage(self, deep=False): raise NotImplementedError @_cudf_nvtx_annotate - def __len__(self): + def __len__(self) -> int: return self._num_rows @_cudf_nvtx_annotate @@ -300,7 +300,7 @@ def astype(self, dtype, copy=False, **kwargs): ) @_cudf_nvtx_annotate - def equals(self, other): + def equals(self, other) -> bool: """ Test whether two objects contain the same elements. @@ -384,7 +384,7 @@ def _get_columns_by_label(self, labels, *, downcast=False) -> Self: @property @_cudf_nvtx_annotate - def values(self): + def values(self) -> cupy.ndarray: """ Return a CuPy representation of the DataFrame. @@ -400,7 +400,7 @@ def values(self): @property @_cudf_nvtx_annotate - def values_host(self): + def values_host(self) -> np.ndarray: """ Return a NumPy representation of the data. @@ -556,7 +556,7 @@ def to_numpy( ) @_cudf_nvtx_annotate - def where(self, cond, other=None, inplace=False): + def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: """ Replace values where the condition is False. @@ -627,7 +627,7 @@ def where(self, cond, other=None, inplace=False): raise NotImplementedError @_cudf_nvtx_annotate - def mask(self, cond, other=None, inplace=False): + def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: """ Replace values where the condition is True. @@ -737,8 +737,13 @@ def pipe(self, func, *args, **kwargs): @_cudf_nvtx_annotate def fillna( - self, value=None, method=None, axis=None, inplace=False, limit=None - ): + self, + value=None, + method: Optional[Literal["ffill", "bfill", "pad", "backfill"]] = None, + axis=None, + inplace: bool = False, + limit=None, + ) -> Optional[Self]: """Fill null values with ``value`` or specified ``method``. Parameters @@ -857,15 +862,15 @@ def fillna( if isinstance(value, cudf.Series): value = value.reindex(self._data.names) elif isinstance(value, cudf.DataFrame): - if not self.index.equals(value.index): - value = value.reindex(self.index) + if not self.index.equals(value.index): # type: ignore[attr-defined] + value = value.reindex(self.index) # type: ignore[attr-defined] else: value = value elif not isinstance(value, abc.Mapping): value = {name: copy.deepcopy(value) for name in self._data.names} else: value = { - key: value.reindex(self.index) + key: value.reindex(self.index) # type: ignore[attr-defined] if isinstance(value, cudf.Series) else value for key, value in value.items() @@ -910,9 +915,11 @@ def _drop_column(self, name): @_cudf_nvtx_annotate def _quantile_table( self, - q, - interpolation="LINEAR", - is_sorted=False, + q: float, + interpolation: Literal[ + "LINEAR", "LOWER", "HIGHER", "MIDPOINT", "NEAREST" + ] = "LINEAR", + is_sorted: bool = False, column_order=(), null_precedence=(), ): @@ -940,7 +947,7 @@ def _quantile_table( @classmethod @_cudf_nvtx_annotate - def from_arrow(cls, data): + def from_arrow(cls, data: pa.Table) -> Self: """Convert from PyArrow Table to Frame Parameters @@ -1117,7 +1124,7 @@ def to_arrow(self): ) @_cudf_nvtx_annotate - def _positions_from_column_names(self, column_names): + def _positions_from_column_names(self, column_names) -> list[int]: """Map each column name into their positions in the frame. The order of indices returned corresponds to the column order in this @@ -1519,7 +1526,12 @@ def argsort( ).values @_cudf_nvtx_annotate - def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): + def _get_sorted_inds( + self, + by=None, + ascending: bool = True, + na_position: Literal["first", "last"] = "last", + ) -> ColumnBase: """ Get the indices required to sort self according to the columns specified in by. @@ -1535,11 +1547,11 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): # If given a scalar need to construct a sequence of length # of columns if np.isscalar(ascending): - ascending = [ascending] * len(to_sort) + ascending_lst = [ascending] * len(to_sort) return libcudf.sort.order_by( to_sort, - ascending, + ascending_lst, na_position, stable=True, ) From 62a68c74522525c8d3ff3443c70cd4a2063f031b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Jan 2024 14:16:15 -0800 Subject: [PATCH 4/8] add any --- python/cudf/cudf/core/udf/groupby_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 6b11a90f578..8b58252c8ad 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args): # See https://github.com/numba/numba/issues/4587 return False - any_na = frame.isna() + any_na = frame.isna().any() if (frame.ndim == 1 and any_na) or (frame.ndim == 2 and any_na.any()): return False np_field_types = np.dtype( From d6824d649030b0b6e0d5977526aa90756185d97d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 10:15:53 -0800 Subject: [PATCH 5/8] Add any --- python/cudf/cudf/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8b6643a4170..7192ca58586 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1253,7 +1253,7 @@ def _jit_groupby_apply( ): # Nulls are not yet supported obj = self.grouping._obj - any_na = obj.isna() + any_na = obj.isna().any() if (obj.ndim == 1 and any_na) or (obj.ndim == 2 and any_na.any()): raise ValueError("Nulls not yet supported with groupby JIT engine") From 3004e1e7f6ea67141ad61a96633e2e0858c35229 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 12:03:28 -0800 Subject: [PATCH 6/8] Just use _can_jit --- python/cudf/cudf/core/frame.py | 4 ++-- python/cudf/cudf/core/groupby/groupby.py | 6 ------ python/cudf/cudf/core/udf/groupby_utils.py | 3 +-- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 697e36ec55d..9f4089c60a0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1537,11 +1537,11 @@ def _get_sorted_inds( # If given a scalar need to construct a sequence of length # of columns if np.isscalar(ascending): - ascending_lst = [ascending] * len(to_sort) + ascending = [ascending] * len(to_sort) # type: ignore[assignment] return libcudf.sort.order_by( to_sort, - ascending_lst, + ascending, na_position, stable=True, ) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 7192ca58586..e28ba233c56 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1251,12 +1251,6 @@ def pipe(self, func, *args, **kwargs): def _jit_groupby_apply( self, function, group_names, offsets, group_keys, grouped_values, *args ): - # Nulls are not yet supported - obj = self.grouping._obj - any_na = obj.isna().any() - if (obj.ndim == 1 and any_na) or (obj.ndim == 2 and any_na.any()): - raise ValueError("Nulls not yet supported with groupby JIT engine") - chunk_results = jit_groupby_apply( offsets, grouped_values, function, *args ) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 8b58252c8ad..06d9296ca0f 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -210,8 +210,7 @@ def _can_be_jitted(frame, func, args): # See https://github.com/numba/numba/issues/4587 return False - any_na = frame.isna().any() - if (frame.ndim == 1 and any_na) or (frame.ndim == 2 and any_na.any()): + if any(col.has_nulls() for col in frame._data.values()): return False np_field_types = np.dtype( list( From 9e80bf146c54d90a205d446a579b5f4580599396 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 12:09:25 -0800 Subject: [PATCH 7/8] stronger typing to _get_sorted_inds --- python/cudf/cudf/core/frame.py | 6 ++---- python/cudf/cudf/core/indexed_frame.py | 5 ++++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9f4089c60a0..197e65947dc 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1535,13 +1535,11 @@ def _get_sorted_inds( )._columns ] - # If given a scalar need to construct a sequence of length # of columns - if np.isscalar(ascending): - ascending = [ascending] * len(to_sort) # type: ignore[assignment] + ascending_lst = [ascending] * len(to_sort) return libcudf.sort.order_by( to_sort, - ascending, + ascending_lst, na_position, stable=True, ) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 620ba82e7f4..a5f1ee60fe4 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -14,6 +14,7 @@ Callable, Dict, List, + Literal, MutableMapping, Optional, Tuple, @@ -2438,7 +2439,9 @@ def sort_values( out.columns = self._data.to_pandas_index() return out - def _n_largest_or_smallest(self, largest, n, columns, keep): + def _n_largest_or_smallest( + self, largest: bool, n: int, columns, keep: Literal["first", "last"] + ): # Get column to operate on if isinstance(columns, str): columns = [columns] From ca1e911755c71f3dbc618ad79dc14484cf79b51a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 Jan 2024 11:54:41 -0800 Subject: [PATCH 8/8] _get_sorted_inds accepts list-like ascending --- python/cudf/cudf/core/frame.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 197e65947dc..f165c329e46 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1519,7 +1519,7 @@ def argsort( def _get_sorted_inds( self, by=None, - ascending: bool = True, + ascending=True, na_position: Literal["first", "last"] = "last", ) -> ColumnBase: """ @@ -1535,7 +1535,10 @@ def _get_sorted_inds( )._columns ] - ascending_lst = [ascending] * len(to_sort) + if is_scalar(ascending): + ascending_lst = [ascending] * len(to_sort) + else: + ascending_lst = list(ascending) return libcudf.sort.order_by( to_sort,