Refactor Frame reductions (#8944)

This PR moves implementations of reductions out of the `Series`/`DataFrame` classes and into `Frame`. The resulting reduction code is implemented in terms of columns, which improves the performance of `DataFrame` reductions, and using a single code path makes it easier to maintain. The `median` and `sum_of_squares` reductions, which were previously only available for `Series`, are now transparently enabled for `DataFrame` as well. This PR also explicitly disables reductions for Index objects to match pandas Index APIs. Since a few reductions had previously been implemented, removing these features constitutes a breaking change. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Keith Kraus (https://github.com/kkraus14) - Ashwin Srinath (https://github.com/shwina) URL: #8944
rapidsai · Aug 7, 2021 · 115f3b6 · 115f3b6
1 parent 4b5853d
commit 115f3b6
Show file tree

Hide file tree

Showing 8 changed files with 732 additions and 1,230 deletions.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -172,11 +172,31 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
     def _null_equals(self, other: ColumnBase) -> ColumnBase:
         return self.binary_operator("NULL_EQUALS", other)
 
-    def all(self) -> bool:
-        return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_))
+    def all(self, skipna: bool = True) -> bool:
+        # If all entries are null the result is True, including when the column
+        # is empty.
+        result_col = self.nans_to_nulls() if skipna else self
 
-    def any(self) -> bool:
-        return bool(libcudf.reduce.reduce("any", self, dtype=np.bool_))
+        if result_col.null_count == result_col.size:
+            return True
+
+        if isinstance(result_col, ColumnBase):
+            return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
+        else:
+            return result_col
+
+    def any(self, skipna: bool = True) -> bool:
+        # Early exit for fast cases.
+        result_col = self.nans_to_nulls() if skipna else self
+        if not skipna and result_col.has_nulls:
+            return True
+        elif skipna and result_col.null_count == result_col.size:
+            return False
+
+        if isinstance(result_col, ColumnBase):
+            return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
+        else:
+            return result_col
 
     def __sizeof__(self) -> int:
         n = 0
@@ -911,9 +931,9 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
             return self.as_interval_column(dtype, **kwargs)
         elif is_decimal_dtype(dtype):
             return self.as_decimal_column(dtype, **kwargs)
-        elif np.issubdtype(dtype, np.datetime64):
+        elif np.issubdtype(cast(Any, dtype), np.datetime64):
             return self.as_datetime_column(dtype, **kwargs)
-        elif np.issubdtype(dtype, np.timedelta64):
+        elif np.issubdtype(cast(Any, dtype), np.timedelta64):
             return self.as_timedelta_column(dtype, **kwargs)
         else:
             return self.as_numerical_column(dtype, **kwargs)