Refactor sorting APIs (#9464)

This PR refactors most sorting APIs of Frame and its subclasses. To support these changes, it also refactors the implementation of `take`. New Features: - DataFrame nlargest/nsmallest will accept multiple columns. Previously this would fail unexpectedly. - BaseIndex.sort_values now accepts na_position to be consistent with other sorts. - DataFrame.argsort now accepts an (optional) by parameter to indicate what columns to order by. Performance: - DataFrame nlargest/nsmallest are up to 10x faster for small inputs. - take is significantly faster for all classes. For instance I see about a 2x speedup for Series. - DataFrame.sort_values is ~10% faster for small inputs. Deprecations/Removals/Breaking Changes: - Deprecating arguments to take other than numerical indexes. Boolean masks are deprecated and will no longer be supported in the future. This matches pandas behavior and allows us to simplify our code. - The parameter for take has been renamed to `indices` from `positions` for consistency with pandas. This is a breaking change. If reviewers think it's important to still support positions as a kwarg we could add a backwards compatibility layer. My thinking is that this is probably not a frequently used API, and where it is used it's almost always used with a positional argument so renaming the first argument is not a huge issue. There's one additional note that fits under a couple of the headings. While unifying implementations of argsort it made sense to change the behavior of DataFrame.argsort to return a cupy array instead of a Series. There's no corresponding pandas API so we have some freedom to choose the appropriate output, and I think an array makes more sense. However, `Column.values` is not that fast (yet, I plan to optimize soon), so it's actually slower right now to return the array than to return a Series constructed via `_from_data`. I think this is OK for now, but if reviewers feel strongly about it I can change it back to returning a Series. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #9464
rapidsai · Nov 2, 2021 · 2ecebe1 · 2ecebe1
1 parent 1c10790
commit 2ecebe1
Show file tree

Hide file tree

Showing 10 changed files with 408 additions and 458 deletions.
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
@@ -3,9 +3,9 @@
 from __future__ import annotations, division, print_function
 
 import pickle
+import warnings
 from typing import Any, Set
 
-import cupy
 import pandas as pd
 
 import cudf
@@ -499,66 +499,6 @@ def fillna(self, value, downcast=None):
 
         return super().fillna(value=value)
 
-    def take(self, indices):
-        """Gather only the specific subset of indices
-
-        Parameters
-        ----------
-        indices: An array-like that maps to values contained in this Index.
-        """
-        return self[indices]
-
-    def argsort(self, ascending=True, **kwargs):
-        """
-        Return the integer indices that would sort the index.
-
-        Parameters
-        ----------
-        ascending : bool, default True
-            If True, returns the indices for ascending order.
-            If False, returns the indices for descending order.
-
-        Returns
-        -------
-        array : A cupy array containing Integer indices that
-            would sort the index if used as an indexer.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> index = cudf.Index([10, 100, 1, 1000])
-        >>> index
-        Int64Index([10, 100, 1, 1000], dtype='int64')
-        >>> index.argsort()
-        array([2, 0, 1, 3], dtype=int32)
-
-        The order of argsort can be reversed using
-        ``ascending`` parameter, by setting it to ``False``.
-        >>> index.argsort(ascending=False)
-        array([3, 1, 0, 2], dtype=int32)
-
-        ``argsort`` on a MultiIndex:
-
-        >>> index = cudf.MultiIndex(
-        ...      levels=[[1, 3, 4, -10], [1, 11, 5]],
-        ...      codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
-        ...      names=["x", "y"],
-        ... )
-        >>> index
-        MultiIndex([(  1,  1),
-                    (  1,  5),
-                    (  3, 11),
-                    (  4, 11),
-                    (-10,  1)],
-                   names=['x', 'y'])
-        >>> index.argsort()
-        array([4, 0, 1, 2, 3], dtype=int32)
-        >>> index.argsort(ascending=False)
-        array([3, 2, 1, 0, 4], dtype=int32)
-        """
-        indices = self._values.argsort(ascending=ascending, **kwargs)
-        return cupy.asarray(indices)
-
     def to_frame(self, index=True, name=None):
         """Create a DataFrame with a column containing this Index
 
@@ -621,6 +561,10 @@ def gpu_values(self):
         """
         View the data as a numba device array object
         """
+        warnings.warn(
+            "The gpu_values property is deprecated and will be removed.",
+            FutureWarning,
+        )
         return self._values.data_array_view
 
     def append(self, other):
@@ -1025,7 +969,13 @@ def _intersection(self, other, sort=None):
             return intersection_result.sort_values()
         return intersection_result
 
-    def sort_values(self, return_indexer=False, ascending=True, key=None):
+    def sort_values(
+        self,
+        return_indexer=False,
+        ascending=True,
+        na_position="last",
+        key=None,
+    ):
         """
         Return a sorted copy of the index, and optionally return the indices
         that sorted the index itself.
@@ -1036,6 +986,9 @@ def sort_values(self, return_indexer=False, ascending=True, key=None):
             Should the indices that would sort the index be returned.
         ascending : bool, default True
             Should the index values be sorted in an ascending order.
+        na_position : {'first' or 'last'}, default 'last'
+            Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
+            the end.
         key : None, optional
             This parameter is NON-FUNCTIONAL.
 
@@ -1101,12 +1054,14 @@ def sort_values(self, return_indexer=False, ascending=True, key=None):
         """
         if key is not None:
             raise NotImplementedError("key parameter is not yet implemented.")
+        if na_position not in {"first", "last"}:
+            raise ValueError(f"invalid na_position: {na_position}")
 
-        indices = self._values.argsort(ascending=ascending)
-        index_sorted = cudf.Index(self.take(indices), name=self.name)
+        indices = self.argsort(ascending=ascending, na_position=na_position)
+        index_sorted = self.take(indices)
 
         if return_indexer:
-            return index_sorted, cupy.asarray(indices)
+            return index_sorted, indices
         else:
             return index_sorted
 

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -2557,43 +2557,11 @@ class max_speed
         if not inplace:
             return result
 
-    def take(self, positions, axis=0, keep_index=True):
-        """
-        Return a new DataFrame containing the rows specified by *positions*
-
-        Parameters
-        ----------
-        positions : array-like
-            Integer or boolean array-like specifying the rows of the output.
-            If integer, each element represents the integer index of a row.
-            If boolean, *positions* must be of the same length as *self*,
-            and represents a boolean mask.
-
-        Returns
-        -------
-        out : DataFrame
-            New DataFrame
-
-        Examples
-        --------
-        >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0],
-        ...                    'b': cudf.Series(['a', 'b', 'c'])})
-        >>> a.take([0, 2, 2])
-             a  b
-        0  1.0  a
-        2  3.0  c
-        2  3.0  c
-        >>> a.take([True, False, True])
-             a  b
-        0  1.0  a
-        2  3.0  c
-        """
+    def take(self, indices, axis=0, keep_index=None):
+        axis = self._get_axis_from_axis_arg(axis)
         if axis != 0:
             raise NotImplementedError("Only axis=0 is supported.")
-        positions = as_column(positions)
-        if is_bool_dtype(positions):
-            return self._apply_boolean_mask(positions)
-        out = self._gather(positions, keep_index=keep_index)
+        out = super().take(indices, keep_index)
         out.columns = self.columns
         return out
 
@@ -3246,127 +3214,6 @@ def _label_encoding(
         outdf.insert(len(outdf._data), newname, newcol)
         return outdf
 
-    @annotate("ARGSORT", color="yellow", domain="cudf_python")
-    def argsort(self, ascending=True, na_position="last"):
-        """
-        Sort by the values.
-
-        Parameters
-        ----------
-        ascending : bool or list of bool, default True
-            If True, sort values in ascending order, otherwise descending.
-        na_position : {‘first’ or ‘last’}, default ‘last’
-            Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs
-            at the end.
-
-        Returns
-        -------
-        out_column_inds : cuDF Column of indices sorted based on input
-
-        Notes
-        -----
-        Difference from pandas:
-
-        - Support axis='index' only.
-        - Not supporting: inplace, kind
-        - Ascending can be a list of bools to control per column
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a':[10, 0, 2], 'b':[-10, 10, 1]})
-        >>> df
-            a   b
-        0  10 -10
-        1   0  10
-        2   2   1
-        >>> inds = df.argsort()
-        >>> inds
-        0    1
-        1    2
-        2    0
-        dtype: int32
-        >>> df.take(inds)
-            a   b
-        1   0  10
-        2   2   1
-        0  10 -10
-        """
-        inds_col = self._get_sorted_inds(
-            ascending=ascending, na_position=na_position
-        )
-        return cudf.Series(inds_col)
-
-    def sort_values(
-        self,
-        by,
-        axis=0,
-        ascending=True,
-        inplace=False,
-        kind="quicksort",
-        na_position="last",
-        ignore_index=False,
-    ):
-        """
-        Sort by the values row-wise.
-
-        Parameters
-        ----------
-        by : str or list of str
-            Name or list of names to sort by.
-        ascending : bool or list of bool, default True
-            Sort ascending vs. descending. Specify list for multiple sort
-            orders. If this is a list of bools, must match the length of the
-            by.
-        na_position : {‘first’, ‘last’}, default ‘last’
-            'first' puts nulls at the beginning, 'last' puts nulls at the end
-        ignore_index : bool, default False
-            If True, index will not be sorted.
-
-        Returns
-        -------
-        sorted_obj : cuDF DataFrame
-
-        Notes
-        -----
-        Difference from pandas:
-          * Support axis='index' only.
-          * Not supporting: inplace, kind
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame()
-        >>> df['a'] = [0, 1, 2]
-        >>> df['b'] = [-3, 2, 0]
-        >>> df.sort_values('b')
-           a  b
-        0  0 -3
-        2  2  0
-        1  1  2
-        """
-        if inplace:
-            raise NotImplementedError("`inplace` not currently implemented.")
-        if kind not in {"quicksort", "mergesort", "heapsort", "stable"}:
-            raise AttributeError(
-                f"{kind} is not a valid sorting algorithm for "
-                f"'DataFrame' object"
-            )
-        elif kind != "quicksort":
-            msg = (
-                f"GPU-accelerated {kind} is currently not supported, "
-                f"now defaulting to GPU-accelerated quicksort."
-            )
-            warnings.warn(msg)
-        if axis != 0:
-            raise NotImplementedError("`axis` not currently implemented.")
-
-        # argsort the `by` column
-        return self.take(
-            self[by].argsort(ascending=ascending, na_position=na_position),
-            keep_index=not ignore_index,
-        )
-
     def agg(self, aggs, axis=None):
         """
         Aggregate using one or more operations over the specified axis.
@@ -3559,7 +3406,7 @@ def nlargest(self, n, columns, keep="first"):
         Italy     59000000  1937894      IT
         Brunei      434000    12128      BN
         """
-        return self._n_largest_or_smallest("nlargest", n, columns, keep)
+        return self._n_largest_or_smallest(True, n, columns, keep)
 
     def nsmallest(self, n, columns, keep="first"):
         """Get the rows of the DataFrame sorted by the n smallest value of *columns*
@@ -3627,26 +3474,7 @@ def nsmallest(self, n, columns, keep="first"):
         Tuvalu         11300   38      TV
         Nauru         337000  182      NR
         """
-        return self._n_largest_or_smallest("nsmallest", n, columns, keep)
-
-    def _n_largest_or_smallest(self, method, n, columns, keep):
-        # Get column to operate on
-        if not isinstance(columns, str):
-            [column] = columns
-        else:
-            column = columns
-
-        col = self[column].reset_index(drop=True)
-        # Operate
-        sorted_series = getattr(col, method)(n=n, keep=keep)
-        df = DataFrame()
-        new_positions = sorted_series.index.gpu_values
-        for k in self._data.names:
-            if k == column:
-                df[k] = sorted_series
-            else:
-                df[k] = self[k].reset_index(drop=True).take(new_positions)
-        return df.set_index(self.index.take(new_positions))
+        return self._n_largest_or_smallest(False, n, columns, keep)
 
     def transpose(self):
         """Transpose index and columns.