From 2ecebe17cf519db1bc7aa8d459a735ea9f90c199 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 2 Nov 2021 11:52:47 -0700 Subject: [PATCH] Refactor sorting APIs (#9464) This PR refactors most sorting APIs of Frame and its subclasses. To support these changes, it also refactors the implementation of `take`. New Features: - DataFrame nlargest/nsmallest will accept multiple columns. Previously this would fail unexpectedly. - BaseIndex.sort_values now accepts na_position to be consistent with other sorts. - DataFrame.argsort now accepts an (optional) by parameter to indicate what columns to order by. Performance: - DataFrame nlargest/nsmallest are up to 10x faster for small inputs. - take is significantly faster for all classes. For instance I see about a 2x speedup for Series. - DataFrame.sort_values is ~10% faster for small inputs. Deprecations/Removals/Breaking Changes: - Deprecating arguments to take other than numerical indexes. Boolean masks are deprecated and will no longer be supported in the future. This matches pandas behavior and allows us to simplify our code. - The parameter for take has been renamed to `indices` from `positions` for consistency with pandas. This is a breaking change. If reviewers think it's important to still support positions as a kwarg we could add a backwards compatibility layer. My thinking is that this is probably not a frequently used API, and where it is used it's almost always used with a positional argument so renaming the first argument is not a huge issue. There's one additional note that fits under a couple of the headings. While unifying implementations of argsort it made sense to change the behavior of DataFrame.argsort to return a cupy array instead of a Series. There's no corresponding pandas API so we have some freedom to choose the appropriate output, and I think an array makes more sense. However, `Column.values` is not that fast (yet, I plan to optimize soon), so it's actually slower right now to return the array than to return a Series constructed via `_from_data`. I think this is OK for now, but if reviewers feel strongly about it I can change it back to returning a Series. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/9464 --- python/cudf/cudf/core/_base_index.py | 85 +++-------- python/cudf/cudf/core/dataframe.py | 182 +---------------------- python/cudf/cudf/core/frame.py | 166 +++++++++++++++++++-- python/cudf/cudf/core/index.py | 41 ++++- python/cudf/cudf/core/indexed_frame.py | 112 ++++++++++++++ python/cudf/cudf/core/multiindex.py | 61 +++----- python/cudf/cudf/core/series.py | 178 ++++++---------------- python/cudf/cudf/tests/test_dataframe.py | 4 +- python/cudf/cudf/tests/test_pickling.py | 4 +- python/cudf/cudf/tests/test_sorting.py | 33 +--- 10 files changed, 408 insertions(+), 458 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 590bff3b19d..eea8e3c418f 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -3,9 +3,9 @@ from __future__ import annotations, division, print_function import pickle +import warnings from typing import Any, Set -import cupy import pandas as pd import cudf @@ -499,66 +499,6 @@ def fillna(self, value, downcast=None): return super().fillna(value=value) - def take(self, indices): - """Gather only the specific subset of indices - - Parameters - ---------- - indices: An array-like that maps to values contained in this Index. - """ - return self[indices] - - def argsort(self, ascending=True, **kwargs): - """ - Return the integer indices that would sort the index. - - Parameters - ---------- - ascending : bool, default True - If True, returns the indices for ascending order. - If False, returns the indices for descending order. - - Returns - ------- - array : A cupy array containing Integer indices that - would sort the index if used as an indexer. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([10, 100, 1, 1000]) - >>> index - Int64Index([10, 100, 1, 1000], dtype='int64') - >>> index.argsort() - array([2, 0, 1, 3], dtype=int32) - - The order of argsort can be reversed using - ``ascending`` parameter, by setting it to ``False``. - >>> index.argsort(ascending=False) - array([3, 1, 0, 2], dtype=int32) - - ``argsort`` on a MultiIndex: - - >>> index = cudf.MultiIndex( - ... levels=[[1, 3, 4, -10], [1, 11, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> index - MultiIndex([( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11), - (-10, 1)], - names=['x', 'y']) - >>> index.argsort() - array([4, 0, 1, 2, 3], dtype=int32) - >>> index.argsort(ascending=False) - array([3, 2, 1, 0, 4], dtype=int32) - """ - indices = self._values.argsort(ascending=ascending, **kwargs) - return cupy.asarray(indices) - def to_frame(self, index=True, name=None): """Create a DataFrame with a column containing this Index @@ -621,6 +561,10 @@ def gpu_values(self): """ View the data as a numba device array object """ + warnings.warn( + "The gpu_values property is deprecated and will be removed.", + FutureWarning, + ) return self._values.data_array_view def append(self, other): @@ -1025,7 +969,13 @@ def _intersection(self, other, sort=None): return intersection_result.sort_values() return intersection_result - def sort_values(self, return_indexer=False, ascending=True, key=None): + def sort_values( + self, + return_indexer=False, + ascending=True, + na_position="last", + key=None, + ): """ Return a sorted copy of the index, and optionally return the indices that sorted the index itself. @@ -1036,6 +986,9 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. key : None, optional This parameter is NON-FUNCTIONAL. @@ -1101,12 +1054,14 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): """ if key is not None: raise NotImplementedError("key parameter is not yet implemented.") + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") - indices = self._values.argsort(ascending=ascending) - index_sorted = cudf.Index(self.take(indices), name=self.name) + indices = self.argsort(ascending=ascending, na_position=na_position) + index_sorted = self.take(indices) if return_indexer: - return index_sorted, cupy.asarray(indices) + return index_sorted, indices else: return index_sorted diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8efdd55b258..42389ef6e4b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2557,43 +2557,11 @@ class max_speed if not inplace: return result - def take(self, positions, axis=0, keep_index=True): - """ - Return a new DataFrame containing the rows specified by *positions* - - Parameters - ---------- - positions : array-like - Integer or boolean array-like specifying the rows of the output. - If integer, each element represents the integer index of a row. - If boolean, *positions* must be of the same length as *self*, - and represents a boolean mask. - - Returns - ------- - out : DataFrame - New DataFrame - - Examples - -------- - >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0], - ... 'b': cudf.Series(['a', 'b', 'c'])}) - >>> a.take([0, 2, 2]) - a b - 0 1.0 a - 2 3.0 c - 2 3.0 c - >>> a.take([True, False, True]) - a b - 0 1.0 a - 2 3.0 c - """ + def take(self, indices, axis=0, keep_index=None): + axis = self._get_axis_from_axis_arg(axis) if axis != 0: raise NotImplementedError("Only axis=0 is supported.") - positions = as_column(positions) - if is_bool_dtype(positions): - return self._apply_boolean_mask(positions) - out = self._gather(positions, keep_index=keep_index) + out = super().take(indices, keep_index) out.columns = self.columns return out @@ -3246,127 +3214,6 @@ def _label_encoding( outdf.insert(len(outdf._data), newname, newcol) return outdf - @annotate("ARGSORT", color="yellow", domain="cudf_python") - def argsort(self, ascending=True, na_position="last"): - """ - Sort by the values. - - Parameters - ---------- - ascending : bool or list of bool, default True - If True, sort values in ascending order, otherwise descending. - na_position : {‘first’ or ‘last’}, default ‘last’ - Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs - at the end. - - Returns - ------- - out_column_inds : cuDF Column of indices sorted based on input - - Notes - ----- - Difference from pandas: - - - Support axis='index' only. - - Not supporting: inplace, kind - - Ascending can be a list of bools to control per column - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a':[10, 0, 2], 'b':[-10, 10, 1]}) - >>> df - a b - 0 10 -10 - 1 0 10 - 2 2 1 - >>> inds = df.argsort() - >>> inds - 0 1 - 1 2 - 2 0 - dtype: int32 - >>> df.take(inds) - a b - 1 0 10 - 2 2 1 - 0 10 -10 - """ - inds_col = self._get_sorted_inds( - ascending=ascending, na_position=na_position - ) - return cudf.Series(inds_col) - - def sort_values( - self, - by, - axis=0, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ignore_index=False, - ): - """ - Sort by the values row-wise. - - Parameters - ---------- - by : str or list of str - Name or list of names to sort by. - ascending : bool or list of bool, default True - Sort ascending vs. descending. Specify list for multiple sort - orders. If this is a list of bools, must match the length of the - by. - na_position : {‘first’, ‘last’}, default ‘last’ - 'first' puts nulls at the beginning, 'last' puts nulls at the end - ignore_index : bool, default False - If True, index will not be sorted. - - Returns - ------- - sorted_obj : cuDF DataFrame - - Notes - ----- - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['a'] = [0, 1, 2] - >>> df['b'] = [-3, 2, 0] - >>> df.sort_values('b') - a b - 0 0 -3 - 2 2 0 - 1 1 2 - """ - if inplace: - raise NotImplementedError("`inplace` not currently implemented.") - if kind not in {"quicksort", "mergesort", "heapsort", "stable"}: - raise AttributeError( - f"{kind} is not a valid sorting algorithm for " - f"'DataFrame' object" - ) - elif kind != "quicksort": - msg = ( - f"GPU-accelerated {kind} is currently not supported, " - f"now defaulting to GPU-accelerated quicksort." - ) - warnings.warn(msg) - if axis != 0: - raise NotImplementedError("`axis` not currently implemented.") - - # argsort the `by` column - return self.take( - self[by].argsort(ascending=ascending, na_position=na_position), - keep_index=not ignore_index, - ) - def agg(self, aggs, axis=None): """ Aggregate using one or more operations over the specified axis. @@ -3559,7 +3406,7 @@ def nlargest(self, n, columns, keep="first"): Italy 59000000 1937894 IT Brunei 434000 12128 BN """ - return self._n_largest_or_smallest("nlargest", n, columns, keep) + return self._n_largest_or_smallest(True, n, columns, keep) def nsmallest(self, n, columns, keep="first"): """Get the rows of the DataFrame sorted by the n smallest value of *columns* @@ -3627,26 +3474,7 @@ def nsmallest(self, n, columns, keep="first"): Tuvalu 11300 38 TV Nauru 337000 182 NR """ - return self._n_largest_or_smallest("nsmallest", n, columns, keep) - - def _n_largest_or_smallest(self, method, n, columns, keep): - # Get column to operate on - if not isinstance(columns, str): - [column] = columns - else: - column = columns - - col = self[column].reset_index(drop=True) - # Operate - sorted_series = getattr(col, method)(n=n, keep=keep) - df = DataFrame() - new_positions = sorted_series.index.gpu_values - for k in self._data.names: - if k == column: - df[k] = sorted_series - else: - df[k] = self[k].reset_index(drop=True).take(new_positions) - return df.set_index(self.index.take(new_positions)) + return self._n_largest_or_smallest(False, n, columns, keep) def transpose(self): """Transpose index and columns. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b14a4d91831..2c469a4ea6a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -30,6 +30,7 @@ from cudf._typing import ColumnLike, DataFrameOrSeries, Dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, + is_bool_dtype, is_decimal_dtype, is_dict_like, is_integer_dtype, @@ -533,6 +534,7 @@ def _gather(self, gather_map, keep_index=True, nullify=False): ) result._copy_type_metadata(self, include_index=keep_index) + result._data.names = self._data.names if keep_index and self._index is not None: result._index.names = self._index.names return result @@ -2882,6 +2884,9 @@ def searchsorted( """ # Call libcudf++ search_sorted primitive + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + scalar_flag = None if is_scalar(values): scalar_flag = True @@ -2903,34 +2908,101 @@ def searchsorted( else: return result - def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): - """ - Sort by the values. + @annotate("ARGSORT", color="yellow", domain="cudf_python") + def argsort( + self, + by=None, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + """Return the integer indices that would sort the Series values. Parameters ---------- - by: list, optional - Labels specifying columns to sort by. By default, - sort by all columns of `self` + by : str or list of str, default None + Name or list of names to sort by. If None, sort by all columns. + axis : {0 or "index"} + Has no effect but is accepted for compatibility with numpy. + kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See :func:`numpy.sort` for more + information. 'mergesort' and 'stable' are the only stable + algorithms. Only quicksort is supported in cuDF. + order : None + Has no effect but is accepted for compatibility with numpy. ascending : bool or list of bool, default True If True, sort values in ascending order, otherwise descending. na_position : {‘first’ or ‘last’}, default ‘last’ Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs at the end. + Returns ------- - out_column_inds : cuDF Column of indices sorted based on input + cupy.ndarray: The indices sorted based on input. - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - * Ascending can be a list of bools to control per column - """ + Examples + -------- + **Series** + + >>> import cudf + >>> s = cudf.Series([3, 1, 2]) + >>> s + 0 3 + 1 1 + 2 2 + dtype: int64 + >>> s.argsort() + 0 1 + 1 2 + 2 0 + dtype: int32 + >>> s[s.argsort()] + 1 1 + 2 2 + 0 3 + dtype: int64 + + **DataFrame** + >>> import cudf + >>> df = cudf.DataFrame({'foo': [3, 1, 2]}) + >>> df.argsort() + array([1, 2, 0], dtype=int32) + + **Index** + >>> import cudf + >>> idx = cudf.Index([3, 1, 2]) + >>> idx.argsort() + array([1, 2, 0], dtype=int32) + """ # noqa: E501 + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + if kind != "quicksort": + if kind not in {"mergesort", "heapsort", "stable"}: + raise AttributeError( + f"{kind} is not a valid sorting algorithm for " + f"'DataFrame' object" + ) + warnings.warn( + f"GPU-accelerated {kind} is currently not supported, " + "defaulting to quicksort." + ) + + if isinstance(by, str): + by = [by] + return self._get_sorted_inds( + by=by, ascending=ascending, na_position=na_position + ).values + + def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): + # Get an int64 column consisting of the indices required to sort self + # according to the columns specified in by. to_sort = ( self if by is None - else self._get_columns_by_label(by, downcast=False) + else self._get_columns_by_label(list(by), downcast=False) ) # If given a scalar need to construct a sequence of length # of columns @@ -2939,6 +3011,74 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): return libcudf.sort.order_by(to_sort, ascending, na_position) + def take(self, indices, keep_index=None): + """Return a new object containing the rows specified by *positions* + + Parameters + ---------- + indices : array-like + Array of ints indicating which positions to take. + keep_index : bool, default True + Whether to retain the index in result or not. + + Returns + ------- + out : Series or DataFrame or Index + New object with desired subset of rows. + + Examples + -------- + **Series** + >>> s = cudf.Series(['a', 'b', 'c', 'd', 'e']) + >>> s.take([2, 0, 4, 3]) + 2 c + 0 a + 4 e + 3 d + dtype: object + + **DataFrame** + + >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0], + ... 'b': cudf.Series(['a', 'b', 'c'])}) + >>> a.take([0, 2, 2]) + a b + 0 1.0 a + 2 3.0 c + 2 3.0 c + >>> a.take([True, False, True]) + a b + 0 1.0 a + 2 3.0 c + + **Index** + + >>> idx = cudf.Index(['a', 'b', 'c', 'd', 'e']) + >>> idx.take([2, 0, 4, 3]) + StringIndex(['c' 'a' 'e' 'd'], dtype='object') + """ + # TODO: When we remove keep_index we should introduce the axis + # parameter. We could also introduce is_copy, but that's already + # deprecated in pandas so it's probably unnecessary. We also need to + # introduce Index.take's allow_fill and fill_value parameters. + if keep_index is not None: + warnings.warn( + "keep_index is deprecated and will be removed in the future.", + FutureWarning, + ) + else: + keep_index = True + + indices = as_column(indices) + if is_bool_dtype(indices): + warnings.warn( + "Calling take with a boolean array is deprecated and will be " + "removed in the future.", + FutureWarning, + ) + return self._apply_boolean_mask(indices) + return self._gather(indices, keep_index=keep_index) + def sin(self): """ Get Trigonometric sine, element-wise. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c003454fb59..de463269743 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1162,6 +1162,44 @@ def is_categorical(self): def is_interval(self): return False + def argsort( + self, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + """Return the integer indices that would sort the Series values. + + Parameters + ---------- + axis : {0 or "index"} + Has no effect but is accepted for compatibility with numpy. + kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See :func:`numpy.sort` for more + information. 'mergesort' and 'stable' are the only stable + algorithms. Only quicksort is supported in cuDF. + order : None + Has no effect but is accepted for compatibility with numpy. + ascending : bool or list of bool, default True + If True, sort values in ascending order, otherwise descending. + na_position : {‘first’ or ‘last’}, default ‘last’ + Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs + at the end. + + Returns + ------- + cupy.ndarray: The indices sorted based on input. + """ # noqa: E501 + return super().argsort( + axis=axis, + kind=kind, + order=order, + ascending=ascending, + na_position=na_position, + ) + class NumericIndex(GenericIndex): """Immutable, ordered and sliceable sequence of labels. @@ -2371,9 +2409,6 @@ def to_pandas(self): self.to_numpy(na_value=None), name=self.name, dtype="object" ) - def take(self, indices): - return self._values[indices] - def __repr__(self): return ( f"{self.__class__.__name__}({self._values.to_array()}," diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e7066e336ab..68088cb275e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3,6 +3,7 @@ from __future__ import annotations +import warnings from typing import Type, TypeVar import cupy as cp @@ -377,6 +378,9 @@ def sort_index( if key is not None: raise NotImplementedError("key is not yet supported.") + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + if axis in (0, "index"): idx = self.index if isinstance(idx, MultiIndex): @@ -414,3 +418,111 @@ def sort_index( if ignore_index is True: out = out.reset_index(drop=True) return self._mimic_inplace(out, inplace=inplace) + + def sort_values( + self, + by, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ignore_index=False, + ): + """Sort by the values along either axis. + + Parameters + ---------- + by : str or list of str + Name or list of names to sort by. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of the + by. + na_position : {‘first’, ‘last’}, default ‘last’ + 'first' puts nulls at the beginning, 'last' puts nulls at the end + ignore_index : bool, default False + If True, index will not be sorted. + + Returns + ------- + Frame : Frame with sorted values. + + Notes + ----- + Difference from pandas: + * Support axis='index' only. + * Not supporting: inplace, kind + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['a'] = [0, 1, 2] + >>> df['b'] = [-3, 2, 0] + >>> df.sort_values('b') + a b + 0 0 -3 + 2 2 0 + 1 1 2 + """ + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + if inplace: + raise NotImplementedError("`inplace` not currently implemented.") + if kind != "quicksort": + if kind not in {"mergesort", "heapsort", "stable"}: + raise AttributeError( + f"{kind} is not a valid sorting algorithm for " + f"'DataFrame' object" + ) + warnings.warn( + f"GPU-accelerated {kind} is currently not supported, " + f"defaulting to quicksort." + ) + if axis != 0: + raise NotImplementedError("`axis` not currently implemented.") + + if len(self) == 0: + return self + + # argsort the `by` column + return self.take( + self._get_columns_by_label(by)._get_sorted_inds( + ascending=ascending, na_position=na_position + ), + keep_index=not ignore_index, + ) + + def _n_largest_or_smallest(self, largest, n, columns, keep): + # Get column to operate on + if isinstance(columns, str): + columns = [columns] + + if len(self) == 0: + return self + + if keep == "first": + if n < 0: + n = 0 + + # argsort the `by` column + return self.take( + self._get_columns_by_label(columns)._get_sorted_inds( + ascending=not largest + )[:n], + keep_index=True, + ) + elif keep == "last": + indices = self._get_columns_by_label(columns)._get_sorted_inds( + ascending=largest + ) + + if n <= 0: + # Empty slice. + indices = indices[0:0] + else: + indices = indices[: -n - 1 : -1] + return self.take(indices, keep_index=True) + else: + raise ValueError('keep must be either "first", "last"') diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 27edd41ed92..7c132e3fb71 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -835,22 +835,11 @@ def size(self): return self._num_rows def take(self, indices): - if isinstance(indices, (Integral, Sequence)): - indices = np.array(indices) - elif isinstance(indices, cudf.Series) and indices.has_nulls: + if isinstance(indices, cudf.Series) and indices.has_nulls: raise ValueError("Column must have no nulls.") - elif isinstance(indices, slice): - start, stop, step = indices.indices(len(self)) - indices = column.arange(start, stop, step) - result = MultiIndex.from_frame( - self.to_frame(index=False).take(indices) - ) - if self._codes is not None: - result._codes = self._codes.take(indices) - if self._levels is not None: - result._levels = self._levels - result.names = self.names - return result + obj = super().take(indices) + obj.names = self.names + return obj def serialize(self): header, frames = super().serialize() @@ -887,11 +876,26 @@ def deserialize(cls, header, frames): return obj._set_names(column_names) def __getitem__(self, index): - if isinstance(index, int): - # we are indexing into a single row of the MultiIndex, - # return that row as a tuple: - return self.take(index).to_pandas()[0] - return self.take(index) + flatten = isinstance(index, int) + + if isinstance(index, (Integral, Sequence)): + index = np.array(index) + elif isinstance(index, slice): + start, stop, step = index.indices(len(self)) + index = column.arange(start, stop, step) + result = MultiIndex.from_frame(self.to_frame(index=False).take(index)) + + # we are indexing into a single row of the MultiIndex, + # return that row as a tuple: + if flatten: + return result.to_pandas()[0] + + if self._codes is not None: + result._codes = self._codes.take(index) + if self._levels is not None: + result._levels = self._levels + result.names = self.names + return result def to_frame(self, index=True, name=None): # TODO: Currently this function makes a shallow copy, which is @@ -1364,23 +1368,6 @@ def is_monotonic_decreasing(self): ascending=[False] * len(self.levels), null_position=None ) - def argsort(self, ascending=True, **kwargs): - return self._get_sorted_inds(ascending=ascending, **kwargs).values - - def sort_values(self, return_indexer=False, ascending=True, key=None): - if key is not None: - raise NotImplementedError("key parameter is not yet implemented.") - - indices = cudf.Series._from_data( - {None: self._get_sorted_inds(ascending=ascending)} - ) - index_sorted = as_index(self.take(indices), name=self.names) - - if return_indexer: - return index_sorted, cupy.asarray(indices) - else: - return index_sorted - def fillna(self, value): """ Fill null values with the specified value. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a5374f1383e..0a0854ac50c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -111,8 +111,7 @@ def __getitem__(self, arg): ): return data return self._frame._from_data( - {self._frame.name: data}, - index=cudf.Index(self._frame.index.take(arg)), + {self._frame.name: data}, index=cudf.Index(self._frame.index[arg]), ) def __setitem__(self, key, value): @@ -1171,51 +1170,9 @@ def __setitem__(self, key, value): self.loc[key] = value def take(self, indices, axis=0, keep_index=True): - """ - Return Series by taking values from the corresponding *indices*. - - Parameters - ---------- - indices : array-like or scalar - An array/scalar like integers indicating which positions to take. - keep_index : bool, default True - Whethere to retain the index in result Series or not. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 11, 12, 13, 14]) - >>> series - 0 10 - 1 11 - 2 12 - 3 13 - 4 14 - dtype: int64 - >>> series.take([0, 4]) - 0 10 - 4 14 - dtype: int64 - - If you want to drop the index, pass `keep_index=False` - - >>> series.take([0, 4], keep_index=False) - 0 10 - 1 14 - dtype: int64 - """ - axis = self._get_axis_from_axis_arg(axis) - if keep_index is True or is_scalar(indices): - return self.iloc[indices] - else: - col_inds = as_column(indices) - return self._from_data( - {self.name: self._column.take(col_inds, keep_index=False)} - ) + # Validate but don't use the axis. + _ = self._get_axis_from_axis_arg(axis) + return super().take(indices, keep_index) def __repr__(self): _, height = get_terminal_size() @@ -1950,37 +1907,6 @@ def astype(self, dtype, copy=False, errors="raise"): pass return self - def argsort(self, ascending=True, na_position="last"): - """Returns a Series of int64 index that will sort the series. - - Uses Thrust sort. - - Returns - ------- - result: Series - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([3, 1, 2]) - >>> s - 0 3 - 1 1 - 2 2 - dtype: int64 - >>> s.argsort() - 0 1 - 1 2 - 2 0 - dtype: int32 - >>> s[s.argsort()] - 1 1 - 2 2 - 0 3 - dtype: int64 - """ - return self._sort(ascending=ascending, na_position=na_position)[1] - def sort_index(self, axis=0, *args, **kwargs): if axis not in (0, "index"): raise ValueError("Only axis=0 is valid for Series.") @@ -1995,28 +1921,28 @@ def sort_values( na_position="last", ignore_index=False, ): - """ - Sort by the values. - - Sort a Series in ascending or descending order by some criterion. + """Sort by the values along either axis. Parameters ---------- - ascending : bool, default True - If True, sort values in ascending order, otherwise descending. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of the + by. na_position : {‘first’, ‘last’}, default ‘last’ - 'first' puts nulls at the beginning, 'last' puts nulls at the end. + 'first' puts nulls at the beginning, 'last' puts nulls at the end ignore_index : bool, default False If True, index will not be sorted. Returns ------- - sorted_obj : cuDF Series + Series : Series with sorted values. Notes ----- Difference from pandas: - * Not supporting: `inplace`, `kind` + * Support axis='index' only. + * Not supporting: inplace, kind Examples -------- @@ -2030,38 +1956,15 @@ def sort_values( 1 5 dtype: int64 """ - - if inplace: - raise NotImplementedError("`inplace` not currently implemented.") - if kind != "quicksort": - raise NotImplementedError("`kind` not currently implemented.") - if axis != 0: - raise NotImplementedError("`axis` not currently implemented.") - - if len(self) == 0: - return self - vals, inds = self._sort(ascending=ascending, na_position=na_position) - if not ignore_index: - index = self.index.take(inds) - else: - index = self.index - return vals.set_index(index) - - def _n_largest_or_smallest(self, largest, n, keep): - direction = largest - if keep == "first": - if n < 0: - n = 0 - return self.sort_values(ascending=not direction).head(n) - elif keep == "last": - data = self.sort_values(ascending=direction) - if n <= 0: - data = data[-n:-n] - else: - data = data.tail(n) - return data.reverse() - else: - raise ValueError('keep must be either "first", "last"') + return super().sort_values( + by=self.name, + axis=axis, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + ignore_index=ignore_index, + ) def nlargest(self, n=5, keep="first"): """Returns a new Series of the *n* largest element. @@ -2123,7 +2026,7 @@ def nlargest(self, n=5, keep="first"): Brunei 434000 dtype: int64 """ - return self._n_largest_or_smallest(n=n, keep=keep, largest=True) + return self._n_largest_or_smallest(True, n, [self.name], keep) def nsmallest(self, n=5, keep="first"): """ @@ -2198,22 +2101,29 @@ def nsmallest(self, n=5, keep="first"): Tuvalu 11300 dtype: int64 """ - return self._n_largest_or_smallest(n=n, keep=keep, largest=False) + return self._n_largest_or_smallest(False, n, [self.name], keep) - def _sort(self, ascending=True, na_position="last"): - """ - Sort by values - - Returns - ------- - 2-tuple of key and index - """ - col_keys, col_inds = self._column.sort_by_values( - ascending=ascending, na_position=na_position + def argsort( + self, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + obj = self.__class__._from_data( + { + None: super().argsort( + axis=axis, + kind=kind, + order=order, + ascending=ascending, + na_position=na_position, + ) + } ) - sr_keys = self._from_data({self.name: col_keys}, self._index) - sr_inds = self._from_data({self.name: col_inds}, self._index) - return sr_keys, sr_inds + obj.name = self.name + return obj def replace(self, to_replace=None, value=None, *args, **kwargs): if is_dict_like(to_replace) and value is not None: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c1eade0fcdc..ae331a1d5ce 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8732,12 +8732,12 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode): ( cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}), True, - cudf.Series([1, 2, 0], dtype="int32"), + cupy.array([1, 2, 0], dtype="int32"), ), ( cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}), False, - cudf.Series([0, 2, 1], dtype="int32"), + cupy.array([0, 2, 1], dtype="int32"), ), ], ) diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 0f8b46cee35..28e63ec41f1 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from cudf import DataFrame, GenericIndex, Series +from cudf import DataFrame, GenericIndex, RangeIndex, Series from cudf.core.buffer import Buffer from cudf.testing._utils import assert_eq @@ -28,7 +28,7 @@ def check_serialization(df): assert_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, (GenericIndex, RangeIndex)) assert_frame_picklable(sortvaldf) # out-of-band if pickle.HIGHEST_PROTOCOL >= 5: diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 53676a47046..00cd31e7539 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -154,33 +154,16 @@ def test_series_nsmallest(data, n): @pytest.mark.parametrize("nelem,n", [(1, 1), (100, 100), (10, 5), (100, 10)]) -def test_dataframe_nlargest(nelem, n): +@pytest.mark.parametrize("op", ["nsmallest", "nlargest"]) +@pytest.mark.parametrize("columns", ["a", ["b", "a"]]) +def test_dataframe_nlargest_nsmallest(nelem, n, op, columns): np.random.seed(0) - df = DataFrame() - df["a"] = aa = np.random.random(nelem) - df["b"] = bb = np.random.random(nelem) - res = df.nlargest(n, "a") - - # Check - inds = np.argsort(aa) - assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1]) - assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1]) - assert_eq(res.index.values, inds[-n:][::-1]) - + aa = np.random.random(nelem) + bb = np.random.random(nelem) -@pytest.mark.parametrize("nelem,n", [(10, 5), (100, 10)]) -def test_dataframe_nsmallest(nelem, n): - np.random.seed(0) - df = DataFrame() - df["a"] = aa = np.random.random(nelem) - df["b"] = bb = np.random.random(nelem) - res = df.nsmallest(n, "a") - - # Check - inds = np.argsort(-aa) - assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1]) - assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1]) - assert_eq(res.index.values, inds[-n:][::-1]) + df = DataFrame({"a": aa, "b": bb}) + pdf = df.to_pandas() + assert_eq(getattr(df, op)(n, columns), getattr(pdf, op)(n, columns)) @pytest.mark.parametrize(