From ec44d78092f783cde106f4f4d192fb2791cd7031 Mon Sep 17 00:00:00 2001 From: Touutae-lab Date: Fri, 3 Feb 2023 22:33:21 +0700 Subject: [PATCH 01/14] Convert note --- python/cudf/cudf/core/column/lists.py | 10 +- python/cudf/cudf/core/column/string.py | 98 +++++++------- python/cudf/cudf/core/dataframe.py | 156 +++++++++++++---------- python/cudf/cudf/core/frame.py | 118 +++++++++-------- python/cudf/cudf/core/groupby/groupby.py | 39 ++++-- python/cudf/cudf/core/indexed_frame.py | 87 +++++++------ python/cudf/cudf/core/series.py | 62 ++++----- python/cudf/cudf/core/tools/datetimes.py | 1 - python/cudf/cudf/core/tools/numeric.py | 18 +-- python/cudf/cudf/tests/test_dataframe.py | 79 ------------ python/dask_cudf/dask_cudf/accessors.py | 12 +- 11 files changed, 335 insertions(+), 345 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 0bb9f70f851..6d37f1281ef 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -622,11 +622,6 @@ def sort_values( ------- Series or Index with each list sorted - Notes - ----- - Difference from pandas: - * Not supporting: `inplace`, `kind` - Examples -------- >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) @@ -635,6 +630,11 @@ def sort_values( 1 [2.0, 8.0, 8.0] 2 [1.0, 2.0] dtype: list + + .. pandas-compat:: + **list.ListMethods.sort_values** + + The ``inplace`` and ``kind`` arguments are currently not supported. """ if inplace: raise NotImplementedError("`inplace` not currently implemented.") diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9319881669f..1b5415de1f7 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -598,11 +598,6 @@ def extract( for each group. If `expand=False` and `pat` has only one capture group, then return a Series/Index. - Notes - ----- - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - Examples -------- >>> import cudf @@ -629,6 +624,12 @@ def extract( 1 2 2 dtype: object + + .. pandas-compat:: + **StringMethods.extract** + + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ # noqa W605 if not _is_supported_regex_flags(flags): raise NotImplementedError( @@ -676,14 +677,6 @@ def contains( pattern is contained within the string of each element of the Series/Index. - Notes - ----- - The parameters `case` and `na` are not yet supported and will - raise a NotImplementedError if anything other than the default - value is set. - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - Examples -------- >>> import cudf @@ -757,6 +750,15 @@ def contains( 3 True 4 dtype: bool + + .. pandas-compat:: + **StringMethods.contains** + + The parameters `case` and `na` are not yet supported and will + raise a NotImplementedError if anything other than the default + value is set. + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ # noqa W605 if na is not np.nan: raise NotImplementedError("`na` parameter is not yet supported") @@ -955,12 +957,6 @@ def replace( A copy of the object with all matching occurrences of pat replaced by repl. - Notes - ----- - The parameters `case` and `flags` are not yet supported and will raise - a `NotImplementedError` if anything other than the default value - is set. - Examples -------- >>> import cudf @@ -990,6 +986,13 @@ def replace( 1 fuz 2 dtype: object + + .. pandas-compat:: + **StringMethods.replace** + + The parameters `case` and `flags` are not yet supported and will raise + a `NotImplementedError` if anything other than the default value + is set. """ if case is not None: raise NotImplementedError("`case` parameter is not yet supported") @@ -2773,11 +2776,6 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: DataFrame or MultiIndex Returns a DataFrame / MultiIndex - Notes - ----- - The parameter `expand` is not yet supported and will raise a - `NotImplementedError` if anything other than the default value is set. - See Also -------- rpartition @@ -2819,6 +2817,12 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: MultiIndex([('X', ' ', '123'), ('Y', ' ', '999')], ) + + .. pandas-compat:: + **StringMethods.partition** + + The parameter `expand` is not yet supported and will raise a + `NotImplementedError` if anything other than the default value is set. """ if expand is not True: raise NotImplementedError( @@ -3506,11 +3510,11 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: Notes ----- - - `flags` parameter currently only supports re.DOTALL - and re.MULTILINE. - - Some characters need to be escaped when passing - in pat. e.g. ``'$'`` has a special meaning in regex - and must be escaped when finding this literal character. + - `flags` parameter currently only supports re.DOTALL + and re.MULTILINE. + - Some characters need to be escaped when passing + in pat. e.g. ``'$'`` has a special meaning in regex + and must be escaped when finding this literal character. Examples -------- @@ -3574,11 +3578,6 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: All non-overlapping matches of pattern or regular expression in each string of this Series/Index. - Notes - ----- - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - Examples -------- >>> import cudf @@ -3619,6 +3618,12 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: 1 [] 2 [b, b] dtype: list + + .. pandas-compat:: + **StringMethods.findall** + + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ if isinstance(pat, re.Pattern): flags = pat.flags & ~re.U @@ -3801,11 +3806,6 @@ def endswith(self, pat: str) -> SeriesOrIndex: A Series of booleans indicating whether the given pattern matches the end of each string element. - Notes - ----- - `na` parameter is not yet supported, as cudf uses - native strings instead of Python objects. - Examples -------- >>> import cudf @@ -3822,6 +3822,12 @@ def endswith(self, pat: str) -> SeriesOrIndex: 2 False 3 dtype: bool + + .. pandas-compat:: + **StringMethods.endswith** + + `na` parameter is not yet supported, as cudf uses + native strings instead of Python objects. """ if pat is None: raise TypeError( @@ -4249,13 +4255,6 @@ def match( ------- Series or Index of boolean values. - Notes - ----- - Parameters `case` and `na` are currently not supported. - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - - Examples -------- >>> import cudf @@ -4276,6 +4275,13 @@ def match( 1 True 2 True dtype: bool + + .. pandas-compat:: + **StringMethods.match** + + Parameters `case` and `na` are currently not supported. + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ if case is not True: raise NotImplementedError("`case` parameter is not yet supported") diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d1c807279da..9ae121cda0e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3115,10 +3115,6 @@ def diff(self, periods=1, axis=0): DataFrame First differences of the DataFrame. - Notes - ----- - Diff currently only supports numeric dtype columns. - Examples -------- >>> import cudf @@ -3142,6 +3138,10 @@ def diff(self, periods=1, axis=0): 4 2 3 16 5 2 5 20 + .. pandas-compat:: + **DataFrame.diff** + + Diff currently only supports numeric dtype columns. """ if not is_integer(periods): if not (is_float(periods) and periods.is_integer()): @@ -3317,14 +3317,6 @@ def rename( ------- DataFrame - Notes - ----- - Difference from pandas: - * Not supporting: level - - Rename will not overwrite column names. If a list with duplicates is - passed, column names will be postfixed with a number. - Examples -------- >>> import cudf @@ -3350,6 +3342,14 @@ def rename( 10 1 4 20 2 5 30 3 6 + + .. pandas-compat:: + **DataFrame.rename** + + * Not Supporting: level + + Rename will not overwrite column names. If a list with duplicates is + passed, column names will be postfixed with a number. """ if errors != "ignore": raise NotImplementedError( @@ -3449,10 +3449,10 @@ def agg(self, aggs, axis=None): When ``DataFrame.agg`` is called with several aggs, ``DataFrame`` is returned. - Notes - ----- - Difference from pandas: - * Not supporting: ``axis``, ``*args``, ``**kwargs`` + .. pandas-compat:: + **DataFrame.agg** + + * Not supporting: ``axis``, ``*args``, ``**kwargs`` """ # TODO: Remove the typecasting below once issue #6846 is fixed @@ -3585,11 +3585,6 @@ def nlargest(self, n, columns, keep="first"): The first `n` rows ordered by the given columns in descending order. - Notes - ----- - Difference from pandas: - - Only a single column is supported in *columns* - Examples -------- >>> import cudf @@ -3624,6 +3619,11 @@ def nlargest(self, n, columns, keep="first"): France 65000000 2583560 FR Italy 59000000 1937894 IT Brunei 434000 12128 BN + + .. pandas-compat:: + **DataFrame.nlargest** + + - Only a single column is supported in *columns* """ return self._n_largest_or_smallest(True, n, columns, keep) @@ -3650,11 +3650,6 @@ def nsmallest(self, n, columns, keep="first"): ------- DataFrame - Notes - ----- - Difference from pandas: - - Only a single column is supported in *columns* - Examples -------- >>> import cudf @@ -3696,6 +3691,11 @@ def nsmallest(self, n, columns, keep="first"): Anguilla 11300 311 AI Tuvalu 11300 38 TV Nauru 337000 182 NR + + .. pandas-compat:: + **DataFrame.nsmallest** + + - Only a single column is supported in *columns* """ return self._n_largest_or_smallest(False, n, columns, keep) @@ -3773,10 +3773,10 @@ def transpose(self): ------- a new (ncol x nrow) dataframe. self is (nrow x ncol) - Notes - ----- - Difference from pandas: - Not supporting *copy* because default and only behavior is copy=True + .. pandas-compat:: + **DataFrame.transpose, DataFrame.T** + + Not supporting *copy* because default and only behavior is copy=True """ index = self._data.to_pandas_index() @@ -3925,10 +3925,6 @@ def merge( ------- merged : DataFrame - Notes - ----- - **DataFrames merges in cuDF result in non-deterministic row ordering.** - Examples -------- >>> import cudf @@ -3964,6 +3960,11 @@ def merge( right dtype respectively. This extends to semi and anti joins. - For outer joins, the result will be the union of categories from both sides. + + .. pandas-compat:: + **DataFrame.merge** + + **DataFrames merges in cuDF result in non-deterministic row ordering.** """ if indicator: raise NotImplementedError( @@ -4034,12 +4035,11 @@ def join( ------- joined : DataFrame - Notes - ----- - Difference from pandas: + .. pandas-compat:: + **DataFrame.join** - - *other* must be a single DataFrame for now. - - *on* is not supported yet due to lack of multi-index support. + - *other* must be a single DataFrame for now. + - *on* is not supported yet due to lack of multi-index support. """ if on is not None: raise NotImplementedError("The on parameter is not yet supported") @@ -5161,11 +5161,6 @@ def from_arrow(cls, table): ------- cudf DataFrame - Notes - ----- - - Does not support automatically setting index column(s) similar - to how ``to_pandas`` works for PyArrow Tables. - Examples -------- >>> import cudf @@ -5176,6 +5171,12 @@ def from_arrow(cls, table): 0 1 4 1 2 5 2 3 6 + + .. pandas-compat:: + **DataFrame.from_arrow** + + - Does not support automatically setting index column(s) similar + to how ``to_pandas`` works for PyArrow Tables. """ index_col = None if isinstance(table, pa.Table) and isinstance( @@ -5514,14 +5515,6 @@ def quantile( If q is a float, a Series will be returned where the index is the columns of self and the values are the quantiles. - .. pandas-compat:: - **DataFrame.quantile** - - One notable difference from Pandas is when DataFrame is of - non-numeric types and result is expected to be a Series in case of - Pandas. cuDF will return a DataFrame as it doesn't support mixed - types under Series. - Examples -------- >>> import cupy as cp @@ -5542,6 +5535,14 @@ def quantile( a b 0.1 1.3 3.7 0.5 2.5 55.0 + + .. pandas-compat:: + **DataFrame.quantile** + + One notable difference from Pandas is when DataFrame is of + non-numeric types and result is expected to be a Series in case of + Pandas. cuDF will return a DataFrame as it doesn't support mixed + types under Series. """ # noqa: E501 if axis not in (0, None): raise NotImplementedError("axis is not implemented yet") @@ -5807,10 +5808,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Series For each column/row the number of non-NA/null entries. - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -5824,6 +5821,11 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Age 4 Single 5 dtype: int64 + + .. pandas-compat:: + **DataFrame.count** + + Parameters currently not supported are `axis`, `level`, `numeric_only`. """ axis = self._get_axis_from_axis_arg(axis) if axis != 0: @@ -5971,10 +5973,6 @@ def mode(self, axis=0, numeric_only=False, dropna=True): cudf.Series.value_counts : Return the counts of values in a Series. - Notes - ----- - ``axis`` parameter is currently not supported. - Examples -------- >>> import cudf @@ -6013,6 +6011,11 @@ def mode(self, axis=0, numeric_only=False, dropna=True): legs wings 0 2 0.0 1 2.0 + + .. pandas-compat:: + **DataFrame.mode** + + ``axis`` parameter is currently not supported. """ if axis not in (0, "index"): raise NotImplementedError("Only axis=0 is currently supported") @@ -6567,7 +6570,7 @@ def to_struct(self, name=None): Notes ----- - Note that a copy of the columns is made. + Note: that a copy of the columns is made. """ if not all(isinstance(name, str) for name in self._data.names): warnings.warn( @@ -6672,11 +6675,6 @@ def append( ------- DataFrame - See Also - -------- - cudf.concat : General function to concatenate DataFrame or - objects. - Notes ----- If a list of dict/series is passed and the keys are all contained in @@ -6686,7 +6684,12 @@ def append( computationally intensive than a single concatenate. A better solution is to append those rows to a list and then concatenate the list with the original DataFrame all at once. - `verify_integrity` parameter is not supported yet. + `verify_integrity` parameter is not supported yet + + See Also + -------- + cudf.concat : General function to concatenate DataFrame or + objects. Examples -------- @@ -7071,8 +7074,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): Specifically, `&` must be used for bitwise operators on integers, not `and`, which is specifically for the logical and between booleans. - * Only numerical types currently support all operators. - * String types currently support comparison operators. + * Only numerical types are currently supported. * Operators generally will not cast automatically. Users are responsible for casting columns to suitable types before evaluating a function. @@ -7141,6 +7143,22 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): 2 3 6 9 -3 3 4 4 8 0 4 5 2 7 3 + + .. pandas-compat:: + **DataFrame.eval** + + * Additional kwargs are not supported. + * Bitwise and logical operators are not dtype-dependent. + Specifically, `&` must be used for bitwise operators on integers, + not `and`, which is specifically for the logical and between + booleans. + * Only numerical types are currently supported. + * Operators generally will not cast automatically. Users are + responsible for casting columns to suitable types before + evaluating a function. + * Multiple assignments to the same name (i.e. a sequence of + assignment statements where later statements are conditioned upon + the output of earlier statements) is not supported. """ if kwargs: raise ValueError( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 3b5f3306c7e..2ac863dac2c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -256,11 +256,6 @@ def empty(self): out : bool If DataFrame/Series is empty, return True, if not return False. - Notes - ----- - If DataFrame/Series contains only `null` values, it is still not - considered empty. See the example below. - Examples -------- >>> import cudf @@ -301,6 +296,12 @@ def empty(self): Series([], dtype: float64) >>> s.empty True + + .. pandas-compat:: + **DataFrame.empty, Series.empty, Frame.empty** + + If DataFrame/Series contains only `null` values, it is still not + considered empty. See the example above. """ return self.size == 0 @@ -642,6 +643,8 @@ def where(self, cond, other=None, inplace=False): dtype: int64 .. pandas-compat:: + **DataFrame.where, Series.where, Frame.where** + Note that ``where`` treats missing values as falsy, in parallel with pandas treatment of nullable data: @@ -1933,10 +1936,6 @@ def min( ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -1945,6 +1944,11 @@ def min( a 1 b 7 dtype: int64 + + .. pandas-compat:: + **DataFrame.min, Series.min, Frame.min** + + Parameters currently not supported are `level`, `numeric_only`. """ return self._reduce( "min", @@ -1984,10 +1988,6 @@ def max( ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -1996,6 +1996,11 @@ def max( a 4 b 10 dtype: int64 + + .. pandas-compat:: + **DataFrame.max, Series.max, Frame.max** + + Parameters currently not supported are `level`, `numeric_only`. """ return self._reduce( "max", @@ -2040,10 +2045,6 @@ def sum( ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -2052,6 +2053,11 @@ def sum( a 10 b 34 dtype: int64 + + .. pandas-compat:: + **DataFrame.sum, Series.sum, Frame.sum** + + Parameters currently not supported are `level`, `numeric_only`. """ return self._reduce( "sum", @@ -2098,10 +2104,6 @@ def product( ------- Series - Notes - ----- - Parameters currently not supported are level`, `numeric_only`. - Examples -------- >>> import cudf @@ -2110,6 +2112,11 @@ def product( a 24 b 5040 dtype: int64 + + .. pandas-compat:: + **DataFrame.product, Series.product, Frame.product** + + Parameters currently not supported are level`, `numeric_only`. """ axis = self._get_axis_from_axis_arg(axis) return self._reduce( @@ -2204,11 +2211,6 @@ def std( ------- Series - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - Examples -------- >>> import cudf @@ -2217,6 +2219,12 @@ def std( a 1.290994 b 1.290994 dtype: float64 + + .. pandas-compat:: + **DataFrame.std, Series.std, Frame.std** + + Parameters currently not supported are `level` and + `numeric_only` """ return self._reduce( @@ -2260,11 +2268,6 @@ def var( ------- scalar - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - Examples -------- >>> import cudf @@ -2273,6 +2276,12 @@ def var( a 1.666667 b 1.666667 dtype: float64 + + .. pandas-compat:: + **DataFrame.var, Series.var, Frame.var** + + Parameters currently not supported are `level` and + `numeric_only` """ return self._reduce( "var", @@ -2305,10 +2314,6 @@ def kurtosis( ------- Series or scalar - Notes - ----- - Parameters currently not supported are `level` and `numeric_only` - Examples -------- **Series** @@ -2326,6 +2331,11 @@ def kurtosis( a -1.2 b -1.2 dtype: float64 + + .. pandas-compat:: + **DataFrame.kurtosis, Frame.kurtosis** + + Parameters currently not supported are `level` and `numeric_only` """ if axis not in (0, "index", None): raise NotImplementedError("Only axis=0 is currently supported.") @@ -2358,11 +2368,6 @@ def skew( ------- Series - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - Examples -------- **Series** @@ -2387,6 +2392,12 @@ def skew( a 0.00000 b -0.37037 dtype: float64 + + .. pandas-compat:: + **DataFrame.skew, Series.skew, Frame.skew** + + Parameters currently not supported are `axis`, `level` and + `numeric_only` """ if axis not in (0, "index", None): raise NotImplementedError("Only axis=0 is currently supported.") @@ -2418,10 +2429,6 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): ------- Series - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - Examples -------- >>> import cudf @@ -2430,6 +2437,11 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): a True b False dtype: bool + + .. pandas-compat:: + **DataFrame.all, Series.all, Frame.all** + + Parameters currently not supported are `axis`, `bool_only`, `level`. """ return self._reduce( "all", @@ -2457,10 +2469,6 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): ------- Series - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - Examples -------- >>> import cudf @@ -2469,6 +2477,11 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): a True b True dtype: bool + + .. pandas-compat:: + **DataFrame.any, Series.any, Frame.any** + + Parameters currently not supported are `axis`, `bool_only`, `level`. """ return self._reduce( "any", @@ -2523,10 +2536,6 @@ def median( ------- scalar - Notes - ----- - Parameters currently not supported are `level` and `numeric_only`. - Examples -------- >>> import cudf @@ -2541,6 +2550,11 @@ def median( dtype: int64 >>> ser.median() 17.0 + + .. pandas-compat:: + **DataFrame.median, Series.median, Frame.median** + + Parameters currently not supported are `level` and `numeric_only`. """ return self._reduce( "median", diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b31fca85525..927b91755a7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -626,10 +626,10 @@ def _reduce( Series or DataFrame Computed {op} of values within each group. - Notes - ----- - Difference from pandas: - * Not supporting: numeric_only, min_count + .. pandas-compat:: + **{cls}.{op}** + + The numeric_only, min_count """ if numeric_only: raise NotImplementedError( @@ -1327,7 +1327,7 @@ def mult(df): 6 2 6 12 .. pandas-compat:: - **groupby.apply** + **GroupBy.apply** cuDF's ``groupby.apply`` is limited compared to pandas. In some situations, Pandas returns the grouped keys as part of @@ -2151,6 +2151,28 @@ def fillna( Returns ------- DataFrame or Series + + .. pandas-compat:: + **GroupBy.fillna** + + This function may return result in different format to the method + Pandas supports. For example: + + .. code-block:: + + >>> df = pd.DataFrame({'k': [1, 1, 2], 'v': [2, None, 4]}) + >>> gdf = cudf.from_pandas(df) + >>> df.groupby('k').fillna({'v': 4}) # pandas + v + k + 1 0 2.0 + 1 4.0 + 2 2 4.0 + >>> gdf.groupby('k').fillna({'v': 4}) # cudf + v + 0 2.0 + 1 4.0 + 2 4.0 """ if inplace: raise NotImplementedError("Does not support inplace yet.") @@ -2210,9 +2232,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): Series or DataFrame Object shifted within each group. - Notes - ----- - Parameter ``freq`` is unsupported. + .. pandas-compat:: + **GroupBy.shift** + + Parameter ``freq`` is unsupported. """ if freq is not None: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index eba5fada7d0..f2c915d78f1 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -595,11 +595,6 @@ def replace( result : Series Series after replacement. The mask and index are preserved. - Notes - ----- - Parameters that are currently not supported are: `limit`, `regex`, - `method` - Examples -------- **Series** @@ -742,6 +737,12 @@ def replace( 2 2 7 c 3 3 8 d 4 4 9 e + + .. pandas-compat:: + **DataFrame.replace, Series.replace, IndexedFrame.replace** + + Parameters that are currently not supported are: `limit`, `regex`, + `method` """ if limit is not None: raise NotImplementedError("limit parameter is not implemented yet") @@ -1073,13 +1074,6 @@ def truncate(self, before=None, after=None, axis=0, copy=True): `before` and `after` may be specified as strings instead of Timestamps. - .. pandas-compat:: - **DataFrame.truncate, Series.truncate** - - The ``copy`` parameter is only present for API compatibility, but - ``copy=False`` is not supported. This method always generates a - copy. - Examples -------- **Series** @@ -1221,6 +1215,13 @@ def truncate(self, before=None, after=None, axis=0, copy=True): 2021-01-01 23:45:25 1 2 2021-01-01 23:45:26 1 2 2021-01-01 23:45:27 1 2 + + .. pandas-compat:: + **DataFrame.truncate, Series.truncate, IndexedFrame.truncate** + + The ``copy`` parameter is only present for API compatibility, but + ``copy=False`` is not supported. This method always generates a + copy. """ if not copy: raise ValueError("Truncating with copy=False is not supported.") @@ -1473,11 +1474,6 @@ def sort_index( ------- Frame or None - Notes - ----- - Difference from pandas: - * Not supporting: kind, sort_remaining=False - Examples -------- **Series** @@ -1518,6 +1514,11 @@ def sort_index( 1 2 3 3 1 2 2 3 1 + + .. pandas-compat:: + **DataFrame.sort_index, Series.sort_index, IndexedFrame.sort_index** + + * Not supporting: kind, sort_remaining=False """ if kind is not None: raise NotImplementedError("kind is not yet supported") @@ -2235,12 +2236,6 @@ def sort_values( ------- Frame : Frame with sorted values. - Notes - ----- - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - Examples -------- >>> import cudf @@ -2252,6 +2247,12 @@ def sort_values( 0 0 -3 2 2 0 1 1 2 + + .. pandas-compat:: + **DataFrame.sort_values, Series.sort_values, IndexedFrame.sort_values** + + * Support axis='index' only. + * Not supporting: inplace, kind """ if na_position not in {"first", "last"}: raise ValueError(f"invalid na_position: {na_position}") @@ -2713,13 +2714,14 @@ def resample( 2018-02-28 18.0 63.333333 - Notes - ----- - Note that the dtype of the index (or the 'on' column if using - 'on=') in the result will be of a frequency closest to the - resampled frequency. For example, if resampling from - nanoseconds to milliseconds, the index will be of dtype - 'datetime64[ms]'. + .. pandas-compat:: + **DataFrame.resample, Series.resample, IndexedFrame.resample** + + Note that the dtype of the index (or the 'on' column if using + 'on=') in the result will be of a frequency closest to the + resampled frequency. For example, if resampling from + nanoseconds to milliseconds, the index will be of dtype + 'datetime64[ms]'. """ import cudf.core.resample @@ -3153,18 +3155,6 @@ def sample( provided via the `random_state` parameter. This function will always produce the same sample given an identical `random_state`. - Notes - ----- - When sampling from ``axis=0/'index'``, ``random_state`` can be either - a numpy random state (``numpy.random.RandomState``) or a cupy random - state (``cupy.random.RandomState``). When a numpy random state is - used, the output is guaranteed to match the output of the corresponding - pandas method call, but generating the sample may be slow. If exact - pandas equivalence is not required, using a cupy random state will - achieve better performance, especially when sampling large number of - items. It's advised to use the matching `ndarray` type to the random - state for the `weights` array. - Parameters ---------- n : int, optional @@ -3232,6 +3222,19 @@ def sample( a c 0 1 3 1 2 4 + + .. pandas-compat:: + **DataFrame.sample, Series.sample, IndexedFrame.sample** + + When sampling from ``axis=0/'index'``, ``random_state`` can be either + a numpy random state (``numpy.random.RandomState``) or a cupy random + state (``cupy.random.RandomState``). When a numpy random state is + used, the output is guaranteed to match the output of the corresponding + pandas method call, but generating the sample may be slow. If exact + pandas equivalence is not required, using a cupy random state will + achieve better performance, especially when sampling large number of + items. It's advised to use the matching `ndarray` type to the random + state for the `weights` array. """ axis = self._get_axis_from_axis_arg(axis) size = self.shape[axis] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index b5392fcbe62..be430cc7f31 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1258,10 +1258,11 @@ def map(self, arg, na_action=None) -> "Series": 4 dtype: int64 - Notes - ----- - Please note map currently only supports fixed-width numeric - type functions. + .. pandas-compat:: + **Series.map** + + Please note map currently only supports fixed-width numeric + type functions. """ if isinstance(arg, dict): if hasattr(arg, "__missing__"): @@ -2026,12 +2027,6 @@ def sort_values( ------- Series : Series with sorted values. - Notes - ----- - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - Examples -------- >>> import cudf @@ -2043,6 +2038,12 @@ def sort_values( 3 4 1 5 dtype: int64 + + .. pandas-compat:: + **Series.sort_values** + + * Support axis='index' only. + * The inplace and kind argument is currently unsupported """ return super().sort_values( by=self.name, @@ -2487,16 +2488,17 @@ def count(self, level=None, **kwargs): int Number of non-null values in the Series. - Notes - ----- - Parameters currently not supported is `level`. - Examples -------- >>> import cudf >>> ser = cudf.Series([1, 5, 2, 4, 3]) >>> ser.count() 5 + + .. pandas-compat:: + **Series.count** + + Parameters currently not supported is `level`. """ if level is not None: @@ -2598,10 +2600,6 @@ def cov(self, other, min_periods=None): Covariance between Series and other normalized by N-1 (unbiased estimator). - Notes - ----- - `min_periods` parameter is not yet supported. - Examples -------- >>> import cudf @@ -2609,6 +2607,11 @@ def cov(self, other, min_periods=None): >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) >>> ser1.cov(ser2) -0.015750000000000004 + + .. pandas-compat:: + **Series.cov** + + `min_periods` parameter is not yet supported. """ if min_periods is not None: @@ -3336,12 +3339,6 @@ def rename(self, index=None, copy=True): ------- Series - Notes - ----- - Difference from pandas: - - Supports scalar values only for changing name attribute - - Not supporting : inplace, level - Examples -------- >>> import cudf @@ -3360,6 +3357,12 @@ def rename(self, index=None, copy=True): Name: numeric_series, dtype: int64 >>> renamed_series.name 'numeric_series' + + .. pandas-compat:: + **Series.rename** + + - Supports scalar values only for changing name attribute + - The ``inplace`` and ``level`` is not supported """ out_data = self._data.copy(deep=copy) return Series._from_data(out_data, self.index, name=index) @@ -4539,11 +4542,6 @@ def strftime(self, date_format, *args, **kwargs): Series Series of formatted strings. - Notes - ----- - The following date format identifiers are not yet - supported: ``%c``, ``%x``,``%X`` - Examples -------- >>> import cudf @@ -4570,6 +4568,12 @@ def strftime(self, date_format, *args, **kwargs): 1 2000 / 30 / 06 2 2000 / 30 / 09 dtype: object + + .. pandas-compat:: + **series.DatetimeProperties.strftime** + + The following date format identifiers are not yet + supported: ``%c``, ``%x``,``%X`` """ if not isinstance(date_format, str): diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 0ee9f511061..351bc1e67b4 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -806,7 +806,6 @@ def date_range( '2023-12-23 08:00:00', '2025-02-23 08:00:00', '2026-04-23 08:00:00'], dtype='datetime64[ns]') - """ if tz is not None: raise NotImplementedError("tz is currently unsupported.") diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 0273227010b..609a5503040 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import warnings @@ -56,13 +56,6 @@ def to_numeric(arg, errors="raise", downcast=None): Depending on the input, if series is passed in, series is returned, otherwise ndarray - Notes - ----- - An important difference from pandas is that this function does not accept - mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. - A ``TypeError`` will be raised when such input is received, regardless of - ``errors`` parameter. - Examples -------- >>> s = cudf.Series(['1', '2.0', '3e3']) @@ -92,6 +85,15 @@ def to_numeric(arg, errors="raise", downcast=None): 1 1.0 2 3000.0 dtype: float64 + + .. pandas-compat:: + **cudf.to_numeric** + + An important difference from pandas is that this function does not accept + mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. + A ``TypeError`` will be raised when such input is received, regardless of + ``errors`` parameter. + """ if errors not in {"raise", "ignore", "coerce"}: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 25a17697538..338005f7578 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10068,82 +10068,3 @@ def test_dataframe_transpose_complex_types(data): actual = gdf.T assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]}, - {"a": [[{"b": 567}], None] * 10}, - {"a": [decimal.Decimal(10), decimal.Decimal(20), None]}, - ], -) -def test_dataframe_values_complex_types(data): - gdf = cudf.DataFrame(data) - with pytest.raises(NotImplementedError): - gdf.values - - -def test_dataframe_from_arrow_slice(): - table = pa.Table.from_pandas( - pd.DataFrame.from_dict( - {"a": ["aa", "bb", "cc"] * 3, "b": [1, 2, 3] * 3} - ) - ) - table_slice = table.slice(3, 7) - - expected = table_slice.to_pandas() - actual = cudf.DataFrame.from_arrow(table_slice) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 4}, - {"c": 4, "a": [1, 2, 3], "b": ["x", "y", "z"]}, - {"a": [1, 2, 3], "c": 4}, - ], -) -def test_dataframe_init_from_scalar_and_lists(data): - actual = cudf.DataFrame(data) - expected = pd.DataFrame(data) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,index", - [ - ({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None), - ( - { - "a": [1, 2, 3], - "b": ["x", "y", "z"], - }, - [10, 11], - ), - ( - { - "a": [1, 2, 3], - "b": ["x", "y", "z"], - }, - [10, 11], - ), - ([[10, 11], [12, 13]], ["a", "b", "c"]), - ], -) -def test_dataframe_init_length_error(data, index): - assert_exceptions_equal( - lfunc=pd.DataFrame, - rfunc=cudf.DataFrame, - lfunc_args_and_kwargs=( - [], - {"data": data, "index": index}, - ), - rfunc_args_and_kwargs=( - [], - {"data": data, "index": index}, - ), - ) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 1c21fca51c8..873981b24f9 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. class StructMethods: @@ -263,11 +263,6 @@ def sort_values( ------- ListColumn with each list sorted - Notes - ----- - Difference from pandas: - * Not supporting: `inplace`, `kind` - Examples -------- >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) @@ -277,6 +272,11 @@ def sort_values( 1 [2.0, 8.0, 8.0] 2 [1.0, 2.0] dtype: list + + .. pandas-compat:: + **ListMethods.sort_values** + + The `inplace` and `kind` argument is currently unsupported. """ return self.d_series.map_partitions( lambda s: s.list.sort_values( From 1de4d0279d3b6c8f76cc836075e98237543dca4e Mon Sep 17 00:00:00 2001 From: Pantakan Kanprawet Date: Fri, 7 Jul 2023 22:49:43 +0700 Subject: [PATCH 02/14] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani Co-authored-by: Lawrence Mitchell --- python/cudf/cudf/core/column/lists.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 24 ++++++++++++------------ python/cudf/cudf/core/groupby/groupby.py | 22 ---------------------- python/cudf/cudf/core/indexed_frame.py | 12 ++++++------ 5 files changed, 20 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6d37f1281ef..1036f5ac263 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -632,7 +632,7 @@ def sort_values( dtype: list .. pandas-compat:: - **list.ListMethods.sort_values** + **ListMethods.sort_values** The ``inplace`` and ``kind`` arguments are currently not supported. """ diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9ae121cda0e..190e9437cbc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6570,7 +6570,7 @@ def to_struct(self, name=None): Notes ----- - Note: that a copy of the columns is made. + Note: a copy of the columns is made. """ if not all(isinstance(name, str) for name in self._data.names): warnings.warn( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 2ac863dac2c..5837cc3686c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -298,7 +298,7 @@ def empty(self): True .. pandas-compat:: - **DataFrame.empty, Series.empty, Frame.empty** + **DataFrame.empty, Series.empty** If DataFrame/Series contains only `null` values, it is still not considered empty. See the example above. @@ -643,7 +643,7 @@ def where(self, cond, other=None, inplace=False): dtype: int64 .. pandas-compat:: - **DataFrame.where, Series.where, Frame.where** + **DataFrame.where, Series.where** Note that ``where`` treats missing values as falsy, in parallel with pandas treatment of nullable data: @@ -1946,7 +1946,7 @@ def min( dtype: int64 .. pandas-compat:: - **DataFrame.min, Series.min, Frame.min** + **DataFrame.min, Series.min** Parameters currently not supported are `level`, `numeric_only`. """ @@ -1998,7 +1998,7 @@ def max( dtype: int64 .. pandas-compat:: - **DataFrame.max, Series.max, Frame.max** + **DataFrame.max, Series.max** Parameters currently not supported are `level`, `numeric_only`. """ @@ -2055,7 +2055,7 @@ def sum( dtype: int64 .. pandas-compat:: - **DataFrame.sum, Series.sum, Frame.sum** + **DataFrame.sum, Series.sum** Parameters currently not supported are `level`, `numeric_only`. """ @@ -2114,7 +2114,7 @@ def product( dtype: int64 .. pandas-compat:: - **DataFrame.product, Series.product, Frame.product** + **DataFrame.product, Series.product** Parameters currently not supported are level`, `numeric_only`. """ @@ -2221,7 +2221,7 @@ def std( dtype: float64 .. pandas-compat:: - **DataFrame.std, Series.std, Frame.std** + **DataFrame.std, Series.std** Parameters currently not supported are `level` and `numeric_only` @@ -2278,7 +2278,7 @@ def var( dtype: float64 .. pandas-compat:: - **DataFrame.var, Series.var, Frame.var** + **DataFrame.var, Series.var** Parameters currently not supported are `level` and `numeric_only` @@ -2333,7 +2333,7 @@ def kurtosis( dtype: float64 .. pandas-compat:: - **DataFrame.kurtosis, Frame.kurtosis** + **DataFrame.kurtosis** Parameters currently not supported are `level` and `numeric_only` """ @@ -2439,7 +2439,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): dtype: bool .. pandas-compat:: - **DataFrame.all, Series.all, Frame.all** + **DataFrame.all, Series.all** Parameters currently not supported are `axis`, `bool_only`, `level`. """ @@ -2479,7 +2479,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): dtype: bool .. pandas-compat:: - **DataFrame.any, Series.any, Frame.any** + **DataFrame.any, Series.any** Parameters currently not supported are `axis`, `bool_only`, `level`. """ @@ -2552,7 +2552,7 @@ def median( 17.0 .. pandas-compat:: - **DataFrame.median, Series.median, Frame.median** + **DataFrame.median, Series.median** Parameters currently not supported are `level` and `numeric_only`. """ diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 927b91755a7..4859beacbf3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2151,28 +2151,6 @@ def fillna( Returns ------- DataFrame or Series - - .. pandas-compat:: - **GroupBy.fillna** - - This function may return result in different format to the method - Pandas supports. For example: - - .. code-block:: - - >>> df = pd.DataFrame({'k': [1, 1, 2], 'v': [2, None, 4]}) - >>> gdf = cudf.from_pandas(df) - >>> df.groupby('k').fillna({'v': 4}) # pandas - v - k - 1 0 2.0 - 1 4.0 - 2 2 4.0 - >>> gdf.groupby('k').fillna({'v': 4}) # cudf - v - 0 2.0 - 1 4.0 - 2 4.0 """ if inplace: raise NotImplementedError("Does not support inplace yet.") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 1dcad1750f0..c49272b99aa 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -739,7 +739,7 @@ def replace( 4 4 9 e .. pandas-compat:: - **DataFrame.replace, Series.replace, IndexedFrame.replace** + **DataFrame.replace, Series.replace** Parameters that are currently not supported are: `limit`, `regex`, `method` @@ -1217,7 +1217,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True): 2021-01-01 23:45:27 1 2 .. pandas-compat:: - **DataFrame.truncate, Series.truncate, IndexedFrame.truncate** + **DataFrame.truncate, Series.truncate** The ``copy`` parameter is only present for API compatibility, but ``copy=False`` is not supported. This method always generates a @@ -1516,7 +1516,7 @@ def sort_index( 2 3 1 .. pandas-compat:: - **DataFrame.sort_index, Series.sort_index, IndexedFrame.sort_index** + **DataFrame.sort_index, Series.sort_index** * Not supporting: kind, sort_remaining=False """ @@ -2249,7 +2249,7 @@ def sort_values( 1 1 2 .. pandas-compat:: - **DataFrame.sort_values, Series.sort_values, IndexedFrame.sort_values** + **DataFrame.sort_values, Series.sort_values** * Support axis='index' only. * Not supporting: inplace, kind @@ -2715,7 +2715,7 @@ def resample( .. pandas-compat:: - **DataFrame.resample, Series.resample, IndexedFrame.resample** + **DataFrame.resample, Series.resample** Note that the dtype of the index (or the 'on' column if using 'on=') in the result will be of a frequency closest to the @@ -3224,7 +3224,7 @@ def sample( 1 2 4 .. pandas-compat:: - **DataFrame.sample, Series.sample, IndexedFrame.sample** + **DataFrame.sample, Series.sample** When sampling from ``axis=0/'index'``, ``random_state`` can be either a numpy random state (``numpy.random.RandomState``) or a cupy random From 1aabdc5e19e0af74c27565cff035f990fa69b72c Mon Sep 17 00:00:00 2001 From: Touutae-lab Date: Sat, 8 Jul 2023 00:54:11 +0700 Subject: [PATCH 03/14] add 2 more methode --- python/cudf/cudf/core/column/string.py | 17 ++++++------ python/cudf/cudf/core/dataframe.py | 38 ++++++++------------------ 2 files changed, 21 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 1b5415de1f7..86d06f3647f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -3508,14 +3508,6 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: ------- Series or Index - Notes - ----- - - `flags` parameter currently only supports re.DOTALL - and re.MULTILINE. - - Some characters need to be escaped when passing - in pat. e.g. ``'$'`` has a special meaning in regex - and must be escaped when finding this literal character. - Examples -------- >>> import cudf @@ -3547,6 +3539,15 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: >>> index = cudf.Index(['A', 'A', 'Aaba', 'cat']) >>> index.str.count('a') Int64Index([0, 0, 2, 1], dtype='int64') + + .. pandas-compat:: + **StringMethods.count** + + - `flags` parameter currently only supports re.DOTALL + and re.MULTILINE. + - Some characters need to be escaped when passing + in pat. e.g. ``'$'`` has a special meaning in regex + and must be escaped when finding this literal character. """ # noqa W605 if isinstance(pat, re.Pattern): flags = pat.flags & ~re.U diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 190e9437cbc..4b6ce680764 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6675,17 +6675,6 @@ def append( ------- DataFrame - Notes - ----- - If a list of dict/series is passed and the keys are all contained in - the DataFrame's index, the order of the columns in the resulting - DataFrame will be unchanged. - Iteratively appending rows to a cudf DataFrame can be more - computationally intensive than a single concatenate. A better - solution is to append those rows to a list and then concatenate - the list with the original DataFrame all at once. - `verify_integrity` parameter is not supported yet - See Also -------- cudf.concat : General function to concatenate DataFrame or @@ -6745,6 +6734,18 @@ def append( 2 2 3 3 4 4 + + .. pandas-compat:: + **DataFrame.append** + + If a list of dict/series is passed and the keys are all contained in + the DataFrame's index, the order of the columns in the resulting + DataFrame will be unchanged. + Iteratively appending rows to a cudf DataFrame can be more + computationally intensive than a single concatenate. A better + solution is to append those rows to a list and then concatenate + the list with the original DataFrame all at once. + `verify_integrity` parameter is not supported yet """ if isinstance(other, dict): if not ignore_index: @@ -7066,21 +7067,6 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): DataFrame if any assignment statements are included in ``expr``, or None if ``inplace=True``. - Notes - ----- - Difference from pandas: - * Additional kwargs are not supported. - * Bitwise and logical operators are not dtype-dependent. - Specifically, `&` must be used for bitwise operators on integers, - not `and`, which is specifically for the logical and between - booleans. - * Only numerical types are currently supported. - * Operators generally will not cast automatically. Users are - responsible for casting columns to suitable types before - evaluating a function. - * Multiple assignments to the same name (i.e. a sequence of - assignment statements where later statements are conditioned upon - the output of earlier statements) is not supported. Examples -------- From 42df2011ba6505d522ad95df6ad8c2e3de171140 Mon Sep 17 00:00:00 2001 From: Touutae-lab Date: Sat, 15 Jul 2023 19:38:07 +0700 Subject: [PATCH 04/14] Fix indentation --- python/cudf/cudf/core/column/string.py | 10 ++++++---- python/cudf/cudf/core/dataframe.py | 20 ++++++++++++-------- python/cudf/cudf/core/frame.py | 6 ++++-- python/cudf/cudf/core/indexed_frame.py | 19 ++++++++++--------- python/cudf/cudf/core/tools/numeric.py | 9 ++++----- 5 files changed, 36 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 86d06f3647f..945f7f12d2d 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -990,9 +990,9 @@ def replace( .. pandas-compat:: **StringMethods.replace** - The parameters `case` and `flags` are not yet supported and will raise - a `NotImplementedError` if anything other than the default value - is set. + The parameters `case` and `flags` are not yet supported and will + raise a `NotImplementedError` if anything other than the default + value is set. """ if case is not None: raise NotImplementedError("`case` parameter is not yet supported") @@ -2822,7 +2822,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: **StringMethods.partition** The parameter `expand` is not yet supported and will raise a - `NotImplementedError` if anything other than the default value is set. + `NotImplementedError` if anything other than the default + value is set. + """ if expand is not True: raise NotImplementedError( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4b6ce680764..f1258eb7d4d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3348,8 +3348,9 @@ def rename( * Not Supporting: level - Rename will not overwrite column names. If a list with duplicates is - passed, column names will be postfixed with a number. + Rename will not overwrite column names. If a list with + duplicates is passed, column names will be postfixed + with a number. """ if errors != "ignore": raise NotImplementedError( @@ -3776,7 +3777,8 @@ def transpose(self): .. pandas-compat:: **DataFrame.transpose, DataFrame.T** - Not supporting *copy* because default and only behavior is copy=True + Not supporting *copy* because default and only behavior is + copy=True """ index = self._data.to_pandas_index() @@ -3964,7 +3966,8 @@ def merge( .. pandas-compat:: **DataFrame.merge** - **DataFrames merges in cuDF result in non-deterministic row ordering.** + DataFrames merges in cuDF result in non-deterministic row + ordering. """ if indicator: raise NotImplementedError( @@ -5825,7 +5828,8 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): .. pandas-compat:: **DataFrame.count** - Parameters currently not supported are `axis`, `level`, `numeric_only`. + Parameters currently not supported are `axis`, `level`, + `numeric_only`. """ axis = self._get_axis_from_axis_arg(axis) if axis != 0: @@ -6738,9 +6742,9 @@ def append( .. pandas-compat:: **DataFrame.append** - If a list of dict/series is passed and the keys are all contained in - the DataFrame's index, the order of the columns in the resulting - DataFrame will be unchanged. + If a list of dict/series is passed and the keys are all contained + in the DataFrame's index, the order of the columns in + the resulting DataFrame will be unchanged. Iteratively appending rows to a cudf DataFrame can be more computationally intensive than a single concatenate. A better solution is to append those rows to a list and then concatenate diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5837cc3686c..f0d6fff7929 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2441,7 +2441,8 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): .. pandas-compat:: **DataFrame.all, Series.all** - Parameters currently not supported are `axis`, `bool_only`, `level`. + Parameters currently not supported are `axis`, `bool_only`, + `level`. """ return self._reduce( "all", @@ -2481,7 +2482,8 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): .. pandas-compat:: **DataFrame.any, Series.any** - Parameters currently not supported are `axis`, `bool_only`, `level`. + Parameters currently not supported are `axis`, `bool_only`, + `level`. """ return self._reduce( "any", diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c49272b99aa..8ff48a0c103 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3226,15 +3226,16 @@ def sample( .. pandas-compat:: **DataFrame.sample, Series.sample** - When sampling from ``axis=0/'index'``, ``random_state`` can be either - a numpy random state (``numpy.random.RandomState``) or a cupy random - state (``cupy.random.RandomState``). When a numpy random state is - used, the output is guaranteed to match the output of the corresponding - pandas method call, but generating the sample may be slow. If exact - pandas equivalence is not required, using a cupy random state will - achieve better performance, especially when sampling large number of - items. It's advised to use the matching `ndarray` type to the random - state for the `weights` array. + When sampling from ``axis=0/'index'``, ``random_state`` can be + either a numpy random state (``numpy.random.RandomState``) + or a cupy random state (``cupy.random.RandomState``). When a numpy + random state is used, the output is guaranteed to match the output + of the corresponding pandas method call, but generating the sample + maybe slow. If exact pandas equivalence is not required, using a + cupy random state will achieve better performance, + especially when sampling large number of + items. It's advised to use the matching `ndarray` type to + the random state for the `weights` array. """ axis = self._get_axis_from_axis_arg(axis) size = self.shape[axis] diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 609a5503040..529c0d18183 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -89,11 +89,10 @@ def to_numeric(arg, errors="raise", downcast=None): .. pandas-compat:: **cudf.to_numeric** - An important difference from pandas is that this function does not accept - mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. - A ``TypeError`` will be raised when such input is received, regardless of - ``errors`` parameter. - + An important difference from pandas is that this function does not + accept mixed numeric/non-numeric type sequences. + For example ``[1, 'a']``. A ``TypeError`` will be raised when such + input is received, regardless of ``errors`` parameter. """ if errors not in {"raise", "ignore", "coerce"}: From 09173eab6ae5c0d31bbb79f548945a768ee201dc Mon Sep 17 00:00:00 2001 From: Pantakan Kanprawet Date: Sat, 21 Oct 2023 08:55:19 +0000 Subject: [PATCH 05/14] Revert Testcase back --- python/cudf/cudf/tests/test_dataframe.py | 79 ++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 338005f7578..38fc37c7b13 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10068,3 +10068,82 @@ def test_dataframe_transpose_complex_types(data): actual = gdf.T assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]}, + {"a": [[{"b": 567}], None] * 10}, + {"a": [decimal.Decimal(10), decimal.Decimal(20), None]}, + ], +) +def test_dataframe_values_complex_types(data): + gdf = cudf.DataFrame(data) + with pytest.raises(NotImplementedError): + gdf.values + + +def test_dataframe_from_arrow_slice(): + table = pa.Table.from_pandas( + pd.DataFrame.from_dict( + {"a": ["aa", "bb", "cc"] * 3, "b": [1, 2, 3] * 3} + ) + ) + table_slice = table.slice(3, 7) + + expected = table_slice.to_pandas() + actual = cudf.DataFrame.from_arrow(table_slice) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 4}, + {"c": 4, "a": [1, 2, 3], "b": ["x", "y", "z"]}, + {"a": [1, 2, 3], "c": 4}, + ], +) +def test_dataframe_init_from_scalar_and_lists(data): + actual = cudf.DataFrame(data) + expected = pd.DataFrame(data) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data,index", + [ + ({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None), + ( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + }, + [10, 11], + ), + ( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + }, + [10, 11], + ), + ([[10, 11], [12, 13]], ["a", "b", "c"]), + ], +) +def test_dataframe_init_length_error(data, index): + assert_exceptions_equal( + lfunc=pd.DataFrame, + rfunc=cudf.DataFrame, + lfunc_args_and_kwargs=( + [], + {"data": data, "index": index}, + ), + rfunc_args_and_kwargs=( + [], + {"data": data, "index": index}, + ), + ) \ No newline at end of file From 10498be824bac02e6ea1e2ca5b3de9498bcd6572 Mon Sep 17 00:00:00 2001 From: Touutae-lab Date: Fri, 3 Feb 2023 22:33:21 +0700 Subject: [PATCH 06/14] Convert note --- python/cudf/cudf/core/column/lists.py | 10 +- python/cudf/cudf/core/column/string.py | 98 ++++--- python/cudf/cudf/core/dataframe.py | 156 ++++++----- python/cudf/cudf/core/frame.py | 166 ++++++++--- python/cudf/cudf/core/groupby/groupby.py | 39 ++- python/cudf/cudf/core/indexed_frame.py | 87 +++--- python/cudf/cudf/core/series.py | 62 +++-- python/cudf/cudf/core/tools/datetimes.py | 1 - python/cudf/cudf/core/tools/numeric.py | 18 +- python/cudf/cudf/tests/test_dataframe.py | 340 ----------------------- python/dask_cudf/dask_cudf/accessors.py | 12 +- 11 files changed, 396 insertions(+), 593 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 0bb9f70f851..6d37f1281ef 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -622,11 +622,6 @@ def sort_values( ------- Series or Index with each list sorted - Notes - ----- - Difference from pandas: - * Not supporting: `inplace`, `kind` - Examples -------- >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) @@ -635,6 +630,11 @@ def sort_values( 1 [2.0, 8.0, 8.0] 2 [1.0, 2.0] dtype: list + + .. pandas-compat:: + **list.ListMethods.sort_values** + + The ``inplace`` and ``kind`` arguments are currently not supported. """ if inplace: raise NotImplementedError("`inplace` not currently implemented.") diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fe21dc87bac..a58dd13fff6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -598,11 +598,6 @@ def extract( for each group. If `expand=False` and `pat` has only one capture group, then return a Series/Index. - Notes - ----- - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - Examples -------- >>> import cudf @@ -629,6 +624,12 @@ def extract( 1 2 2 dtype: object + + .. pandas-compat:: + **StringMethods.extract** + + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ # noqa W605 if not _is_supported_regex_flags(flags): raise NotImplementedError( @@ -676,14 +677,6 @@ def contains( pattern is contained within the string of each element of the Series/Index. - Notes - ----- - The parameters `case` and `na` are not yet supported and will - raise a NotImplementedError if anything other than the default - value is set. - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - Examples -------- >>> import cudf @@ -757,6 +750,15 @@ def contains( 3 True 4 dtype: bool + + .. pandas-compat:: + **StringMethods.contains** + + The parameters `case` and `na` are not yet supported and will + raise a NotImplementedError if anything other than the default + value is set. + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ # noqa W605 if na is not np.nan: raise NotImplementedError("`na` parameter is not yet supported") @@ -955,12 +957,6 @@ def replace( A copy of the object with all matching occurrences of pat replaced by repl. - Notes - ----- - The parameters `case` and `flags` are not yet supported and will raise - a `NotImplementedError` if anything other than the default value - is set. - Examples -------- >>> import cudf @@ -990,6 +986,13 @@ def replace( 1 fuz 2 dtype: object + + .. pandas-compat:: + **StringMethods.replace** + + The parameters `case` and `flags` are not yet supported and will raise + a `NotImplementedError` if anything other than the default value + is set. """ if case is not None: raise NotImplementedError("`case` parameter is not yet supported") @@ -2773,11 +2776,6 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: DataFrame or MultiIndex Returns a DataFrame / MultiIndex - Notes - ----- - The parameter `expand` is not yet supported and will raise a - `NotImplementedError` if anything other than the default value is set. - See Also -------- rpartition @@ -2819,6 +2817,12 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: MultiIndex([('X', ' ', '123'), ('Y', ' ', '999')], ) + + .. pandas-compat:: + **StringMethods.partition** + + The parameter `expand` is not yet supported and will raise a + `NotImplementedError` if anything other than the default value is set. """ if expand is not True: raise NotImplementedError( @@ -3506,11 +3510,11 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: Notes ----- - - `flags` parameter currently only supports re.DOTALL - and re.MULTILINE. - - Some characters need to be escaped when passing - in pat. e.g. ``'$'`` has a special meaning in regex - and must be escaped when finding this literal character. + - `flags` parameter currently only supports re.DOTALL + and re.MULTILINE. + - Some characters need to be escaped when passing + in pat. e.g. ``'$'`` has a special meaning in regex + and must be escaped when finding this literal character. Examples -------- @@ -3574,11 +3578,6 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: All non-overlapping matches of pattern or regular expression in each string of this Series/Index. - Notes - ----- - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - Examples -------- >>> import cudf @@ -3619,6 +3618,12 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: 1 [] 2 [b, b] dtype: list + + .. pandas-compat:: + **StringMethods.findall** + + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ if isinstance(pat, re.Pattern): flags = pat.flags & ~re.U @@ -3801,11 +3806,6 @@ def endswith(self, pat: str) -> SeriesOrIndex: A Series of booleans indicating whether the given pattern matches the end of each string element. - Notes - ----- - `na` parameter is not yet supported, as cudf uses - native strings instead of Python objects. - Examples -------- >>> import cudf @@ -3822,6 +3822,12 @@ def endswith(self, pat: str) -> SeriesOrIndex: 2 False 3 dtype: bool + + .. pandas-compat:: + **StringMethods.endswith** + + `na` parameter is not yet supported, as cudf uses + native strings instead of Python objects. """ if pat is None: raise TypeError( @@ -4249,13 +4255,6 @@ def match( ------- Series or Index of boolean values. - Notes - ----- - Parameters `case` and `na` are currently not supported. - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - - Examples -------- >>> import cudf @@ -4276,6 +4275,13 @@ def match( 1 True 2 True dtype: bool + + .. pandas-compat:: + **StringMethods.match** + + Parameters `case` and `na` are currently not supported. + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ if case is not True: raise NotImplementedError("`case` parameter is not yet supported") diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b38345af83d..1aaacca6d79 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3118,10 +3118,6 @@ def diff(self, periods=1, axis=0): DataFrame First differences of the DataFrame. - Notes - ----- - Diff currently only supports numeric dtype columns. - Examples -------- >>> import cudf @@ -3145,6 +3141,10 @@ def diff(self, periods=1, axis=0): 4 2 3 16 5 2 5 20 + .. pandas-compat:: + **DataFrame.diff** + + Diff currently only supports numeric dtype columns. """ if not is_integer(periods): if not (is_float(periods) and periods.is_integer()): @@ -3320,14 +3320,6 @@ def rename( ------- DataFrame - Notes - ----- - Difference from pandas: - * Not supporting: level - - Rename will not overwrite column names. If a list with duplicates is - passed, column names will be postfixed with a number. - Examples -------- >>> import cudf @@ -3353,6 +3345,14 @@ def rename( 10 1 4 20 2 5 30 3 6 + + .. pandas-compat:: + **DataFrame.rename** + + * Not Supporting: level + + Rename will not overwrite column names. If a list with duplicates is + passed, column names will be postfixed with a number. """ if errors != "ignore": raise NotImplementedError( @@ -3452,10 +3452,10 @@ def agg(self, aggs, axis=None): When ``DataFrame.agg`` is called with several aggs, ``DataFrame`` is returned. - Notes - ----- - Difference from pandas: - * Not supporting: ``axis``, ``*args``, ``**kwargs`` + .. pandas-compat:: + **DataFrame.agg** + + * Not supporting: ``axis``, ``*args``, ``**kwargs`` """ # TODO: Remove the typecasting below once issue #6846 is fixed @@ -3588,11 +3588,6 @@ def nlargest(self, n, columns, keep="first"): The first `n` rows ordered by the given columns in descending order. - Notes - ----- - Difference from pandas: - - Only a single column is supported in *columns* - Examples -------- >>> import cudf @@ -3627,6 +3622,11 @@ def nlargest(self, n, columns, keep="first"): France 65000000 2583560 FR Italy 59000000 1937894 IT Brunei 434000 12128 BN + + .. pandas-compat:: + **DataFrame.nlargest** + + - Only a single column is supported in *columns* """ return self._n_largest_or_smallest(True, n, columns, keep) @@ -3653,11 +3653,6 @@ def nsmallest(self, n, columns, keep="first"): ------- DataFrame - Notes - ----- - Difference from pandas: - - Only a single column is supported in *columns* - Examples -------- >>> import cudf @@ -3699,6 +3694,11 @@ def nsmallest(self, n, columns, keep="first"): Anguilla 11300 311 AI Tuvalu 11300 38 TV Nauru 337000 182 NR + + .. pandas-compat:: + **DataFrame.nsmallest** + + - Only a single column is supported in *columns* """ return self._n_largest_or_smallest(False, n, columns, keep) @@ -3776,10 +3776,10 @@ def transpose(self): ------- a new (ncol x nrow) dataframe. self is (nrow x ncol) - Notes - ----- - Difference from pandas: - Not supporting *copy* because default and only behavior is copy=True + .. pandas-compat:: + **DataFrame.transpose, DataFrame.T** + + Not supporting *copy* because default and only behavior is copy=True """ index = self._data.to_pandas_index() @@ -3928,10 +3928,6 @@ def merge( ------- merged : DataFrame - Notes - ----- - **DataFrames merges in cuDF result in non-deterministic row ordering.** - Examples -------- >>> import cudf @@ -3967,6 +3963,11 @@ def merge( right dtype respectively. This extends to semi and anti joins. - For outer joins, the result will be the union of categories from both sides. + + .. pandas-compat:: + **DataFrame.merge** + + **DataFrames merges in cuDF result in non-deterministic row ordering.** """ if indicator: raise NotImplementedError( @@ -4037,12 +4038,11 @@ def join( ------- joined : DataFrame - Notes - ----- - Difference from pandas: + .. pandas-compat:: + **DataFrame.join** - - *other* must be a single DataFrame for now. - - *on* is not supported yet due to lack of multi-index support. + - *other* must be a single DataFrame for now. + - *on* is not supported yet due to lack of multi-index support. """ if on is not None: raise NotImplementedError("The on parameter is not yet supported") @@ -5173,11 +5173,6 @@ def from_arrow(cls, table): ------- cudf DataFrame - Notes - ----- - - Does not support automatically setting index column(s) similar - to how ``to_pandas`` works for PyArrow Tables. - Examples -------- >>> import cudf @@ -5188,6 +5183,12 @@ def from_arrow(cls, table): 0 1 4 1 2 5 2 3 6 + + .. pandas-compat:: + **DataFrame.from_arrow** + + - Does not support automatically setting index column(s) similar + to how ``to_pandas`` works for PyArrow Tables. """ index_col = None col_index_names = None @@ -5543,14 +5544,6 @@ def quantile( If q is a float, a Series will be returned where the index is the columns of self and the values are the quantiles. - .. pandas-compat:: - **DataFrame.quantile** - - One notable difference from Pandas is when DataFrame is of - non-numeric types and result is expected to be a Series in case of - Pandas. cuDF will return a DataFrame as it doesn't support mixed - types under Series. - Examples -------- >>> import cupy as cp @@ -5571,6 +5564,14 @@ def quantile( a b 0.1 1.3 3.7 0.5 2.5 55.0 + + .. pandas-compat:: + **DataFrame.quantile** + + One notable difference from Pandas is when DataFrame is of + non-numeric types and result is expected to be a Series in case of + Pandas. cuDF will return a DataFrame as it doesn't support mixed + types under Series. """ # noqa: E501 if axis not in (0, None): raise NotImplementedError("axis is not implemented yet") @@ -5836,10 +5837,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Series For each column/row the number of non-NA/null entries. - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -5853,6 +5850,11 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Age 4 Single 5 dtype: int64 + + .. pandas-compat:: + **DataFrame.count** + + Parameters currently not supported are `axis`, `level`, `numeric_only`. """ axis = self._get_axis_from_axis_arg(axis) if axis != 0: @@ -6026,10 +6028,6 @@ def mode(self, axis=0, numeric_only=False, dropna=True): cudf.Series.value_counts : Return the counts of values in a Series. - Notes - ----- - ``axis`` parameter is currently not supported. - Examples -------- >>> import cudf @@ -6068,6 +6066,11 @@ def mode(self, axis=0, numeric_only=False, dropna=True): legs wings 0 2 0.0 1 2.0 + + .. pandas-compat:: + **DataFrame.mode** + + ``axis`` parameter is currently not supported. """ if axis not in (0, "index"): raise NotImplementedError("Only axis=0 is currently supported") @@ -6836,7 +6839,7 @@ def to_struct(self, name=None): Notes ----- - Note that a copy of the columns is made. + Note: that a copy of the columns is made. """ if not all(isinstance(name, str) for name in self._data.names): warnings.warn( @@ -6941,11 +6944,6 @@ def append( ------- DataFrame - See Also - -------- - cudf.concat : General function to concatenate DataFrame or - objects. - Notes ----- If a list of dict/series is passed and the keys are all contained in @@ -6955,7 +6953,12 @@ def append( computationally intensive than a single concatenate. A better solution is to append those rows to a list and then concatenate the list with the original DataFrame all at once. - `verify_integrity` parameter is not supported yet. + `verify_integrity` parameter is not supported yet + + See Also + -------- + cudf.concat : General function to concatenate DataFrame or + objects. Examples -------- @@ -7340,8 +7343,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): Specifically, `&` must be used for bitwise operators on integers, not `and`, which is specifically for the logical and between booleans. - * Only numerical types currently support all operators. - * String types currently support comparison operators. + * Only numerical types are currently supported. * Operators generally will not cast automatically. Users are responsible for casting columns to suitable types before evaluating a function. @@ -7410,6 +7412,22 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): 2 3 6 9 -3 3 4 4 8 0 4 5 2 7 3 + + .. pandas-compat:: + **DataFrame.eval** + + * Additional kwargs are not supported. + * Bitwise and logical operators are not dtype-dependent. + Specifically, `&` must be used for bitwise operators on integers, + not `and`, which is specifically for the logical and between + booleans. + * Only numerical types are currently supported. + * Operators generally will not cast automatically. Users are + responsible for casting columns to suitable types before + evaluating a function. + * Multiple assignments to the same name (i.e. a sequence of + assignment statements where later statements are conditioned upon + the output of earlier statements) is not supported. """ if kwargs: raise ValueError( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7cb78bc8d1f..ef3134378ac 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -248,6 +248,73 @@ def size(self): return self._num_columns * self._num_rows @_cudf_nvtx_annotate + @property + def shape(self): + """Returns a tuple representing the dimensionality of the DataFrame.""" + return self._num_rows, self._num_columns + + @property + def empty(self): + """ + Indicator whether DataFrame or Series is empty. + + True if DataFrame/Series is entirely empty (no items), + meaning any of the axes are of length 0. + + Returns + ------- + out : bool + If DataFrame/Series is empty, return True, if not return False. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'A' : []}) + >>> df + Empty DataFrame + Columns: [A] + Index: [] + >>> df.empty + True + + If we only have `null` values in our DataFrame, it is + not considered empty! We will need to drop + the `null`'s to make the DataFrame empty: + + >>> df = cudf.DataFrame({'A' : [None, None]}) + >>> df + A + 0 + 1 + >>> df.empty + False + >>> df.dropna().empty + True + + Non-empty and empty Series example: + + >>> s = cudf.Series([1, 2, None]) + >>> s + 0 1 + 1 2 + 2 + dtype: int64 + >>> s.empty + False + >>> s = cudf.Series([]) + >>> s + Series([], dtype: float64) + >>> s.empty + True + + .. pandas-compat:: + **DataFrame.empty, Series.empty, Frame.empty** + + If DataFrame/Series contains only `null` values, it is still not + considered empty. See the example above. + """ + return self.size == 0 + def memory_usage(self, deep=False): """Return the memory usage of an object. @@ -593,6 +660,8 @@ def where(self, cond, other=None, inplace=False): dtype: int64 .. pandas-compat:: + **DataFrame.where, Series.where, Frame.where** + Note that ``where`` treats missing values as falsy, in parallel with pandas treatment of nullable data: @@ -1922,10 +1991,6 @@ def min( ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -1934,6 +1999,11 @@ def min( a 1 b 7 dtype: int64 + + .. pandas-compat:: + **DataFrame.min, Series.min, Frame.min** + + Parameters currently not supported are `level`, `numeric_only`. """ return self._reduce( "min", @@ -1973,10 +2043,6 @@ def max( ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -1985,6 +2051,11 @@ def max( a 4 b 10 dtype: int64 + + .. pandas-compat:: + **DataFrame.max, Series.max, Frame.max** + + Parameters currently not supported are `level`, `numeric_only`. """ return self._reduce( "max", @@ -2029,10 +2100,6 @@ def sum( ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -2041,6 +2108,11 @@ def sum( a 10 b 34 dtype: int64 + + .. pandas-compat:: + **DataFrame.sum, Series.sum, Frame.sum** + + Parameters currently not supported are `level`, `numeric_only`. """ return self._reduce( "sum", @@ -2087,10 +2159,6 @@ def product( ------- Series - Notes - ----- - Parameters currently not supported are level`, `numeric_only`. - Examples -------- >>> import cudf @@ -2099,6 +2167,11 @@ def product( a 24 b 5040 dtype: int64 + + .. pandas-compat:: + **DataFrame.product, Series.product, Frame.product** + + Parameters currently not supported are level`, `numeric_only`. """ return self._reduce( @@ -2198,11 +2271,6 @@ def std( ------- Series - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - Examples -------- >>> import cudf @@ -2211,6 +2279,12 @@ def std( a 1.290994 b 1.290994 dtype: float64 + + .. pandas-compat:: + **DataFrame.std, Series.std, Frame.std** + + Parameters currently not supported are `level` and + `numeric_only` """ return self._reduce( @@ -2254,11 +2328,6 @@ def var( ------- scalar - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - Examples -------- >>> import cudf @@ -2267,6 +2336,12 @@ def var( a 1.666667 b 1.666667 dtype: float64 + + .. pandas-compat:: + **DataFrame.var, Series.var, Frame.var** + + Parameters currently not supported are `level` and + `numeric_only` """ return self._reduce( "var", @@ -2304,10 +2379,6 @@ def kurtosis( ------- Series or scalar - Notes - ----- - Parameters currently not supported are `level` and `numeric_only` - Examples -------- **Series** @@ -2325,6 +2396,11 @@ def kurtosis( a -1.2 b -1.2 dtype: float64 + + .. pandas-compat:: + **DataFrame.kurtosis, Frame.kurtosis** + + Parameters currently not supported are `level` and `numeric_only` """ if axis not in (0, "index", None, no_default): raise NotImplementedError("Only axis=0 is currently supported.") @@ -2362,11 +2438,6 @@ def skew( ------- Series - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - Examples -------- **Series** @@ -2391,6 +2462,12 @@ def skew( a 0.00000 b -0.37037 dtype: float64 + + .. pandas-compat:: + **DataFrame.skew, Series.skew, Frame.skew** + + Parameters currently not supported are `axis`, `level` and + `numeric_only` """ if axis not in (0, "index", None, no_default): raise NotImplementedError("Only axis=0 is currently supported.") @@ -2443,6 +2520,11 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): a True b False dtype: bool + + .. pandas-compat:: + **DataFrame.all, Series.all, Frame.all** + + Parameters currently not supported are `axis`, `bool_only`, `level`. """ return self._reduce( "all", @@ -2491,6 +2573,11 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): a True b True dtype: bool + + .. pandas-compat:: + **DataFrame.any, Series.any, Frame.any** + + Parameters currently not supported are `axis`, `bool_only`, `level`. """ return self._reduce( "any", @@ -2516,10 +2603,6 @@ def median( ------- scalar - Notes - ----- - Parameters currently not supported are `level` and `numeric_only`. - Examples -------- >>> import cudf @@ -2534,6 +2617,11 @@ def median( dtype: int64 >>> ser.median() 17.0 + + .. pandas-compat:: + **DataFrame.median, Series.median, Frame.median** + + Parameters currently not supported are `level` and `numeric_only`. """ return self._reduce( "median", diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3b8f0f3824a..a7cb4c72ac1 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -658,10 +658,10 @@ def _reduce( Series or DataFrame Computed {op} of values within each group. - Notes - ----- - Difference from pandas: - * Not supporting: numeric_only, min_count + .. pandas-compat:: + **{cls}.{op}** + + The numeric_only, min_count """ if numeric_only: raise NotImplementedError( @@ -1359,7 +1359,7 @@ def mult(df): 6 2 6 12 .. pandas-compat:: - **groupby.apply** + **GroupBy.apply** cuDF's ``groupby.apply`` is limited compared to pandas. In some situations, Pandas returns the grouped keys as part of @@ -2199,6 +2199,28 @@ def fillna( Returns ------- DataFrame or Series + + .. pandas-compat:: + **GroupBy.fillna** + + This function may return result in different format to the method + Pandas supports. For example: + + .. code-block:: + + >>> df = pd.DataFrame({'k': [1, 1, 2], 'v': [2, None, 4]}) + >>> gdf = cudf.from_pandas(df) + >>> df.groupby('k').fillna({'v': 4}) # pandas + v + k + 1 0 2.0 + 1 4.0 + 2 2 4.0 + >>> gdf.groupby('k').fillna({'v': 4}) # cudf + v + 0 2.0 + 1 4.0 + 2 4.0 """ if inplace: raise NotImplementedError("Does not support inplace yet.") @@ -2258,9 +2280,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): Series or DataFrame Object shifted within each group. - Notes - ----- - Parameter ``freq`` is unsupported. + .. pandas-compat:: + **GroupBy.shift** + + Parameter ``freq`` is unsupported. """ if freq is not None: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index b1fb47eb790..650ba77e101 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -661,11 +661,6 @@ def replace( result : Series Series after replacement. The mask and index are preserved. - Notes - ----- - Parameters that are currently not supported are: `limit`, `regex`, - `method` - Examples -------- **Series** @@ -808,6 +803,12 @@ def replace( 2 2 7 c 3 3 8 d 4 4 9 e + + .. pandas-compat:: + **DataFrame.replace, Series.replace, IndexedFrame.replace** + + Parameters that are currently not supported are: `limit`, `regex`, + `method` """ if limit is not None: raise NotImplementedError("limit parameter is not implemented yet") @@ -1148,13 +1149,6 @@ def truncate(self, before=None, after=None, axis=0, copy=True): `before` and `after` may be specified as strings instead of Timestamps. - .. pandas-compat:: - **DataFrame.truncate, Series.truncate** - - The ``copy`` parameter is only present for API compatibility, but - ``copy=False`` is not supported. This method always generates a - copy. - Examples -------- **Series** @@ -1296,6 +1290,13 @@ def truncate(self, before=None, after=None, axis=0, copy=True): 2021-01-01 23:45:25 1 2 2021-01-01 23:45:26 1 2 2021-01-01 23:45:27 1 2 + + .. pandas-compat:: + **DataFrame.truncate, Series.truncate, IndexedFrame.truncate** + + The ``copy`` parameter is only present for API compatibility, but + ``copy=False`` is not supported. This method always generates a + copy. """ if not copy: raise ValueError("Truncating with copy=False is not supported.") @@ -1550,11 +1551,6 @@ def sort_index( ------- Frame or None - Notes - ----- - Difference from pandas: - * Not supporting: kind, sort_remaining=False - Examples -------- **Series** @@ -1597,6 +1593,11 @@ def sort_index( 1 2 3 3 1 2 2 3 1 + + .. pandas-compat:: + **DataFrame.sort_index, Series.sort_index, IndexedFrame.sort_index** + + * Not supporting: kind, sort_remaining=False """ if kind is not None: raise NotImplementedError("kind is not yet supported") @@ -2415,12 +2416,6 @@ def sort_values( ------- Frame : Frame with sorted values. - Notes - ----- - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - Examples -------- >>> import cudf @@ -2432,6 +2427,12 @@ def sort_values( 0 0 -3 2 2 0 1 1 2 + + .. pandas-compat:: + **DataFrame.sort_values, Series.sort_values, IndexedFrame.sort_values** + + * Support axis='index' only. + * Not supporting: inplace, kind """ if na_position not in {"first", "last"}: raise ValueError(f"invalid na_position: {na_position}") @@ -2923,13 +2924,14 @@ def resample( 2018-02-28 18.0 63.333333 - Notes - ----- - Note that the dtype of the index (or the 'on' column if using - 'on=') in the result will be of a frequency closest to the - resampled frequency. For example, if resampling from - nanoseconds to milliseconds, the index will be of dtype - 'datetime64[ms]'. + .. pandas-compat:: + **DataFrame.resample, Series.resample, IndexedFrame.resample** + + Note that the dtype of the index (or the 'on' column if using + 'on=') in the result will be of a frequency closest to the + resampled frequency. For example, if resampling from + nanoseconds to milliseconds, the index will be of dtype + 'datetime64[ms]'. """ import cudf.core.resample @@ -3370,18 +3372,6 @@ def sample( provided via the `random_state` parameter. This function will always produce the same sample given an identical `random_state`. - Notes - ----- - When sampling from ``axis=0/'index'``, ``random_state`` can be either - a numpy random state (``numpy.random.RandomState``) or a cupy random - state (``cupy.random.RandomState``). When a numpy random state is - used, the output is guaranteed to match the output of the corresponding - pandas method call, but generating the sample may be slow. If exact - pandas equivalence is not required, using a cupy random state will - achieve better performance, especially when sampling large number of - items. It's advised to use the matching `ndarray` type to the random - state for the `weights` array. - Parameters ---------- n : int, optional @@ -3449,6 +3439,19 @@ def sample( a c 0 1 3 1 2 4 + + .. pandas-compat:: + **DataFrame.sample, Series.sample, IndexedFrame.sample** + + When sampling from ``axis=0/'index'``, ``random_state`` can be either + a numpy random state (``numpy.random.RandomState``) or a cupy random + state (``cupy.random.RandomState``). When a numpy random state is + used, the output is guaranteed to match the output of the corresponding + pandas method call, but generating the sample may be slow. If exact + pandas equivalence is not required, using a cupy random state will + achieve better performance, especially when sampling large number of + items. It's advised to use the matching `ndarray` type to the random + state for the `weights` array. """ axis = 0 if axis is None else self._get_axis_from_axis_arg(axis) size = self.shape[axis] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 00ba722136e..369f11812f9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1299,10 +1299,11 @@ def map(self, arg, na_action=None) -> "Series": 4 dtype: int64 - Notes - ----- - Please note map currently only supports fixed-width numeric - type functions. + .. pandas-compat:: + **Series.map** + + Please note map currently only supports fixed-width numeric + type functions. """ if isinstance(arg, dict): if hasattr(arg, "__missing__"): @@ -2121,12 +2122,6 @@ def sort_values( ------- Series : Series with sorted values. - Notes - ----- - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - Examples -------- >>> import cudf @@ -2138,6 +2133,12 @@ def sort_values( 3 4 1 5 dtype: int64 + + .. pandas-compat:: + **Series.sort_values** + + * Support axis='index' only. + * The inplace and kind argument is currently unsupported """ return super().sort_values( by=self.name, @@ -2582,16 +2583,17 @@ def count(self, level=None): int Number of non-null values in the Series. - Notes - ----- - Parameters currently not supported is `level`. - Examples -------- >>> import cudf >>> ser = cudf.Series([1, 5, 2, 4, 3]) >>> ser.count() 5 + + .. pandas-compat:: + **Series.count** + + Parameters currently not supported is `level`. """ if level is not None: @@ -2695,10 +2697,6 @@ def cov(self, other, min_periods=None): Covariance between Series and other normalized by N-1 (unbiased estimator). - Notes - ----- - `min_periods` parameter is not yet supported. - Examples -------- >>> import cudf @@ -2706,6 +2704,11 @@ def cov(self, other, min_periods=None): >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) >>> ser1.cov(ser2) -0.015750000000000004 + + .. pandas-compat:: + **Series.cov** + + `min_periods` parameter is not yet supported. """ if min_periods is not None: @@ -3441,12 +3444,6 @@ def rename(self, index=None, copy=True): ------- Series - Notes - ----- - Difference from pandas: - - Supports scalar values only for changing name attribute - - Not supporting : inplace, level - Examples -------- >>> import cudf @@ -3465,6 +3462,12 @@ def rename(self, index=None, copy=True): Name: numeric_series, dtype: int64 >>> renamed_series.name 'numeric_series' + + .. pandas-compat:: + **Series.rename** + + - Supports scalar values only for changing name attribute + - The ``inplace`` and ``level`` is not supported """ out_data = self._data.copy(deep=copy) return Series._from_data(out_data, self.index, name=index) @@ -4644,11 +4647,6 @@ def strftime(self, date_format, *args, **kwargs): Series Series of formatted strings. - Notes - ----- - The following date format identifiers are not yet - supported: ``%c``, ``%x``,``%X`` - Examples -------- >>> import cudf @@ -4675,6 +4673,12 @@ def strftime(self, date_format, *args, **kwargs): 1 2000 / 30 / 06 2 2000 / 30 / 09 dtype: object + + .. pandas-compat:: + **series.DatetimeProperties.strftime** + + The following date format identifiers are not yet + supported: ``%c``, ``%x``,``%X`` """ if not isinstance(date_format, str): diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index a3f4bacf206..c74a7087c87 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -815,7 +815,6 @@ def date_range( '2023-12-23 08:00:00', '2025-02-23 08:00:00', '2026-04-23 08:00:00'], dtype='datetime64[ns]') - """ if tz is not None: raise NotImplementedError("tz is currently unsupported.") diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 0273227010b..609a5503040 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import warnings @@ -56,13 +56,6 @@ def to_numeric(arg, errors="raise", downcast=None): Depending on the input, if series is passed in, series is returned, otherwise ndarray - Notes - ----- - An important difference from pandas is that this function does not accept - mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. - A ``TypeError`` will be raised when such input is received, regardless of - ``errors`` parameter. - Examples -------- >>> s = cudf.Series(['1', '2.0', '3e3']) @@ -92,6 +85,15 @@ def to_numeric(arg, errors="raise", downcast=None): 1 1.0 2 3000.0 dtype: float64 + + .. pandas-compat:: + **cudf.to_numeric** + + An important difference from pandas is that this function does not accept + mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. + A ``TypeError`` will be raised when such input is received, regardless of + ``errors`` parameter. + """ if errors not in {"raise", "ignore", "coerce"}: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c297748f7e5..36cc3ef5628 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10106,343 +10106,3 @@ def test_dataframe_transpose_complex_types(data): actual = gdf.T assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]}, - {"a": [[{"b": 567}], None] * 10}, - {"a": [decimal.Decimal(10), decimal.Decimal(20), None]}, - ], -) -def test_dataframe_values_complex_types(data): - gdf = cudf.DataFrame(data) - with pytest.raises(NotImplementedError): - gdf.values - - -def test_dataframe_from_arrow_slice(): - table = pa.Table.from_pandas( - pd.DataFrame.from_dict( - {"a": ["aa", "bb", "cc"] * 3, "b": [1, 2, 3] * 3} - ) - ) - table_slice = table.slice(3, 7) - - expected = table_slice.to_pandas() - actual = cudf.DataFrame.from_arrow(table_slice) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 4}, - {"c": 4, "a": [1, 2, 3], "b": ["x", "y", "z"]}, - {"a": [1, 2, 3], "c": 4}, - ], -) -def test_dataframe_init_from_scalar_and_lists(data): - actual = cudf.DataFrame(data) - expected = pd.DataFrame(data) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,index", - [ - ({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None), - ( - { - "a": [1, 2, 3], - "b": ["x", "y", "z"], - }, - [10, 11], - ), - ( - { - "a": [1, 2, 3], - "b": ["x", "y", "z"], - }, - [10, 11], - ), - ([[10, 11], [12, 13]], ["a", "b", "c"]), - ], -) -def test_dataframe_init_length_error(data, index): - assert_exceptions_equal( - lfunc=pd.DataFrame, - rfunc=cudf.DataFrame, - lfunc_args_and_kwargs=( - [], - {"data": data, "index": index}, - ), - rfunc_args_and_kwargs=( - [], - {"data": data, "index": index}, - ), - ) - - -def test_dataframe_binop_with_mixed_date_types(): - df = pd.DataFrame( - np.random.rand(2, 2), - columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), - ) - ser = pd.Series(np.random.rand(3), index=[0, 1, 2]) - gdf = cudf.from_pandas(df) - gser = cudf.from_pandas(ser) - expected = df - ser - got = gdf - gser - assert_eq(expected, got) - - -def test_dataframe_binop_with_mixed_string_types(): - df1 = pd.DataFrame(np.random.rand(3, 3), columns=pd.Index([0, 1, 2])) - df2 = pd.DataFrame( - np.random.rand(6, 6), - columns=pd.Index([0, 1, 2, "VhDoHxRaqt", "X0NNHBIPfA", "5FbhPtS0D1"]), - ) - gdf1 = cudf.from_pandas(df1) - gdf2 = cudf.from_pandas(df2) - - expected = df2 + df1 - got = gdf2 + gdf1 - - assert_eq(expected, got) - - -def test_dataframe_binop_and_where(): - df = pd.DataFrame(np.random.rand(2, 2), columns=pd.Index([True, False])) - gdf = cudf.from_pandas(df) - - expected = df > 1 - got = gdf > 1 - - assert_eq(expected, got) - - expected = df[df > 1] - got = gdf[gdf > 1] - - assert_eq(expected, got) - - -def test_dataframe_binop_with_datetime_index(): - df = pd.DataFrame( - np.random.rand(2, 2), - columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), - ) - ser = pd.Series( - np.random.rand(2), - index=pd.Index( - [ - "2000-01-04", - "2000-01-03", - ], - dtype="datetime64[ns]", - ), - ) - gdf = cudf.from_pandas(df) - gser = cudf.from_pandas(ser) - expected = df - ser - got = gdf - gser - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "columns", - ( - [], - ["c", "a"], - ["a", "d", "b", "e", "c"], - ["a", "b", "c"], - pd.Index(["b", "a", "c"], name="custom_name"), - ), -) -@pytest.mark.parametrize("index", (None, [4, 5, 6])) -def test_dataframe_dict_like_with_columns(columns, index): - data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} - expect = pd.DataFrame(data, columns=columns, index=index) - actual = cudf.DataFrame(data, columns=columns, index=index) - if index is None and len(columns) == 0: - # We make an empty range index, pandas makes an empty index - expect = expect.reset_index(drop=True) - assert_eq(expect, actual) - - -def test_dataframe_init_columns_named_multiindex(): - np.random.seed(0) - data = np.random.randn(2, 2) - columns = cudf.MultiIndex.from_tuples( - [("A", "one"), ("A", "two")], names=["y", "z"] - ) - gdf = cudf.DataFrame(data, columns=columns) - pdf = pd.DataFrame(data, columns=columns.to_pandas()) - - assert_eq(gdf, pdf) - - -def test_dataframe_init_columns_named_index(): - np.random.seed(0) - data = np.random.randn(2, 2) - columns = pd.Index(["a", "b"], name="custom_name") - gdf = cudf.DataFrame(data, columns=columns) - pdf = pd.DataFrame(data, columns=columns) - - assert_eq(gdf, pdf) - - -def test_dataframe_from_pandas_sparse(): - pdf = pd.DataFrame(range(2), dtype=pd.SparseDtype(np.int64, 0)) - with pytest.raises(NotImplementedError): - cudf.DataFrame(pdf) - - -def test_dataframe_constructor_unbounded_sequence(): - class A: - def __getitem__(self, key): - return 1 - - with pytest.raises(TypeError): - cudf.DataFrame([A()]) - - with pytest.raises(TypeError): - cudf.DataFrame({"a": A()}) - - -def test_dataframe_constructor_dataframe_list(): - df = cudf.DataFrame(range(2)) - with pytest.raises(ValueError): - cudf.DataFrame([df]) - - -def test_dataframe_constructor_from_namedtuple(): - Point1 = namedtuple("Point1", ["a", "b", "c"]) - Point2 = namedtuple("Point1", ["x", "y"]) - - data = [Point1(1, 2, 3), Point2(4, 5)] - idx = ["a", "b"] - gdf = cudf.DataFrame(data, index=idx) - pdf = pd.DataFrame(data, index=idx) - - assert_eq(gdf, pdf) - - data = [Point2(4, 5), Point1(1, 2, 3)] - with pytest.raises(ValueError): - cudf.DataFrame(data, index=idx) - with pytest.raises(ValueError): - pd.DataFrame(data, index=idx) - - -@pytest.mark.parametrize( - "dtype", ["datetime64[ns]", "timedelta64[ns]", "int64", "float32"] -) -def test_dataframe_mixed_dtype_error(dtype): - pdf = pd.Series([1, 2, 3], dtype=dtype).to_frame().astype(object) - with pytest.raises(TypeError): - cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "index_data,name", - [([10, 13], "a"), ([30, 40, 20], "b"), (["ef"], "c"), ([2, 3], "Z")], -) -def test_dataframe_reindex_with_index_names(index_data, name): - gdf = cudf.DataFrame( - { - "a": [10, 12, 13], - "b": [20, 30, 40], - "c": cudf.Series(["ab", "cd", "ef"], dtype="category"), - } - ) - if name in gdf.columns: - gdf = gdf.set_index(name) - pdf = gdf.to_pandas() - - gidx = cudf.Index(index_data, name=name) - actual = gdf.reindex(gidx) - expected = pdf.reindex(gidx.to_pandas()) - - assert_eq(actual, expected) - - actual = gdf.reindex(index_data) - expected = pdf.reindex(index_data) - - assert_eq(actual, expected) - - -@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"]) -def test_dataframe_nlargest_nsmallest_str_error(attr): - gdf = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - pdf = gdf.to_pandas() - - assert_exceptions_equal( - getattr(gdf, attr), - getattr(pdf, attr), - ([], {"n": 1, "columns": ["a", "b"]}), - ([], {"n": 1, "columns": ["a", "b"]}), - ) - - -@pytest.mark.parametrize("digits", [0, 1, 3, 4, 10]) -def test_dataframe_round_builtin(digits): - pdf = pd.DataFrame( - { - "a": [1.2234242333234, 323432.3243423, np.nan], - "b": ["a", "b", "c"], - "c": pd.Series([34224, 324324, 324342], dtype="datetime64[ns]"), - "d": pd.Series([224.242, None, 2424.234324], dtype="category"), - "e": [ - decimal.Decimal("342.3243234234242"), - decimal.Decimal("89.32432497687622"), - None, - ], - } - ) - gdf = cudf.from_pandas(pdf, nan_as_null=False) - - expected = round(pdf, digits) - actual = round(gdf, digits) - - assert_eq(expected, actual) - - -def test_dataframe_init_from_nested_dict(): - ordered_dict = OrderedDict( - [ - ("one", OrderedDict([("col_a", "foo1"), ("col_b", "bar1")])), - ("two", OrderedDict([("col_a", "foo2"), ("col_b", "bar2")])), - ("three", OrderedDict([("col_a", "foo3"), ("col_b", "bar3")])), - ] - ) - pdf = pd.DataFrame(ordered_dict) - gdf = cudf.DataFrame(ordered_dict) - - assert_eq(pdf, gdf) - regular_dict = {key: dict(value) for key, value in ordered_dict.items()} - - pdf = pd.DataFrame(regular_dict) - gdf = cudf.DataFrame(regular_dict) - assert_eq(pdf, gdf) - - -def test_init_from_2_categoricalindex_series_diff_categories(): - s1 = cudf.Series( - [39, 6, 4], index=cudf.CategoricalIndex(["female", "male", "unknown"]) - ) - s2 = cudf.Series( - [2, 152, 2, 242, 150], - index=cudf.CategoricalIndex(["f", "female", "m", "male", "unknown"]), - ) - result = cudf.DataFrame([s1, s2]) - expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()]) - assert_eq(result, expected, check_dtype=False) - - -def test_data_frame_values_no_cols_but_index(): - result = cudf.DataFrame(index=range(5)).values - expected = pd.DataFrame(index=range(5)).values - assert_eq(result, expected) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 1c21fca51c8..873981b24f9 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. class StructMethods: @@ -263,11 +263,6 @@ def sort_values( ------- ListColumn with each list sorted - Notes - ----- - Difference from pandas: - * Not supporting: `inplace`, `kind` - Examples -------- >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) @@ -277,6 +272,11 @@ def sort_values( 1 [2.0, 8.0, 8.0] 2 [1.0, 2.0] dtype: list + + .. pandas-compat:: + **ListMethods.sort_values** + + The `inplace` and `kind` argument is currently unsupported. """ return self.d_series.map_partitions( lambda s: s.list.sort_values( From 95f6ac33e25f20e6fe1697bd9a4e2a985a01d472 Mon Sep 17 00:00:00 2001 From: Pantakan Kanprawet Date: Fri, 7 Jul 2023 22:49:43 +0700 Subject: [PATCH 07/14] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani Co-authored-by: Lawrence Mitchell --- python/cudf/cudf/core/column/lists.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 24 ++++++++++++------------ python/cudf/cudf/core/groupby/groupby.py | 22 ---------------------- python/cudf/cudf/core/indexed_frame.py | 12 ++++++------ 5 files changed, 20 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6d37f1281ef..1036f5ac263 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -632,7 +632,7 @@ def sort_values( dtype: list .. pandas-compat:: - **list.ListMethods.sort_values** + **ListMethods.sort_values** The ``inplace`` and ``kind`` arguments are currently not supported. """ diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1aaacca6d79..cd9452a37ad 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6839,7 +6839,7 @@ def to_struct(self, name=None): Notes ----- - Note: that a copy of the columns is made. + Note: a copy of the columns is made. """ if not all(isinstance(name, str) for name in self._data.names): warnings.warn( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ef3134378ac..d151b6ee3a6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -308,7 +308,7 @@ def empty(self): True .. pandas-compat:: - **DataFrame.empty, Series.empty, Frame.empty** + **DataFrame.empty, Series.empty** If DataFrame/Series contains only `null` values, it is still not considered empty. See the example above. @@ -660,7 +660,7 @@ def where(self, cond, other=None, inplace=False): dtype: int64 .. pandas-compat:: - **DataFrame.where, Series.where, Frame.where** + **DataFrame.where, Series.where** Note that ``where`` treats missing values as falsy, in parallel with pandas treatment of nullable data: @@ -2001,7 +2001,7 @@ def min( dtype: int64 .. pandas-compat:: - **DataFrame.min, Series.min, Frame.min** + **DataFrame.min, Series.min** Parameters currently not supported are `level`, `numeric_only`. """ @@ -2053,7 +2053,7 @@ def max( dtype: int64 .. pandas-compat:: - **DataFrame.max, Series.max, Frame.max** + **DataFrame.max, Series.max** Parameters currently not supported are `level`, `numeric_only`. """ @@ -2110,7 +2110,7 @@ def sum( dtype: int64 .. pandas-compat:: - **DataFrame.sum, Series.sum, Frame.sum** + **DataFrame.sum, Series.sum** Parameters currently not supported are `level`, `numeric_only`. """ @@ -2169,7 +2169,7 @@ def product( dtype: int64 .. pandas-compat:: - **DataFrame.product, Series.product, Frame.product** + **DataFrame.product, Series.product** Parameters currently not supported are level`, `numeric_only`. """ @@ -2281,7 +2281,7 @@ def std( dtype: float64 .. pandas-compat:: - **DataFrame.std, Series.std, Frame.std** + **DataFrame.std, Series.std** Parameters currently not supported are `level` and `numeric_only` @@ -2338,7 +2338,7 @@ def var( dtype: float64 .. pandas-compat:: - **DataFrame.var, Series.var, Frame.var** + **DataFrame.var, Series.var** Parameters currently not supported are `level` and `numeric_only` @@ -2398,7 +2398,7 @@ def kurtosis( dtype: float64 .. pandas-compat:: - **DataFrame.kurtosis, Frame.kurtosis** + **DataFrame.kurtosis** Parameters currently not supported are `level` and `numeric_only` """ @@ -2522,7 +2522,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): dtype: bool .. pandas-compat:: - **DataFrame.all, Series.all, Frame.all** + **DataFrame.all, Series.all** Parameters currently not supported are `axis`, `bool_only`, `level`. """ @@ -2575,7 +2575,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): dtype: bool .. pandas-compat:: - **DataFrame.any, Series.any, Frame.any** + **DataFrame.any, Series.any** Parameters currently not supported are `axis`, `bool_only`, `level`. """ @@ -2619,7 +2619,7 @@ def median( 17.0 .. pandas-compat:: - **DataFrame.median, Series.median, Frame.median** + **DataFrame.median, Series.median** Parameters currently not supported are `level` and `numeric_only`. """ diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a7cb4c72ac1..a8d0225455c 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2199,28 +2199,6 @@ def fillna( Returns ------- DataFrame or Series - - .. pandas-compat:: - **GroupBy.fillna** - - This function may return result in different format to the method - Pandas supports. For example: - - .. code-block:: - - >>> df = pd.DataFrame({'k': [1, 1, 2], 'v': [2, None, 4]}) - >>> gdf = cudf.from_pandas(df) - >>> df.groupby('k').fillna({'v': 4}) # pandas - v - k - 1 0 2.0 - 1 4.0 - 2 2 4.0 - >>> gdf.groupby('k').fillna({'v': 4}) # cudf - v - 0 2.0 - 1 4.0 - 2 4.0 """ if inplace: raise NotImplementedError("Does not support inplace yet.") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 650ba77e101..06c195fea6f 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -805,7 +805,7 @@ def replace( 4 4 9 e .. pandas-compat:: - **DataFrame.replace, Series.replace, IndexedFrame.replace** + **DataFrame.replace, Series.replace** Parameters that are currently not supported are: `limit`, `regex`, `method` @@ -1292,7 +1292,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True): 2021-01-01 23:45:27 1 2 .. pandas-compat:: - **DataFrame.truncate, Series.truncate, IndexedFrame.truncate** + **DataFrame.truncate, Series.truncate** The ``copy`` parameter is only present for API compatibility, but ``copy=False`` is not supported. This method always generates a @@ -1595,7 +1595,7 @@ def sort_index( 2 3 1 .. pandas-compat:: - **DataFrame.sort_index, Series.sort_index, IndexedFrame.sort_index** + **DataFrame.sort_index, Series.sort_index** * Not supporting: kind, sort_remaining=False """ @@ -2429,7 +2429,7 @@ def sort_values( 1 1 2 .. pandas-compat:: - **DataFrame.sort_values, Series.sort_values, IndexedFrame.sort_values** + **DataFrame.sort_values, Series.sort_values** * Support axis='index' only. * Not supporting: inplace, kind @@ -2925,7 +2925,7 @@ def resample( .. pandas-compat:: - **DataFrame.resample, Series.resample, IndexedFrame.resample** + **DataFrame.resample, Series.resample** Note that the dtype of the index (or the 'on' column if using 'on=') in the result will be of a frequency closest to the @@ -3441,7 +3441,7 @@ def sample( 1 2 4 .. pandas-compat:: - **DataFrame.sample, Series.sample, IndexedFrame.sample** + **DataFrame.sample, Series.sample** When sampling from ``axis=0/'index'``, ``random_state`` can be either a numpy random state (``numpy.random.RandomState``) or a cupy random From 7f6b9bd3f76366c7431d734c43bea929495dd58b Mon Sep 17 00:00:00 2001 From: Touutae-lab Date: Sat, 8 Jul 2023 00:54:11 +0700 Subject: [PATCH 08/14] add 2 more methode --- python/cudf/cudf/core/column/string.py | 17 ++++++------ python/cudf/cudf/core/dataframe.py | 38 ++++++++------------------ 2 files changed, 21 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a58dd13fff6..2585ba0101a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -3508,14 +3508,6 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: ------- Series or Index - Notes - ----- - - `flags` parameter currently only supports re.DOTALL - and re.MULTILINE. - - Some characters need to be escaped when passing - in pat. e.g. ``'$'`` has a special meaning in regex - and must be escaped when finding this literal character. - Examples -------- >>> import cudf @@ -3547,6 +3539,15 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: >>> index = cudf.Index(['A', 'A', 'Aaba', 'cat']) >>> index.str.count('a') Int64Index([0, 0, 2, 1], dtype='int64') + + .. pandas-compat:: + **StringMethods.count** + + - `flags` parameter currently only supports re.DOTALL + and re.MULTILINE. + - Some characters need to be escaped when passing + in pat. e.g. ``'$'`` has a special meaning in regex + and must be escaped when finding this literal character. """ # noqa W605 if isinstance(pat, re.Pattern): flags = pat.flags & ~re.U diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index cd9452a37ad..4ea8c91cda6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6944,17 +6944,6 @@ def append( ------- DataFrame - Notes - ----- - If a list of dict/series is passed and the keys are all contained in - the DataFrame's index, the order of the columns in the resulting - DataFrame will be unchanged. - Iteratively appending rows to a cudf DataFrame can be more - computationally intensive than a single concatenate. A better - solution is to append those rows to a list and then concatenate - the list with the original DataFrame all at once. - `verify_integrity` parameter is not supported yet - See Also -------- cudf.concat : General function to concatenate DataFrame or @@ -7014,6 +7003,18 @@ def append( 2 2 3 3 4 4 + + .. pandas-compat:: + **DataFrame.append** + + If a list of dict/series is passed and the keys are all contained in + the DataFrame's index, the order of the columns in the resulting + DataFrame will be unchanged. + Iteratively appending rows to a cudf DataFrame can be more + computationally intensive than a single concatenate. A better + solution is to append those rows to a list and then concatenate + the list with the original DataFrame all at once. + `verify_integrity` parameter is not supported yet """ if isinstance(other, dict): if not ignore_index: @@ -7335,21 +7336,6 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): DataFrame if any assignment statements are included in ``expr``, or None if ``inplace=True``. - Notes - ----- - Difference from pandas: - * Additional kwargs are not supported. - * Bitwise and logical operators are not dtype-dependent. - Specifically, `&` must be used for bitwise operators on integers, - not `and`, which is specifically for the logical and between - booleans. - * Only numerical types are currently supported. - * Operators generally will not cast automatically. Users are - responsible for casting columns to suitable types before - evaluating a function. - * Multiple assignments to the same name (i.e. a sequence of - assignment statements where later statements are conditioned upon - the output of earlier statements) is not supported. Examples -------- From 7e76dac81c9278517ecb38b2a903671cec8177d2 Mon Sep 17 00:00:00 2001 From: Touutae-lab Date: Sat, 15 Jul 2023 19:38:07 +0700 Subject: [PATCH 09/14] Fix indentation --- python/cudf/cudf/core/column/string.py | 10 ++++++---- python/cudf/cudf/core/dataframe.py | 20 ++++++++++++-------- python/cudf/cudf/core/frame.py | 6 ++++-- python/cudf/cudf/core/indexed_frame.py | 19 ++++++++++--------- python/cudf/cudf/core/tools/numeric.py | 9 ++++----- 5 files changed, 36 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2585ba0101a..18a1d3bc8af 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -990,9 +990,9 @@ def replace( .. pandas-compat:: **StringMethods.replace** - The parameters `case` and `flags` are not yet supported and will raise - a `NotImplementedError` if anything other than the default value - is set. + The parameters `case` and `flags` are not yet supported and will + raise a `NotImplementedError` if anything other than the default + value is set. """ if case is not None: raise NotImplementedError("`case` parameter is not yet supported") @@ -2822,7 +2822,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: **StringMethods.partition** The parameter `expand` is not yet supported and will raise a - `NotImplementedError` if anything other than the default value is set. + `NotImplementedError` if anything other than the default + value is set. + """ if expand is not True: raise NotImplementedError( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4ea8c91cda6..757cb0c1b3c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3351,8 +3351,9 @@ def rename( * Not Supporting: level - Rename will not overwrite column names. If a list with duplicates is - passed, column names will be postfixed with a number. + Rename will not overwrite column names. If a list with + duplicates is passed, column names will be postfixed + with a number. """ if errors != "ignore": raise NotImplementedError( @@ -3779,7 +3780,8 @@ def transpose(self): .. pandas-compat:: **DataFrame.transpose, DataFrame.T** - Not supporting *copy* because default and only behavior is copy=True + Not supporting *copy* because default and only behavior is + copy=True """ index = self._data.to_pandas_index() @@ -3967,7 +3969,8 @@ def merge( .. pandas-compat:: **DataFrame.merge** - **DataFrames merges in cuDF result in non-deterministic row ordering.** + DataFrames merges in cuDF result in non-deterministic row + ordering. """ if indicator: raise NotImplementedError( @@ -5854,7 +5857,8 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): .. pandas-compat:: **DataFrame.count** - Parameters currently not supported are `axis`, `level`, `numeric_only`. + Parameters currently not supported are `axis`, `level`, + `numeric_only`. """ axis = self._get_axis_from_axis_arg(axis) if axis != 0: @@ -7007,9 +7011,9 @@ def append( .. pandas-compat:: **DataFrame.append** - If a list of dict/series is passed and the keys are all contained in - the DataFrame's index, the order of the columns in the resulting - DataFrame will be unchanged. + If a list of dict/series is passed and the keys are all contained + in the DataFrame's index, the order of the columns in + the resulting DataFrame will be unchanged. Iteratively appending rows to a cudf DataFrame can be more computationally intensive than a single concatenate. A better solution is to append those rows to a list and then concatenate diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d151b6ee3a6..6c465bed40b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2524,7 +2524,8 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): .. pandas-compat:: **DataFrame.all, Series.all** - Parameters currently not supported are `axis`, `bool_only`, `level`. + Parameters currently not supported are `axis`, `bool_only`, + `level`. """ return self._reduce( "all", @@ -2577,7 +2578,8 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): .. pandas-compat:: **DataFrame.any, Series.any** - Parameters currently not supported are `axis`, `bool_only`, `level`. + Parameters currently not supported are `axis`, `bool_only`, + `level`. """ return self._reduce( "any", diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 06c195fea6f..d07d5c75e2b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3443,15 +3443,16 @@ def sample( .. pandas-compat:: **DataFrame.sample, Series.sample** - When sampling from ``axis=0/'index'``, ``random_state`` can be either - a numpy random state (``numpy.random.RandomState``) or a cupy random - state (``cupy.random.RandomState``). When a numpy random state is - used, the output is guaranteed to match the output of the corresponding - pandas method call, but generating the sample may be slow. If exact - pandas equivalence is not required, using a cupy random state will - achieve better performance, especially when sampling large number of - items. It's advised to use the matching `ndarray` type to the random - state for the `weights` array. + When sampling from ``axis=0/'index'``, ``random_state`` can be + either a numpy random state (``numpy.random.RandomState``) + or a cupy random state (``cupy.random.RandomState``). When a numpy + random state is used, the output is guaranteed to match the output + of the corresponding pandas method call, but generating the sample + maybe slow. If exact pandas equivalence is not required, using a + cupy random state will achieve better performance, + especially when sampling large number of + items. It's advised to use the matching `ndarray` type to + the random state for the `weights` array. """ axis = 0 if axis is None else self._get_axis_from_axis_arg(axis) size = self.shape[axis] diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 609a5503040..529c0d18183 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -89,11 +89,10 @@ def to_numeric(arg, errors="raise", downcast=None): .. pandas-compat:: **cudf.to_numeric** - An important difference from pandas is that this function does not accept - mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. - A ``TypeError`` will be raised when such input is received, regardless of - ``errors`` parameter. - + An important difference from pandas is that this function does not + accept mixed numeric/non-numeric type sequences. + For example ``[1, 'a']``. A ``TypeError`` will be raised when such + input is received, regardless of ``errors`` parameter. """ if errors not in {"raise", "ignore", "coerce"}: From 46577ee35e9cf4527cd3de86893015dd8c369174 Mon Sep 17 00:00:00 2001 From: Pantakan Kanprawet Date: Sat, 21 Oct 2023 08:55:19 +0000 Subject: [PATCH 10/14] Revert Testcase back --- python/cudf/cudf/tests/test_dataframe.py | 79 ++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 36cc3ef5628..2256c14e259 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10106,3 +10106,82 @@ def test_dataframe_transpose_complex_types(data): actual = gdf.T assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]}, + {"a": [[{"b": 567}], None] * 10}, + {"a": [decimal.Decimal(10), decimal.Decimal(20), None]}, + ], +) +def test_dataframe_values_complex_types(data): + gdf = cudf.DataFrame(data) + with pytest.raises(NotImplementedError): + gdf.values + + +def test_dataframe_from_arrow_slice(): + table = pa.Table.from_pandas( + pd.DataFrame.from_dict( + {"a": ["aa", "bb", "cc"] * 3, "b": [1, 2, 3] * 3} + ) + ) + table_slice = table.slice(3, 7) + + expected = table_slice.to_pandas() + actual = cudf.DataFrame.from_arrow(table_slice) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 4}, + {"c": 4, "a": [1, 2, 3], "b": ["x", "y", "z"]}, + {"a": [1, 2, 3], "c": 4}, + ], +) +def test_dataframe_init_from_scalar_and_lists(data): + actual = cudf.DataFrame(data) + expected = pd.DataFrame(data) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data,index", + [ + ({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None), + ( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + }, + [10, 11], + ), + ( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + }, + [10, 11], + ), + ([[10, 11], [12, 13]], ["a", "b", "c"]), + ], +) +def test_dataframe_init_length_error(data, index): + assert_exceptions_equal( + lfunc=pd.DataFrame, + rfunc=cudf.DataFrame, + lfunc_args_and_kwargs=( + [], + {"data": data, "index": index}, + ), + rfunc_args_and_kwargs=( + [], + {"data": data, "index": index}, + ), + ) \ No newline at end of file From d1f5e2c42b6c973c93e775867e6e375c09035851 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Jan 2024 23:04:07 +0000 Subject: [PATCH 11/14] Undo incorrect new methods --- python/cudf/cudf/core/frame.py | 68 -------------------------- python/cudf/cudf/core/indexed_frame.py | 11 +++-- 2 files changed, 6 insertions(+), 73 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e6da5dbf311..cf36cdf6e30 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -251,74 +251,6 @@ def size(self) -> int: """ return self._num_columns * self._num_rows - @_cudf_nvtx_annotate - @property - def shape(self): - """Returns a tuple representing the dimensionality of the DataFrame.""" - return self._num_rows, self._num_columns - - @property - def empty(self): - """ - Indicator whether DataFrame or Series is empty. - - True if DataFrame/Series is entirely empty (no items), - meaning any of the axes are of length 0. - - Returns - ------- - out : bool - If DataFrame/Series is empty, return True, if not return False. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'A' : []}) - >>> df - Empty DataFrame - Columns: [A] - Index: [] - >>> df.empty - True - - If we only have `null` values in our DataFrame, it is - not considered empty! We will need to drop - the `null`'s to make the DataFrame empty: - - >>> df = cudf.DataFrame({'A' : [None, None]}) - >>> df - A - 0 - 1 - >>> df.empty - False - >>> df.dropna().empty - True - - Non-empty and empty Series example: - - >>> s = cudf.Series([1, 2, None]) - >>> s - 0 1 - 1 2 - 2 - dtype: int64 - >>> s.empty - False - >>> s = cudf.Series([]) - >>> s - Series([], dtype: float64) - >>> s.empty - True - - .. pandas-compat:: - **DataFrame.empty, Series.empty** - - If DataFrame/Series contains only `null` values, it is still not - considered empty. See the example above. - """ - return self.size == 0 - def memory_usage(self, deep=False): """Return the memory usage of an object. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d7b6f2379ae..6c0aba34970 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -446,11 +446,6 @@ def empty(self): out : bool If DataFrame/Series is empty, return True, if not return False. - Notes - ----- - If DataFrame/Series contains only `null` values, it is still not - considered empty. See the example below. - Examples -------- >>> import cudf @@ -491,6 +486,12 @@ def empty(self): Series([], dtype: float64) >>> s.empty True + + .. pandas-compat:: + **DataFrame.empty, Series.empty** + + If DataFrame/Series contains only `null` values, it is still not + considered empty. See the example above. """ return self.size == 0 From c31df6762b0c3bd4b37657b2c450d53c5a40543d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Jan 2024 23:05:07 +0000 Subject: [PATCH 12/14] Fix a copyright --- python/dask_cudf/dask_cudf/accessors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 873981b24f9..47fcedfd4d7 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. class StructMethods: From e757a1c2c2dce93c8677df4d4d6dfb0a99c6e1a4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 23 Jan 2024 19:02:12 +0000 Subject: [PATCH 13/14] Clean up docstring of append --- python/cudf/cudf/core/dataframe.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 88ea4870e42..ad7ba86645e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7109,6 +7109,13 @@ def append( ------- DataFrame + Notes + ----- + Iteratively appending rows to a cudf DataFrame can be more + computationally intensive than a single concatenate. A better solution + is to append those rows to a list and then concatenate the list with + the original DataFrame all at once. + See Also -------- cudf.concat : General function to concatenate DataFrame or @@ -7172,14 +7179,10 @@ def append( .. pandas-compat:: **DataFrame.append** - If a list of dict/series is passed and the keys are all contained - in the DataFrame's index, the order of the columns in - the resulting DataFrame will be unchanged. - Iteratively appending rows to a cudf DataFrame can be more - computationally intensive than a single concatenate. A better - solution is to append those rows to a list and then concatenate - the list with the original DataFrame all at once. - `verify_integrity` parameter is not supported yet + * If a list of dict/series is passed and the keys are all contained + in the DataFrame's index, the order of the columns in the + resulting DataFrame will be unchanged. + * The `verify_integrity` parameter is not supported yet. """ if isinstance(other, dict): if not ignore_index: From 86ee68967733d68d3066831a77ef5fbe4c04360b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 23 Jan 2024 23:57:22 +0000 Subject: [PATCH 14/14] Remove change from dask (does not support the plugin) --- python/dask_cudf/dask_cudf/accessors.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 47fcedfd4d7..1c21fca51c8 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. class StructMethods: @@ -263,6 +263,11 @@ def sort_values( ------- ListColumn with each list sorted + Notes + ----- + Difference from pandas: + * Not supporting: `inplace`, `kind` + Examples -------- >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) @@ -272,11 +277,6 @@ def sort_values( 1 [2.0, 8.0, 8.0] 2 [1.0, 2.0] dtype: list - - .. pandas-compat:: - **ListMethods.sort_values** - - The `inplace` and `kind` argument is currently unsupported. """ return self.d_series.map_partitions( lambda s: s.list.sort_values(