From 95059b84b9bdbfc819741b7806c63e8a26cf0ae2 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Thu, 21 Jan 2021 02:36:04 +0530 Subject: [PATCH 1/2] Add encoding and compression argument to CSV writer (#7168) This PR closes https://github.com/rapidsai/cudf/issues/7083 by adding an encoding argument to our CSV writer, it also adds compression argument to the writer. This will help address some issues with feature tool compatibility [PR](https://github.com/alteryx/featuretools/pull/1246). Authors: - Vibhu Jawa (@VibhuJawa) Approvers: - GALI PREM SAGAR (@galipremsagar) - Michael Wang (@isVoid) URL: https://github.com/rapidsai/cudf/pull/7168 --- python/cudf/cudf/core/dataframe.py | 4 ++++ python/cudf/cudf/io/csv.py | 13 +++++++++++++ python/cudf/cudf/tests/test_csv.py | 21 +++++++++++++++++++++ python/cudf/cudf/utils/ioutils.py | 7 ++++++- 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 606d96600a1..906a71623e0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6844,6 +6844,8 @@ def to_csv( index=True, line_terminator="\n", chunksize=None, + encoding=None, + compression=None, **kwargs, ): """{docstring}""" @@ -6859,6 +6861,8 @@ def to_csv( index=index, line_terminator=line_terminator, chunksize=chunksize, + encoding=encoding, + compression=compression, **kwargs, ) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 0270198a7e0..e2c7ca7dca1 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -115,6 +115,8 @@ def to_csv( index=True, line_terminator="\n", chunksize=None, + encoding=None, + compression=None, **kwargs, ): """{docstring}""" @@ -124,6 +126,17 @@ def to_csv( elif len(sep) > 1: raise TypeError('"sep" must be a 1-character string') + if encoding and encoding != "utf-8": + error_msg = ( + f"Encoding {encoding} is not supported. " + + "Currently, only utf-8 encoding is supported." + ) + raise NotImplementedError(error_msg) + + if compression: + error_msg = "Writing compressed csv is not currently supported in cudf" + raise NotImplementedError(error_msg) + return_as_string = False if path_or_buf is None: path_or_buf = StringIO() diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 0ea8b3add4b..23a950bb72d 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1955,3 +1955,24 @@ def test_csv_sep_error(): rfunc_args_and_kwargs=([], {"sep": 1}), expected_error_message='"sep" must be string, not int', ) + + +def test_to_csv_encoding_error(): + # TODO: Remove this test once following + # issue is fixed: https://github.com/rapidsai/cudf/issues/2957 + df = cudf.DataFrame({"a": ["你好", "test"]}) + encoding = "utf-8-sig" + error_message = ( + f"Encoding {encoding} is not supported. " + + "Currently, only utf-8 encoding is supported." + ) + with pytest.raises(NotImplementedError, match=re.escape(error_message)): + df.to_csv("test.csv", encoding=encoding) + + +def test_to_csv_compression_error(): + df = cudf.DataFrame({"a": ["test"]}) + compression = "snappy" + error_message = "Writing compressed csv is not currently supported in cudf" + with pytest.raises(NotImplementedError, match=re.escape(error_message)): + df.to_csv("test.csv", compression=compression) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index e71c90b1ec9..5d52d6c7da4 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -931,7 +931,12 @@ line_terminator : char, default '\\n' chunksize : int or None, default None Rows to write at a time - +encoding: str, default 'utf-8' + A string representing the encoding to use in the output file + Only ‘utf-8’ is currently supported +compression: str, None + A string representing the compression scheme to use in the the output file + Compression while writing csv is not supported currently Returns ------- None or str From a51caa595bb84bc73e181c4fb3d4d064577f43cb Mon Sep 17 00:00:00 2001 From: ChrisJar Date: Wed, 20 Jan 2021 15:32:57 -0600 Subject: [PATCH 2/2] Enable round in cudf for DataFrame and Series (#7022) This enables round for DataFrames and Series using the libcudf round implementation and removes the old numba round implementation. Closes #1270 Authors: - @ChrisJar Approvers: - Ashwin Srinath (@shwina) - Michael Wang (@isVoid) - Ram (Ramakrishna Prabhu) (@rgsl888prabhu) - GALI PREM SAGAR (@galipremsagar) URL: https://github.com/rapidsai/cudf/pull/7022 --- python/cudf/cudf/_lib/__init__.py | 3 +- python/cudf/cudf/_lib/cpp/round.pxd | 19 ++++ python/cudf/cudf/_lib/round.pyx | 42 ++++++++ python/cudf/cudf/core/column/numerical.py | 16 +--- python/cudf/cudf/core/frame.py | 111 ++++++++++++++++++++++ python/cudf/cudf/core/series.py | 27 +++++- python/cudf/cudf/tests/test_dataframe.py | 111 +++++++++------------- python/cudf/cudf/tests/test_series.py | 56 ++++++++++- python/cudf/cudf/utils/cudautils.py | 23 +---- 9 files changed, 303 insertions(+), 105 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/round.pxd create mode 100644 python/cudf/cudf/_lib/round.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index e52ff707319..be2d4ef5f51 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import numpy as np from . import ( @@ -23,6 +23,7 @@ replace, reshape, rolling, + round, search, sort, stream_compaction, diff --git a/python/cudf/cudf/_lib/cpp/round.pxd b/python/cudf/cudf/_lib/cpp/round.pxd new file mode 100644 index 00000000000..78f18dcacce --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/round.pxd @@ -0,0 +1,19 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libc.stdint cimport int32_t +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view + +cdef extern from "cudf/round.hpp" namespace "cudf" nogil: + + ctypedef enum rounding_method "cudf::rounding_method": + HALF_UP "cudf::rounding_method::HALF_UP" + HALF_EVEN "cudf::rounding_method::HALF_EVEN" + + cdef unique_ptr[column] round ( + const column_view& input, + int32_t decimal_places, + rounding_method method, + ) except + diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx new file mode 100644 index 00000000000..660d6d91670 --- /dev/null +++ b/python/cudf/cudf/_lib/round.pyx @@ -0,0 +1,42 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.column cimport Column + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.round cimport ( + rounding_method as cpp_rounding_method, + round as cpp_round +) + + +def round(Column input_col, int decimal_places=0): + """ + Round column values to the given number of decimal places + + Parameters + ---------- + input_col : Column whose values will be rounded + decimal_places : The number or decimal places to round to + + Returns + ------- + A Column with values rounded to the given number of decimal places + """ + + cdef column_view input_col_view = input_col.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_round( + input_col_view, + decimal_places, + cpp_rounding_method.HALF_EVEN, + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b8a25d83ea8..54a6d274843 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. from numbers import Number @@ -342,17 +342,9 @@ def corr(self, other): return cov / lhs_std / rhs_std def round(self, decimals=0): - if decimals < 0: - msg = "Decimal values < 0 are not yet supported." - raise NotImplementedError(msg) - - if np.issubdtype(self.dtype, np.integer): - return self - - data = Buffer( - cudautils.apply_round(self.data_array_view, decimals).view("|u1") - ) - return column.build_column(data=data, dtype=self.dtype, mask=self.mask) + """Round the values in the Column to the given number of decimals. + """ + return libcudf.round.round(self, decimal_places=decimals) def applymap(self, udf, out_dtype=None): """Apply an element-wise function to transform the values in the Column. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 87081cbecdb..ad4069dfb68 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1751,6 +1751,117 @@ def __arrow_array__(self, type=None): "consider using .to_arrow()" ) + def round(self, decimals=0): + """ + Round a DataFrame to a variable number of decimal places. + + Parameters + ---------- + decimals : int, dict, Series + Number of decimal places to round each column to. If an int is + given, round each column to the same number of places. + Otherwise dict and Series round to variable numbers of places. + Column names should be in the keys if `decimals` is a + dict-like, or in the index if `decimals` is a Series. Any + columns not included in `decimals` will be left as is. Elements + of `decimals` which are not columns of the input will be + ignored. + + Returns + ------- + DataFrame + A DataFrame with the affected columns rounded to the specified + number of decimal places. + + Examples + -------- + >>> df = cudf.DataFrame( + [(.21, .32), (.01, .67), (.66, .03), (.21, .18)], + ... columns=['dogs', 'cats'] + ... ) + >>> df + dogs cats + 0 0.21 0.32 + 1 0.01 0.67 + 2 0.66 0.03 + 3 0.21 0.18 + + By providing an integer each column is rounded to the same number + of decimal places + + >>> df.round(1) + dogs cats + 0 0.2 0.3 + 1 0.0 0.7 + 2 0.7 0.0 + 3 0.2 0.2 + + With a dict, the number of places for specific columns can be + specified with the column names as key and the number of decimal + places as value + + >>> df.round({'dogs': 1, 'cats': 0}) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + Using a Series, the number of places for specific columns can be + specified with the column names as index and the number of + decimal places as value + + >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs']) + >>> df.round(decimals) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + """ + + if isinstance(decimals, cudf.Series): + decimals = decimals.to_pandas() + + if isinstance(decimals, (dict, pd.Series)): + if ( + isinstance(decimals, pd.Series) + and not decimals.index.is_unique + ): + raise ValueError("Index of decimals must be unique") + + cols = { + name: col.round(decimals[name]) + if ( + name in decimals.keys() + and pd.api.types.is_numeric_dtype(col.dtype) + ) + else col.copy(deep=True) + for name, col in self._data.items() + } + elif isinstance(decimals, int): + cols = { + name: col.round(decimals) + if pd.api.types.is_numeric_dtype(col.dtype) + else col.copy(deep=True) + for name, col in self._data.items() + } + else: + raise TypeError( + "decimals must be an integer, a dict-like or a Series" + ) + + return self.__class__._from_table( + Frame( + data=cudf.core.column_accessor.ColumnAccessor( + cols, + multiindex=self._data.multiindex, + level_names=self._data.level_names, + ) + ), + index=self._index, + ) + @annotate("SAMPLE", color="orange", domain="cudf_python") def sample( self, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2b9078abed6..4da0749bddb 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. import pickle import warnings from collections import abc as abc @@ -3506,8 +3506,31 @@ def mode(self, dropna=True): return Series(val_counts.index.sort_values(), name=self.name) def round(self, decimals=0): - """Round a Series to a configurable number of decimal places. """ + Round each value in a Series to the given number of decimals. + + Parameters + ---------- + decimals : int, default 0 + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal + point. + + Returns + ------- + Series + Rounded values of the Series. + + Examples + -------- + >>> s = cudf.Series([0.1, 1.4, 2.9]) + >>> s.round() + 0 0.0 + 1 1.0 + 2 3.0 + dtype: float64 + """ + return Series( self._column.round(decimals=decimals), name=self.name, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 66d25076d68..f66aeb8cf1e 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3243,86 +3243,61 @@ def test_ndim(): @pytest.mark.parametrize( - "arr", - [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), - np.zeros(100), - np.repeat([-0.6459412758761901], 100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - ], -) -@pytest.mark.parametrize( - "decimal", + "decimals", [ + -3, 0, - 1, - 2, - 3, - 4, 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - pytest.param( - -1, - marks=[ - pytest.mark.xfail(reason="NotImplementedError: decimals < 0") - ], - ), + pd.Series([1, 4, 3, -6], index=["w", "x", "y", "z"]), + gd.Series([-4, -2, 12], index=["x", "y", "z"]), + {"w": -1, "x": 15, "y": 2}, ], ) -def test_round(arr, decimal): - pser = pd.Series(arr) - ser = gd.Series(arr) - result = ser.round(decimal) - expected = pser.round(decimal) +def test_dataframe_round(decimals): + pdf = pd.DataFrame( + { + "w": np.arange(0.5, 10.5, 1), + "x": np.random.normal(-100, 100, 10), + "y": np.array( + [ + 14.123, + 2.343, + np.nan, + 0.0, + -8.302, + np.nan, + 94.313, + -112.236, + -8.029, + np.nan, + ] + ), + "z": np.repeat([-0.6459412758761901], 10), + } + ) + gdf = gd.DataFrame.from_pandas(pdf) + if isinstance(decimals, gd.Series): + pdecimals = decimals.to_pandas() + else: + pdecimals = decimals + + result = gdf.round(decimals) + expected = pdf.round(pdecimals) assert_eq(result, expected) # with nulls, maintaining existing null mask - arr = arr.astype("float64") # for pandas nulls - mask = np.random.randint(0, 2, arr.shape[0]) - arr[mask == 1] = np.nan + for c in pdf.columns: + arr = pdf[c].to_numpy().astype("float64") # for pandas nulls + arr.ravel()[np.random.choice(10, 5, replace=False)] = np.nan + pdf[c] = gdf[c] = arr - pser = pd.Series(arr) - ser = gd.Series(arr) - result = ser.round(decimal) - expected = pser.round(decimal) + result = gdf.round(decimals) + expected = pdf.round(pdecimals) assert_eq(result, expected) - np.array_equal(ser.nullmask.to_array(), result.to_array()) - - -@pytest.mark.parametrize( - "series", - [ - gd.Series([1.0, None, np.nan, 4.0], nan_as_null=False), - gd.Series([1.24430, None, np.nan, 4.423530], nan_as_null=False), - gd.Series([1.24430, np.nan, 4.423530], nan_as_null=False), - gd.Series([-1.24430, np.nan, -4.423530], nan_as_null=False), - gd.Series(np.repeat(np.nan, 100)), - ], -) -@pytest.mark.parametrize("decimal", [0, 1, 2, 3]) -def test_round_nan_as_null_false(series, decimal): - pser = series.to_pandas() - ser = gd.Series(series) - result = ser.round(decimal) - expected = pser.round(decimal) - np.testing.assert_array_almost_equal( - result.to_pandas(), expected, decimal=10 - ) + for c in gdf.columns: + np.array_equal(gdf[c].nullmask.to_array(), result[c].to_array()) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 4c6589789bf..980dcb5a13b 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import operator import re from string import ascii_letters, digits @@ -655,6 +655,60 @@ def test_series_mode(df, dropna): assert_eq(expected, actual, check_dtype=False) +@pytest.mark.parametrize( + "arr", + [ + np.random.normal(-100, 100, 1000), + np.random.randint(-50, 50, 1000), + np.zeros(100), + np.repeat([-0.6459412758761901], 100), + np.repeat(np.nan, 100), + np.array([1.123, 2.343, np.nan, 0.0]), + np.arange(-100.5, 101.5, 1), + ], +) +@pytest.mark.parametrize("decimals", [-5, -3, -1, 0, 1, 4, 12]) +def test_series_round(arr, decimals): + pser = pd.Series(arr) + ser = cudf.Series(arr) + result = ser.round(decimals) + expected = pser.round(decimals) + + assert_eq(result, expected) + + # with nulls, maintaining existing null mask + arr = arr.astype("float64") # for pandas nulls + arr.ravel()[ + np.random.choice(arr.shape[0], arr.shape[0] // 2, replace=False) + ] = np.nan + + pser = pd.Series(arr) + ser = cudf.Series(arr) + result = ser.round(decimals) + expected = pser.round(decimals) + + assert_eq(result, expected) + np.array_equal(ser.nullmask.to_array(), result.to_array()) + + +@pytest.mark.parametrize( + "series", + [ + cudf.Series([1.0, None, np.nan, 4.0], nan_as_null=False), + cudf.Series([1.24430, None, np.nan, 4.423530], nan_as_null=False), + cudf.Series([1.24430, np.nan, 4.423530], nan_as_null=False), + cudf.Series([-1.24430, np.nan, -4.423530], nan_as_null=False), + cudf.Series(np.repeat(np.nan, 100)), + ], +) +@pytest.mark.parametrize("decimal", [0, 1, 2, 3]) +def test_round_nan_as_null_false(series, decimal): + pser = series.to_pandas() + result = series.round(decimal) + expected = pser.round(decimal) + assert_eq(result, expected, atol=1e-10) + + @pytest.mark.parametrize("ps", _series_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_series_isnull_isna(ps, nan_as_null): diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index 6bdd251238d..fbf6d008284 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. from functools import lru_cache import cupy @@ -6,7 +6,7 @@ from numba import cuda import cudf -from cudf.utils.utils import check_equals_float, check_equals_int, rint +from cudf.utils.utils import check_equals_float, check_equals_int try: # Numba >= 0.49 @@ -69,25 +69,6 @@ def gpu_diff(in_col, out_col, out_mask, N): out_mask[i] = False -@cuda.jit -def gpu_round(in_col, out_col, decimal): - i = cuda.grid(1) - f = 10 ** decimal - - if i < in_col.size: - ret = in_col[i] * f - ret = rint(ret) - tmp = ret / f - out_col[i] = tmp - - -def apply_round(data, decimal): - output_dary = cuda.device_array_like(data) - if output_dary.size > 0: - gpu_round.forall(output_dary.size)(data, output_dary, decimal) - return output_dary - - # Find segments