From 95059b84b9bdbfc819741b7806c63e8a26cf0ae2 Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Thu, 21 Jan 2021 02:36:04 +0530
Subject: [PATCH 1/2] Add encoding and compression argument to CSV writer
 (#7168)

This PR closes https://github.com/rapidsai/cudf/issues/7083  by adding an encoding argument to our CSV writer, it also adds compression argument to the writer.

This will help address some issues with feature tool compatibility [PR](https://github.com/alteryx/featuretools/pull/1246).

Authors:
  - Vibhu Jawa (@VibhuJawa)

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Michael Wang (@isVoid)

URL: https://github.com/rapidsai/cudf/pull/7168
---
 python/cudf/cudf/core/dataframe.py |  4 ++++
 python/cudf/cudf/io/csv.py         | 13 +++++++++++++
 python/cudf/cudf/tests/test_csv.py | 21 +++++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py  |  7 ++++++-
 4 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 606d96600a1..906a71623e0 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6844,6 +6844,8 @@ def to_csv(
         index=True,
         line_terminator="\n",
         chunksize=None,
+        encoding=None,
+        compression=None,
         **kwargs,
     ):
         """{docstring}"""
@@ -6859,6 +6861,8 @@ def to_csv(
             index=index,
             line_terminator=line_terminator,
             chunksize=chunksize,
+            encoding=encoding,
+            compression=compression,
             **kwargs,
         )
 
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 0270198a7e0..e2c7ca7dca1 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -115,6 +115,8 @@ def to_csv(
     index=True,
     line_terminator="\n",
     chunksize=None,
+    encoding=None,
+    compression=None,
     **kwargs,
 ):
     """{docstring}"""
@@ -124,6 +126,17 @@ def to_csv(
     elif len(sep) > 1:
         raise TypeError('"sep" must be a 1-character string')
 
+    if encoding and encoding != "utf-8":
+        error_msg = (
+            f"Encoding {encoding} is not supported. "
+            + "Currently, only utf-8 encoding is supported."
+        )
+        raise NotImplementedError(error_msg)
+
+    if compression:
+        error_msg = "Writing compressed csv is not currently supported in cudf"
+        raise NotImplementedError(error_msg)
+
     return_as_string = False
     if path_or_buf is None:
         path_or_buf = StringIO()
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 0ea8b3add4b..23a950bb72d 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1955,3 +1955,24 @@ def test_csv_sep_error():
         rfunc_args_and_kwargs=([], {"sep": 1}),
         expected_error_message='"sep" must be string, not int',
     )
+
+
+def test_to_csv_encoding_error():
+    # TODO: Remove this test once following
+    # issue is fixed: https://github.com/rapidsai/cudf/issues/2957
+    df = cudf.DataFrame({"a": ["你好", "test"]})
+    encoding = "utf-8-sig"
+    error_message = (
+        f"Encoding {encoding} is not supported. "
+        + "Currently, only utf-8 encoding is supported."
+    )
+    with pytest.raises(NotImplementedError, match=re.escape(error_message)):
+        df.to_csv("test.csv", encoding=encoding)
+
+
+def test_to_csv_compression_error():
+    df = cudf.DataFrame({"a": ["test"]})
+    compression = "snappy"
+    error_message = "Writing compressed csv is not currently supported in cudf"
+    with pytest.raises(NotImplementedError, match=re.escape(error_message)):
+        df.to_csv("test.csv", compression=compression)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index e71c90b1ec9..5d52d6c7da4 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -931,7 +931,12 @@
 line_terminator : char, default '\\n'
 chunksize : int or None, default None
     Rows to write at a time
-
+encoding: str, default 'utf-8'
+    A string representing the encoding to use in the output file
+    Only ‘utf-8’ is currently supported
+compression: str, None
+    A string representing the compression scheme to use in the the output file
+    Compression while writing csv is not supported currently
 Returns
 -------
 None or str

From a51caa595bb84bc73e181c4fb3d4d064577f43cb Mon Sep 17 00:00:00 2001
From: ChrisJar <chris.jarrett.0@gmail.com>
Date: Wed, 20 Jan 2021 15:32:57 -0600
Subject: [PATCH 2/2] Enable round in cudf for DataFrame and Series (#7022)

This enables round for DataFrames and Series using the libcudf round implementation and removes the old numba round implementation.

Closes #1270

Authors:
  - @ChrisJar

Approvers:
  - Ashwin Srinath (@shwina)
  - Michael Wang (@isVoid)
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)
  - GALI PREM SAGAR (@galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/7022
---
 python/cudf/cudf/_lib/__init__.py         |   3 +-
 python/cudf/cudf/_lib/cpp/round.pxd       |  19 ++++
 python/cudf/cudf/_lib/round.pyx           |  42 ++++++++
 python/cudf/cudf/core/column/numerical.py |  16 +---
 python/cudf/cudf/core/frame.py            | 111 ++++++++++++++++++++++
 python/cudf/cudf/core/series.py           |  27 +++++-
 python/cudf/cudf/tests/test_dataframe.py  | 111 +++++++++-------------
 python/cudf/cudf/tests/test_series.py     |  56 ++++++++++-
 python/cudf/cudf/utils/cudautils.py       |  23 +----
 9 files changed, 303 insertions(+), 105 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/round.pxd
 create mode 100644 python/cudf/cudf/_lib/round.pyx

diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index e52ff707319..be2d4ef5f51 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 import numpy as np
 
 from . import (
@@ -23,6 +23,7 @@
     replace,
     reshape,
     rolling,
+    round,
     search,
     sort,
     stream_compaction,
diff --git a/python/cudf/cudf/_lib/cpp/round.pxd b/python/cudf/cudf/_lib/cpp/round.pxd
new file mode 100644
index 00000000000..78f18dcacce
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/round.pxd
@@ -0,0 +1,19 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+
+cdef extern from "cudf/round.hpp" namespace "cudf" nogil:
+
+    ctypedef enum rounding_method "cudf::rounding_method":
+        HALF_UP "cudf::rounding_method::HALF_UP"
+        HALF_EVEN "cudf::rounding_method::HALF_EVEN"
+
+    cdef unique_ptr[column] round (
+        const column_view& input,
+        int32_t decimal_places,
+        rounding_method method,
+    ) except +
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
new file mode 100644
index 00000000000..660d6d91670
--- /dev/null
+++ b/python/cudf/cudf/_lib/round.pyx
@@ -0,0 +1,42 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.round cimport (
+    rounding_method as cpp_rounding_method,
+    round as cpp_round
+)
+
+
+def round(Column input_col, int decimal_places=0):
+    """
+    Round column values to the given number of decimal places
+
+    Parameters
+    ----------
+    input_col : Column whose values will be rounded
+    decimal_places : The number or decimal places to round to
+
+    Returns
+    -------
+    A Column with values rounded to the given number of decimal places
+    """
+
+    cdef column_view input_col_view = input_col.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_round(
+                input_col_view,
+                decimal_places,
+                cpp_rounding_method.HALF_EVEN,
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b8a25d83ea8..54a6d274843 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 from numbers import Number
 
@@ -342,17 +342,9 @@ def corr(self, other):
         return cov / lhs_std / rhs_std
 
     def round(self, decimals=0):
-        if decimals < 0:
-            msg = "Decimal values < 0 are not yet supported."
-            raise NotImplementedError(msg)
-
-        if np.issubdtype(self.dtype, np.integer):
-            return self
-
-        data = Buffer(
-            cudautils.apply_round(self.data_array_view, decimals).view("|u1")
-        )
-        return column.build_column(data=data, dtype=self.dtype, mask=self.mask)
+        """Round the values in the Column to the given number of decimals.
+        """
+        return libcudf.round.round(self, decimal_places=decimals)
 
     def applymap(self, udf, out_dtype=None):
         """Apply an element-wise function to transform the values in the Column.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 87081cbecdb..ad4069dfb68 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1751,6 +1751,117 @@ def __arrow_array__(self, type=None):
             "consider using .to_arrow()"
         )
 
+    def round(self, decimals=0):
+        """
+        Round a DataFrame to a variable number of decimal places.
+
+        Parameters
+        ----------
+        decimals : int, dict, Series
+            Number of decimal places to round each column to. If an int is
+            given, round each column to the same number of places.
+            Otherwise dict and Series round to variable numbers of places.
+            Column names should be in the keys if `decimals` is a
+            dict-like, or in the index if `decimals` is a Series. Any
+            columns not included in `decimals` will be left as is. Elements
+            of `decimals` which are not columns of the input will be
+            ignored.
+
+        Returns
+        -------
+        DataFrame
+            A DataFrame with the affected columns rounded to the specified
+            number of decimal places.
+
+        Examples
+        --------
+        >>> df = cudf.DataFrame(
+                [(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
+        ...     columns=['dogs', 'cats']
+        ... )
+        >>> df
+            dogs  cats
+        0  0.21  0.32
+        1  0.01  0.67
+        2  0.66  0.03
+        3  0.21  0.18
+
+        By providing an integer each column is rounded to the same number
+        of decimal places
+
+        >>> df.round(1)
+            dogs  cats
+        0   0.2   0.3
+        1   0.0   0.7
+        2   0.7   0.0
+        3   0.2   0.2
+
+        With a dict, the number of places for specific columns can be
+        specified with the column names as key and the number of decimal
+        places as value
+
+        >>> df.round({'dogs': 1, 'cats': 0})
+            dogs  cats
+        0   0.2   0.0
+        1   0.0   1.0
+        2   0.7   0.0
+        3   0.2   0.0
+
+        Using a Series, the number of places for specific columns can be
+        specified with the column names as index and the number of
+        decimal places as value
+
+        >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs'])
+        >>> df.round(decimals)
+            dogs  cats
+        0   0.2   0.0
+        1   0.0   1.0
+        2   0.7   0.0
+        3   0.2   0.0
+        """
+
+        if isinstance(decimals, cudf.Series):
+            decimals = decimals.to_pandas()
+
+        if isinstance(decimals, (dict, pd.Series)):
+            if (
+                isinstance(decimals, pd.Series)
+                and not decimals.index.is_unique
+            ):
+                raise ValueError("Index of decimals must be unique")
+
+            cols = {
+                name: col.round(decimals[name])
+                if (
+                    name in decimals.keys()
+                    and pd.api.types.is_numeric_dtype(col.dtype)
+                )
+                else col.copy(deep=True)
+                for name, col in self._data.items()
+            }
+        elif isinstance(decimals, int):
+            cols = {
+                name: col.round(decimals)
+                if pd.api.types.is_numeric_dtype(col.dtype)
+                else col.copy(deep=True)
+                for name, col in self._data.items()
+            }
+        else:
+            raise TypeError(
+                "decimals must be an integer, a dict-like or a Series"
+            )
+
+        return self.__class__._from_table(
+            Frame(
+                data=cudf.core.column_accessor.ColumnAccessor(
+                    cols,
+                    multiindex=self._data.multiindex,
+                    level_names=self._data.level_names,
+                )
+            ),
+            index=self._index,
+        )
+
     @annotate("SAMPLE", color="orange", domain="cudf_python")
     def sample(
         self,
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 2b9078abed6..4da0749bddb 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 import pickle
 import warnings
 from collections import abc as abc
@@ -3506,8 +3506,31 @@ def mode(self, dropna=True):
         return Series(val_counts.index.sort_values(), name=self.name)
 
     def round(self, decimals=0):
-        """Round a Series to a configurable number of decimal places.
         """
+        Round each value in a Series to the given number of decimals.
+
+        Parameters
+        ----------
+        decimals : int, default 0
+            Number of decimal places to round to. If decimals is negative,
+            it specifies the number of positions to the left of the decimal
+            point.
+
+        Returns
+        -------
+        Series
+            Rounded values of the Series.
+
+        Examples
+        --------
+        >>> s = cudf.Series([0.1, 1.4, 2.9])
+        >>> s.round()
+        0    0.0
+        1    1.0
+        2    3.0
+        dtype: float64
+        """
+
         return Series(
             self._column.round(decimals=decimals),
             name=self.name,
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 66d25076d68..f66aeb8cf1e 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -3243,86 +3243,61 @@ def test_ndim():
 
 
 @pytest.mark.parametrize(
-    "arr",
-    [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
-        np.zeros(100),
-        np.repeat([-0.6459412758761901], 100),
-        np.repeat(np.nan, 100),
-        np.array([1.123, 2.343, np.nan, 0.0]),
-    ],
-)
-@pytest.mark.parametrize(
-    "decimal",
+    "decimals",
     [
+        -3,
         0,
-        1,
-        2,
-        3,
-        4,
         5,
-        6,
-        7,
-        8,
-        9,
-        10,
-        11,
-        12,
-        13,
-        14,
-        15,
-        16,
-        17,
-        pytest.param(
-            -1,
-            marks=[
-                pytest.mark.xfail(reason="NotImplementedError: decimals < 0")
-            ],
-        ),
+        pd.Series([1, 4, 3, -6], index=["w", "x", "y", "z"]),
+        gd.Series([-4, -2, 12], index=["x", "y", "z"]),
+        {"w": -1, "x": 15, "y": 2},
     ],
 )
-def test_round(arr, decimal):
-    pser = pd.Series(arr)
-    ser = gd.Series(arr)
-    result = ser.round(decimal)
-    expected = pser.round(decimal)
+def test_dataframe_round(decimals):
+    pdf = pd.DataFrame(
+        {
+            "w": np.arange(0.5, 10.5, 1),
+            "x": np.random.normal(-100, 100, 10),
+            "y": np.array(
+                [
+                    14.123,
+                    2.343,
+                    np.nan,
+                    0.0,
+                    -8.302,
+                    np.nan,
+                    94.313,
+                    -112.236,
+                    -8.029,
+                    np.nan,
+                ]
+            ),
+            "z": np.repeat([-0.6459412758761901], 10),
+        }
+    )
+    gdf = gd.DataFrame.from_pandas(pdf)
 
+    if isinstance(decimals, gd.Series):
+        pdecimals = decimals.to_pandas()
+    else:
+        pdecimals = decimals
+
+    result = gdf.round(decimals)
+    expected = pdf.round(pdecimals)
     assert_eq(result, expected)
 
     # with nulls, maintaining existing null mask
-    arr = arr.astype("float64")  # for pandas nulls
-    mask = np.random.randint(0, 2, arr.shape[0])
-    arr[mask == 1] = np.nan
+    for c in pdf.columns:
+        arr = pdf[c].to_numpy().astype("float64")  # for pandas nulls
+        arr.ravel()[np.random.choice(10, 5, replace=False)] = np.nan
+        pdf[c] = gdf[c] = arr
 
-    pser = pd.Series(arr)
-    ser = gd.Series(arr)
-    result = ser.round(decimal)
-    expected = pser.round(decimal)
+    result = gdf.round(decimals)
+    expected = pdf.round(pdecimals)
 
     assert_eq(result, expected)
-    np.array_equal(ser.nullmask.to_array(), result.to_array())
-
-
-@pytest.mark.parametrize(
-    "series",
-    [
-        gd.Series([1.0, None, np.nan, 4.0], nan_as_null=False),
-        gd.Series([1.24430, None, np.nan, 4.423530], nan_as_null=False),
-        gd.Series([1.24430, np.nan, 4.423530], nan_as_null=False),
-        gd.Series([-1.24430, np.nan, -4.423530], nan_as_null=False),
-        gd.Series(np.repeat(np.nan, 100)),
-    ],
-)
-@pytest.mark.parametrize("decimal", [0, 1, 2, 3])
-def test_round_nan_as_null_false(series, decimal):
-    pser = series.to_pandas()
-    ser = gd.Series(series)
-    result = ser.round(decimal)
-    expected = pser.round(decimal)
-    np.testing.assert_array_almost_equal(
-        result.to_pandas(), expected, decimal=10
-    )
+    for c in gdf.columns:
+        np.array_equal(gdf[c].nullmask.to_array(), result[c].to_array())
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 4c6589789bf..980dcb5a13b 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 import operator
 import re
 from string import ascii_letters, digits
@@ -655,6 +655,60 @@ def test_series_mode(df, dropna):
     assert_eq(expected, actual, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "arr",
+    [
+        np.random.normal(-100, 100, 1000),
+        np.random.randint(-50, 50, 1000),
+        np.zeros(100),
+        np.repeat([-0.6459412758761901], 100),
+        np.repeat(np.nan, 100),
+        np.array([1.123, 2.343, np.nan, 0.0]),
+        np.arange(-100.5, 101.5, 1),
+    ],
+)
+@pytest.mark.parametrize("decimals", [-5, -3, -1, 0, 1, 4, 12])
+def test_series_round(arr, decimals):
+    pser = pd.Series(arr)
+    ser = cudf.Series(arr)
+    result = ser.round(decimals)
+    expected = pser.round(decimals)
+
+    assert_eq(result, expected)
+
+    # with nulls, maintaining existing null mask
+    arr = arr.astype("float64")  # for pandas nulls
+    arr.ravel()[
+        np.random.choice(arr.shape[0], arr.shape[0] // 2, replace=False)
+    ] = np.nan
+
+    pser = pd.Series(arr)
+    ser = cudf.Series(arr)
+    result = ser.round(decimals)
+    expected = pser.round(decimals)
+
+    assert_eq(result, expected)
+    np.array_equal(ser.nullmask.to_array(), result.to_array())
+
+
+@pytest.mark.parametrize(
+    "series",
+    [
+        cudf.Series([1.0, None, np.nan, 4.0], nan_as_null=False),
+        cudf.Series([1.24430, None, np.nan, 4.423530], nan_as_null=False),
+        cudf.Series([1.24430, np.nan, 4.423530], nan_as_null=False),
+        cudf.Series([-1.24430, np.nan, -4.423530], nan_as_null=False),
+        cudf.Series(np.repeat(np.nan, 100)),
+    ],
+)
+@pytest.mark.parametrize("decimal", [0, 1, 2, 3])
+def test_round_nan_as_null_false(series, decimal):
+    pser = series.to_pandas()
+    result = series.round(decimal)
+    expected = pser.round(decimal)
+    assert_eq(result, expected, atol=1e-10)
+
+
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_isnull_isna(ps, nan_as_null):
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 6bdd251238d..fbf6d008284 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 from functools import lru_cache
 
 import cupy
@@ -6,7 +6,7 @@
 from numba import cuda
 
 import cudf
-from cudf.utils.utils import check_equals_float, check_equals_int, rint
+from cudf.utils.utils import check_equals_float, check_equals_int
 
 try:
     # Numba >= 0.49
@@ -69,25 +69,6 @@ def gpu_diff(in_col, out_col, out_mask, N):
             out_mask[i] = False
 
 
-@cuda.jit
-def gpu_round(in_col, out_col, decimal):
-    i = cuda.grid(1)
-    f = 10 ** decimal
-
-    if i < in_col.size:
-        ret = in_col[i] * f
-        ret = rint(ret)
-        tmp = ret / f
-        out_col[i] = tmp
-
-
-def apply_round(data, decimal):
-    output_dary = cuda.device_array_like(data)
-    if output_dary.size > 0:
-        gpu_round.forall(output_dary.size)(data, output_dary, decimal)
-    return output_dary
-
-
 # Find segments