Merge branch 'branch-0.18' into fea-lists-count-elements

rapidsai · Jan 20, 2021 · 0a04bb0 · 0a04bb0
2 parents b0f97a5 + a51caa5
commit 0a04bb0
Show file tree

Hide file tree

Showing 13 changed files with 347 additions and 106 deletions.
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 import numpy as np
 
 from . import (
@@ -23,6 +23,7 @@
     replace,
     reshape,
     rolling,
+    round,
     search,
     sort,
     stream_compaction,

diff --git a/python/cudf/cudf/_lib/cpp/round.pxd b/python/cudf/cudf/_lib/cpp/round.pxd
@@ -0,0 +1,19 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+
+cdef extern from "cudf/round.hpp" namespace "cudf" nogil:
+
+    ctypedef enum rounding_method "cudf::rounding_method":
+        HALF_UP "cudf::rounding_method::HALF_UP"
+        HALF_EVEN "cudf::rounding_method::HALF_EVEN"
+
+    cdef unique_ptr[column] round (
+        const column_view& input,
+        int32_t decimal_places,
+        rounding_method method,
+    ) except +
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
@@ -0,0 +1,42 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.round cimport (
+    rounding_method as cpp_rounding_method,
+    round as cpp_round
+)
+
+
+def round(Column input_col, int decimal_places=0):
+    """
+    Round column values to the given number of decimal places
+
+    Parameters
+    ----------
+    input_col : Column whose values will be rounded
+    decimal_places : The number or decimal places to round to
+
+    Returns
+    -------
+    A Column with values rounded to the given number of decimal places
+    """
+
+    cdef column_view input_col_view = input_col.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_round(
+                input_col_view,
+                decimal_places,
+                cpp_rounding_method.HALF_EVEN,
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 from numbers import Number
 
@@ -342,17 +342,9 @@ def corr(self, other):
         return cov / lhs_std / rhs_std
 
     def round(self, decimals=0):
-        if decimals < 0:
-            msg = "Decimal values < 0 are not yet supported."
-            raise NotImplementedError(msg)
-
-        if np.issubdtype(self.dtype, np.integer):
-            return self
-
-        data = Buffer(
-            cudautils.apply_round(self.data_array_view, decimals).view("|u1")
-        )
-        return column.build_column(data=data, dtype=self.dtype, mask=self.mask)
+        """Round the values in the Column to the given number of decimals.
+        """
+        return libcudf.round.round(self, decimal_places=decimals)
 
     def applymap(self, udf, out_dtype=None):
         """Apply an element-wise function to transform the values in the Column.

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -6844,6 +6844,8 @@ def to_csv(
         index=True,
         line_terminator="\n",
         chunksize=None,
+        encoding=None,
+        compression=None,
         **kwargs,
     ):
         """{docstring}"""
@@ -6859,6 +6861,8 @@ def to_csv(
             index=index,
             line_terminator=line_terminator,
             chunksize=chunksize,
+            encoding=encoding,
+            compression=compression,
             **kwargs,
         )
 

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -1751,6 +1751,117 @@ def __arrow_array__(self, type=None):
             "consider using .to_arrow()"
         )
 
+    def round(self, decimals=0):
+        """
+        Round a DataFrame to a variable number of decimal places.
+
+        Parameters
+        ----------
+        decimals : int, dict, Series
+            Number of decimal places to round each column to. If an int is
+            given, round each column to the same number of places.
+            Otherwise dict and Series round to variable numbers of places.
+            Column names should be in the keys if `decimals` is a
+            dict-like, or in the index if `decimals` is a Series. Any
+            columns not included in `decimals` will be left as is. Elements
+            of `decimals` which are not columns of the input will be
+            ignored.
+
+        Returns
+        -------
+        DataFrame
+            A DataFrame with the affected columns rounded to the specified
+            number of decimal places.
+
+        Examples
+        --------
+        >>> df = cudf.DataFrame(
+                [(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
+        ...     columns=['dogs', 'cats']
+        ... )
+        >>> df
+            dogs  cats
+        0  0.21  0.32
+        1  0.01  0.67
+        2  0.66  0.03
+        3  0.21  0.18
+
+        By providing an integer each column is rounded to the same number
+        of decimal places
+
+        >>> df.round(1)
+            dogs  cats
+        0   0.2   0.3
+        1   0.0   0.7
+        2   0.7   0.0
+        3   0.2   0.2
+
+        With a dict, the number of places for specific columns can be
+        specified with the column names as key and the number of decimal
+        places as value
+
+        >>> df.round({'dogs': 1, 'cats': 0})
+            dogs  cats
+        0   0.2   0.0
+        1   0.0   1.0
+        2   0.7   0.0
+        3   0.2   0.0
+
+        Using a Series, the number of places for specific columns can be
+        specified with the column names as index and the number of
+        decimal places as value
+
+        >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs'])
+        >>> df.round(decimals)
+            dogs  cats
+        0   0.2   0.0
+        1   0.0   1.0
+        2   0.7   0.0
+        3   0.2   0.0
+        """
+
+        if isinstance(decimals, cudf.Series):
+            decimals = decimals.to_pandas()
+
+        if isinstance(decimals, (dict, pd.Series)):
+            if (
+                isinstance(decimals, pd.Series)
+                and not decimals.index.is_unique
+            ):
+                raise ValueError("Index of decimals must be unique")
+
+            cols = {
+                name: col.round(decimals[name])
+                if (
+                    name in decimals.keys()
+                    and pd.api.types.is_numeric_dtype(col.dtype)
+                )
+                else col.copy(deep=True)
+                for name, col in self._data.items()
+            }
+        elif isinstance(decimals, int):
+            cols = {
+                name: col.round(decimals)
+                if pd.api.types.is_numeric_dtype(col.dtype)
+                else col.copy(deep=True)
+                for name, col in self._data.items()
+            }
+        else:
+            raise TypeError(
+                "decimals must be an integer, a dict-like or a Series"
+            )
+
+        return self.__class__._from_table(
+            Frame(
+                data=cudf.core.column_accessor.ColumnAccessor(
+                    cols,
+                    multiindex=self._data.multiindex,
+                    level_names=self._data.level_names,
+                )
+            ),
+            index=self._index,
+        )
+
     @annotate("SAMPLE", color="orange", domain="cudf_python")
     def sample(
         self,

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 import pickle
 import warnings
 from collections import abc as abc
@@ -3506,8 +3506,31 @@ def mode(self, dropna=True):
         return Series(val_counts.index.sort_values(), name=self.name)
 
     def round(self, decimals=0):
-        """Round a Series to a configurable number of decimal places.
         """
+        Round each value in a Series to the given number of decimals.
+
+        Parameters
+        ----------
+        decimals : int, default 0
+            Number of decimal places to round to. If decimals is negative,
+            it specifies the number of positions to the left of the decimal
+            point.
+
+        Returns
+        -------
+        Series
+            Rounded values of the Series.
+
+        Examples
+        --------
+        >>> s = cudf.Series([0.1, 1.4, 2.9])
+        >>> s.round()
+        0    0.0
+        1    1.0
+        2    3.0
+        dtype: float64
+        """
+
         return Series(
             self._column.round(decimals=decimals),
             name=self.name,

diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
@@ -115,6 +115,8 @@ def to_csv(
     index=True,
     line_terminator="\n",
     chunksize=None,
+    encoding=None,
+    compression=None,
     **kwargs,
 ):
     """{docstring}"""
@@ -124,6 +126,17 @@ def to_csv(
     elif len(sep) > 1:
         raise TypeError('"sep" must be a 1-character string')
 
+    if encoding and encoding != "utf-8":
+        error_msg = (
+            f"Encoding {encoding} is not supported. "
+            + "Currently, only utf-8 encoding is supported."
+        )
+        raise NotImplementedError(error_msg)
+
+    if compression:
+        error_msg = "Writing compressed csv is not currently supported in cudf"
+        raise NotImplementedError(error_msg)
+
     return_as_string = False
     if path_or_buf is None:
         path_or_buf = StringIO()

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -1955,3 +1955,24 @@ def test_csv_sep_error():
         rfunc_args_and_kwargs=([], {"sep": 1}),
         expected_error_message='"sep" must be string, not int',
     )
+
+
+def test_to_csv_encoding_error():
+    # TODO: Remove this test once following
+    # issue is fixed: https://github.com/rapidsai/cudf/issues/2957
+    df = cudf.DataFrame({"a": ["你好", "test"]})
+    encoding = "utf-8-sig"
+    error_message = (
+        f"Encoding {encoding} is not supported. "
+        + "Currently, only utf-8 encoding is supported."
+    )
+    with pytest.raises(NotImplementedError, match=re.escape(error_message)):
+        df.to_csv("test.csv", encoding=encoding)
+
+
+def test_to_csv_compression_error():
+    df = cudf.DataFrame({"a": ["test"]})
+    compression = "snappy"
+    error_message = "Writing compressed csv is not currently supported in cudf"
+    with pytest.raises(NotImplementedError, match=re.escape(error_message)):
+        df.to_csv("test.csv", compression=compression)