Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add python bindings to fixed-size window and groupby rolling.var, rolling.std #9097

Merged
merged 59 commits into from
Sep 15, 2021
Merged
Show file tree
Hide file tree
Changes from 57 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
7cc6630
initial
isVoid Jul 21, 2021
6f64691
Compiles
isVoid Jul 21, 2021
2ab88fe
style
isVoid Jul 22, 2021
2f63568
clean up
isVoid Jul 23, 2021
16cb7a8
clean up
isVoid Jul 23, 2021
ddd59f0
header cleanup
isVoid Jul 23, 2021
bcd00f0
.
isVoid Jul 23, 2021
da8b755
More cleanup
isVoid Jul 23, 2021
3ff0d8d
revert ptx changes
isVoid Jul 23, 2021
fc00d8f
Static tests
isVoid Jul 23, 2021
1259df6
undo python changes
isVoid Jul 23, 2021
3539e1b
.
isVoid Jul 23, 2021
07aa54d
docs
isVoid Jul 23, 2021
1cc6fbe
Merge branch 'rolling_std' into rolling_agg_python
isVoid Jul 23, 2021
d250115
initial
isVoid Jul 25, 2021
d2a6407
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
isVoid Jul 25, 2021
885d66e
remove count==1 restriction
isVoid Jul 25, 2021
68ab4ae
add ddof tests
isVoid Jul 25, 2021
1e1c8cd
docfix
isVoid Jul 26, 2021
e66badd
Merge branch 'rolling_std' into rolling_agg_python
isVoid Jul 26, 2021
1288a56
fixed_point fix
isVoid Jul 26, 2021
3a9e589
docs
isVoid Jul 26, 2021
490539c
docfix
isVoid Jul 26, 2021
b73abab
Merge branch 'rolling_std' into rolling_agg_python
isVoid Jul 26, 2021
394f0f0
docfix
isVoid Jul 27, 2021
f01a222
Merge branch 'rolling_std' into rolling_agg_python
isVoid Jul 27, 2021
fe2b4f4
initial testings
isVoid Jul 27, 2021
bc0920b
remove thrust::reduce
isVoid Jul 27, 2021
934f104
.
isVoid Jul 27, 2021
b215199
Update cpp/src/rolling/rolling_detail.cuh
isVoid Jul 29, 2021
a3af3e9
Update cpp/src/rolling/rolling_detail.cuh
isVoid Jul 29, 2021
ae33de0
Update cpp/src/rolling/rolling_detail.cuh
isVoid Jul 29, 2021
645a172
address review comments
isVoid Jul 29, 2021
d94b8db
Apply suggestions from code review
isVoid Jul 29, 2021
0fe4a87
Add nan tests
isVoid Jul 30, 2021
9ce41ae
Remove auto generated column test
isVoid Jul 30, 2021
b62a40d
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
isVoid Aug 19, 2021
43df716
count==0 case maps to invalid output.
isVoid Aug 21, 2021
793783d
Merge branch 'rolling_std' into rolling_agg_python
isVoid Aug 23, 2021
652020f
Detail docs over pandas vs cudf difference on uniform windows
isVoid Aug 23, 2021
d62eb00
Apply review comments: div by zero result is valid element
isVoid Aug 25, 2021
0b78ab3
Update cpp/src/rolling/rolling_detail.cuh
isVoid Aug 25, 2021
e3f89df
ddof > count situation is valid but nan
isVoid Aug 30, 2021
a504017
Merge branch 'rolling_std' of github.com:isVoid/cudf into rolling_std
isVoid Aug 30, 2021
724db10
Merge branch 'rolling_std' into rolling_agg_python
isVoid Aug 31, 2021
d3cedb1
make operator constant
isVoid Aug 31, 2021
75e8140
header cleanup
isVoid Sep 1, 2021
0317ded
Add groupby var/std tests
isVoid Sep 1, 2021
d5359ba
Update cpp/tests/rolling/rolling_test.cpp
isVoid Sep 1, 2021
a14d77e
style
isVoid Sep 1, 2021
98c158d
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
isVoid Sep 8, 2021
8e4f2c2
Merge branch 'rolling_std' into rolling_agg_python
isVoid Sep 8, 2021
9d6f614
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
isVoid Sep 8, 2021
fc6fe6a
Use custom dataframe generator
isVoid Sep 8, 2021
aadc203
styles
isVoid Sep 8, 2021
b5a7f73
Fix seeds for rngs.
isVoid Sep 9, 2021
fce541e
Doc for dataframe gen
isVoid Sep 9, 2021
2ca83c6
Switch to `dataframe_generator`
isVoid Sep 10, 2021
0f6c359
style
isVoid Sep 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions python/cudf/cudf/_lib/aggregation.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,24 @@ cdef class RollingAggregation:
libcudf_aggregation.make_mean_aggregation[rolling_aggregation]())
return agg

@classmethod
def var(cls, ddof=1):
cdef RollingAggregation agg = cls()
agg.c_obj = move(
libcudf_aggregation.make_variance_aggregation[rolling_aggregation](
ddof
)
)
return agg

@classmethod
def std(cls, ddof=1):
cdef RollingAggregation agg = cls()
agg.c_obj = move(
libcudf_aggregation.make_std_aggregation[rolling_aggregation](ddof)
)
return agg

@classmethod
def count(cls, dropna=True):
cdef libcudf_types.null_policy c_null_handling
Expand Down
13 changes: 10 additions & 3 deletions python/cudf/cudf/_lib/rolling.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,14 @@ from cudf._lib.cpp.rolling cimport rolling_window as cpp_rolling_window
from cudf._lib.cpp.types cimport size_type


def rolling(Column source_column, Column pre_column_window,
Column fwd_column_window, window, min_periods, center, op):
def rolling(Column source_column,
Column pre_column_window,
Column fwd_column_window,
window,
min_periods,
center,
op,
agg_params):
"""
Rolling on input executing operation within the given window for each row

Expand All @@ -33,6 +39,7 @@ def rolling(Column source_column, Column pre_column_window,
center : Set the labels at the center of the window
op : operation to be executed, as of now it supports MIN, MAX, COUNT, SUM,
MEAN and UDF
agg_params : dict, parameter for the aggregation (e.g. ddof for VAR/STD)

Returns
-------
Expand All @@ -51,7 +58,7 @@ def rolling(Column source_column, Column pre_column_window,
cython_agg = make_rolling_aggregation(
op, {'dtype': source_column.dtype})
else:
cython_agg = make_rolling_aggregation(op)
cython_agg = make_rolling_aggregation(op, agg_params)

if window is None:
if center:
Expand Down
10 changes: 10 additions & 0 deletions python/cudf/cudf/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ def __init__(
self.min_periods = min_periods
self.center = center
self._normalize()
self.agg_params = {}
if axis != 0:
raise NotImplementedError("axis != 0 is not supported yet.")
self.axis = axis
Expand Down Expand Up @@ -237,6 +238,7 @@ def _apply_agg_series(self, sr, agg_name):
min_periods=min_periods,
center=self.center,
op=agg_name,
agg_params=self.agg_params,
)
return sr._from_data({sr.name: result_col}, sr._index)

Expand Down Expand Up @@ -266,6 +268,14 @@ def max(self):
def mean(self):
return self._apply_agg("mean")

def var(self, ddof=1):
self.agg_params["ddof"] = ddof
return self._apply_agg("var")

def std(self, ddof=1):
self.agg_params["ddof"] = ddof
return self._apply_agg("std")

def count(self):
return self._apply_agg("count")

Expand Down
111 changes: 102 additions & 9 deletions python/cudf/cudf/tests/test_rolling.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

import math
from decimal import Decimal

import numpy as np
import pandas as pd
Expand All @@ -20,20 +21,23 @@
([1, 2, 4, 9, 9, 4], ["a", "b", "c", "d", "e", "f"]),
],
)
@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count"])
@pytest.mark.parametrize(
"agg", ["sum", "min", "max", "mean", "count", "std", "var"]
)
@pytest.mark.parametrize("nulls", ["none", "one", "some", "all"])
@pytest.mark.parametrize("center", [True, False])
def test_rolling_series_basic(data, index, agg, nulls, center):
rng = np.random.default_rng(1)
if PANDAS_GE_110:
kwargs = {"check_freq": False}
else:
kwargs = {}
if len(data) > 0:
if nulls == "one":
p = np.random.randint(0, len(data))
p = rng.integers(0, len(data))
data[p] = np.nan
elif nulls == "some":
p1, p2 = np.random.randint(0, len(data), (2,))
p1, p2 = rng.integers(0, len(data), (2,))
data[p1] = np.nan
data[p2] = np.nan
elif nulls == "all":
Expand Down Expand Up @@ -64,19 +68,22 @@ def test_rolling_series_basic(data, index, agg, nulls, center):
},
],
)
@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count"])
@pytest.mark.parametrize(
"agg", ["sum", "min", "max", "mean", "count", "std", "var"]
)
@pytest.mark.parametrize("nulls", ["none", "one", "some", "all"])
@pytest.mark.parametrize("center", [True, False])
def test_rolling_dataframe_basic(data, agg, nulls, center):
rng = np.random.default_rng(0)
pdf = pd.DataFrame(data)

if len(pdf) > 0:
for col_name in pdf.columns:
if nulls == "one":
p = np.random.randint(0, len(data))
p = rng.integers(0, len(data))
pdf[col_name][p] = np.nan
elif nulls == "some":
p1, p2 = np.random.randint(0, len(data), (2,))
p1, p2 = rng.integers(0, len(data), (2,))
pdf[col_name][p1] = np.nan
pdf[col_name][p2] = np.nan
elif nulls == "all":
Expand All @@ -102,6 +109,8 @@ def test_rolling_dataframe_basic(data, agg, nulls, center):
pytest.param("max"),
pytest.param("mean"),
pytest.param("count"),
pytest.param("std"),
pytest.param("var"),
],
)
def test_rolling_with_offset(agg):
Expand All @@ -124,6 +133,84 @@ def test_rolling_with_offset(agg):
)


def generate_large_dataframe_for_var(size, window_size, nulls_prob, seed):
isVoid marked this conversation as resolved.
Show resolved Hide resolved
"""Generates a random number filled dataframe with nulls to evaluate
correctness of variance and std. The range of the numbers are clamped
to avoid overflows. Three dtypes were tested: `np.int64` (non-nullable
integer), `np.float64` and `Decimal`.
"""
rng = np.random.default_rng(seed)

iupper_bound = math.sqrt(np.iinfo(np.int64).max / window_size)
ilower_bound = -math.sqrt(abs(np.iinfo(np.int64).min) / window_size)

fupper_bound = math.sqrt(np.finfo(np.float64).max / window_size)
flower_bound = -math.sqrt(abs(np.finfo(np.float64).min) / window_size)

# Nullable integer type rolling agg is unsupported in pandas
intcol = rng.integers(ilower_bound, iupper_bound, size)
floatcol = [
rng.uniform(flower_bound, fupper_bound)
if rng.uniform(0, 1) > nulls_prob
else np.nan
for _ in range(size)
]
deccol = [
Decimal(int(rng.integers(ilower_bound, iupper_bound)))
if rng.uniform(0, 1) > nulls_prob
else None
for _ in range(size)
]

pdf = pd.DataFrame()
pdf["int"] = pd.Series(intcol).astype("int64")
pdf["float"] = pd.Series(floatcol).astype("float64")
pdf["decimal"] = pd.Series(deccol)

return pdf


@pytest.mark.parametrize("agg", ["std", "var"])
@pytest.mark.parametrize("ddof", [0, 1])
@pytest.mark.parametrize("center", [True, False])
@pytest.mark.parametrize("seed", [100, 1000, 10000])
@pytest.mark.parametrize("window_size", [2, 10, 100, 1000])
def test_rolling_var_std_large(agg, ddof, center, seed, window_size):
if PANDAS_GE_110:
kwargs = {"check_freq": False}
else:
kwargs = {}

n_rows = 1_000
pdf = generate_large_dataframe_for_var(
n_rows, window_size, nulls_prob=0.4, seed=seed
)
gdf = cudf.from_pandas(pdf)

expect = getattr(pdf.rolling(window_size, 1, center), agg)(ddof=ddof)
got = getattr(gdf.rolling(window_size, 1, center), agg)(ddof=ddof)

assert_eq(expect, got, **kwargs)


@pytest.mark.xfail
def test_rolling_var_uniform_window():
"""
Pandas adopts an online variance calculation algorithm. This gives a
floating point artifact.
https://github.com/pandas-dev/pandas/issues/37051

In cudf, each window is computed independently from the previous window,
this gives better numeric precision.
"""

s = pd.Series([1e8, 5, 5, 5])
expected = s.rolling(3).var()
got = cudf.from_pandas(s).rolling(3).var()

assert_eq(expected, got)


def test_rolling_count_with_offset():
"""
This test covers the xfail case from test_rolling_with_offset["count"].
Expand Down Expand Up @@ -300,7 +387,9 @@ def some_func(A):
)


@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count"])
@pytest.mark.parametrize(
"agg", ["sum", "min", "max", "mean", "count", "var", "std"]
)
def test_rolling_groupby_simple(agg):
pdf = pd.DataFrame(
{
Expand Down Expand Up @@ -330,7 +419,9 @@ def test_rolling_groupby_simple(agg):
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count"])
@pytest.mark.parametrize(
"agg", ["sum", "min", "max", "mean", "count", "var", "std"]
)
def test_rolling_groupby_multi(agg):
pdf = pd.DataFrame(
{
Expand All @@ -351,7 +442,9 @@ def test_rolling_groupby_multi(agg):
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count"])
@pytest.mark.parametrize(
"agg", ["sum", "min", "max", "mean", "count", "var", "std"]
)
@pytest.mark.parametrize(
"window_size", ["1d", "2d", "3d", "4d", "5d", "6d", "7d"]
)
Expand Down