Skip to content

Commit

Permalink
Merge pull request #2905 from karthikeyann/fea-median_with_null
Browse files Browse the repository at this point in the history
[REVIEW] Series median() with null support
  • Loading branch information
karthikeyann authored Nov 19, 2019
2 parents 3000d59 + beb9539 commit 6c8ce65
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 23 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# cuDF 0.11.0 (Date TBD)

## New Features

- PR #2905 Added `Series.median()` and null support for `Series.quantile()`
- PR #2930 JSON Reader: Support ARROW_RANDOM_FILE input
- PR #2956 Add `cudf::stack` and `cudf::tile`
- PR #2980 Added nvtext is_vowel/is_consonant functions
Expand Down
25 changes: 16 additions & 9 deletions cpp/src/quantiles/legacy/quantiles.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/legacy/type_dispatcher.hpp>
#include <cudf/utilities/legacy/wrapper_types.hpp>
#include <bitmask/legacy/legacy_bitmask.hpp>
#include <rmm/thrust_rmm_allocator.h>

#include <thrust/device_vector.h>
Expand Down Expand Up @@ -83,21 +84,29 @@ namespace{ // anonymous

if( ctxt->flag_sort_inplace && ctxt->flag_sorted )
{
return select_quantile(col_data,
n,
auto data_begin = ctxt->flag_null_sort_behavior == GDF_NULL_AS_LARGEST
? col_data
: col_data + col_in->null_count;
return select_quantile(data_begin,
n-col_in->null_count,
quant,
interpolation,
*ptr_res,
ctxt->flag_sorted,
stream);
}else{
// create a clone of col_data if sort is required but sort_inplace is not allowed.
rmm::device_vector<ColType> dv(n);
thrust::copy_n(rmm::exec_policy(stream)->on(stream), col_data, n, dv.begin());
rmm::device_vector<ColType> dv(n-col_in->null_count);
thrust::copy_if(rmm::exec_policy(stream)->on(stream), col_data,
col_data + n,
thrust::counting_iterator<cudf::size_type>(0), dv.begin(),
[bitmask = col_in->valid] __device__(auto i) {
return gdf_is_valid(bitmask, i);
});
ColType* clone_data = dv.data().get();

return select_quantile(clone_data,
n,
dv.size(),
quant,
interpolation,
*ptr_res,
Expand Down Expand Up @@ -172,14 +181,13 @@ gdf_error quantile_exact( gdf_column* col_in, // input column
{
GDF_REQUIRE(nullptr != col_in, GDF_DATASET_EMPTY);

if (col_in->size == 0) {
if (col_in->size == col_in->null_count) {
result->is_valid = false;
return GDF_SUCCESS;
}

GDF_REQUIRE(nullptr != col_in->data, GDF_DATASET_EMPTY);
GDF_REQUIRE(0 < col_in->size, GDF_DATASET_EMPTY);
GDF_REQUIRE(nullptr == col_in->valid || 0 == col_in->null_count, GDF_VALIDITY_UNSUPPORTED);

gdf_error ret = GDF_SUCCESS;
result->dtype = GDF_FLOAT64;
Expand All @@ -200,14 +208,13 @@ gdf_error quantile_approx( gdf_column* col_in, // input column
{
GDF_REQUIRE(nullptr != col_in, GDF_DATASET_EMPTY);

if (col_in->size == 0) {
if (col_in->size == col_in->null_count) {
result->is_valid = false;
return GDF_SUCCESS;
}

GDF_REQUIRE(nullptr != col_in->data, GDF_DATASET_EMPTY);
GDF_REQUIRE(0 < col_in->size, GDF_DATASET_EMPTY);
GDF_REQUIRE(nullptr == col_in->valid || 0 == col_in->null_count, GDF_VALIDITY_UNSUPPORTED);

gdf_error ret = GDF_SUCCESS;
result->dtype = col_in->dtype;
Expand Down
65 changes: 52 additions & 13 deletions cpp/tests/quantiles/legacy/quantiles_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
template<typename VType>
void f_quantile_tester(
gdf_column* col_in, ///< input column
std::vector<VType>& v_appox, ///< expected result for quantile_approx
bool is_sorted, ///< input column sorted?
std::vector<VType>& v_approx, ///< expected result for quantile_approx
std::vector<std::vector<double>>& v_exact, ///< expected result for quantile_exact
const gdf_error expected_error = GDF_SUCCESS) ///< expected returned state for quantiles
{
Expand All @@ -59,7 +60,7 @@ void f_quantile_tester(
size_t n_qs = qvals.size();

assert( n_methods == methods.size() );
gdf_context ctxt{0, static_cast<gdf_method>(0), 0, 1};
gdf_context ctxt{is_sorted, static_cast<gdf_method>(0), 0, 1};

for(size_t j = 0; j<n_qs; ++j)
{
Expand All @@ -68,10 +69,10 @@ void f_quantile_tester(
EXPECT_EQ( ret, expected_error) << "approx " << " returns unexpected failure\n";

if( ret == GDF_SUCCESS ){
double delta = std::abs(static_cast<double>(result_approx.value() - v_appox[j]));
double delta = std::abs(static_cast<double>(result_approx.value() - v_approx[j]));
bool flag = delta < 1.0e-8;
EXPECT_EQ( flag, true ) << " " << q << " appox quantile "
<< " val = " << result_approx.value() << ", " << v_appox[j];
EXPECT_EQ( flag, true ) << " " << q << " approx quantile "
<< " val = " << result_approx.value() << ", " << v_approx[j];
}

for(size_t i = 0;i<n_methods;++i)
Expand Down Expand Up @@ -105,7 +106,7 @@ TEST_F(gdf_quantile, DoubleVector)
{-1.01, 0.8, 0.955, 2.13, 6.8},
{-1.01, 0.8, 1.11, 2.13, 6.8}};

f_quantile_tester<VType>(col.get(), v_baseline_approx, v_baseline_exact);
f_quantile_tester<VType>(col.get(), false, v_baseline_approx, v_baseline_exact);
}

TEST_F(gdf_quantile, IntegerVector)
Expand All @@ -122,15 +123,14 @@ TEST_F(gdf_quantile, IntegerVector)
{-1.0, 1.0, 1.0, 2.0, 7.0},
{-1, 1, 1, 2, 7}};

f_quantile_tester<VType>(col.get(), v_baseline_approx, v_baseline_exact);
f_quantile_tester<VType>(col.get(), false, v_baseline_approx, v_baseline_exact);
}

TEST_F(gdf_quantile, ReportValidMaskError)
TEST_F(gdf_quantile, SortedVector)
{
using VType = int32_t;
std::vector<VType> v{7, 0, 3, 4, 2, 1, -1, 1, 6};
std::vector<cudf::valid_type> bitmask(gdf_valid_allocation_size(v.size()), 0xF3);
cudf::test::column_wrapper<VType> col(v, bitmask);
std::vector<VType> v{-1, 0, 1, 1, 2, 3, 4, 6, 7};
cudf::test::column_wrapper<VType> col(v);

std::vector<VType> v_baseline_approx{-1, 1, 1, 2, 7};
std::vector<std::vector<double>> v_baseline_exact{
Expand All @@ -139,8 +139,47 @@ TEST_F(gdf_quantile, ReportValidMaskError)
{-1, 1, 1, 2, 7},
{-1.0, 1.0, 1.0, 2.0, 7.0},
{-1, 1, 1, 2, 7}};

f_quantile_tester<VType>(col.get(), v_baseline_approx, v_baseline_exact, GDF_VALIDITY_UNSUPPORTED);

f_quantile_tester<VType>(col.get(), true, v_baseline_approx, v_baseline_exact);
}

TEST_F(gdf_quantile, VectorWithNulls)
{
using VType = int32_t;
std::vector<VType> v{7, 0, 3, 4, 2, 1, -1, 1, 6};
std::vector<gdf_valid_type> bitmask(gdf_valid_allocation_size(v.size()), 0xF3);
cudf::test::column_wrapper<VType> col(v, bitmask);
//col.print(); //7 0 @ @ 2 1 -1 1 6

std::vector<VType> v_baseline_approx{-1, 0, 0, 1, 7};
std::vector<std::vector<double>> v_baseline_exact{
{-1., 0.5, 0.98, 1., 7.},
{-1., 0., 0., 1., 7.},
{-1., 1., 1., 1., 7.},
{-1., 0.5, 0.5, 1., 7.},
{-1., 1., 1., 1., 7.}};

f_quantile_tester<VType>(col.get(), false, v_baseline_approx, v_baseline_exact);
}

TEST_F(gdf_quantile, SortedVectorWithNulls)
{
using VType = int32_t;
std::vector<VType> v{-1, 0, 1, 1, 2, 6, 7, 3, 4};
std::vector<gdf_valid_type> bitmask(gdf_valid_allocation_size(v.size()), 0x7F);
bitmask[1]=0xFE;
cudf::test::column_wrapper<VType> col(v, bitmask);
//col.print(); //-1 0 1 1 2 6 7 @ @

std::vector<VType> v_baseline_approx{-1, 0, 0, 1, 7};
std::vector<std::vector<double>> v_baseline_exact{
{-1., 0.5, 0.98, 1., 7.},
{-1., 0., 0., 1., 7.},
{-1., 1., 1., 1., 7.},
{-1., 0.5, 0.5, 1., 7.},
{-1., 1., 1., 1., 7.}};

f_quantile_tester<VType>(col.get(), true, v_baseline_approx, v_baseline_exact);
}


Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,14 @@ def var(self, ddof=1, axis=None, skipna=True):
def sum_of_squares(self, dtype=None):
return self._column.sum_of_squares(dtype=dtype)

def median(self, skipna=True):
"""Compute the median of the series
"""
if not skipna and self.null_count > 0:
return np.nan
# enforce linear in case the default ever changes
return self.quantile(0.5, interpolation="linear", exact=True)

def round(self, decimals=0):
"""Round a Series to a configurable number of decimal places.
"""
Expand Down
28 changes: 28 additions & 0 deletions python/cudf/cudf/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,34 @@ def test_skew(data, null_flag):
np.testing.assert_array_almost_equal(got, expected)


@pytest.mark.parametrize("dtype", params_dtypes)
@pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100])
def test_series_median(dtype, num_na):
np.random.seed(0)
arr = np.random.random(100)
if np.issubdtype(dtype, np.integer):
arr *= 100
mask = np.arange(100) >= num_na

arr = arr.astype(dtype)
sr = Series.from_masked_array(arr, Series(mask).as_mask())
arr2 = arr[mask]
ps = pd.Series(arr2, dtype=dtype)

actual = sr.median(skipna=True)
desired = ps.median(skipna=True)
print(actual, desired)
np.testing.assert_approx_equal(actual, desired)

# only for float until integer null supported convert to pandas in cudf
# eg. pd.Int64Dtype
if np.issubdtype(dtype, np.floating):
ps = sr.to_pandas()
actual = sr.median(skipna=False)
desired = ps.median(skipna=False)
np.testing.assert_approx_equal(actual, desired)


@pytest.mark.parametrize(
"data1",
[
Expand Down

0 comments on commit 6c8ce65

Please sign in to comment.