Merge pull request #2905 from karthikeyann/fea-median_with_null

[REVIEW] Series median() with null support
rapidsai · Nov 19, 2019 · 6c8ce65 · 6c8ce65
2 parents 3000d59 + beb9539
commit 6c8ce65
Show file tree

Hide file tree

Showing 5 changed files with 105 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,7 @@
 # cuDF 0.11.0 (Date TBD)
 
 ## New Features
-
+- PR #2905 Added `Series.median()` and null support for `Series.quantile()`
 - PR #2930 JSON Reader: Support ARROW_RANDOM_FILE input
 - PR #2956 Add `cudf::stack` and `cudf::tile`
 - PR #2980 Added nvtext is_vowel/is_consonant functions

diff --git a/cpp/src/quantiles/legacy/quantiles.cu b/cpp/src/quantiles/legacy/quantiles.cu
@@ -22,6 +22,7 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/legacy/type_dispatcher.hpp>
 #include <cudf/utilities/legacy/wrapper_types.hpp>
+#include <bitmask/legacy/legacy_bitmask.hpp>
 #include <rmm/thrust_rmm_allocator.h>
 
 #include <thrust/device_vector.h>
@@ -83,21 +84,29 @@ namespace{ // anonymous
 
     if( ctxt->flag_sort_inplace  && ctxt->flag_sorted )
     {
-      return select_quantile(col_data,
-                             n,
+      auto data_begin = ctxt->flag_null_sort_behavior == GDF_NULL_AS_LARGEST
+                            ? col_data
+                            : col_data + col_in->null_count;
+      return select_quantile(data_begin,
+                             n-col_in->null_count,
                              quant, 
                              interpolation,
                              *ptr_res,
                              ctxt->flag_sorted,
                              stream);
     }else{
       // create a clone of col_data if sort is required but sort_inplace is not allowed.
-      rmm::device_vector<ColType> dv(n);
-      thrust::copy_n(rmm::exec_policy(stream)->on(stream), col_data, n, dv.begin());
+      rmm::device_vector<ColType> dv(n-col_in->null_count);
+      thrust::copy_if(rmm::exec_policy(stream)->on(stream), col_data,
+                      col_data + n,
+                      thrust::counting_iterator<cudf::size_type>(0), dv.begin(),
+                      [bitmask = col_in->valid] __device__(auto i) {
+                        return gdf_is_valid(bitmask, i);
+                      });
       ColType* clone_data = dv.data().get();
 
       return select_quantile(clone_data,
-                             n,
+                             dv.size(),
                              quant, 
                              interpolation,
                              *ptr_res,
@@ -172,14 +181,13 @@ gdf_error quantile_exact( gdf_column*         col_in,       // input column
 {
   GDF_REQUIRE(nullptr != col_in, GDF_DATASET_EMPTY);
 
-  if (col_in->size == 0) {
+  if (col_in->size == col_in->null_count) {
      result->is_valid = false;
      return GDF_SUCCESS;
   }
 
   GDF_REQUIRE(nullptr != col_in->data, GDF_DATASET_EMPTY);
   GDF_REQUIRE(0 < col_in->size, GDF_DATASET_EMPTY);
-  GDF_REQUIRE(nullptr == col_in->valid || 0 == col_in->null_count, GDF_VALIDITY_UNSUPPORTED);
 
   gdf_error ret = GDF_SUCCESS;
   result->dtype = GDF_FLOAT64;
@@ -200,14 +208,13 @@ gdf_error quantile_approx(	gdf_column*  col_in,       // input column
 {
   GDF_REQUIRE(nullptr != col_in, GDF_DATASET_EMPTY);
 
-  if (col_in->size == 0) {
+  if (col_in->size == col_in->null_count) {
      result->is_valid = false;
      return GDF_SUCCESS;
   }
 
   GDF_REQUIRE(nullptr != col_in->data, GDF_DATASET_EMPTY);
   GDF_REQUIRE(0 < col_in->size, GDF_DATASET_EMPTY);
-  GDF_REQUIRE(nullptr == col_in->valid || 0 == col_in->null_count, GDF_VALIDITY_UNSUPPORTED);
 
   gdf_error ret = GDF_SUCCESS;
   result->dtype = col_in->dtype;

diff --git a/cpp/tests/quantiles/legacy/quantiles_test.cu b/cpp/tests/quantiles/legacy/quantiles_test.cu
@@ -45,7 +45,8 @@
 template<typename VType>
 void f_quantile_tester(
   gdf_column* col_in,                           ///< input column
-  std::vector<VType>& v_appox,                  ///< expected result for quantile_approx
+  bool is_sorted,                               ///< input column sorted?
+  std::vector<VType>& v_approx,                 ///< expected result for quantile_approx
   std::vector<std::vector<double>>& v_exact,    ///< expected result for quantile_exact
   const gdf_error expected_error = GDF_SUCCESS) ///< expected returned state for quantiles
 {
@@ -59,7 +60,7 @@ void f_quantile_tester(
   size_t n_qs = qvals.size();
 
   assert( n_methods == methods.size() );
-  gdf_context ctxt{0, static_cast<gdf_method>(0), 0, 1};
+  gdf_context ctxt{is_sorted, static_cast<gdf_method>(0), 0, 1};
 
   for(size_t j = 0; j<n_qs; ++j)
     {
@@ -68,10 +69,10 @@ void f_quantile_tester(
       EXPECT_EQ( ret, expected_error) << "approx " << " returns unexpected failure\n";
 
       if( ret == GDF_SUCCESS ){
-        double delta = std::abs(static_cast<double>(result_approx.value() - v_appox[j]));
+        double delta = std::abs(static_cast<double>(result_approx.value() - v_approx[j]));
         bool flag = delta < 1.0e-8;
-        EXPECT_EQ( flag, true ) << " " << q << " appox quantile "
-          << " val = " << result_approx.value() << ", " <<  v_appox[j];
+        EXPECT_EQ( flag, true ) << " " << q << " approx quantile "
+          << " val = " << result_approx.value() << ", " <<  v_approx[j];
       }
 
       for(size_t i = 0;i<n_methods;++i)
@@ -105,7 +106,7 @@ TEST_F(gdf_quantile, DoubleVector)
     {-1.01,   0.8,  0.955,  2.13,   6.8},
     {-1.01,   0.8,  1.11,   2.13,   6.8}};
 
-  f_quantile_tester<VType>(col.get(), v_baseline_approx, v_baseline_exact);
+  f_quantile_tester<VType>(col.get(), false, v_baseline_approx, v_baseline_exact);
 }
 
 TEST_F(gdf_quantile, IntegerVector)
@@ -122,15 +123,14 @@ TEST_F(gdf_quantile, IntegerVector)
     {-1.0,   1.0,   1.0,   2.0,   7.0},
     {-1,     1,     1,     2,     7}};
 
-  f_quantile_tester<VType>(col.get(), v_baseline_approx, v_baseline_exact);
+  f_quantile_tester<VType>(col.get(), false, v_baseline_approx, v_baseline_exact);
 }
 
-TEST_F(gdf_quantile, ReportValidMaskError)
+TEST_F(gdf_quantile, SortedVector)
 {
   using VType = int32_t;
-  std::vector<VType> v{7, 0, 3, 4, 2, 1, -1, 1, 6};
-  std::vector<cudf::valid_type> bitmask(gdf_valid_allocation_size(v.size()), 0xF3);
-  cudf::test::column_wrapper<VType> col(v, bitmask);
+  std::vector<VType> v{-1, 0, 1, 1, 2, 3, 4, 6, 7};
+  cudf::test::column_wrapper<VType> col(v);
 
   std::vector<VType> v_baseline_approx{-1,     1,     1,     2,     7};
   std::vector<std::vector<double>> v_baseline_exact{
@@ -139,8 +139,47 @@ TEST_F(gdf_quantile, ReportValidMaskError)
     {-1,     1,     1,     2,     7},
     {-1.0,   1.0,   1.0,   2.0,   7.0},
     {-1,     1,     1,     2,     7}};
-
-  f_quantile_tester<VType>(col.get(), v_baseline_approx, v_baseline_exact, GDF_VALIDITY_UNSUPPORTED);
+
+  f_quantile_tester<VType>(col.get(), true, v_baseline_approx, v_baseline_exact);
+}
+
+TEST_F(gdf_quantile, VectorWithNulls)
+{
+  using VType = int32_t;
+  std::vector<VType> v{7, 0, 3, 4, 2, 1, -1, 1, 6};
+  std::vector<gdf_valid_type> bitmask(gdf_valid_allocation_size(v.size()), 0xF3);
+  cudf::test::column_wrapper<VType> col(v, bitmask);
+  //col.print(); //7 0 @ @ 2 1 -1 1 6 
+
+  std::vector<VType> v_baseline_approx{-1,     0,     0,     1,     7};
+  std::vector<std::vector<double>> v_baseline_exact{
+    {-1.,   0.5,  0.98,  1.,   7.},
+    {-1.,   0.,   0.,    1.,   7.},
+    {-1.,   1.,   1.,    1.,   7.},
+    {-1.,   0.5,  0.5,   1.,   7.},
+    {-1.,   1.,   1.,    1.,   7.}};
+
+  f_quantile_tester<VType>(col.get(), false, v_baseline_approx, v_baseline_exact);
+}
+
+TEST_F(gdf_quantile, SortedVectorWithNulls)
+{
+  using VType = int32_t;
+  std::vector<VType> v{-1, 0, 1, 1, 2, 6, 7, 3, 4};
+  std::vector<gdf_valid_type> bitmask(gdf_valid_allocation_size(v.size()), 0x7F);
+  bitmask[1]=0xFE;
+  cudf::test::column_wrapper<VType> col(v, bitmask);
+  //col.print(); //-1 0 1 1 2 6 7 @ @
+
+  std::vector<VType> v_baseline_approx{-1,     0,     0,     1,     7};
+  std::vector<std::vector<double>> v_baseline_exact{
+    {-1.,   0.5,  0.98,  1.,   7.},
+    {-1.,   0.,   0.,    1.,   7.},
+    {-1.,   1.,   1.,    1.,   7.},
+    {-1.,   0.5,  0.5,   1.,   7.},
+    {-1.,   1.,   1.,    1.,   7.}};
+
+  f_quantile_tester<VType>(col.get(), true, v_baseline_approx, v_baseline_exact);
 }
 
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -1882,6 +1882,14 @@ def var(self, ddof=1, axis=None, skipna=True):
     def sum_of_squares(self, dtype=None):
         return self._column.sum_of_squares(dtype=dtype)
 
+    def median(self, skipna=True):
+        """Compute the median of the series
+        """
+        if not skipna and self.null_count > 0:
+            return np.nan
+        # enforce linear in case the default ever changes
+        return self.quantile(0.5, interpolation="linear", exact=True)
+
     def round(self, decimals=0):
         """Round a Series to a configurable number of decimal places.
         """

diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
@@ -270,6 +270,34 @@ def test_skew(data, null_flag):
     np.testing.assert_array_almost_equal(got, expected)
 
 
+@pytest.mark.parametrize("dtype", params_dtypes)
+@pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100])
+def test_series_median(dtype, num_na):
+    np.random.seed(0)
+    arr = np.random.random(100)
+    if np.issubdtype(dtype, np.integer):
+        arr *= 100
+    mask = np.arange(100) >= num_na
+
+    arr = arr.astype(dtype)
+    sr = Series.from_masked_array(arr, Series(mask).as_mask())
+    arr2 = arr[mask]
+    ps = pd.Series(arr2, dtype=dtype)
+
+    actual = sr.median(skipna=True)
+    desired = ps.median(skipna=True)
+    print(actual, desired)
+    np.testing.assert_approx_equal(actual, desired)
+
+    # only for float until integer null supported convert to pandas in cudf
+    # eg. pd.Int64Dtype
+    if np.issubdtype(dtype, np.floating):
+        ps = sr.to_pandas()
+        actual = sr.median(skipna=False)
+        desired = ps.median(skipna=False)
+        np.testing.assert_approx_equal(actual, desired)
+
+
 @pytest.mark.parametrize(
     "data1",
     [