Merge pull request #11767 from rapidsai/branch-22.10

[gpuCI] Forward-merge branch-22.10 to branch-22.12 [skip gpuci]
rapidsai · Sep 26, 2022 · 41474af · 41474af
2 parents 59847c1 + cd60462
commit 41474af
Show file tree

Hide file tree

Showing 10 changed files with 77 additions and 10 deletions.
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
@@ -163,6 +163,7 @@ struct column_statistics;
  */
 struct column_statistics {
   std::optional<uint64_t> number_of_values;  ///< number of statistics
+  std::optional<bool> has_null;              ///< column has any nulls
   std::variant<no_statistics,
                integer_statistics,
                double_statistics,

diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
@@ -79,8 +79,7 @@ struct find_index_fn {
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
     auto keys_view   = column_device_view::create(input.keys(), stream);
-    auto iter = thrust::equal_range(thrust::device,  // segfaults: rmm::exec_policy(stream) and
-                                                     // thrust::cuda::par.on(stream)
+    auto iter        = thrust::equal_range(rmm::exec_policy(cudf::default_stream_value),
                                     keys_view->begin<Element>(),
                                     keys_view->end<Element>(),
                                     find_key);

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
@@ -30,6 +30,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/copy.h>
@@ -153,7 +154,8 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
 
   auto count = thrust::make_counting_iterator(0);
 
-  thrust::for_each(count,
+  thrust::for_each(rmm::exec_policy(cudf::default_stream_value),
+                   count,
                    count + input.size(),
                    [in = input.begin<DeviceType>(), out = buf.data()] __device__(auto in_idx) {
                      auto const out_idx = in_idx * 2;

diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
@@ -287,6 +287,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)
 column_statistics::column_statistics(cudf::io::orc::column_statistics&& cs)
 {
   number_of_values = cs.number_of_values;
+  has_null         = cs.has_null;
   if (cs.int_stats) {
     type_specific_stats = *cs.int_stats;
   } else if (cs.double_stats) {

diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
@@ -184,7 +184,8 @@ void ProtobufReader::read(column_statistics& s, size_t maxlen)
                             make_field_reader(6, s.decimal_stats),
                             make_field_reader(7, s.date_stats),
                             make_field_reader(8, s.binary_stats),
-                            make_field_reader(9, s.timestamp_stats));
+                            make_field_reader(9, s.timestamp_stats),
+                            make_field_reader(10, s.has_null));
   function_builder(s, maxlen, op);
 }
 

diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
@@ -122,7 +122,7 @@ struct column_statistics {
   std::optional<date_statistics> date_stats;
   std::optional<binary_statistics> binary_stats;
   std::optional<timestamp_statistics> timestamp_stats;
-  // TODO: hasNull (issue #7087)
+  std::optional<bool> has_null;
 };
 
 struct StripeStatistics {
@@ -423,6 +423,12 @@ inline uint8_t ProtobufReader::get<uint8_t>()
   return (m_cur < m_end) ? *m_cur++ : 0;
 };
 
+template <>
+inline bool ProtobufReader::get<bool>()
+{
+  return static_cast<bool>(get<uint8_t>());
+};
+
 template <>
 inline uint32_t ProtobufReader::get<uint32_t>()
 {

diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
@@ -1246,7 +1246,7 @@ TEST_F(OrcStatisticsTest, Overflow)
     not_too_small_seq, not_too_small_seq + num_rows, validity);
   table_view tbl({col1, col2, col3, col4});
 
-  auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc");
+  auto filepath = temp_env->get_temp_filepath("OrcStatsOverflow.orc");
 
   cudf_io::orc_writer_options out_opts =
     cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
@@ -1264,6 +1264,63 @@ TEST_F(OrcStatisticsTest, Overflow)
   check_sum_exist(3, true);
   check_sum_exist(4, true);
 }
+
+TEST_F(OrcStatisticsTest, HasNull)
+{
+  // cudf's ORC writer doesn't yet support the ability to encode the hasNull value in statistics so
+  // we're embedding a file created using pyorc
+  //
+  // Method to create file:
+  // >>> import pyorc
+  // >>> output = open("./temp.orc", "wb")
+  // >>> writer = pyorc.Writer(output, pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt()))
+  // >>> writer.write((1, 3))
+  // >>> writer.write((2, 4))
+  // >>> writer.write((None, 5))
+  // >>> writer.close()
+  //
+  // Contents of file:
+  // >>> import pyarrow.orc as po
+  // >>> po.ORCFile('new.orc').read()
+  // pyarrow.Table
+  // a: int64
+  // b: int64
+  // ----
+  // a: [[1,2,null]]
+  // b: [[3,4,5]]
+  auto nulls_orc = std::array<uint8_t, 308>{
+    0x4F, 0x52, 0x43, 0x1D, 0x00, 0x00, 0x0A, 0x0C, 0x0A, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x04,
+    0x08, 0x03, 0x50, 0x00, 0x2C, 0x00, 0x00, 0xE3, 0x12, 0xE7, 0x62, 0x67, 0x80, 0x00, 0x21, 0x1E,
+    0x0E, 0x26, 0x21, 0x36, 0x0E, 0x26, 0x01, 0x16, 0x09, 0xB6, 0x00, 0x46, 0x00, 0x2C, 0x00, 0x00,
+    0xE3, 0x12, 0xE7, 0x62, 0x67, 0x80, 0x00, 0x21, 0x1E, 0x0E, 0x66, 0x21, 0x36, 0x0E, 0x36, 0x01,
+    0x2E, 0x09, 0x89, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0xFF, 0xE0, 0x05, 0x00, 0x00, 0xFF, 0xC0,
+    0x07, 0x00, 0x00, 0x46, 0x01, 0x24, 0x05, 0x00, 0x00, 0xFF, 0xE0, 0x09, 0x00, 0x00, 0x46, 0x02,
+    0x68, 0xA0, 0x68, 0x00, 0x00, 0xE3, 0x62, 0xE3, 0x60, 0x13, 0x60, 0x90, 0x10, 0xE4, 0x02, 0xD1,
+    0x8C, 0x12, 0x92, 0x60, 0x9A, 0x09, 0x4C, 0x33, 0x00, 0xC5, 0x59, 0xC1, 0x34, 0x23, 0x98, 0x66,
+    0x04, 0xD2, 0x6C, 0x60, 0x3E, 0x13, 0x94, 0xCF, 0x24, 0xC1, 0x2E, 0xC4, 0x02, 0x52, 0x07, 0x24,
+    0x99, 0x60, 0xA4, 0x14, 0x73, 0x68, 0x88, 0x33, 0x00, 0x46, 0x00, 0x00, 0xE3, 0x52, 0xE2, 0x62,
+    0xE1, 0x60, 0x0E, 0x60, 0xE0, 0xE2, 0xE1, 0x60, 0x12, 0x62, 0xE3, 0x60, 0x12, 0x60, 0x91, 0x60,
+    0x0B, 0x60, 0x04, 0xF2, 0x98, 0x81, 0x3C, 0x36, 0x01, 0x2E, 0x09, 0x89, 0x00, 0x06, 0x00, 0xB4,
+    0x00, 0x00, 0xE3, 0x60, 0x16, 0x98, 0xC6, 0x28, 0xC5, 0xC5, 0xC1, 0x2C, 0xE0, 0x2C, 0x21, 0xA3,
+    0x60, 0xAE, 0xC1, 0xAC, 0x24, 0xC4, 0xC1, 0x23, 0xC4, 0xC4, 0xC8, 0x24, 0xC5, 0x98, 0x28, 0xC5,
+    0x98, 0xA4, 0xC0, 0xA0, 0xC1, 0x60, 0xC0, 0xA0, 0xC4, 0xC1, 0xC1, 0x82, 0xCE, 0x32, 0x60, 0xB6,
+    0x62, 0xE1, 0x60, 0x0E, 0x60, 0xB0, 0xE2, 0xE1, 0x60, 0x12, 0x62, 0xE3, 0x60, 0x12, 0x60, 0x91,
+    0x60, 0x0B, 0x60, 0x04, 0xF2, 0x98, 0x81, 0x3C, 0x36, 0x01, 0x2E, 0x09, 0x89, 0x00, 0x06, 0x87,
+    0x09, 0x7E, 0x1E, 0x8C, 0x49, 0xAC, 0x86, 0x7A, 0xE6, 0x7A, 0xA6, 0x00, 0x08, 0x5D, 0x10, 0x01,
+    0x18, 0x80, 0x80, 0x04, 0x22, 0x02, 0x00, 0x0C, 0x28, 0x26, 0x30, 0x06, 0x82, 0xF4, 0x03, 0x03,
+    0x4F, 0x52, 0x43, 0x17,
+  };
+
+  auto const stats = cudf_io::read_parsed_orc_statistics(
+    cudf_io::source_info{reinterpret_cast<char const*>(nulls_orc.data()), nulls_orc.size()});
+
+  EXPECT_EQ(stats.file_stats[1].has_null, true);
+  EXPECT_EQ(stats.file_stats[2].has_null, false);
+
+  EXPECT_EQ(stats.stripes_stats[0][1].has_null, true);
+  EXPECT_EQ(stats.stripes_stats[0][2].has_null, false);
+}
+
 struct OrcWriterTestStripes
   : public OrcWriterTest,
     public ::testing::WithParamInterface<std::tuple<size_t, cudf::size_type>> {

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -1643,7 +1643,7 @@ def isnumeric(self) -> SeriesOrIndex:
         also includes other characters that can represent
         quantities such as unicode fractions.
 
-        >>> s2 = pd.Series(['23', '³', '⅕', ''])
+        >>> s2 = pd.Series(['23', '³', '⅕', ''], dtype='str')
         >>> s2.str.isnumeric()
         0     True
         1     True

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -1057,7 +1057,7 @@ def dtypes(self):
         string              object
         dtype: object
         """
-        return pd.Series(self._dtypes)
+        return pd.Series(self._dtypes, dtype="object")
 
     @property
     def ndim(self):
@@ -6977,7 +6977,7 @@ def from_pandas(obj, nan_as_null=None):
 
     Converting a Pandas Series to cuDF Series:
 
-    >>> psr = pd.Series(['a', 'b', 'c', 'd'], name='apple')
+    >>> psr = pd.Series(['a', 'b', 'c', 'd'], name='apple', dtype='str')
     >>> psr
     0    a
     1    b

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -589,7 +589,7 @@ def from_pandas(cls, s, nan_as_null=None):
         >>> import pandas as pd
         >>> import numpy as np
         >>> data = [10, 20, 30, np.nan]
-        >>> pds = pd.Series(data)
+        >>> pds = pd.Series(data, dtype='float64')
         >>> cudf.Series.from_pandas(pds)
         0    10.0
         1    20.0