Skip to content

Commit

Permalink
Merge pull request #11767 from rapidsai/branch-22.10
Browse files Browse the repository at this point in the history
[gpuCI] Forward-merge branch-22.10 to branch-22.12 [skip gpuci]
  • Loading branch information
GPUtester authored Sep 26, 2022
2 parents 59847c1 + cd60462 commit 41474af
Show file tree
Hide file tree
Showing 10 changed files with 77 additions and 10 deletions.
1 change: 1 addition & 0 deletions cpp/include/cudf/io/orc_metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ struct column_statistics;
*/
struct column_statistics {
std::optional<uint64_t> number_of_values; ///< number of statistics
std::optional<bool> has_null; ///< column has any nulls
std::variant<no_statistics,
integer_statistics,
double_statistics,
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/dictionary/search.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,7 @@ struct find_index_fn {
using ScalarType = cudf::scalar_type_t<Element>;
auto find_key = static_cast<ScalarType const&>(key).value(stream);
auto keys_view = column_device_view::create(input.keys(), stream);
auto iter = thrust::equal_range(thrust::device, // segfaults: rmm::exec_policy(stream) and
// thrust::cuda::par.on(stream)
auto iter = thrust::equal_range(rmm::exec_policy(cudf::default_stream_value),
keys_view->begin<Element>(),
keys_view->end<Element>(),
find_key);
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/interop/to_arrow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>
#include <rmm/mr/device/per_device_resource.hpp>

#include <thrust/copy.h>
Expand Down Expand Up @@ -153,7 +154,8 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(

auto count = thrust::make_counting_iterator(0);

thrust::for_each(count,
thrust::for_each(rmm::exec_policy(cudf::default_stream_value),
count,
count + input.size(),
[in = input.begin<DeviceType>(), out = buf.data()] __device__(auto in_idx) {
auto const out_idx = in_idx * 2;
Expand Down
1 change: 1 addition & 0 deletions cpp/src/io/functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)
column_statistics::column_statistics(cudf::io::orc::column_statistics&& cs)
{
number_of_values = cs.number_of_values;
has_null = cs.has_null;
if (cs.int_stats) {
type_specific_stats = *cs.int_stats;
} else if (cs.double_stats) {
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/orc/orc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@ void ProtobufReader::read(column_statistics& s, size_t maxlen)
make_field_reader(6, s.decimal_stats),
make_field_reader(7, s.date_stats),
make_field_reader(8, s.binary_stats),
make_field_reader(9, s.timestamp_stats));
make_field_reader(9, s.timestamp_stats),
make_field_reader(10, s.has_null));
function_builder(s, maxlen, op);
}

Expand Down
8 changes: 7 additions & 1 deletion cpp/src/io/orc/orc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ struct column_statistics {
std::optional<date_statistics> date_stats;
std::optional<binary_statistics> binary_stats;
std::optional<timestamp_statistics> timestamp_stats;
// TODO: hasNull (issue #7087)
std::optional<bool> has_null;
};

struct StripeStatistics {
Expand Down Expand Up @@ -423,6 +423,12 @@ inline uint8_t ProtobufReader::get<uint8_t>()
return (m_cur < m_end) ? *m_cur++ : 0;
};

template <>
inline bool ProtobufReader::get<bool>()
{
return static_cast<bool>(get<uint8_t>());
};

template <>
inline uint32_t ProtobufReader::get<uint32_t>()
{
Expand Down
59 changes: 58 additions & 1 deletion cpp/tests/io/orc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1246,7 +1246,7 @@ TEST_F(OrcStatisticsTest, Overflow)
not_too_small_seq, not_too_small_seq + num_rows, validity);
table_view tbl({col1, col2, col3, col4});

auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc");
auto filepath = temp_env->get_temp_filepath("OrcStatsOverflow.orc");

cudf_io::orc_writer_options out_opts =
cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
Expand All @@ -1264,6 +1264,63 @@ TEST_F(OrcStatisticsTest, Overflow)
check_sum_exist(3, true);
check_sum_exist(4, true);
}

TEST_F(OrcStatisticsTest, HasNull)
{
// cudf's ORC writer doesn't yet support the ability to encode the hasNull value in statistics so
// we're embedding a file created using pyorc
//
// Method to create file:
// >>> import pyorc
// >>> output = open("./temp.orc", "wb")
// >>> writer = pyorc.Writer(output, pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt()))
// >>> writer.write((1, 3))
// >>> writer.write((2, 4))
// >>> writer.write((None, 5))
// >>> writer.close()
//
// Contents of file:
// >>> import pyarrow.orc as po
// >>> po.ORCFile('new.orc').read()
// pyarrow.Table
// a: int64
// b: int64
// ----
// a: [[1,2,null]]
// b: [[3,4,5]]
auto nulls_orc = std::array<uint8_t, 308>{
0x4F, 0x52, 0x43, 0x1D, 0x00, 0x00, 0x0A, 0x0C, 0x0A, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x04,
0x08, 0x03, 0x50, 0x00, 0x2C, 0x00, 0x00, 0xE3, 0x12, 0xE7, 0x62, 0x67, 0x80, 0x00, 0x21, 0x1E,
0x0E, 0x26, 0x21, 0x36, 0x0E, 0x26, 0x01, 0x16, 0x09, 0xB6, 0x00, 0x46, 0x00, 0x2C, 0x00, 0x00,
0xE3, 0x12, 0xE7, 0x62, 0x67, 0x80, 0x00, 0x21, 0x1E, 0x0E, 0x66, 0x21, 0x36, 0x0E, 0x36, 0x01,
0x2E, 0x09, 0x89, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0xFF, 0xE0, 0x05, 0x00, 0x00, 0xFF, 0xC0,
0x07, 0x00, 0x00, 0x46, 0x01, 0x24, 0x05, 0x00, 0x00, 0xFF, 0xE0, 0x09, 0x00, 0x00, 0x46, 0x02,
0x68, 0xA0, 0x68, 0x00, 0x00, 0xE3, 0x62, 0xE3, 0x60, 0x13, 0x60, 0x90, 0x10, 0xE4, 0x02, 0xD1,
0x8C, 0x12, 0x92, 0x60, 0x9A, 0x09, 0x4C, 0x33, 0x00, 0xC5, 0x59, 0xC1, 0x34, 0x23, 0x98, 0x66,
0x04, 0xD2, 0x6C, 0x60, 0x3E, 0x13, 0x94, 0xCF, 0x24, 0xC1, 0x2E, 0xC4, 0x02, 0x52, 0x07, 0x24,
0x99, 0x60, 0xA4, 0x14, 0x73, 0x68, 0x88, 0x33, 0x00, 0x46, 0x00, 0x00, 0xE3, 0x52, 0xE2, 0x62,
0xE1, 0x60, 0x0E, 0x60, 0xE0, 0xE2, 0xE1, 0x60, 0x12, 0x62, 0xE3, 0x60, 0x12, 0x60, 0x91, 0x60,
0x0B, 0x60, 0x04, 0xF2, 0x98, 0x81, 0x3C, 0x36, 0x01, 0x2E, 0x09, 0x89, 0x00, 0x06, 0x00, 0xB4,
0x00, 0x00, 0xE3, 0x60, 0x16, 0x98, 0xC6, 0x28, 0xC5, 0xC5, 0xC1, 0x2C, 0xE0, 0x2C, 0x21, 0xA3,
0x60, 0xAE, 0xC1, 0xAC, 0x24, 0xC4, 0xC1, 0x23, 0xC4, 0xC4, 0xC8, 0x24, 0xC5, 0x98, 0x28, 0xC5,
0x98, 0xA4, 0xC0, 0xA0, 0xC1, 0x60, 0xC0, 0xA0, 0xC4, 0xC1, 0xC1, 0x82, 0xCE, 0x32, 0x60, 0xB6,
0x62, 0xE1, 0x60, 0x0E, 0x60, 0xB0, 0xE2, 0xE1, 0x60, 0x12, 0x62, 0xE3, 0x60, 0x12, 0x60, 0x91,
0x60, 0x0B, 0x60, 0x04, 0xF2, 0x98, 0x81, 0x3C, 0x36, 0x01, 0x2E, 0x09, 0x89, 0x00, 0x06, 0x87,
0x09, 0x7E, 0x1E, 0x8C, 0x49, 0xAC, 0x86, 0x7A, 0xE6, 0x7A, 0xA6, 0x00, 0x08, 0x5D, 0x10, 0x01,
0x18, 0x80, 0x80, 0x04, 0x22, 0x02, 0x00, 0x0C, 0x28, 0x26, 0x30, 0x06, 0x82, 0xF4, 0x03, 0x03,
0x4F, 0x52, 0x43, 0x17,
};

auto const stats = cudf_io::read_parsed_orc_statistics(
cudf_io::source_info{reinterpret_cast<char const*>(nulls_orc.data()), nulls_orc.size()});

EXPECT_EQ(stats.file_stats[1].has_null, true);
EXPECT_EQ(stats.file_stats[2].has_null, false);

EXPECT_EQ(stats.stripes_stats[0][1].has_null, true);
EXPECT_EQ(stats.stripes_stats[0][2].has_null, false);
}

struct OrcWriterTestStripes
: public OrcWriterTest,
public ::testing::WithParamInterface<std::tuple<size_t, cudf::size_type>> {
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1643,7 +1643,7 @@ def isnumeric(self) -> SeriesOrIndex:
also includes other characters that can represent
quantities such as unicode fractions.
>>> s2 = pd.Series(['23', '³', '⅕', ''])
>>> s2 = pd.Series(['23', '³', '⅕', ''], dtype='str')
>>> s2.str.isnumeric()
0 True
1 True
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ def dtypes(self):
string object
dtype: object
"""
return pd.Series(self._dtypes)
return pd.Series(self._dtypes, dtype="object")

@property
def ndim(self):
Expand Down Expand Up @@ -6977,7 +6977,7 @@ def from_pandas(obj, nan_as_null=None):
Converting a Pandas Series to cuDF Series:
>>> psr = pd.Series(['a', 'b', 'c', 'd'], name='apple')
>>> psr = pd.Series(['a', 'b', 'c', 'd'], name='apple', dtype='str')
>>> psr
0 a
1 b
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ def from_pandas(cls, s, nan_as_null=None):
>>> import pandas as pd
>>> import numpy as np
>>> data = [10, 20, 30, np.nan]
>>> pds = pd.Series(data)
>>> pds = pd.Series(data, dtype='float64')
>>> cudf.Series.from_pandas(pds)
0 10.0
1 20.0
Expand Down

0 comments on commit 41474af

Please sign in to comment.