diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index fad5c98dc76..10368f84824 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -67,6 +67,8 @@ class parquet_reader_options { bool _use_pandas_metadata = true; // Cast timestamp columns to a specific type data_type _timestamp_type{type_id::EMPTY}; + // Whether to store binary data as a string column + std::optional> _convert_binary_to_strings{std::nullopt}; /** * @brief Constructor from source info. @@ -118,6 +120,19 @@ class parquet_reader_options { */ [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; } + /** + * @brief Returns optional vector of true/false values depending on whether binary data should be + * converted to strings or not. + * + * @return vector with ith value `true` if binary data should be converted to strings for the ith + * column. Will return std::nullopt if the user did not set this option, which defaults to all + * binary data being converted to strings. + */ + [[nodiscard]] std::optional> get_convert_binary_to_strings() const + { + return _convert_binary_to_strings; + } + /** * @brief Returns number of rows to skip from the start. * @@ -191,6 +206,17 @@ class parquet_reader_options { */ void enable_use_pandas_metadata(bool val) { _use_pandas_metadata = val; } + /** + * @brief Sets to enable/disable conversion of binary to strings per column. + * + * @param val Vector of boolean values to enable/disable conversion of binary to string columns. + * Note default is to convert to string columns. + */ + void set_convert_binary_to_strings(std::vector val) + { + _convert_binary_to_strings = std::move(val); + } + /** * @brief Sets number of rows to skip. * @@ -296,6 +322,19 @@ class parquet_reader_options_builder { return *this; } + /** + * @brief Sets enable/disable conversion of binary to strings per column. + * + * @param val Vector of boolean values to enable/disable conversion of binary to string columns. + * Note default is to convert to string columns. + * @return this for chaining + */ + parquet_reader_options_builder& convert_binary_to_strings(std::vector val) + { + options._convert_binary_to_strings = std::move(val); + return *this; + } + /** * @brief Sets number of rows to skip. * diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index e9a93894f7d..6504e790677 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -386,7 +386,7 @@ class column_in_metadata { thrust::optional _nullable; bool _list_column_is_map = false; bool _use_int96_timestamp = false; - // bool _output_as_binary = false; + bool _output_as_binary = false; thrust::optional _decimal_precision; thrust::optional _parquet_field_id; std::vector children; @@ -489,6 +489,20 @@ class column_in_metadata { return *this; } + /** + * @brief Specifies whether this column should be written as binary or string data + * Only valid for the following column types: + * string + * + * @param binary True = use binary data type. False = use string data type + * @return this for chaining + */ + column_in_metadata& set_output_as_binary(bool binary) + { + _output_as_binary = binary; + return *this; + } + /** * @brief Get reference to a child of this column * @@ -581,6 +595,13 @@ class column_in_metadata { * @return The number of children of this column */ [[nodiscard]] size_type num_children() const { return children.size(); } + + /** + * @brief Get whether to encode this column as binary or string data + * + * @return Boolean indicating whether to encode this column as binary data + */ + [[nodiscard]] bool is_enabled_output_as_binary() const { return _output_as_binary; } }; /** diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index ee70bf2c271..b03ba23737e 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -148,6 +148,7 @@ struct SchemaElement { int32_t decimal_scale = 0; int32_t decimal_precision = 0; thrust::optional field_id = thrust::nullopt; + bool output_as_byte_array = false; // The following fields are filled in later during schema initialization int max_definition_level = 0; diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 5df4f3e2575..966730a5169 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -41,7 +41,7 @@ enum Type : int8_t { */ enum ConvertedType { UNKNOWN = -1, // No type information present - UTF8 = 0, // a BYTE_ARRAY actually contains UTF8 encoded chars + UTF8 = 0, // a BYTE_ARRAY may contain UTF8 encoded chars MAP = 1, // a map is converted as an optional field containing a repeated key/value pair MAP_KEY_VALUE = 2, // a key/value pair is converted into a group of two fields LIST = diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 230eb371441..304ab936318 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -251,6 +251,7 @@ struct parquet_column_device_view : stats_column_desc { uint8_t const* nullability; //!< Array of nullability of each nesting level. e.g. nullable[0] is //!< nullability of parent_column. May be different from //!< col.nullable() in case of chunked writing. + bool output_as_byte_array; //!< Indicates this list column is being written as a byte array }; constexpr int max_page_fragment_size = 5000; //!< Max number of rows in a page fragment @@ -300,15 +301,19 @@ inline uint32_t __device__ int32_logical_len(type_id id) inline size_type __device__ row_to_value_idx(size_type idx, parquet_column_device_view const& parquet_col) { - auto col = *parquet_col.parent_column; - while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { - if (col.type().id() == type_id::STRUCT) { - idx += col.offset(); - col = col.child(0); - } else { - auto list_col = cudf::detail::lists_column_device_view(col); - idx = list_col.offset_at(idx); - col = list_col.child(); + // with a byte array, we can't go all the way down to the leaf node, but instead we want to leave + // the size at the parent level because we are writing out parent row byte arrays. + if (!parquet_col.output_as_byte_array) { + auto col = *parquet_col.parent_column; + while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { + if (col.type().id() == type_id::STRUCT) { + idx += col.offset(); + col = col.child(0); + } else { + auto list_col = cudf::detail::lists_column_device_view(col); + idx = list_col.offset_at(idx); + col = list_col.child(); + } } } return idx; diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index c4efe2ce856..c2c238c5f27 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1593,6 +1593,9 @@ reader::impl::impl(std::vector>&& sources, // Strings may be returned as either string or categorical columns _strings_to_categorical = options.is_enabled_convert_strings_to_categories(); + // Binary columns can be read as binary or strings + _force_binary_columns_as_strings = options.get_convert_binary_to_strings(); + // Select only columns required by the options std::tie(_input_columns, _output_columns, _output_column_schemas) = _metadata->select_columns(options.get_columns(), @@ -1762,10 +1765,28 @@ table_with_metadata reader::impl::read(size_type skip_rows, // decoding of column data itself decode_page_data(chunks, pages, page_nesting_info, skip_rows, num_rows); + auto make_output_column = [&](column_buffer& buf, column_name_info* schema_info, int i) { + auto col = make_column(buf, schema_info, _stream, _mr); + if (should_write_byte_array(i)) { + auto const& schema = _metadata->get_schema(_output_column_schemas[i]); + if (schema.converted_type == parquet::UNKNOWN) { + auto const num_rows = col->size(); + auto data = col->release(); + return make_lists_column( + num_rows, + std::move(data.children[strings_column_view::offsets_column_index]), + std::move(data.children[strings_column_view::chars_column_index]), + UNKNOWN_NULL_COUNT, + std::move(*data.null_mask)); + } + } + return col; + }; + // create the final output cudf columns for (size_t i = 0; i < _output_columns.size(); ++i) { column_name_info& col_name = out_metadata.schema_info.emplace_back(""); - out_columns.emplace_back(make_column(_output_columns[i], &col_name, _stream, _mr)); + out_columns.emplace_back(make_output_column(_output_columns[i], &col_name, i)); } } } diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index f4366cd1258..97582b8ebd7 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -188,6 +188,20 @@ class reader::impl { size_t min_row, size_t total_rows); + /** + * @brief Indicates if a column should be written as a byte array + * + * @param col column to check + * @return true if the column should be written as a byte array + * @return false if the column should be written as normal for that type + */ + bool should_write_byte_array(int col) + { + return _output_columns[col].type.id() == type_id::STRING && + _force_binary_columns_as_strings.has_value() && + !_force_binary_columns_as_strings.value()[col]; + } + private: rmm::cuda_stream_view _stream; rmm::mr::device_memory_resource* _mr = nullptr; @@ -203,6 +217,7 @@ class reader::impl { std::vector _output_column_schemas; bool _strings_to_categorical = false; + std::optional> _force_binary_columns_as_strings; data_type _timestamp_type{type_id::EMPTY}; }; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index e0d5bf129a7..1b842db7f37 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -337,9 +337,14 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = Type::BYTE_ARRAY; - col_schema.converted_type = ConvertedType::UTF8; - col_schema.stats_dtype = statistics_dtype::dtype_string; + col_schema.type = Type::BYTE_ARRAY; + if (col_meta.is_enabled_output_as_binary()) { + col_schema.converted_type = ConvertedType::UNKNOWN; + col_schema.stats_dtype = statistics_dtype::dtype_byte_array; + } else { + col_schema.converted_type = ConvertedType::UTF8; + col_schema.stats_dtype = statistics_dtype::dtype_string; + } } template diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh index 6a70a46edf3..0e46d7ece18 100644 --- a/cpp/src/io/statistics/column_statistics.cuh +++ b/cpp/src/io/statistics/column_statistics.cuh @@ -97,7 +97,7 @@ struct calculate_group_statistics_functor { */ template ::include_extrema and - (IO == detail::io_file_format::ORC or + (IO != detail::io_file_format::PARQUET or !std::is_same_v)>* = nullptr> __device__ void operator()(stats_state_s& s, uint32_t t) { diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 41bf399ad9a..0f9f55f00db 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -716,6 +716,71 @@ TEST_F(ParquetWriterTest, Strings) cudf::test::expect_metadata_equal(expected_metadata, result.metadata); } +TEST_F(ParquetWriterTest, StringsAsBinary) +{ + std::vector unicode_strings{ + "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; + std::vector ascii_strings{ + "Monday", "Wednesday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; + + column_wrapper col0{ascii_strings.begin(), ascii_strings.end()}; + column_wrapper col1{unicode_strings.begin(), unicode_strings.end()}; + column_wrapper col2{ascii_strings.begin(), ascii_strings.end()}; + cudf::test::lists_column_wrapper col3{{'M', 'o', 'n', 'd', 'a', 'y'}, + {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'M', 'o', 'n', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'u', 'n', 'd', 'a', 'y'}}; + cudf::test::lists_column_wrapper col4{ + {'M', 'o', 'n', 'd', 'a', 'y'}, + {'W', -56, -123, 'd', 'n', -56, -123, 's', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'M', 'o', 'n', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'u', 'n', 'd', 'a', 'y'}}; + + std::vector> cols; + cols.push_back(col0.release()); + cols.push_back(col1.release()); + cols.push_back(col2.release()); + cols.push_back(col3.release()); + cols.push_back(col4.release()); + auto write_tbl = std::make_unique(std::move(cols)); + EXPECT_EQ(5, write_tbl->num_columns()); + + cudf_io::table_input_metadata expected_metadata(*write_tbl); + expected_metadata.column_metadata[0].set_name("col_single").set_output_as_binary(true); + expected_metadata.column_metadata[1].set_name("col_string").set_output_as_binary(true); + expected_metadata.column_metadata[2].set_name("col_another").set_output_as_binary(true); + expected_metadata.column_metadata[3].set_name("col_binary"); + expected_metadata.column_metadata[4].set_name("col_binary"); + + auto filepath = temp_env->get_temp_filepath("BinaryStrings.parquet"); + cudf_io::parquet_writer_options out_opts = + cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, write_tbl->view()) + .metadata(&expected_metadata); + cudf_io::write_parquet(out_opts); + + cudf_io::parquet_reader_options in_opts = + cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}) + .convert_binary_to_strings({false, false, false, false, false, false, false, false, false}); + auto result = cudf_io::read_parquet(in_opts); + + auto original_cols = write_tbl->release(); + original_cols[0] = std::make_unique(original_cols[3]->view()); + original_cols[2] = std::make_unique(original_cols[3]->view()); + original_cols[1] = std::make_unique(original_cols[4]->view()); + auto expected = cudf::table(std::move(original_cols)); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected.view(), result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + TEST_F(ParquetWriterTest, SlicedTable) { // This test checks for writing zero copy, offsetted views into existing cudf tables @@ -4215,4 +4280,72 @@ TEST_F(ParquetReaderTest, EmptyColumnsParam) EXPECT_EQ(result.tbl->num_rows(), 0); } +TEST_F(ParquetReaderTest, BinaryAsStrings) +{ + std::vector strings{ + "Monday", "Wednesday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; + const auto num_rows = strings.size(); + + auto seq_col0 = random_values(num_rows); + auto seq_col2 = random_values(num_rows); + auto validity = cudf::test::iterators::no_nulls(); + + column_wrapper col0{seq_col0.begin(), seq_col0.end(), validity}; + column_wrapper col1{strings.begin(), strings.end()}; + column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; + + std::vector> cols; + cols.push_back(col0.release()); + cols.push_back(col1.release()); + cols.push_back(col2.release()); + auto expected = std::make_unique
(std::move(cols)); + EXPECT_EQ(3, expected->num_columns()); + + cudf_io::table_input_metadata expected_metadata(*expected); + expected_metadata.column_metadata[0].set_name("col_other"); + expected_metadata.column_metadata[1].set_name("col_string").set_output_as_binary(true); + expected_metadata.column_metadata[2].set_name("col_another"); + + auto filepath = temp_env->get_temp_filepath("BinaryReadStrings.parquet"); + cudf_io::parquet_writer_options out_opts = + cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) + .metadata(&expected_metadata); + cudf_io::write_parquet(out_opts); + + cudf_io::parquet_reader_options in_opts = + cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}) + .convert_binary_to_strings({true, true, true, true, true, true, true, true}); + auto result = cudf_io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + + // test default options result the same as all true + cudf_io::parquet_reader_options binary_in_default_opts = + cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); + result = cudf_io::read_parquet(binary_in_default_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + + // test all false results in binary + cudf_io::parquet_reader_options binary_in_opts = + cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}) + .convert_binary_to_strings({false, false, false, false, false, false, false, false}); + result = cudf_io::read_parquet(binary_in_opts); + + auto original_cols = expected->release(); + original_cols[1] = + cudf::test::lists_column_wrapper{{'M', 'o', 'n', 'd', 'a', 'y'}, + {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'M', 'o', 'n', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'u', 'n', 'd', 'a', 'y'}} + .release(); + expected = std::make_unique(std::move(original_cols)); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 471ddef81c2..d511512431b 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -667,15 +667,13 @@ std::vector resolve_null_precedence(JNIEnv *env, jbooleanArray namespace { -int set_column_metadata(cudf::io::column_in_metadata &column_metadata, - std::vector &col_names, - cudf::jni::native_jbooleanArray &nullability, - cudf::jni::native_jbooleanArray &is_int96, - cudf::jni::native_jintArray &precisions, - cudf::jni::native_jbooleanArray &is_map, - cudf::jni::native_jbooleanArray &hasParquetFieldIds, - cudf::jni::native_jintArray &parquetFieldIds, - cudf::jni::native_jintArray &children, int num_children, int read_index) { +int set_column_metadata( + cudf::io::column_in_metadata &column_metadata, std::vector &col_names, + cudf::jni::native_jbooleanArray &nullability, cudf::jni::native_jbooleanArray &is_int96, + cudf::jni::native_jintArray &precisions, cudf::jni::native_jbooleanArray &is_map, + cudf::jni::native_jbooleanArray &hasParquetFieldIds, + cudf::jni::native_jintArray &parquetFieldIds, cudf::jni::native_jintArray &children, + int num_children, int read_index, cudf::jni::native_jbooleanArray &is_binary) { int write_index = 0; for (int i = 0; i < num_children; i++, write_index++) { cudf::io::column_in_metadata child; @@ -686,6 +684,9 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata, if (!is_int96.is_null()) { child.set_int96_timestamps(is_int96[read_index]); } + if (!is_binary.is_null()) { + child.set_output_as_binary(is_binary[read_index]); + } if (is_map[read_index]) { child.set_list_column_as_map(); } @@ -695,9 +696,9 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata, column_metadata.add_child(child); int childs_children = children[read_index++]; if (childs_children > 0) { - read_index = set_column_metadata(column_metadata.child(write_index), col_names, nullability, - is_int96, precisions, is_map, hasParquetFieldIds, - parquetFieldIds, children, childs_children, read_index); + read_index = set_column_metadata( + column_metadata.child(write_index), col_names, nullability, is_int96, precisions, is_map, + hasParquetFieldIds, parquetFieldIds, children, childs_children, read_index, is_binary); } } return read_index; @@ -707,7 +708,8 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam jintArray &j_children, jbooleanArray &j_col_nullability, jbooleanArray &j_is_int96, jintArray &j_precisions, jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata, - jbooleanArray &j_hasParquetFieldIds, jintArray &j_parquetFieldIds) { + jbooleanArray &j_hasParquetFieldIds, jintArray &j_parquetFieldIds, + jbooleanArray &j_is_binary) { cudf::jni::auto_set_device(env); cudf::jni::native_jstringArray col_names(env, j_col_names); cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); @@ -717,6 +719,7 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam cudf::jni::native_jintArray parquetFieldIds(env, j_parquetFieldIds); cudf::jni::native_jintArray children(env, j_children); cudf::jni::native_jbooleanArray is_map(env, j_is_map); + cudf::jni::native_jbooleanArray is_binary(env, j_is_binary); auto cpp_names = col_names.as_cpp_vector(); @@ -734,6 +737,9 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam if (!is_int96.is_null()) { metadata.column_metadata[write_index].set_int96_timestamps(is_int96[read_index]); } + if (!is_binary.is_null()) { + metadata.column_metadata[write_index].set_output_as_binary(is_binary[read_index]); + } if (is_map[read_index]) { metadata.column_metadata[write_index].set_list_column_as_map(); } @@ -742,9 +748,10 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam } int childs_children = children[read_index++]; if (childs_children > 0) { - read_index = set_column_metadata( - metadata.column_metadata[write_index], cpp_names, col_nullability, is_int96, precisions, - is_map, hasParquetFieldIds, parquetFieldIds, children, childs_children, read_index); + read_index = + set_column_metadata(metadata.column_metadata[write_index], cpp_names, col_nullability, + is_int96, precisions, is_map, hasParquetFieldIds, parquetFieldIds, + children, childs_children, read_index, is_binary); } } } @@ -1519,12 +1526,16 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( std::unique_ptr data_sink( new cudf::jni::jni_writer_data_sink(env, consumer)); + // temp stub + jbooleanArray j_is_binary = NULL; + using namespace cudf::io; using namespace cudf::jni; sink_info sink{data_sink.get()}; table_input_metadata metadata; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96, - j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds, + j_is_binary); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); @@ -1563,11 +1574,15 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( try { cudf::jni::native_jstring output_path(env, j_output_path); + // temp stub + jbooleanArray j_is_binary = NULL; + using namespace cudf::io; using namespace cudf::jni; table_input_metadata metadata; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96, - j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds, + j_is_binary); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); @@ -1695,9 +1710,12 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin( // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue. jbooleanArray j_hasParquetFieldIds = NULL; jintArray j_parquetFieldIds = NULL; + // temp stub + jbooleanArray j_is_binary = NULL; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96, - j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds, + j_is_binary); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); @@ -1744,8 +1762,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin( // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue. jbooleanArray j_hasParquetFieldIds = NULL; jintArray j_parquetFieldIds = NULL; + // temp stub + jbooleanArray j_is_binary = NULL; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96, - j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds, + j_is_binary); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd index abde77a0001..db3443111f5 100644 --- a/python/cudf/cudf/_lib/cpp/io/types.pxd +++ b/python/cudf/cudf/_lib/cpp/io/types.pxd @@ -72,6 +72,7 @@ cdef extern from "cudf/io/types.hpp" \ column_in_metadata& set_int96_timestamps(bool req) column_in_metadata& set_decimal_precision(uint8_t precision) column_in_metadata& child(size_type i) + column_in_metadata& set_output_as_binary(bool binary) cdef cppclass table_input_metadata: table_input_metadata() except +