From 4eed4107f55a8e25122a1befec3de3fc5d3b794b Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" <42624703+rgsl888prabhu@users.noreply.github.com> Date: Tue, 24 Aug 2021 23:37:05 +0530 Subject: [PATCH] Add support for reading ORC file with no row group index (#9060) The ORC reader in cuIO was designed thinking row group index is always available, which resulted in the failure. Changes have been made to read ORC files even in case group index stream is not available. closes #8878 Authors: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Devavret Makkar (https://github.com/devavret) - Vukasin Milovanovic (https://github.com/vuule) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/9060 --- cpp/src/io/orc/reader_impl.cu | 51 ++++++++++++------ .../TestOrcFile.NoIndStrm.IntWithNulls.orc | Bin 0 -> 101 bytes ...dStrm.StructAndIntWithNulls.TwoStripes.orc | Bin 0 -> 232 bytes ...rcFile.NoIndStrm.StructAndIntWithNulls.orc | Bin 0 -> 193 bytes ...estOrcFile.NoIndStrm.StructWithNoNulls.orc | Bin 0 -> 167 bytes python/cudf/cudf/tests/test_orc.py | 18 +++++++ 6 files changed, 54 insertions(+), 15 deletions(-) create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 833c88f3788..191050d5b65 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -271,6 +271,7 @@ class aggregate_orc_metadata { size_type const num_rows; size_type const num_columns; size_type const num_stripes; + bool row_grp_idx_present = true; /** * @brief Create a metadata object from each element in the source vector @@ -370,6 +371,8 @@ class aggregate_orc_metadata { return per_file_metadata[source_idx].get_column_name(column_idx); } + auto is_row_grp_idx_present() const { return row_grp_idx_present; } + std::vector select_stripes( std::vector> const& user_specified_stripes, size_type& row_start, @@ -459,6 +462,7 @@ class aggregate_orc_metadata { ProtobufReader(sf_data, sf_length) .read(per_file_metadata[mapping.source_idx].stripefooters[i]); mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i]; + if (stripe->indexLength == 0) { row_grp_idx_present = false; } } } } @@ -1140,6 +1144,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Association between each ORC column and its cudf::column _col_meta.orc_col_map.emplace_back(_metadata->get_num_cols(), -1); std::vector nested_col; + bool is_data_empty = false; // Get a list of column data types std::vector column_types; @@ -1197,6 +1202,8 @@ table_with_metadata reader::impl::read(size_type skip_rows, const bool use_index = (_use_index == true) && + // Do stripes have row group index + _metadata->is_row_grp_idx_present() && // Only use if we don't have much work with complete columns & stripes // TODO: Consider nrows, gpu, and tune the threshold (num_rows > _metadata->get_row_index_stride() && !(_metadata->get_row_index_stride() & 7) && @@ -1244,13 +1251,21 @@ table_with_metadata reader::impl::read(size_type skip_rows, stream_info, level == 0); - CUDF_EXPECTS(total_data_size > 0, "Expected streams data within stripe"); + if (total_data_size == 0) { + CUDF_EXPECTS(stripe_info->indexLength == 0, "Invalid index rowgroup stream data"); + // In case ROW GROUP INDEX is not present and all columns are structs with no null + // stream, there is nothing to read at this level. + auto fn_check_dtype = [](auto dtype) { return dtype.id() == type_id::STRUCT; }; + CUDF_EXPECTS(std::all_of(column_types.begin(), column_types.end(), fn_check_dtype), + "Expected streams data within stripe"); + is_data_empty = true; + } stripe_data.emplace_back(total_data_size, stream); auto dst_base = static_cast(stripe_data.back().data()); // Coalesce consecutive streams into one read - while (stream_count < stream_info.size()) { + while (not is_data_empty and stream_count < stream_info.size()) { const auto d_dst = dst_base + stream_info[stream_count].dst_pos; const auto offset = stream_info[stream_count].offset; auto len = stream_info[stream_count].length; @@ -1332,8 +1347,10 @@ table_with_metadata reader::impl::read(size_type skip_rows, if (chunk.type_kind == orc::TIMESTAMP) { chunk.ts_clock_rate = to_clockrate(_timestamp_type.id()); } - for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { - chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; + if (not is_data_empty) { + for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { + chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; + } } } stripe_start_row += num_rows_per_stripe; @@ -1367,7 +1384,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, }); } // Setup row group descriptors if using indexes - if (_metadata->per_file_metadata[0].ps.compression != orc::NONE) { + if (_metadata->per_file_metadata[0].ps.compression != orc::NONE and not is_data_empty) { auto decomp_data = decompress_stripe_data(chunks, stripe_data, @@ -1418,19 +1435,23 @@ table_with_metadata reader::impl::read(size_type skip_rows, out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, stream, _mr); } - decode_stream_data(chunks, - num_dict_entries, - skip_rows, - tz_table.view(), - row_groups, - _metadata->get_row_index_stride(), - out_buffers[level], - level, - stream); + if (not is_data_empty) { + decode_stream_data(chunks, + num_dict_entries, + skip_rows, + tz_table.view(), + row_groups, + _metadata->get_row_index_stride(), + out_buffers[level], + level, + stream); + } // Extract information to process nested child columns if (nested_col.size()) { - scan_null_counts(chunks, null_count_prefix_sums[level], stream); + if (not is_data_empty) { + scan_null_counts(chunks, null_count_prefix_sums[level], stream); + } row_groups.device_to_host(stream, true); aggregate_child_meta(chunks, row_groups, out_buffers[level], nested_col, level); } diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc new file mode 100644 index 0000000000000000000000000000000000000000..2103e0212fcdcc9a110e0dbe550d2fcb94bb640d GIT binary patch literal 101 zcmeYda%N><_ewK`ddygwP8Y z9x!aOO~_G|w<_diX#vDJf-glFyVRpXLuUW-OJMyhLL1)EO2t6qR(=1odbJ^{h+~)G&L%z`)@k Zz$nq6pv1($qrqUn+4O~(*+0lx3;@GaM5X`$ literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc new file mode 100644 index 0000000000000000000000000000000000000000..1c6e53a0b92f68b996b4a249b7050eae715331e9 GIT binary patch literal 167 zcmeYda%N{>U}a3=E$=&YnDZ-Sh)PaGSs>4kt-x4PlN{u7zh5rdTp?f|Qb!=@QCW-OgC`Ejpf%R(Vxc42|d(gh3*9A*NH5)BGUObpx_ P3