Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work around incompatibilities between V2 page header handling and zStandard compression in Parquet writer #14772

Merged
merged 14 commits into from
Jan 22, 2024
Merged
14 changes: 12 additions & 2 deletions cpp/src/io/parquet/page_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size;

constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }

// max V1 header size
// also valid for dict page header (V1 or V2)
constexpr int MAX_V1_HDR_SIZE = util::round_up_unsafe(27, 8);

// max V2 header size
constexpr int MAX_V2_HDR_SIZE = util::round_up_unsafe(49, 8);

// do not truncate statistics
constexpr int32_t NO_TRUNC_STATS = 0;

Expand Down Expand Up @@ -534,6 +541,9 @@ CUDF_KERNEL void __launch_bounds__(128)
uint32_t const t = threadIdx.x;
auto const data_page_type = write_v2_headers ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE;

// Max page header size excluding statistics
auto const max_data_page_hdr_size = write_v2_headers ? MAX_V2_HDR_SIZE : MAX_V1_HDR_SIZE;
vuule marked this conversation as resolved.
Show resolved Hide resolved

if (t == 0) {
col_g = col_desc[blockIdx.x];
ck_g = chunks[blockIdx.y][blockIdx.x];
Expand Down Expand Up @@ -584,7 +594,7 @@ CUDF_KERNEL void __launch_bounds__(128)
page_g.chunk = &chunks[blockIdx.y][blockIdx.x];
page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x;
page_g.hdr_size = 0;
page_g.max_hdr_size = 32;
page_g.max_hdr_size = MAX_V1_HDR_SIZE;
page_g.max_data_size = ck_g.uniq_data_size;
page_g.start_row = cur_row;
page_g.num_rows = ck_g.num_dict_entries;
Expand Down Expand Up @@ -684,7 +694,7 @@ CUDF_KERNEL void __launch_bounds__(128)
page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x;
page_g.page_type = data_page_type;
page_g.hdr_size = 0;
page_g.max_hdr_size = 32; // Max size excluding statistics
page_g.max_hdr_size = max_data_page_hdr_size; // Max size excluding statistics
if (ck_g.stats) {
uint32_t stats_hdr_len = 16;
if (col_g.stats_dtype == dtype_string || col_g.stats_dtype == dtype_byte_array) {
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/io/parquet/writer_impl.cu
vuule marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -2220,6 +2220,10 @@ writer::impl::~impl() { close(); }

void writer::impl::init_state()
{
// See issue #14781. Can remove this check once that is fixed.
CUDF_EXPECTS(not(_write_v2_headers and _compression == Compression::ZSTD),
"V2 page headers cannot be used with ZSTD compression");

_current_chunk_offset.resize(_out_sink.size());
// Write file header
file_header_s fhdr;
Expand Down
15 changes: 15 additions & 0 deletions cpp/tests/io/parquet_writer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1401,6 +1401,21 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
EXPECT_EQ(max_value, std::string(max_val));
}

// See #14772.
// zStandard compression cannot currently be used with V2 page headers due to buffer
// alignment issues.
// TODO: Remove this test when #14781 is closed.
TEST_F(ParquetWriterTest, ZstdWithV2Header)
{
auto const expected = table_view{};

cudf::io::parquet_writer_options const out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{"14772.pq"}, expected)
.compression(cudf::io::compression_type::ZSTD)
.write_v2_headers(true);
EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error);
}

// custom mem mapped data sink that supports device writes
template <bool supports_device_writes>
class custom_test_memmap_sink : public cudf::io::data_sink {
Expand Down