Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic validation in reader benchmarks #14647

Merged
merged 17 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions cpp/benchmarks/io/csv/csv_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,17 @@ void csv_read_common(DataType const& data_types,

auto const mem_stats_logger = cudf::memory_stats_logger(); // init stats logger
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
[&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache(); // Drop L3 cache for accurate measurement

timer.start();
cudf::io::read_csv(read_options);
timer.stop();
});
state.exec(
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache(); // Drop L3 cache for accurate measurement

timer.start();
auto const result = cudf::io::read_csv(read_options);
timer.stop();

CUDF_EXPECTS(result.tbl->num_columns() == view.num_columns(), "Unexpected number of columns");
CUDF_EXPECTS(result.tbl->num_rows() == view.num_rows(), "Unexpected number of rows");
});

auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
Expand Down
43 changes: 25 additions & 18 deletions cpp/benchmarks/io/csv/csv_reader_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
#include <benchmarks/io/cuio_common.hpp>
#include <benchmarks/io/nvbench_helpers.hpp>

#include <cudf/detail/utilities/default_stream.hpp>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/csv.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

Expand All @@ -39,8 +40,9 @@ void BM_csv_read_varying_options(
static_cast<int32_t>(data_type::DURATION),
static_cast<int32_t>(data_type::STRING)}),
ColSelection);
auto const cols_to_read = select_column_indexes(data_types.size(), ColSelection);
auto const num_chunks = state.get_int64("num_chunks");
auto const cols_to_read = select_column_indexes(data_types.size(), ColSelection);
cudf::size_type const expected_num_cols = cols_to_read.size();
size_t const num_chunks = state.get_int64("num_chunks");

auto const tbl = create_random_table(data_types, table_size_bytes{data_size});
auto const view = tbl->view();
Expand All @@ -60,43 +62,48 @@ void BM_csv_read_varying_options(
.comment('#')
.prefix("BM_");

size_t const chunk_size = source_sink.size() / num_chunks;
cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
auto const mem_stats_logger = cudf::memory_stats_logger();
size_t const chunk_size = cudf::util::div_rounding_up_safe(source_sink.size(), num_chunks);
auto const chunk_row_cnt =
cudf::util::div_rounding_up_safe(view.num_rows(), static_cast<cudf::size_type>(num_chunks));
Comment on lines +65 to +67
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old approach was rounding down and losing some rows. Adding the check uncovered the issue.
Also some logic in the loop got simplified by rounding up here.

auto const mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
[&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache(); // Drop L3 cache for accurate measurement

cudf::size_type num_rows_read = 0;
timer.start();
for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
// only read the header in the first chunk
read_options.set_header(chunk == 0 ? 0 : -1);

auto const is_last_chunk = chunk == (num_chunks - 1);
for (auto chunk = 0u; chunk < num_chunks; ++chunk) {
switch (RowSelection) {
case row_selection::ALL: break;
case row_selection::BYTE_RANGE:
// only read the header in the first chunk
read_options.set_header(chunk == 0 ? 0 : -1);
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
read_options.set_byte_range_offset(chunk * chunk_size);
read_options.set_byte_range_size(chunk_size);
if (is_last_chunk) read_options.set_byte_range_size(0);
break;
case row_selection::NROWS:
read_options.set_skiprows(chunk * chunk_row_cnt);
read_options.set_nrows(chunk_row_cnt);
if (is_last_chunk) read_options.set_nrows(-1);
break;
case row_selection::SKIPFOOTER:
case row_selection::SKIPFOOTER: {
read_options.set_skiprows(chunk * chunk_row_cnt);
read_options.set_skipfooter(view.num_rows() - (chunk + 1) * chunk_row_cnt);
if (is_last_chunk) read_options.set_skipfooter(0);
cudf::size_type const next_chunk_start = (chunk + 1) * chunk_row_cnt;
auto const skip_footer =
view.num_rows() > next_chunk_start ? view.num_rows() - next_chunk_start : 0;
read_options.set_skipfooter(skip_footer);
break;
}
default: CUDF_FAIL("Unsupported row selection method");
}

cudf::io::read_csv(read_options);
auto const result = cudf::io::read_csv(read_options);

num_rows_read += result.tbl->num_rows();
CUDF_EXPECTS(result.tbl->num_columns() == expected_num_cols,
"Unexpected number of columns");
}
timer.stop();
CUDF_EXPECTS(num_rows_read == view.num_rows(), "Unexpected number of rows");
});

auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
Expand Down
91 changes: 43 additions & 48 deletions cpp/benchmarks/io/json/json_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,26 @@
constexpr size_t data_size = 512 << 20;
constexpr cudf::size_type num_cols = 64;

void json_read_common(cuio_source_sink_pair& source_sink, nvbench::state& state)
void json_read_common(cuio_source_sink_pair& source_sink,
cudf::size_type num_rows_to_read,
nvbench::state& state)
{
cudf::io::json_reader_options read_opts =
cudf::io::json_reader_options::builder(source_sink.make_source_info());

auto mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
[&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache();
state.exec(
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache();

timer.start();
cudf::io::read_json(read_opts);
timer.stop();
});
timer.start();
auto const result = cudf::io::read_json(read_opts);
timer.stop();

CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
});

auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
Expand All @@ -52,55 +57,45 @@ void json_read_common(cuio_source_sink_pair& source_sink, nvbench::state& state)
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

cudf::size_type json_write_bm_data(cudf::io::sink_info sink,
std::vector<cudf::type_id> const& dtypes)
{
auto const tbl = create_random_table(
cycle_dtypes(dtypes, num_cols), table_size_bytes{data_size}, data_profile_builder());
auto const view = tbl->view();

cudf::io::json_writer_options const write_opts =
cudf::io::json_writer_options::builder(sink, view).na_rep("null").rows_per_chunk(100'000);
cudf::io::write_json(write_opts);
return view.num_rows();
}

template <cudf::io::io_type IO>
void BM_json_read_io(nvbench::state& state, nvbench::type_list<nvbench::enum_type<IO>>)
{
auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
static_cast<int32_t>(data_type::DECIMAL),
static_cast<int32_t>(data_type::TIMESTAMP),
static_cast<int32_t>(data_type::DURATION),
static_cast<int32_t>(data_type::STRING),
static_cast<int32_t>(data_type::LIST),
static_cast<int32_t>(data_type::STRUCT)});

auto const source_type = IO;
cuio_source_sink_pair source_sink(source_type);

{
auto const tbl = create_random_table(
cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder());
auto const view = tbl->view();

cudf::io::json_writer_options const write_opts =
cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
.na_rep("null")
.rows_per_chunk(100'000);
cudf::io::write_json(write_opts);
}

json_read_common(source_sink, state);
cuio_source_sink_pair source_sink(IO);
auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
static_cast<int32_t>(data_type::DECIMAL),
static_cast<int32_t>(data_type::TIMESTAMP),
static_cast<int32_t>(data_type::DURATION),
static_cast<int32_t>(data_type::STRING),
static_cast<int32_t>(data_type::LIST),
static_cast<int32_t>(data_type::STRUCT)});
auto const num_rows = json_write_bm_data(source_sink.make_sink_info(), d_type);

json_read_common(source_sink, num_rows, state);
}

template <data_type DataType, cudf::io::io_type IO>
void BM_json_read_data_type(
nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IO>>)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
auto const source_type = IO;
cuio_source_sink_pair source_sink(source_type);
{
auto const tbl = create_random_table(
cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder());
auto const view = tbl->view();

cudf::io::json_writer_options const write_opts =
cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
.na_rep("null")
.rows_per_chunk(100'000);
cudf::io::write_json(write_opts);
}
json_read_common(source_sink, state);
cuio_source_sink_pair source_sink(IO);
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
auto const num_rows = json_write_bm_data(source_sink.make_sink_info(), d_type);

json_read_common(source_sink, num_rows, state);
}

using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
Expand Down
69 changes: 39 additions & 30 deletions cpp/benchmarks/io/orc/orc_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,25 +29,26 @@
constexpr int64_t data_size = 512 << 20;
constexpr cudf::size_type num_cols = 64;

void orc_read_common(cudf::io::orc_writer_options const& opts,
void orc_read_common(cudf::size_type num_rows_to_read,
cuio_source_sink_pair& source_sink,
nvbench::state& state)
{
cudf::io::write_orc(opts);

cudf::io::orc_reader_options read_opts =
cudf::io::orc_reader_options::builder(source_sink.make_source_info());

auto mem_stats_logger = cudf::memory_stats_logger(); // init stats logger
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
[&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache();
state.exec(
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache();

timer.start();
auto const result = cudf::io::read_orc(read_opts);
timer.stop();

timer.start();
cudf::io::read_orc(read_opts);
timer.stop();
});
CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
});

auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
Expand All @@ -63,18 +64,22 @@ void BM_orc_read_data(nvbench::state& state,
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
cuio_source_sink_pair source_sink(IOType);

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();
auto const num_rows = [&]() {
vuule marked this conversation as resolved.
Show resolved Hide resolved
auto const tbl = create_random_table(
cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cuio_source_sink_pair source_sink(IOType);
cudf::io::orc_writer_options opts =
cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view);
cudf::io::orc_writer_options opts =
cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view);
cudf::io::write_orc(opts);
return view.num_rows();
}();

orc_read_common(opts, source_sink, state);
orc_read_common(num_rows, source_sink, state);
}

template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
Expand All @@ -92,19 +97,23 @@ void BM_orc_read_io_compression(

cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cuio_source_sink_pair source_sink(IOType);
cudf::io::orc_writer_options opts =
cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view)
.compression(Compression);

orc_read_common(opts, source_sink, state);
auto const num_rows = [&]() {
auto const tbl = create_random_table(
cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cudf::io::orc_writer_options opts =
cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view)
.compression(Compression);
cudf::io::write_orc(opts);
return view.num_rows();
}();

orc_read_common(num_rows, source_sink, state);
}

using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
Expand Down
12 changes: 8 additions & 4 deletions cpp/benchmarks/io/orc/orc_reader_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ void BM_orc_read_varying_options(nvbench::state& state,

auto const cols_to_read =
select_column_names(get_top_level_col_names(source_sink.make_source_info()), ColSelection);
cudf::size_type const expected_num_cols = cols_to_read.size();
cudf::io::orc_reader_options read_options =
cudf::io::orc_reader_options::builder(source_sink.make_source_info())
.columns(cols_to_read)
Expand All @@ -96,9 +97,8 @@ void BM_orc_read_varying_options(nvbench::state& state,
state.exec(
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache();

cudf::size_type num_rows_read = 0;
timer.start();
cudf::size_type rows_read = 0;
for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
switch (RowSelection) {
case row_selection::ALL: break;
Expand All @@ -112,11 +112,15 @@ void BM_orc_read_varying_options(nvbench::state& state,
default: CUDF_FAIL("Unsupported row selection method");
}

rows_read += cudf::io::read_orc(read_options).tbl->num_rows();
auto const result = cudf::io::read_orc(read_options);

num_rows_read += result.tbl->num_rows();
CUDF_EXPECTS(result.tbl->num_columns() == expected_num_cols,
"Unexpected number of columns");
}

CUDF_EXPECTS(rows_read == view.num_rows(), "Benchmark did not read the entire table");
timer.stop();
CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
});

auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
Expand Down
Loading