Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix out of bound write handling for get_json_object_multiple_paths #2285

Merged
merged 5 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 33 additions & 50 deletions src/main/cpp/src/get_json_object.cu
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,13 @@ struct json_path_processing_data {
*
* The number of warps processing each row is computed as `ceil(num_paths / warp_size)`.
*
* We explicitly set a value for `min_block_per_sm` parameter in the launch bounds to avoid
* spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between
* the maximum number of registers used by a kernel and the parallelism of the kernel.
* If lots of registers are used the parallelism may suffer. But in our case NVCC gets this wrong
* and we want to avoid spilling all the time or else the performance is really bad. This
* essentially tells NVCC to prefer using lots of registers over spilling.
*
* @param input The input JSON strings stored in a strings column
* @param path_data Array containing all path data
* @param num_threads_per_row Number of threads processing each input row
Expand Down Expand Up @@ -848,51 +855,27 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL

/**
* @brief A utility class to launch the main kernel.
*
* It caches the kernel launch parameters to reuse multiple times.
*/
class kernel_launcher {
public:
explicit kernel_launcher(cudf::size_type _input_size, std::size_t _path_size)
: input_size{_input_size},
path_size{_path_size},
num_threads_per_row{cudf::util::div_rounding_up_safe(
path_size, static_cast<std::size_t>(cudf::detail::warp_size)) *
cudf::detail::warp_size},
num_blocks{cudf::util::div_rounding_up_safe(num_threads_per_row * input_size,
static_cast<std::size_t>(block_size))}
struct kernel_launcher {
static void exec(cudf::column_device_view const& input,
cudf::device_span<json_path_processing_data> path_data,
rmm::cuda_stream_view stream)
{
}

void exec(cudf::column_device_view const& input,
cudf::device_span<json_path_processing_data> path_data,
rmm::cuda_stream_view stream) const
{
CUDF_EXPECTS(input.size() == input_size && path_data.size() == path_size,
"Unexpected data sizes upon launching kernel.");
// The optimal values for block_size and min_block_per_sm were found through testing,
// which are either 128-8 or 256-4. The pair 128-8 seems a bit better.
static constexpr int block_size = 128;
static constexpr int min_block_per_sm = 8;

// The number of threads for processing one input row is at least one warp.
auto const num_threads_per_row =
cudf::util::div_rounding_up_safe(path_data.size(),
static_cast<std::size_t>(cudf::detail::warp_size)) *
cudf::detail::warp_size;
auto const num_blocks = cudf::util::div_rounding_up_safe(num_threads_per_row * input.size(),
static_cast<std::size_t>(block_size));
get_json_object_kernel<block_size, min_block_per_sm>
<<<num_blocks, block_size, 0, stream.value()>>>(input, path_data, num_threads_per_row);
}

private:
// We explicitly set the minBlocksPerMultiprocessor parameter in the launch bounds to avoid
// spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between
// the maximum number of registers used by a kernel and the parallelism of the kernel.
// If lots of registers are used the parallelism may suffer. But in our case
// NVCC gets this wrong and we want to avoid spilling all the time or else
// the performance is really bad. This essentially tells NVCC to prefer using lots
// of registers over spilling.
//
// The optimal values for block_size and min_block_per_sm were found through testing,
// which are 128-8 or 256-4.
static constexpr int block_size = 128;
static constexpr int min_block_per_sm = 8;

cudf::size_type const input_size;
std::size_t const path_size;

std::size_t const num_threads_per_row;
std::size_t const num_blocks;
};

/**
Expand Down Expand Up @@ -1038,9 +1021,7 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
h_path_data, stream, rmm::mr::get_current_device_resource());
thrust::uninitialized_fill(
rmm::exec_policy(stream), d_has_out_of_bound.begin(), d_has_out_of_bound.end(), 0);

auto const kernel = kernel_launcher{input.size(), json_paths.size()};
kernel.exec(*d_input_ptr, d_path_data, stream);
kernel_launcher::exec(*d_input_ptr, d_path_data, stream);

// Do not use parallel check since we do not have many elements.
auto h_has_out_of_bound = cudf::detail::make_host_vector_sync(d_has_out_of_bound, stream);
Expand Down Expand Up @@ -1112,7 +1093,7 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
h_path_data, stream, rmm::mr::get_current_device_resource());
thrust::uninitialized_fill(
rmm::exec_policy(stream), d_has_out_of_bound.begin(), d_has_out_of_bound.end(), 0);
kernel.exec(*d_input_ptr, d_path_data, stream);
kernel_launcher::exec(*d_input_ptr, d_path_data, stream);
Copy link
Collaborator Author

@ttnghia ttnghia Jul 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When launching the kernel a second time, the number of paths is different from that number in launching the first time. Thus, the kernel launch parameters are different.


// Check out of bound again to make sure everything looks right.
h_has_out_of_bound = cudf::detail::make_host_vector_sync(d_has_out_of_bound, stream);
Expand All @@ -1123,12 +1104,14 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
// If OOB is still detected, there must be something wrong happened.
CUDF_EXPECTS(has_no_oob, "Unexpected out-of-bound write in get_json_object kernel.");

for (auto const idx : oob_indices) {
output[idx] = cudf::make_strings_column(input.size(),
std::move(out_offsets_and_sizes[idx].first),
out_char_buffers[idx].release(),
out_null_masks_and_null_counts[idx].second,
std::move(out_null_masks_and_null_counts[idx].first));
for (std::size_t idx = 0; idx < oob_indices.size(); ++idx) {
auto const out_idx = oob_indices[idx];
output[out_idx] =
cudf::make_strings_column(input.size(),
std::move(out_offsets_and_sizes[idx].first),
out_char_buffers[idx].release(),
out_null_masks_and_null_counts[idx].second,
std::move(out_null_masks_and_null_counts[idx].first));
Comment on lines +1107 to +1114
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The indices out_idx and idx should be different. Previously they were the same. That is incorrect.

}
return output;
}
Expand Down
23 changes: 23 additions & 0 deletions src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,29 @@ void getJsonObjectMultiplePathsTest() {
}
}

@Test
void getJsonObjectMultiplePathsTest_JNIKernelCalledTwice() {
List<JSONUtils.PathInstructionJni> path0 = Arrays.asList(namedPath("k0"));
List<JSONUtils.PathInstructionJni> path1 = Arrays.asList(namedPath("k1"));
List<JSONUtils.PathInstructionJni> path2 = Arrays.asList();
List<List<JSONUtils.PathInstructionJni>> paths = Arrays.asList(path0, path1, path2);
try (ColumnVector jsonCv = ColumnVector.fromStrings("{\"k0\": \"v0\", \"k1\": \"v1\"}", "['\n\n\n\n\n\n\n\n\n\n']");
ColumnVector expected0 = ColumnVector.fromStrings("v0", null);
ColumnVector expected1 = ColumnVector.fromStrings("v1", null);
ColumnVector expected2 = ColumnVector.fromStrings("{\"k0\":\"v0\",\"k1\":\"v1\"}", "[\"\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"]")) {
ColumnVector[] output = JSONUtils.getJsonObjectMultiplePaths(jsonCv, paths);
try {
assertColumnsAreEqual(expected0, output[0]);
assertColumnsAreEqual(expected1, output[1]);
assertColumnsAreEqual(expected2, output[2]);
} finally {
for (ColumnVector cv : output) {
cv.close();
}
}
}
}

private JSONUtils.PathInstructionJni wildcardPath() {
return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.WILDCARD, "", -1);
}
Expand Down
Loading