diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index ce0b4c10b1b..0dfc8a89825 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -341,6 +341,10 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source CUDF_EXPECTS(trie.size() < multistate::max_segment_value, "delimiter contains too many total tokens to produce a deterministic result."); + // Empty byte ranges could still produce output if the beginning overlapped with the beginning of + // a field with the logic below. Best disallow them instead. + CUDF_EXPECTS(byte_range.size() > 0, "byte range cannot be empty"); + auto concurrency = 2; // must be at least 32 when using warp-reduce on partials // must be at least 1 more than max possible concurrent tiles @@ -383,16 +387,29 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source stream, streams); + // String offsets point to the first character of a field + // This finds the first field whose first character starts inside or after the byte range auto relevant_offsets_begin = thrust::lower_bound(rmm::exec_policy(stream), string_offsets.begin(), string_offsets.end() - 1, byte_range.offset()); - auto relevant_offsets_end = thrust::upper_bound(rmm::exec_policy(stream), - string_offsets.begin(), - string_offsets.end() - 1, - byte_range.offset() + byte_range.size()) + - 1; + // This finds the first field beginning after the byte range. + // We shift it by 1 to also copy this last offset + auto relevant_offsets_end = 1 + thrust::lower_bound(rmm::exec_policy(stream), + string_offsets.begin(), + string_offsets.end() - 1, + byte_range.offset() + byte_range.size()); + + // The above logic works if there are no duplicate string_offsets entries. + // The only way we can get duplicates is if the input ends with a delimiter. + // relevant_offsets_end should then point to the last entry, not the second-to-last, which can + // happen if byte_range.offset() + byte_range.size() matches the input size exactly. + // Without this adjustment, string_offsets_out would be missing the last element. + bool last_field_empty = string_offsets.size() >= 2 && + string_offsets.element(string_offsets.size() - 2, stream) == bytes_total; + bool byte_range_exact_end = byte_range.offset() + byte_range.size() == bytes_total; + if (last_field_empty && byte_range_exact_end) { ++relevant_offsets_end; } auto string_offsets_out_size = relevant_offsets_end - relevant_offsets_begin; diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index cfd1a16f19a..80f78d43985 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -36,6 +36,19 @@ using namespace test; struct MultibyteSplitTest : public BaseFixture { }; +TEST_F(MultibyteSplitTest, Simple) +{ + auto delimiter = std::string(":"); + auto host_input = std::string("abc:def"); + + auto expected = strings_column_wrapper{"abc:", "def"}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split(*source, delimiter); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + TEST_F(MultibyteSplitTest, NondeterministicMatching) { auto delimiter = std::string("abac"); @@ -62,6 +75,22 @@ TEST_F(MultibyteSplitTest, DelimiterAtEnd) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); } +TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange) +{ + auto delimiter = std::string(":"); + auto host_input = std::string("abcdefg:"); + + auto expected = strings_column_wrapper{"abcdefg:", ""}; + + auto source = cudf::io::text::make_source(host_input); + auto out = cudf::io::text::multibyte_split( + *source, + delimiter, + cudf::io::text::byte_range_info{0, static_cast(host_input.size())}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); +} + TEST_F(MultibyteSplitTest, LargeInput) { auto host_input = std::string(); @@ -169,4 +198,36 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS); } +TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges) +{ + using namespace cudf::io::text; + + auto host_input = std::string(); + for (auto i = 0; i < 5; i++) { + host_input += "::"; + } + + auto delimiter = std::string("::"); + auto source = make_source(host_input); + + // for all possible ways to split the input, check that each field is only output once + int size = static_cast(host_input.size()); + for (int split1 = 1; split1 < size; split1++) { + SCOPED_TRACE(split1); + for (int split2 = split1 + 1; split2 < size; split2++) { + SCOPED_TRACE(split2); + auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1}); + auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1}); + auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2}); + + auto out_views = std::vector({out1->view(), out2->view(), out3->view()}); + auto out = cudf::concatenate(out_views); + + auto expected = multibyte_split(*source, delimiter); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS); + } + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py index 5582e16b384..2adace565d5 100644 --- a/python/dask_cudf/dask_cudf/io/text.py +++ b/python/dask_cudf/dask_cudf/io/text.py @@ -39,7 +39,7 @@ def read_text(path, chunksize="256 MiB", **kwargs): kwargs1 = kwargs.copy() kwargs1["byte_range"] = ( start, - chunksize - 1, + chunksize, ) # specify which chunk of the file we care about dsk[(name, i)] = (apply, cudf.read_text, [fn], kwargs1)