Skip to content

Commit

Permalink
Fix read_text when byte_range is aligned with field (NVIDIA#11371)
Browse files Browse the repository at this point in the history
Currently, if the beginning of a field coincides with either the beginning (inclusive) or end (exclusive) of a byte range, the field will be part of the output. This PR fixes the resulting field duplication if we concatenate the results from a partition of the input into byte ranges.

The issue stems from the fact that we use lower_bound to determine the beginning of a field, but upper_bound to determine its end, so if the end of the byte range coincides with the beginning of a field, the result from the range [a,b) doesn't fit exactly onto the result from the range [b,c).

To keep the previous behavior of emitting an empty field if the input ends with a delimiter, I needed to add a small fix that differentiates between byte ranges whose size matches the input size exactly, and ones that overrun the input size (which is the default behavior).

Authors:
  - Tobias Ribizel (https://github.com/upsj)

Approvers:
  - Christopher Harris (https://github.com/cwharris)
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: rapidsai/cudf#11371
  • Loading branch information
upsj authored Aug 2, 2022
1 parent 797215b commit 2dc5c3f
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 6 deletions.
27 changes: 22 additions & 5 deletions cpp/src/io/text/multibyte_split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,10 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
CUDF_EXPECTS(trie.size() < multistate::max_segment_value,
"delimiter contains too many total tokens to produce a deterministic result.");

// Empty byte ranges could still produce output if the beginning overlapped with the beginning of
// a field with the logic below. Best disallow them instead.
CUDF_EXPECTS(byte_range.size() > 0, "byte range cannot be empty");

auto concurrency = 2;
// must be at least 32 when using warp-reduce on partials
// must be at least 1 more than max possible concurrent tiles
Expand Down Expand Up @@ -383,16 +387,29 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
stream,
streams);

// String offsets point to the first character of a field
// This finds the first field whose first character starts inside or after the byte range
auto relevant_offsets_begin = thrust::lower_bound(rmm::exec_policy(stream),
string_offsets.begin(),
string_offsets.end() - 1,
byte_range.offset());

auto relevant_offsets_end = thrust::upper_bound(rmm::exec_policy(stream),
string_offsets.begin(),
string_offsets.end() - 1,
byte_range.offset() + byte_range.size()) +
1;
// This finds the first field beginning after the byte range.
// We shift it by 1 to also copy this last offset
auto relevant_offsets_end = 1 + thrust::lower_bound(rmm::exec_policy(stream),
string_offsets.begin(),
string_offsets.end() - 1,
byte_range.offset() + byte_range.size());

// The above logic works if there are no duplicate string_offsets entries.
// The only way we can get duplicates is if the input ends with a delimiter.
// relevant_offsets_end should then point to the last entry, not the second-to-last, which can
// happen if byte_range.offset() + byte_range.size() matches the input size exactly.
// Without this adjustment, string_offsets_out would be missing the last element.
bool last_field_empty = string_offsets.size() >= 2 &&
string_offsets.element(string_offsets.size() - 2, stream) == bytes_total;
bool byte_range_exact_end = byte_range.offset() + byte_range.size() == bytes_total;
if (last_field_empty && byte_range_exact_end) { ++relevant_offsets_end; }

auto string_offsets_out_size = relevant_offsets_end - relevant_offsets_begin;

Expand Down
61 changes: 61 additions & 0 deletions cpp/tests/io/text/multibyte_split_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,19 @@ using namespace test;
struct MultibyteSplitTest : public BaseFixture {
};

TEST_F(MultibyteSplitTest, Simple)
{
auto delimiter = std::string(":");
auto host_input = std::string("abc:def");

auto expected = strings_column_wrapper{"abc:", "def"};

auto source = cudf::io::text::make_source(host_input);
auto out = cudf::io::text::multibyte_split(*source, delimiter);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
}

TEST_F(MultibyteSplitTest, NondeterministicMatching)
{
auto delimiter = std::string("abac");
Expand All @@ -62,6 +75,22 @@ TEST_F(MultibyteSplitTest, DelimiterAtEnd)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
}

TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
{
auto delimiter = std::string(":");
auto host_input = std::string("abcdefg:");

auto expected = strings_column_wrapper{"abcdefg:", ""};

auto source = cudf::io::text::make_source(host_input);
auto out = cudf::io::text::multibyte_split(
*source,
delimiter,
cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size())});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
}

TEST_F(MultibyteSplitTest, LargeInput)
{
auto host_input = std::string();
Expand Down Expand Up @@ -169,4 +198,36 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS);
}

TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges)
{
using namespace cudf::io::text;

auto host_input = std::string();
for (auto i = 0; i < 5; i++) {
host_input += "::";
}

auto delimiter = std::string("::");
auto source = make_source(host_input);

// for all possible ways to split the input, check that each field is only output once
int size = static_cast<int>(host_input.size());
for (int split1 = 1; split1 < size; split1++) {
SCOPED_TRACE(split1);
for (int split2 = split1 + 1; split2 < size; split2++) {
SCOPED_TRACE(split2);
auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1});
auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1});
auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2});

auto out_views = std::vector<cudf::column_view>({out1->view(), out2->view(), out3->view()});
auto out = cudf::concatenate(out_views);

auto expected = multibyte_split(*source, delimiter);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS);
}
}
}

CUDF_TEST_PROGRAM_MAIN()
2 changes: 1 addition & 1 deletion python/dask_cudf/dask_cudf/io/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def read_text(path, chunksize="256 MiB", **kwargs):
kwargs1 = kwargs.copy()
kwargs1["byte_range"] = (
start,
chunksize - 1,
chunksize,
) # specify which chunk of the file we care about

dsk[(name, i)] = (apply, cudf.read_text, [fn], kwargs1)
Expand Down

0 comments on commit 2dc5c3f

Please sign in to comment.