Fix read_text when byte_range is aligned with field (NVIDIA#11371)

Currently, if the beginning of a field coincides with either the beginning (inclusive) or end (exclusive) of a byte range, the field will be part of the output. This PR fixes the resulting field duplication if we concatenate the results from a partition of the input into byte ranges. The issue stems from the fact that we use lower_bound to determine the beginning of a field, but upper_bound to determine its end, so if the end of the byte range coincides with the beginning of a field, the result from the range [a,b) doesn't fit exactly onto the result from the range [b,c). To keep the previous behavior of emitting an empty field if the input ends with a delimiter, I needed to add a small fix that differentiates between byte ranges whose size matches the input size exactly, and ones that overrun the input size (which is the default behavior). Authors: - Tobias Ribizel (https://github.com/upsj) Approvers: - Christopher Harris (https://github.com/cwharris) - Nghia Truong (https://github.com/ttnghia) - Mark Harris (https://github.com/harrism) - Charles Blackmon-Luca (https://github.com/charlesbluca) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: rapidsai/cudf#11371
tgravescs · Aug 2, 2022 · 2dc5c3f · 2dc5c3f
1 parent 797215b
commit 2dc5c3f
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 6 deletions.
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
@@ -341,6 +341,10 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   CUDF_EXPECTS(trie.size() < multistate::max_segment_value,
                "delimiter contains too many total tokens to produce a deterministic result.");
 
+  // Empty byte ranges could still produce output if the beginning overlapped with the beginning of
+  // a field with the logic below. Best disallow them instead.
+  CUDF_EXPECTS(byte_range.size() > 0, "byte range cannot be empty");
+
   auto concurrency = 2;
   // must be at least 32 when using warp-reduce on partials
   // must be at least 1 more than max possible concurrent tiles
@@ -383,16 +387,29 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
     stream,
     streams);
 
+  // String offsets point to the first character of a field
+  // This finds the first field whose first character starts inside or after the byte range
   auto relevant_offsets_begin = thrust::lower_bound(rmm::exec_policy(stream),
                                                     string_offsets.begin(),
                                                     string_offsets.end() - 1,
                                                     byte_range.offset());
 
-  auto relevant_offsets_end = thrust::upper_bound(rmm::exec_policy(stream),
-                                                  string_offsets.begin(),
-                                                  string_offsets.end() - 1,
-                                                  byte_range.offset() + byte_range.size()) +
-                              1;
+  // This finds the first field beginning after the byte range.
+  // We shift it by 1 to also copy this last offset
+  auto relevant_offsets_end = 1 + thrust::lower_bound(rmm::exec_policy(stream),
+                                                      string_offsets.begin(),
+                                                      string_offsets.end() - 1,
+                                                      byte_range.offset() + byte_range.size());
+
+  // The above logic works if there are no duplicate string_offsets entries.
+  // The only way we can get duplicates is if the input ends with a delimiter.
+  // relevant_offsets_end should then point to the last entry, not the second-to-last, which can
+  // happen if byte_range.offset() + byte_range.size() matches the input size exactly.
+  // Without this adjustment, string_offsets_out would be missing the last element.
+  bool last_field_empty = string_offsets.size() >= 2 &&
+                          string_offsets.element(string_offsets.size() - 2, stream) == bytes_total;
+  bool byte_range_exact_end = byte_range.offset() + byte_range.size() == bytes_total;
+  if (last_field_empty && byte_range_exact_end) { ++relevant_offsets_end; }
 
   auto string_offsets_out_size = relevant_offsets_end - relevant_offsets_begin;
 

diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -36,6 +36,19 @@ using namespace test;
 struct MultibyteSplitTest : public BaseFixture {
 };
 
+TEST_F(MultibyteSplitTest, Simple)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abc:def");
+
+  auto expected = strings_column_wrapper{"abc:", "def"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, NondeterministicMatching)
 {
   auto delimiter  = std::string("abac");
@@ -62,6 +75,22 @@ TEST_F(MultibyteSplitTest, DelimiterAtEnd)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
 
+TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg:");
+
+  auto expected = strings_column_wrapper{"abcdefg:", ""};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(
+    *source,
+    delimiter,
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size())});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, LargeInput)
 {
   auto host_input    = std::string();
@@ -169,4 +198,36 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS);
 }
 
+TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges)
+{
+  using namespace cudf::io::text;
+
+  auto host_input = std::string();
+  for (auto i = 0; i < 5; i++) {
+    host_input += "::";
+  }
+
+  auto delimiter = std::string("::");
+  auto source    = make_source(host_input);
+
+  // for all possible ways to split the input, check that each field is only output once
+  int size = static_cast<int>(host_input.size());
+  for (int split1 = 1; split1 < size; split1++) {
+    SCOPED_TRACE(split1);
+    for (int split2 = split1 + 1; split2 < size; split2++) {
+      SCOPED_TRACE(split2);
+      auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1});
+      auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1});
+      auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2});
+
+      auto out_views = std::vector<cudf::column_view>({out1->view(), out2->view(), out3->view()});
+      auto out       = cudf::concatenate(out_views);
+
+      auto expected = multibyte_split(*source, delimiter);
+
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS);
+    }
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py
@@ -39,7 +39,7 @@ def read_text(path, chunksize="256 MiB", **kwargs):
                 kwargs1 = kwargs.copy()
                 kwargs1["byte_range"] = (
                     start,
-                    chunksize - 1,
+                    chunksize,
                 )  # specify which chunk of the file we care about
 
                 dsk[(name, i)] = (apply, cudf.read_text, [fn], kwargs1)