Merge branch 'cleanup' of https://github.com/galipremsagar/cudf into …

…cleanup
rapidsai · Nov 14, 2023 · bf7a199 · bf7a199
2 parents 083e328 + f9dc31a
commit bf7a199
Show file tree

Hide file tree

Showing 8 changed files with 189 additions and 11 deletions.
diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
@@ -53,7 +53,7 @@ static void bench_vocab_tokenize(nvbench::state& state)
 
   auto const vocab_col = [] {
     data_profile const profile = data_profile_builder().no_validity().distribution(
-      cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, 15);
     auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
     return cudf::strings::filter_characters_of_type(
       cudf::strings_column_view(col->view()),

diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
@@ -276,8 +276,12 @@ __global__ void token_counts_fn(cudf::column_device_view const d_strings,
   __syncwarp();
 
   for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
-    // add one if at the edge of a token or at the string's end
-    count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end));
+    // add one if at the edge of a token or if at the string's end
+    if (*itr) {
+      count += !(*(itr - 1));
+    } else {
+      count += (itr + 1 == d_output_end);
+    }
   }
   __syncwarp();
 

diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
@@ -246,14 +246,14 @@ TEST_F(TextTokenizeTest, Vocabulary)
 
 TEST_F(TextTokenizeTest, VocabularyLongStrings)
 {
-  cudf::test::strings_column_wrapper vocabulary(  // leaving out 'cat' on purpose
+  cudf::test::strings_column_wrapper vocabulary(
     {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
   auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
 
   std::vector<std::string> h_strings(
     4,
     "the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse "
-    "jumped over the mouse house with the dog");
+    "jumped  over  the mousé  house with the dog  ");
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
   auto input_view = cudf::strings_column_view(input);
   auto delimiter  = cudf::string_scalar(" ");
@@ -262,10 +262,10 @@ TEST_F(TextTokenizeTest, VocabularyLongStrings)
 
   using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
   // clang-format off
-  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}});
+  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 

diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
+
+    cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
+        pass
+
+    cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
+        const column_view &merge_pairs
+    ) except +
+
+    cdef unique_ptr[column] byte_pair_encoding(
+        const column_view &strings,
+        const bpe_merge_pairs &merge_pairs,
+        const string_scalar &separator
+    ) except +
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
+    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
 )
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(

diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+from cudf.core.buffer import acquire_spill_lock
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
+    bpe_merge_pairs as cpp_bpe_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+    load_merge_pairs as cpp_load_merge_pairs,
+)
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.scalar cimport DeviceScalar
+
+
+cdef class BPEMergePairs:
+    cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
+
+    def __cinit__(self, Column merge_pairs):
+        cdef column_view c_pairs = merge_pairs.view()
+        with nogil:
+            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+
+
+@acquire_spill_lock()
+def byte_pair_encoding(
+    Column strings,
+    BPEMergePairs merge_pairs,
+    object separator
+):
+    cdef column_view c_strings = strings.view()
+    cdef DeviceScalar d_separator = separator.device_value
+    cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
+        .get_raw_ptr()
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_byte_pair_encoding(
+                c_strings,
+                merge_pairs.c_obj.get()[0],
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import cudf
+from cudf._lib.nvtext.byte_pair_encode import (
+    BPEMergePairs as cpp_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+)
+
+
+class BytePairEncoder:
+    """
+    Given a merge pairs strings series, performs byte pair encoding on
+    a strings series using the provided separator.
+
+    Parameters
+    ----------
+    merges_pairs : str
+        Strings column of merge pairs
+
+    Returns
+    -------
+    BytePairEncoder
+    """
+
+    def __init__(self, merges_pair: "cudf.Series"):
+        self.merge_pairs = cpp_merge_pairs(merges_pair._column)
+
+    def __call__(self, text, separator: str = " "):
+        """
+
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be encoded.
+
+        Returns
+        -------
+        Encoded strings
+
+        Examples
+        --------
+        >>> import cudf
+        >>> from cudf.core.byte_pair_encoding import BytePairEncoder
+        >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t",
+        ...                    "c e", "es t", "en ce", "T h", "Th is",
+        ...                    "t est", "s ent", "t h", "th is"])
+        >>> bpe = BytePairEncoder(mps)
+        >>> str_series = cudf.Series(['This is the sentence', 'thisisit'])
+        >>> bpe(str_series)
+        0    This is a sent ence
+        1             this is it
+        dtype: object
+        """
+        sep = cudf.Scalar(separator, dtype="str")
+        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
+
+        return cudf.Series(result)
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf.core.byte_pair_encoding import BytePairEncoder
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing._utils import assert_eq
 
@@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings():
 
     actual = str1.str.jaccard_index(str2, jaccard_width)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "separator, input, results",
+    [
+        (" ", "thetestsentence", "the test sent ence"),
+        ("_", "sentenceistest", "sent_ence_is_test"),
+        ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"),
+    ],
+)
+def test_byte_pair_encoding(separator, input, results):
+    pairs_table = cudf.Series(
+        [
+            "t he",
+            "h e",
+            "e n",
+            "i t",
+            "i s",
+            "e s",
+            "en t",
+            "c e",
+            "es t",
+            "en ce",
+            "t h",
+            "h i",
+            "th is",
+            "t est",
+            "s i",
+            "s ent",
+        ]
+    )
+    encoder = BytePairEncoder(pairs_table)
+
+    strings = cudf.Series([input, None, "", input])
+
+    expected = cudf.Series([results, None, "", results])
+
+    actual = encoder(strings, separator)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)