Skip to content

Commit

Permalink
Merge branch 'cleanup' of https://github.com/galipremsagar/cudf into …
Browse files Browse the repository at this point in the history
…cleanup
  • Loading branch information
galipremsagar committed Nov 14, 2023
2 parents 083e328 + f9dc31a commit bf7a199
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 11 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/text/vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ static void bench_vocab_tokenize(nvbench::state& state)

auto const vocab_col = [] {
data_profile const profile = data_profile_builder().no_validity().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
cudf::type_id::STRING, distribution_id::NORMAL, 0, 15);
auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
return cudf::strings::filter_characters_of_type(
cudf::strings_column_view(col->view()),
Expand Down
8 changes: 6 additions & 2 deletions cpp/src/text/vocabulary_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,12 @@ __global__ void token_counts_fn(cudf::column_device_view const d_strings,
__syncwarp();

for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
// add one if at the edge of a token or at the string's end
count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end));
// add one if at the edge of a token or if at the string's end
if (*itr) {
count += !(*(itr - 1));
} else {
count += (itr + 1 == d_output_end);
}
}
__syncwarp();

Expand Down
12 changes: 6 additions & 6 deletions cpp/tests/text/tokenize_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,14 +246,14 @@ TEST_F(TextTokenizeTest, Vocabulary)

TEST_F(TextTokenizeTest, VocabularyLongStrings)
{
cudf::test::strings_column_wrapper vocabulary( // leaving out 'cat' on purpose
cudf::test::strings_column_wrapper vocabulary(
{"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));

std::vector<std::string> h_strings(
4,
"the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse "
"jumped over the mouse house with the dog");
"jumped over the mousé house with the dog ");
cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
auto input_view = cudf::strings_column_view(input);
auto delimiter = cudf::string_scalar(" ");
Expand All @@ -262,10 +262,10 @@ TEST_F(TextTokenizeTest, VocabularyLongStrings)

using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
// clang-format off
LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}});
LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}});
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

Expand Down
24 changes: 24 additions & 0 deletions python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.scalar.scalar cimport string_scalar


cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:

cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
pass

cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
const column_view &merge_pairs
) except +

cdef unique_ptr[column] byte_pair_encoding(
const column_view &strings,
const bpe_merge_pairs &merge_pairs,
const string_scalar &separator
) except +
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
# =============================================================================

set(cython_sources
edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
50 changes: 50 additions & 0 deletions python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright (c) 2023, NVIDIA CORPORATION.


from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
bpe_merge_pairs as cpp_bpe_merge_pairs,
byte_pair_encoding as cpp_byte_pair_encoding,
load_merge_pairs as cpp_load_merge_pairs,
)
from cudf._lib.cpp.scalar.scalar cimport string_scalar
from cudf._lib.scalar cimport DeviceScalar


cdef class BPEMergePairs:
cdef unique_ptr[cpp_bpe_merge_pairs] c_obj

def __cinit__(self, Column merge_pairs):
cdef column_view c_pairs = merge_pairs.view()
with nogil:
self.c_obj = move(cpp_load_merge_pairs(c_pairs))


@acquire_spill_lock()
def byte_pair_encoding(
Column strings,
BPEMergePairs merge_pairs,
object separator
):
cdef column_view c_strings = strings.view()
cdef DeviceScalar d_separator = separator.device_value
cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
.get_raw_ptr()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_byte_pair_encoding(
c_strings,
merge_pairs.c_obj.get()[0],
c_separator[0]
)
)

return Column.from_unique_ptr(move(c_result))
59 changes: 59 additions & 0 deletions python/cudf/cudf/core/byte_pair_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from __future__ import annotations

import cudf
from cudf._lib.nvtext.byte_pair_encode import (
BPEMergePairs as cpp_merge_pairs,
byte_pair_encoding as cpp_byte_pair_encoding,
)


class BytePairEncoder:
"""
Given a merge pairs strings series, performs byte pair encoding on
a strings series using the provided separator.
Parameters
----------
merges_pairs : str
Strings column of merge pairs
Returns
-------
BytePairEncoder
"""

def __init__(self, merges_pair: "cudf.Series"):
self.merge_pairs = cpp_merge_pairs(merges_pair._column)

def __call__(self, text, separator: str = " "):
"""
Parameters
----------
text : cudf string series
The strings to be encoded.
Returns
-------
Encoded strings
Examples
--------
>>> import cudf
>>> from cudf.core.byte_pair_encoding import BytePairEncoder
>>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t",
... "c e", "es t", "en ce", "T h", "Th is",
... "t est", "s ent", "t h", "th is"])
>>> bpe = BytePairEncoder(mps)
>>> str_series = cudf.Series(['This is the sentence', 'thisisit'])
>>> bpe(str_series)
0 This is a sent ence
1 this is it
dtype: object
"""
sep = cudf.Scalar(separator, dtype="str")
result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)

return cudf.Series(result)
41 changes: 41 additions & 0 deletions python/cudf/cudf/tests/text/test_text_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pytest

import cudf
from cudf.core.byte_pair_encoding import BytePairEncoder
from cudf.core.tokenize_vocabulary import TokenizeVocabulary
from cudf.testing._utils import assert_eq

Expand Down Expand Up @@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings():

actual = str1.str.jaccard_index(str2, jaccard_width)
assert_eq(expected, actual)


@pytest.mark.parametrize(
"separator, input, results",
[
(" ", "thetestsentence", "the test sent ence"),
("_", "sentenceistest", "sent_ence_is_test"),
("$", "istestsentencehere", "is$test$sent$ence$he$r$e"),
],
)
def test_byte_pair_encoding(separator, input, results):
pairs_table = cudf.Series(
[
"t he",
"h e",
"e n",
"i t",
"i s",
"e s",
"en t",
"c e",
"es t",
"en ce",
"t h",
"h i",
"th is",
"t est",
"s i",
"s ent",
]
)
encoder = BytePairEncoder(pairs_table)

strings = cudf.Series([input, None, "", input])

expected = cudf.Series([results, None, "", results])

actual = encoder(strings, separator)
assert type(expected) == type(actual)
assert_eq(expected, actual)

0 comments on commit bf7a199

Please sign in to comment.