Skip to content

Commit

Permalink
Fix lang id example (NVIDIA#37)
Browse files Browse the repository at this point in the history
* Fix lang id example

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add classifier unit tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add test for failure

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Remove failure test

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

---------

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
  • Loading branch information
ryantwolf authored and nicoleeeluo committed May 20, 2024
1 parent f2b3904 commit b192e92
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 2 deletions.
2 changes: 1 addition & 1 deletion examples/identify_languages_and_fix_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def main(args):

# Remove the language score
filtered_dataset.df[language_field] = filtered_dataset.df[language_field].apply(
lambda score: score[1]
lambda score: score[1], meta=(None, str)
)

# Split the dataset by language
Expand Down
6 changes: 6 additions & 0 deletions nemo_curator/filters/classifier_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import dask
import fasttext
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -75,6 +76,11 @@ def __init__(self, model_path=None, min_langid_score=0.3):
self._cutoff = min_langid_score
self._name = "lang_id"

# Dask will automatically convert the list score type
# to a string without this option.
# See https://github.com/NVIDIA/NeMo-Curator/issues/33
dask.config.set({"dataframe.convert-string": False})

@batched
def score_document(self, df):
model_attr = f"{self._name}_{self._model_path}"
Expand Down
72 changes: 71 additions & 1 deletion tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

import os

import dask
import numpy as np
import pandas as pd
import pytest
from dask import dataframe as dd
Expand Down Expand Up @@ -508,7 +510,7 @@ def test_repeatedparagraphschar(self):
def test_repeatingtopngrams(self):
dataset = list_to_dataset(
[
"this is a totally fine sentence with no repeating ngrams so we are ok",
"this is a totally fine sentence with no repeat ngrams so we are ok",
"a b . a b",
"a a a a a a",
"totally fine small dupe a b a b",
Expand Down Expand Up @@ -756,3 +758,71 @@ def test_per_extension_filter(self):
assert all_equal(
expected_data, filtered_data
), f"Expected {expected_data} but got {filtered_data}"


class FakeQualityFilter(DocumentFilter):
"""
Emulates FastTextQualityFilter without a model
"""

def __init__(self, alpha=3, seed=42):
super().__init__()
self._alpha = alpha
self._seed = np.random.seed(seed)

@batched
def score_document(self, df):
return pd.Series(np.arange(len(df)) / len(df))

@batched
def keep_document(self, df):
return np.random.pareto(self._alpha, size=len(df)) > 1 - df


class FakeLangId(DocumentFilter):
"""
Emulates FastTextLangId without a model
"""

def __init__(self, min_langid_score=0.3, convert_string=False):
super().__init__()
self._cutoff = min_langid_score

# Dask will automatically convert the list score type
# to a string without this option.
# See https://github.com/NVIDIA/NeMo-Curator/issues/33
dask.config.set({"dataframe.convert-string": convert_string})

@batched
def score_document(self, df):
scores = [[0.5, "EN"], [0.7, "HI"], [0.2, "PT"]]
scores = scores * len(df)
scores = scores[: len(df)]
return pd.Series(scores)

def keep_document(self, score):
return score[0] >= self._cutoff


class TestClassifierFilters:
def test_fake_quality_filter(self):
dataset = list_to_dataset(["a", "b", "c", "d"], npartitions=1)
filters = ScoreFilter(FakeQualityFilter())
filtered_data = filters(dataset)

expected_indices = [1, 2, 3]
expected_data = DocumentDataset(dataset.df.loc[expected_indices])
assert all_equal(
expected_data, filtered_data
), f"Expected {expected_data} but got {filtered_data}"

def test_fake_langid_filter(self):
dataset = list_to_dataset(["a", "b", "c", "d"], npartitions=1)
filters = ScoreFilter(FakeLangId())
filtered_data = filters(dataset)

expected_indices = [0, 1, 3]
expected_data = DocumentDataset(dataset.df.loc[expected_indices])
assert all_equal(
expected_data, filtered_data
), f"Expected {expected_data} but got {filtered_data}"

0 comments on commit b192e92

Please sign in to comment.