From b192e92e7966573d816e9f76a3ee9352e0f9f572 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 3 May 2024 08:38:47 -0700 Subject: [PATCH] Fix lang id example (#37) * Fix lang id example Signed-off-by: Ryan Wolf * Add classifier unit tests Signed-off-by: Ryan Wolf * Add test for failure Signed-off-by: Ryan Wolf * Remove failure test Signed-off-by: Ryan Wolf --------- Signed-off-by: Ryan Wolf Signed-off-by: Nicole Luo --- .../identify_languages_and_fix_unicode.py | 2 +- nemo_curator/filters/classifier_filter.py | 6 ++ tests/test_filters.py | 72 ++++++++++++++++++- 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/examples/identify_languages_and_fix_unicode.py b/examples/identify_languages_and_fix_unicode.py index 933c6c23..a95dc690 100644 --- a/examples/identify_languages_and_fix_unicode.py +++ b/examples/identify_languages_and_fix_unicode.py @@ -60,7 +60,7 @@ def main(args): # Remove the language score filtered_dataset.df[language_field] = filtered_dataset.df[language_field].apply( - lambda score: score[1] + lambda score: score[1], meta=(None, str) ) # Split the dataset by language diff --git a/nemo_curator/filters/classifier_filter.py b/nemo_curator/filters/classifier_filter.py index f32e2ff5..3ade004e 100644 --- a/nemo_curator/filters/classifier_filter.py +++ b/nemo_curator/filters/classifier_filter.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import dask import fasttext import numpy as np import pandas as pd @@ -75,6 +76,11 @@ def __init__(self, model_path=None, min_langid_score=0.3): self._cutoff = min_langid_score self._name = "lang_id" + # Dask will automatically convert the list score type + # to a string without this option. + # See https://github.com/NVIDIA/NeMo-Curator/issues/33 + dask.config.set({"dataframe.convert-string": False}) + @batched def score_document(self, df): model_attr = f"{self._name}_{self._model_path}" diff --git a/tests/test_filters.py b/tests/test_filters.py index 4ab11c21..50676f38 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -14,6 +14,8 @@ import os +import dask +import numpy as np import pandas as pd import pytest from dask import dataframe as dd @@ -508,7 +510,7 @@ def test_repeatedparagraphschar(self): def test_repeatingtopngrams(self): dataset = list_to_dataset( [ - "this is a totally fine sentence with no repeating ngrams so we are ok", + "this is a totally fine sentence with no repeat ngrams so we are ok", "a b . a b", "a a a a a a", "totally fine small dupe a b a b", @@ -756,3 +758,71 @@ def test_per_extension_filter(self): assert all_equal( expected_data, filtered_data ), f"Expected {expected_data} but got {filtered_data}" + + +class FakeQualityFilter(DocumentFilter): + """ + Emulates FastTextQualityFilter without a model + """ + + def __init__(self, alpha=3, seed=42): + super().__init__() + self._alpha = alpha + self._seed = np.random.seed(seed) + + @batched + def score_document(self, df): + return pd.Series(np.arange(len(df)) / len(df)) + + @batched + def keep_document(self, df): + return np.random.pareto(self._alpha, size=len(df)) > 1 - df + + +class FakeLangId(DocumentFilter): + """ + Emulates FastTextLangId without a model + """ + + def __init__(self, min_langid_score=0.3, convert_string=False): + super().__init__() + self._cutoff = min_langid_score + + # Dask will automatically convert the list score type + # to a string without this option. + # See https://github.com/NVIDIA/NeMo-Curator/issues/33 + dask.config.set({"dataframe.convert-string": convert_string}) + + @batched + def score_document(self, df): + scores = [[0.5, "EN"], [0.7, "HI"], [0.2, "PT"]] + scores = scores * len(df) + scores = scores[: len(df)] + return pd.Series(scores) + + def keep_document(self, score): + return score[0] >= self._cutoff + + +class TestClassifierFilters: + def test_fake_quality_filter(self): + dataset = list_to_dataset(["a", "b", "c", "d"], npartitions=1) + filters = ScoreFilter(FakeQualityFilter()) + filtered_data = filters(dataset) + + expected_indices = [1, 2, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal( + expected_data, filtered_data + ), f"Expected {expected_data} but got {filtered_data}" + + def test_fake_langid_filter(self): + dataset = list_to_dataset(["a", "b", "c", "d"], npartitions=1) + filters = ScoreFilter(FakeLangId()) + filtered_data = filters(dataset) + + expected_indices = [0, 1, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal( + expected_data, filtered_data + ), f"Expected {expected_data} but got {filtered_data}"