From b192e92e7966573d816e9f76a3ee9352e0f9f572 Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Fri, 3 May 2024 08:38:47 -0700
Subject: [PATCH] Fix lang id example (#37)

* Fix lang id example

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add classifier unit tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add test for failure

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Remove failure test

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

---------

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
Signed-off-by: Nicole Luo <nluo@nvidia.com>
---
 .../identify_languages_and_fix_unicode.py     |  2 +-
 nemo_curator/filters/classifier_filter.py     |  6 ++
 tests/test_filters.py                         | 72 ++++++++++++++++++-
 3 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/examples/identify_languages_and_fix_unicode.py b/examples/identify_languages_and_fix_unicode.py
index 933c6c23..a95dc690 100644
--- a/examples/identify_languages_and_fix_unicode.py
+++ b/examples/identify_languages_and_fix_unicode.py
@@ -60,7 +60,7 @@ def main(args):
 
     # Remove the language score
     filtered_dataset.df[language_field] = filtered_dataset.df[language_field].apply(
-        lambda score: score[1]
+        lambda score: score[1], meta=(None, str)
     )
 
     # Split the dataset by language
diff --git a/nemo_curator/filters/classifier_filter.py b/nemo_curator/filters/classifier_filter.py
index f32e2ff5..3ade004e 100644
--- a/nemo_curator/filters/classifier_filter.py
+++ b/nemo_curator/filters/classifier_filter.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import dask
 import fasttext
 import numpy as np
 import pandas as pd
@@ -75,6 +76,11 @@ def __init__(self, model_path=None, min_langid_score=0.3):
         self._cutoff = min_langid_score
         self._name = "lang_id"
 
+        # Dask will automatically convert the list score type
+        # to a string without this option.
+        # See https://github.com/NVIDIA/NeMo-Curator/issues/33
+        dask.config.set({"dataframe.convert-string": False})
+
     @batched
     def score_document(self, df):
         model_attr = f"{self._name}_{self._model_path}"
diff --git a/tests/test_filters.py b/tests/test_filters.py
index 4ab11c21..50676f38 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -14,6 +14,8 @@
 
 import os
 
+import dask
+import numpy as np
 import pandas as pd
 import pytest
 from dask import dataframe as dd
@@ -508,7 +510,7 @@ def test_repeatedparagraphschar(self):
     def test_repeatingtopngrams(self):
         dataset = list_to_dataset(
             [
-                "this is a totally fine sentence with no repeating ngrams so we are ok",
+                "this is a totally fine sentence with no repeat ngrams so we are ok",
                 "a b . a b",
                 "a a a a a a",
                 "totally fine small dupe a b a b",
@@ -756,3 +758,71 @@ def test_per_extension_filter(self):
         assert all_equal(
             expected_data, filtered_data
         ), f"Expected {expected_data} but got {filtered_data}"
+
+
+class FakeQualityFilter(DocumentFilter):
+    """
+    Emulates FastTextQualityFilter without a model
+    """
+
+    def __init__(self, alpha=3, seed=42):
+        super().__init__()
+        self._alpha = alpha
+        self._seed = np.random.seed(seed)
+
+    @batched
+    def score_document(self, df):
+        return pd.Series(np.arange(len(df)) / len(df))
+
+    @batched
+    def keep_document(self, df):
+        return np.random.pareto(self._alpha, size=len(df)) > 1 - df
+
+
+class FakeLangId(DocumentFilter):
+    """
+    Emulates FastTextLangId without a model
+    """
+
+    def __init__(self, min_langid_score=0.3, convert_string=False):
+        super().__init__()
+        self._cutoff = min_langid_score
+
+        # Dask will automatically convert the list score type
+        # to a string without this option.
+        # See https://github.com/NVIDIA/NeMo-Curator/issues/33
+        dask.config.set({"dataframe.convert-string": convert_string})
+
+    @batched
+    def score_document(self, df):
+        scores = [[0.5, "EN"], [0.7, "HI"], [0.2, "PT"]]
+        scores = scores * len(df)
+        scores = scores[: len(df)]
+        return pd.Series(scores)
+
+    def keep_document(self, score):
+        return score[0] >= self._cutoff
+
+
+class TestClassifierFilters:
+    def test_fake_quality_filter(self):
+        dataset = list_to_dataset(["a", "b", "c", "d"], npartitions=1)
+        filters = ScoreFilter(FakeQualityFilter())
+        filtered_data = filters(dataset)
+
+        expected_indices = [1, 2, 3]
+        expected_data = DocumentDataset(dataset.df.loc[expected_indices])
+        assert all_equal(
+            expected_data, filtered_data
+        ), f"Expected {expected_data} but got {filtered_data}"
+
+    def test_fake_langid_filter(self):
+        dataset = list_to_dataset(["a", "b", "c", "d"], npartitions=1)
+        filters = ScoreFilter(FakeLangId())
+        filtered_data = filters(dataset)
+
+        expected_indices = [0, 1, 3]
+        expected_data = DocumentDataset(dataset.df.loc[expected_indices])
+        assert all_equal(
+            expected_data, filtered_data
+        ), f"Expected {expected_data} but got {filtered_data}"