Fix metadata inference with pandas and dask (NVIDIA#35)

* Fix metadata inference with pandas and dask Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Fix datatypes for task decontamination Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Use targetted import Signed-off-by: Ryan Wolf <rywolf@nvidia.com> --------- Signed-off-by: Ryan Wolf <rywolf@nvidia.com> Signed-off-by: Nicole Luo <nluo@nvidia.com>
nicoleeeluo · May 20, 2024 · 462a1a3 · 462a1a3
1 parent ec26f9f
commit 462a1a3
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 2 deletions.
diff --git a/nemo_curator/modules/filter.py b/nemo_curator/modules/filter.py
@@ -11,12 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import pandas as pd
+from dask.dataframe.extensions import make_array_nonempty
 from dask.typing import no_default
 
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.utils.module_utils import is_batched
 
+# Override so that pd.NA is not passed during the metadata inference
+make_array_nonempty.register(
+    pd.StringDtype,
+    lambda x: pd.array(["a", "b"], dtype=x),
+)
+
 
 class Score:
     def __init__(self, score_fn, score_field, text_field="text", score_type=None):

diff --git a/nemo_curator/modules/task.py b/nemo_curator/modules/task.py
@@ -302,6 +302,8 @@ def _threshold_ngram_count(self, matched_ngrams: dict) -> set:
         return filtered_ngrams
 
     def _remove_ngrams_partition(self, partition, task_ngrams, ngrams_freq_sorted):
+        text_type = partition[self.text_field].dtype
+
         document_fn = partial(
             self._remove_ngrams,
             task_ngrams=task_ngrams,
@@ -318,7 +320,15 @@ def _remove_ngrams_partition(self, partition, task_ngrams, ngrams_freq_sorted):
 
         partition[self.text_field] = split_text
         filtered_partition = partition[valid_documents_mask]
-        return filtered_partition.explode(self.text_field, ignore_index=True)
+        exploded_partition = filtered_partition.explode(
+            self.text_field, ignore_index=True
+        )
+        # After exploding, the string datatype can become an "object" type
+        exploded_partition[self.text_field] = exploded_partition[
+            self.text_field
+        ].astype(text_type)
+
+        return exploded_partition
 
     def _remove_ngrams(self, document, task_ngrams, ngrams_freq_sorted):
         """