From 4264bac4ac05cabceb38c2bad545a4423badce00 Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Thu, 13 Jun 2024 10:48:24 -0700 Subject: [PATCH] Shuffle CC result on group before writing out (#110) Signed-off-by: Ayush Dattagupta --- nemo_curator/modules/fuzzy_dedup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py index f7c0ba0f..6694dd42 100644 --- a/nemo_curator/modules/fuzzy_dedup.py +++ b/nemo_curator/modules/fuzzy_dedup.py @@ -1296,6 +1296,8 @@ def _run_connected_components( assert num_nodes == len(labels_df) print(f"assert num_nodes:{num_nodes}==labels_df:{len(labels_df)} passed") + # Ensure all docs in the same group are in the same partition + labels_df = labels_df.shuffle(on=["group"], ignore_index=True) labels_df.to_parquet(output_path, write_index=False) Comms.destroy()