Skip to content

Commit

Permalink
sort duplicate cluster by size
Browse files Browse the repository at this point in the history
Signed-off-by: Yang Yu <yayu@yayu-mlt.client.nvidia.com>
  • Loading branch information
Yang Yu committed Oct 4, 2024
1 parent a2df246 commit ea271b3
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 316 deletions.
12 changes: 12 additions & 0 deletions tutorials/pretraining-data-curation/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,15 @@ def convert_str_id_to_int(df, id_column="id"):
df["doc_id"] = dx[1].astype("int64").values
df["dataset_id"] = dx[0].hash_values()
return df


def get_dataframe_complement(original_df, filtered_df):
def partition_complement(part_original_df, partition_info=None):
if not partition_info:
return part_original_df
part_filtered_df = filtered_df.get_partition(partition_info["number"])
complement_mask = ~part_original_df.index.isin(part_filtered_df.index.persist())
complement_df = part_original_df[complement_mask]
return complement_df

return original_df.map_partitions(partition_complement)
Loading

0 comments on commit ea271b3

Please sign in to comment.