Skip to content

Commit

Permalink
High level fuzzy duplicates module (#46)
Browse files Browse the repository at this point in the history
* Initial pass at fuzzy dedup api

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Update deprecated shuffle arg

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* dask_cuda gpu only import

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Move fuzzy_dedup imports to optional

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* more tests

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Move FuzzyDeDupConfig to it's own class

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Add example script and config file, fix typo

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Remove slurm examples for gpu dedup

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Add config module

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Rename FuzzyDeDupConfig and minhash_length to  FuzzyDuplicatesConfig, num_hashes

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Add comments and update example

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Write to same format as input in fuzzy dedup example

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

---------

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>
  • Loading branch information
ayushdg authored May 3, 2024
1 parent f59a799 commit 52270ea
Show file tree
Hide file tree
Showing 19 changed files with 670 additions and 477 deletions.
16 changes: 16 additions & 0 deletions config/fuzzy_dedup_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
cache_dir: "./fuzzy_dedup_cache"
# Optional Params below with default values
# profile_dir: null
# id_field: "id"
# text_field: "text"

# seed: 42
# char_ngrams: 5
# num_buckets: 20
# hashes_per_bucket: 13
# use_64_bit_hash: false
# buckets_per_shuffle: 1

# false_positive_check: True
# num_anchors: 2
# jaccard_threshold: 0.8
109 changes: 109 additions & 0 deletions examples/fuzzy_deduplication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import time

import dask
from dask import dataframe as dd

from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.distributed_utils import get_client, write_to_disk
from nemo_curator.utils.script_utils import add_distributed_args


def pre_imports():
import cudf # noqa: F401


def main(args):

dataset_dir = "/path/to/dataset"
log_dir = "./"
cache_dir = "./fuzzy_cache"
output_dir = "./output"
dataset_id_field = "id"
dataset_text_field = "text"

filetype = "parquet"

# Fuzzy dup calculation only supports the cuDF/GPU backend
backend = "cudf"
assert args.device == "gpu"

with dask.config.set({"dataframe.backend": backend}):
client = get_client(args, args.device)
client.run(pre_imports)

t0 = time.time()
if filetype == "parquet":
input_dataset = DocumentDataset(
dd.read_parquet(
dataset_dir,
columns=[dataset_id_field, dataset_text_field],
blocksize="256MiB",
aggregate_files=True,
)
)
elif filetype == "jsonl":
input_dataset = DocumentDataset.read_json(
dataset_dir,
backend=backend,
)

fuzzy_dedup_config = FuzzyDuplicatesConfig(
cache_dir=cache_dir,
id_field=dataset_id_field,
text_field=dataset_text_field,
seed=42,
char_ngrams=5,
num_buckets=20,
hashes_per_bucket=13,
use_64_bit_hash=False,
buckets_per_shuffle=5,
false_positive_check=True,
num_anchors=2,
jaccard_threshold=0.8,
)
fuzzy_dup = FuzzyDuplicates(logger=log_dir, config=fuzzy_dedup_config)
duplicates = fuzzy_dup(dataset=input_dataset)

# By default all duplicate id's and the group they belong to are included in the result
# keep 1 document from each group of duplcates and mark the others to remove
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html
docs_to_remove = duplicates.df.map_partitions(
lambda x: x[x.group.duplicated(keep="first")]
)

# When there are few duplicates we can compute the results to a list and use `isin`.
result = input_dataset.df[
~input_dataset.df[dataset_id_field].isin(
docs_to_remove[dataset_id_field].compute()
)
]
write_to_disk(result, output_dir, output_type=filetype)
print(time.time() - t0)


def attach_args(
parser=argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
),
):
return add_distributed_args(parser)


if __name__ == "__main__":
main(attach_args().parse_args())
29 changes: 0 additions & 29 deletions examples/gpu_deduplication_example/README.md

This file was deleted.

38 changes: 0 additions & 38 deletions examples/gpu_deduplication_example/batch.sh

This file was deleted.

This file was deleted.

This file was deleted.

52 changes: 0 additions & 52 deletions examples/gpu_deduplication_example/remove-duplicates.sh

This file was deleted.

29 changes: 0 additions & 29 deletions examples/gpu_deduplication_example/run-buckets.sh

This file was deleted.

Loading

0 comments on commit 52270ea

Please sign in to comment.