Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix 69 - Refactor how arguments are added to scripts #102

Merged
merged 25 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
e0adc47
Refactor arguments usage
miguelusque Jun 10, 2024
816e7fb
Refactor arguments usage
miguelusque Jun 10, 2024
052e25a
Refactor arguments usage
miguelusque Jun 10, 2024
21c71e1
Refactor arguments usage
miguelusque Jun 10, 2024
66be5f7
Refactor arguments usage
miguelusque Jun 10, 2024
38ca0ca
Refactor arguments usage
miguelusque Jun 10, 2024
e74aa04
Refactor arguments usage
miguelusque Jun 10, 2024
812f777
Add missing parser
miguelusque Jun 14, 2024
742406d
Add missing parser
miguelusque Jun 14, 2024
93c8a29
Fix missing default parameter
miguelusque Jun 18, 2024
3fb851f
Move unique arguments to their corresponding scripts
miguelusque Jun 18, 2024
843149f
Merge branch 'NVIDIA:main' into miguelusque-fix-69
miguelusque Jun 18, 2024
990c1c0
Update help message
miguelusque Jun 25, 2024
1bd0ef4
Improve help wording
miguelusque Jun 25, 2024
ed542b8
Update help wording
miguelusque Jun 25, 2024
653db8e
Update nemo_curator/scripts/text_cleaning.py
miguelusque Jun 25, 2024
7d59d65
Update nemo_curator/scripts/make_data_shards.py
miguelusque Jun 25, 2024
63f0744
Fix help wording typo
miguelusque Jun 25, 2024
4ba9280
Improve help wording for output-data-dir argument
miguelusque Jun 25, 2024
d9407b2
Remove unused arguments
miguelusque Jun 25, 2024
3a1bd09
Fix help wording
miguelusque Jun 25, 2024
35e47a4
Remove unneeded print
miguelusque Jun 25, 2024
d86308c
Fix help string for output-data-dir argument
miguelusque Jun 25, 2024
2980420
Improve argument passing
miguelusque Jun 25, 2024
1a36735
Revert changes
miguelusque Jun 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions examples/blend_and_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import nemo_curator as nc
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.script_utils import add_distributed_args, parse_client_args
from nemo_curator.utils.script_utils import ArgumentHelper


def main(args):
Expand All @@ -28,7 +28,7 @@ def main(args):
output_path = "/path/to/output"

# Set up Dask client
client = get_client(**parse_client_args(args))
client = get_client(**ArgumentHelper.parse_client_args(args))

# Blend the datasets
datasets = [DocumentDataset.read_json(path) for path in dataset_paths]
Expand All @@ -46,7 +46,7 @@ def attach_args(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
),
):
return add_distributed_args(parser)
return ArgumentHelper(parser).add_distributed_args()


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions examples/classifier_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from nemo_curator.modifiers import FastTextLabelModifier
from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk
from nemo_curator.utils.file_utils import get_all_files_paths_under
from nemo_curator.utils.script_utils import add_distributed_args, parse_client_args
from nemo_curator.utils.script_utils import ArgumentHelper


def load_dataset(input_data_dir):
Expand Down Expand Up @@ -55,7 +55,7 @@ def main(args):
filtered_output = "/path/to/output"

# Prepare samples for the classifier
client = get_client(**parse_client_args(args))
client = get_client(**ArgumentHelper.parse_client_args(args))
low_quality_samples = create_samples(
low_quality_data_path, "__label__lq", num_low_quality_samples
)
Expand Down Expand Up @@ -100,7 +100,7 @@ def attach_args(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
),
):
return add_distributed_args(parser)
return ArgumentHelper(parser).add_distributed_args()


if __name__ == "__main__":
Expand Down
64 changes: 14 additions & 50 deletions examples/domain_classifier_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from nemo_curator import DomainClassifier
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.script_utils import parse_client_args
from nemo_curator.utils.script_utils import ArgumentHelper


def main(args):
Expand Down Expand Up @@ -60,7 +60,7 @@ def main(args):
input_file_path = "/path/to/data"
output_file_path = "./"

client = get_client(**parse_client_args(args))
client = get_client(**ArgumentHelper.parse_client_args(args))

input_dataset = DocumentDataset.read_json(
input_file_path, backend="cudf", add_filename=True
Expand Down Expand Up @@ -89,54 +89,18 @@ def attach_args(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
),
):
parser.add_argument(
"--scheduler-address",
type=str,
default=None,
help="Address to the scheduler of a created dask cluster. If not provided"
"a single node LocalCUDACluster will be started.",
)
parser.add_argument(
"--scheduler-file",
type=str,
default=None,
help="Path to the scheduler file of a created dask cluster. If not provided"
" a single node LocalCUDACluster will be started.",
)
parser.add_argument(
"--nvlink-only",
action="store_true",
help="Start a local cluster with only NVLink enabled."
"Only applicable when protocol=ucx and no scheduler file/address is specified",
)
parser.add_argument(
"--protocol",
type=str,
default="ucx",
help="Protcol to use for dask cluster"
"Note: This only applies to the localCUDACluster. If providing an user created "
"cluster refer to"
"https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-protocol", # noqa: E501
)
parser.add_argument(
"--rmm-pool-size",
type=str,
default="14GB",
help="Initial pool size to use for the RMM Pool Memory allocator"
"Note: This only applies to the localCUDACluster. If providing an user created "
"cluster refer to"
"https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-rmm-pool-size", # noqa: E501
)
parser.add_argument("--enable-spilling", action="store_true")
parser.add_argument("--set-torch-to-use-rmm", action="store_true")
parser.add_argument(
"--device",
type=str,
default="gpu",
help="Device to run the script on. Either 'cpu' or 'gpu'.",
)

return parser
argumentHelper = ArgumentHelper(parser)

argumentHelper.add_arg_device()
argumentHelper.add_arg_enable_spilling()
argumentHelper.add_arg_nvlink_only()
argumentHelper.add_arg_protocol()
argumentHelper.add_arg_rmm_pool_size()
argumentHelper.add_arg_scheduler_address()
argumentHelper.add_arg_scheduler_file()
argumentHelper.add_arg_set_torch_to_use_rmm()

return argumentHelper.parser


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions examples/download_arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from nemo_curator.download import download_arxiv
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.script_utils import add_distributed_args, parse_client_args
from nemo_curator.utils.script_utils import ArgumentHelper


def main(args):
Expand All @@ -27,7 +27,7 @@ def main(args):
url_limit = 10

# Set up Dask client
client = get_client(**parse_client_args(args))
client = get_client(**ArgumentHelper.parse_client_args(args))

# Download and sample data
arxiv = download_arxiv(output_directory, url_limit=url_limit)
Expand All @@ -42,7 +42,7 @@ def attach_args(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
),
):
return add_distributed_args(parser)
return ArgumentHelper(parser).add_distributed_args()


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions examples/download_common_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from nemo_curator.download import download_common_crawl
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.script_utils import add_distributed_args, parse_client_args
from nemo_curator.utils.script_utils import ArgumentHelper


def main(args):
Expand All @@ -29,7 +29,7 @@ def main(args):
url_limit = 10

# Set up Dask client
client = get_client(**parse_client_args(args))
client = get_client(**ArgumentHelper.parse_client_args(args))

# Download and sample data
common_crawl = download_common_crawl(
Expand All @@ -46,7 +46,7 @@ def attach_args(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
),
):
return add_distributed_args(parser)
return ArgumentHelper(parser).add_distributed_args()


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions examples/download_wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from nemo_curator.download import download_wikipedia
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.script_utils import add_distributed_args, parse_client_args
from nemo_curator.utils.script_utils import ArgumentHelper


def main(args):
Expand All @@ -28,7 +28,7 @@ def main(args):
url_limit = 10

# Set up Dask client
client = get_client(**parse_client_args(args))
client = get_client(**ArgumentHelper.parse_client_args(args))

# Download and sample data
wikipedia = download_wikipedia(
Expand All @@ -45,7 +45,7 @@ def attach_args(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
),
):
return add_distributed_args(parser)
return ArgumentHelper(parser).add_distributed_args()


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions examples/exact_deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from nemo_curator.modules import ExactDuplicates
from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk
from nemo_curator.utils.file_utils import get_all_files_paths_under
from nemo_curator.utils.script_utils import add_distributed_args, parse_client_args
from nemo_curator.utils.script_utils import ArgumentHelper


def pre_imports():
Expand All @@ -33,7 +33,7 @@ def main(args):
output_dir = "./"
dataset_id_field = "id"
dataset_text_field = "text"
client = get_client(**parse_client_args(args))
client = get_client(**ArgumentHelper.parse_client_args(args))
backend = "cudf" if args.device == "gpu" else "pandas"

if args.device == "gpu":
Expand Down Expand Up @@ -79,7 +79,7 @@ def attach_args(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
),
):
return add_distributed_args(parser)
return ArgumentHelper(parser).add_distributed_args()


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions examples/find_pii_and_deidentify.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
from nemo_curator.modifiers.pii_modifier import PiiModifier
from nemo_curator.modules.modify import Modify
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.script_utils import add_distributed_args, parse_client_args
from nemo_curator.utils.script_utils import ArgumentHelper


def console_script():
parser = argparse.ArgumentParser()
arguments = add_distributed_args(parser).parse_args()
_ = get_client(**parse_client_args(arguments))
args = ArgumentHelper(parser).add_distributed_args().parse_args()
_ = get_client(**ArgumentHelper.parse_client_args(args))

dataframe = pd.DataFrame(
{"text": ["Sarah and Ryan went out to play", "Jensen is the CEO of NVIDIA"]}
Expand Down
6 changes: 3 additions & 3 deletions examples/fuzzy_deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.distributed_utils import get_client, write_to_disk
from nemo_curator.utils.script_utils import add_distributed_args, parse_client_args
from nemo_curator.utils.script_utils import ArgumentHelper


def pre_imports():
Expand All @@ -44,7 +44,7 @@ def main(args):
assert args.device == "gpu"

with dask.config.set({"dataframe.backend": backend}):
client = get_client(**parse_client_args(args))
client = get_client(**ArgumentHelper.parse_client_args(args))
client.run(pre_imports)

t0 = time.time()
Expand Down Expand Up @@ -102,7 +102,7 @@ def attach_args(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
),
):
return add_distributed_args(parser)
return ArgumentHelper(parser).add_distributed_args()


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions examples/identify_languages_and_fix_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
get_all_files_paths_under,
separate_by_metadata,
)
from nemo_curator.utils.script_utils import add_distributed_args, parse_client_args
from nemo_curator.utils.script_utils import ArgumentHelper


def load_dataset(input_data_dir):
Expand All @@ -49,7 +49,7 @@ def main(args):
language_field = "language"

# Prepare samples for the classifier
client = get_client(**parse_client_args(args))
client = get_client(**ArgumentHelper.parse_client_args(args))

# Filter data
multilingual_dataset = load_dataset(multilingual_data_path)
Expand Down Expand Up @@ -88,7 +88,7 @@ def attach_args(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
),
):
return add_distributed_args(parser)
return ArgumentHelper(parser).add_distributed_args()


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions examples/k8s/create_dask_cluster.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Leave this file unchanged. ArgumentHelper isn't being used and the rest of the changes are inconsequential.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still needs to be addressed.

Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from dask_kubernetes.operator.kubecluster import KubeCluster, make_cluster_spec

from nemo_curator.utils.script_utils import ArgumentHelper


def create_cluster(
name: str,
Expand Down Expand Up @@ -127,8 +129,6 @@ def parse_pvcs(specs: str) -> dict[str, str]:
help="Comma sep PVC specificiation of $pvc_name_1:$mount_path_1,$pvc_name_2:$mount_path_2. Example: foo:/foo,bar:/bar mounts pvcs named foo and bar to /foo and /bar respectively.",
)

args = parser.parse_args()

create_cluster(
**vars(args),
**vars(parser.parse_args()),
)
Loading