Skip to content

Commit

Permalink
Switch to placeholder import that raises on usage
Browse files Browse the repository at this point in the history
Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>
  • Loading branch information
ayushdg committed Apr 22, 2024
1 parent c6e2737 commit 66b9e5a
Show file tree
Hide file tree
Showing 5 changed files with 406 additions and 51 deletions.
31 changes: 15 additions & 16 deletions nemo_curator/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,36 +12,35 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo_curator.utils.import_utils import gpu_only_import_from

from .add_id import AddId
from .exact_dedup import ExactDuplicates
from .filter import Filter, Score, ScoreFilter

from .meta import Sequential
from .modify import Modify
from .task import TaskDecontamination

# GPU packages
LSH = gpu_only_import_from(".fuzzy_dedup", "LSH")
MinHash = gpu_only_import_from(".fuzzy_dedup", "MinHash")

# Pytorch related imports must come after all imports that require cugraph,
# because of context cleanup issues b/w pytorch and cugraph
# See this issue: https://github.com/rapidsai/cugraph/issues/2718
from .distributed_data_classifier import DomainClassifier, QualityClassifier

__all__ = [
"DomainClassifier",
"ExactDuplicates",
"Filter",
"LSH",
"MinHash",
"Modify",
"QualityClassifier",
"Score",
"ScoreFilter",
"Sequential",
"TaskDecontamination",
"AddId",
]

# GPU packages
try:
from .fuzzy_dedup import LSH, MinHash

__all__ += ["LSH", "MinHash"]
except ModuleNotFoundError:
pass

# Pytorch related imports must come after all imports that require cugraph,
# because of context cleanup issues b/w pytorch and cugraph
# See this issue: https://github.com/rapidsai/cugraph/issues/2718
from .distributed_data_classifier import DomainClassifier, QualityClassifier

__all__ += ["DomainClassifier", "QualityClassifier"]
7 changes: 3 additions & 4 deletions nemo_curator/modules/fuzzy_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@
from typing import List, Tuple, Union

import cudf
import cugraph.dask as dcg
import cugraph.dask.comms.comms as Comms
import cupy as cp
import dask_cudf
import numpy as np
from cugraph import MultiGraph
from dask import dataframe as dd
from dask.dataframe.shuffle import shuffle as dd_shuffle
from dask.utils import M
Expand Down Expand Up @@ -1104,10 +1107,6 @@ def _run_connected_components(
deduped_parsed_id_path,
output_path,
):
import cugraph.dask as dcg
import cugraph.dask.comms.comms as Comms
from cugraph import MultiGraph

Comms.initialize(p2p=True)
df = dask_cudf.read_parquet(
deduped_encoded_jaccard_path, blocksize="1GB", aggregate_files=True
Expand Down
23 changes: 7 additions & 16 deletions nemo_curator/utils/distributed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@
import pandas as pd
from dask.distributed import Client, LocalCluster, get_worker, performance_report

from nemo_curator.utils.gpu_utils import (
GPU_INSTALL_STRING,
is_cudf_type,
try_dask_cudf_import_and_raise,
)
from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING, is_cudf_type
from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from

cudf = gpu_only_import("cudf")
LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster")


class DotDict:
Expand All @@ -53,13 +53,6 @@ def start_dask_gpu_local_cluster(args) -> Client:
GPUs present on the machine.
"""
try:
from dask_cuda import LocalCUDACluster
except ModuleNotFoundError:
raise ModuleNotFoundError(
f"Starting a GPU cluster requires GPU dependencies. {GPU_INSTALL_STRING}"
)

# Setting conservative defaults
# which should work across most systems
nvlink_only = getattr(args, "nvlink_only", False)
Expand Down Expand Up @@ -197,9 +190,6 @@ def read_single_partition(
A cudf DataFrame or a pandas DataFrame.
"""
if backend == "cudf":
try_dask_cudf_import_and_raise("Backend=cudf requires GPU packages")

if filetype == "jsonl":
read_kwargs = {"lines": True}
if backend == "cudf":
Expand Down Expand Up @@ -282,7 +272,8 @@ def read_data(
"""
if backend == "cudf":
try_dask_cudf_import_and_raise("Backend=cudf requires GPU packages")
# Try using cuDF. If not availible will throw an error.
test_obj = cudf.Series

if file_type == "pickle":
df = read_pandas_pickle(input_files[0], add_filename=add_filename)
Expand Down
17 changes: 2 additions & 15 deletions nemo_curator/utils/gpu_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

GPU_INSTALL_STRING = """Install GPU packages via `pip install --extra-index-url https://pypi.nvidia.com nemo_curator[cuda]`
or use `pip install --extra-index-url https://pypi.nvidia.com ".[cuda]` if installing from source"""
GPU_INSTALL_STRING = """Install GPU packages via `pip install --extra-index-url https://pypi.nvidia.com nemo_curator[cuda-12x]`
or use `pip install --extra-index-url https://pypi.nvidia.com ".[cuda-12x]"` if installing from source"""


def is_cudf_type(obj):
Expand All @@ -26,16 +26,3 @@ def is_cudf_type(obj):
str(getattr(obj, "_meta", "")),
]
return any("cudf" in obj_type for obj_type in types)


def try_dask_cudf_import_and_raise(message_prefix: str):
"""
Try to import cudf/dask-cudf and raise an error message on installing dependencies.
Optionally prepends msg
"""
try:
import cudf
import dask_cudf
except ModuleNotFoundError:
raise ModuleNotFoundError(f"{message_prefix}. {GPU_INSTALL_STRING}")
Loading

0 comments on commit 66b9e5a

Please sign in to comment.