NVIDIA · ryantwolf · Jul 9, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024
diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst
@@ -18,6 +18,9 @@
 :ref:`GPU Accelerated Exact and Fuzzy Deduplication <data-curator-gpu-deduplication>`
    Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF.
 
+:ref:`Synthetic Data Generation <data-curator-syntheticdata>`
+   Synthetic data generation tools and example piplines are available within NeMo Curator.
+
 :ref:`Downstream Task Decontamination <data-curator-downstream>`
    After training, large language models are usually evaluated by their performance on downstream tasks consisting of unseen test data. When dealing with large datasets, there is a potential for leakage of this test data into the model’s training dataset. NeMo Curator allows you to remove sections of documents in your dataset that are present in downstream tasks.
 

diff --git a/docs/user-guide/syntheticdata.rst b/docs/user-guide/syntheticdata.rst
@@ -0,0 +1,22 @@
+
+.. _data-curator-syntheticdata:
+
+======================================
+Synthetic Data Generation
+======================================
+--------------------------------------
+Background
+--------------------------------------
+Synthetic data generation has become increasing useful in large language model training.
+It is used in pretraining, fine-tuning, and evalutation.
+Synthetically generated data can be useful for adapting an LLM to low resource languages/domains, or performing knowledge distillation from other models among other purposes.
+There are a variety of ways to construct synthetic data generation pipelines, with numerous LLM and classical filters.
+
+NeMo Curator has a simple, easy-to-use set of tools that allow you to use prebuilt synthetic generation pipelines or build your own.
+Any model inference service that uses the OpenAI API is compatible with the synthetic data generation module, allowing you to generate your data from any model.
+NeMo Curator has prebuilt synthetic data generation pipelines for supervised fine-tuning (SFT) and preference data that were used to generate data for the training of `Nemotron-4 340B <https://research.nvidia.com/publication/2024-06_nemotron-4-340b>`_.
+And, you can easily interweave filtering and deduplication steps in your synthetic data pipeline with the other modules in NeMo Curator.
+
+-----------------------------------------
+Usage
+-----------------------------------------
diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py
@@ -27,6 +27,13 @@
 
 
 from .modules import *
+from .services import (
+    AsyncLLMClient,
+    AsyncOpenAIClient,
+    LLMClient,
+    NemoDeployClient,
+    OpenAIClient,
+)
 from .utils.distributed_utils import get_client
 
 # Dask will automatically convert the list score type

diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Union
+from typing import List, Optional, Union
 
 import dask.dataframe as dd
 
@@ -130,6 +130,44 @@ def to_pickle(
     ):
         raise NotImplementedError("DocumentDataset does not support to_pickle yet")
 
+    @classmethod
+    def from_pandas(
+        cls,
+        data,
+        npartitions: Optional[int] = 1,
+        chunksize: Optional[int] = None,
+        sort: Optional[bool] = True,
+        name: Optional[str] = None,
+    ):
+        """
+        Creates a document dataset from a pandas data frame.
+        For more information on the arguments see Dask's from_pandas documentation
+        https://docs.dask.org/en/stable/generated/dask.dataframe.from_pandas.html
+
+        Args:
+            data: A pandas dataframe
+        Returns:
+            A document dataset with a pandas backend (on the CPU).
+        """
+        return cls(
+            dd.from_pandas(
+                data=data,
+                npartitions=npartitions,
+                chunksize=chunksize,
+                sort=sort,
+                name=name,
+            )
+        )
+
+    def to_pandas(self):
+        """
+        Creates a pandas dataframe from a DocumentDataset
+
+        Returns:
+            A pandas dataframe (on the CPU)
+        """
+        return self.df.compute()
+
 
 def _read_json_or_parquet(
     input_files: Union[str, List[str]],

diff --git a/nemo_curator/services/__init__.py b/nemo_curator/services/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .conversation_formatter import ConversationFormatter
+from .model_client import AsyncLLMClient, LLMClient
+from .nemo_client import NemoDeployClient
+from .openai_client import AsyncOpenAIClient, OpenAIClient
+
+__all__ = [
+    "AsyncLLMClient",
+    "LLMClient",
+    "AsyncOpenAIClient",
+    "OpenAIClient",
+    "NemoDeployClient",
+    "ConversationFormatter",
+]
diff --git a/nemo_curator/services/conversation_formatter.py b/nemo_curator/services/conversation_formatter.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class ConversationFormatter(ABC):
+    """
+    Represents a way of formatting a conversation with an LLM
+    such that it can response appropriately
+    """
+
+    @abstractmethod
+    def format_conversation(self, conv: List[dict]) -> str:
+        raise NotImplementedError(
+            "format_converstaion must be implemented by subclasses"
+        )
diff --git a/nemo_curator/services/model_client.py b/nemo_curator/services/model_client.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import Iterable, List, Optional, Union
+
+from nemo_curator.services.conversation_formatter import ConversationFormatter
+
+
+class LLMClient(ABC):
+    """
+    Interface representing a client connecting to an LLM inference server
+    and making requests synchronously
+    """
+
+    @abstractmethod
+    def query_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: Optional[ConversationFormatter] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = 1,
+        seed: Optional[int] = None,
+        stop: Union[Optional[str], List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+    ) -> List[str]:
+        raise NotImplementedError("Subclass of LLMClient must implement 'query_model'")
+
+    @abstractmethod
+    def query_reward_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: Optional[ConversationFormatter] = None,
+    ) -> dict:
+        raise NotImplementedError(
+            "Subclass of LLMClient must implement 'query_reward_model'"
+        )
+
+
+class AsyncLLMClient(ABC):
+    """
+    Interface representing a client connecting to an LLM inference server
+    and making requests asynchronously
+    """
+
+    @abstractmethod
+    async def query_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: Optional[ConversationFormatter] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = 1,
+        seed: Optional[int] = None,
+        stop: Union[Optional[str], List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+    ) -> List[str]:
+        raise NotImplementedError(
+            "Subclass of AsyncLLMClient must implement 'query_model'"
+        )
+
+    @abstractmethod
+    async def query_reward_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: Optional[ConversationFormatter] = None,
+    ) -> dict:
+        raise NotImplementedError(
+            "Subclass of LLMClient must implement 'query_reward_model'"
+        )
diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Iterable, List, Optional, Union
+
+from nemo_curator.services.conversation_formatter import ConversationFormatter
+from nemo_curator.utils.import_utils import safe_import_from
+
+from .model_client import AsyncLLMClient, LLMClient
+
+NemoQueryLLM = safe_import_from("nemo.deploy.nlp", "NemoQueryLLM")
+
+
+class NemoDeployClient(LLMClient):
+    """
+    A wrapper around NemoQueryLLM for querying models in synthetic data generation
+    """
+
+    def __init__(self, nemo_deploy: NemoQueryLLM) -> None:
+        self.client = nemo_deploy
+
+    def query_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: Optional[ConversationFormatter] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        seed: Optional[int] = None,
+        stop: Union[Optional[str], List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+    ) -> List[str]:
+        if conversation_formatter is None:
+            raise ValueError(
+                "NemoDeployClient's query_model requires a conversation_formatter"
+            )
+
+        prompt = conversation_formatter.format_conversation(messages)
+        self.client.model_name = model
+
+        if n is not None:
+            warnings.warn("n is not supported in NemoDeployClient")
+
+        if isinstance(stop, str):
+            stop = [stop]
+
+        response = self.client.query_llm(
+            prompts=[prompt],
+            max_output_len=max_tokens,
+            random_seed=seed,
+            stop_words_list=stop,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+        )[0]
+
+        return self._postprocess_response(response, stop)
+
+    @staticmethod
+    def _postprocess_response(responses: List[str], stop_words: List[str]) -> List[str]:
+        processed_responses = []
+        for response in responses:
+            for stop in stop_words:
+                if response.endswith(stop):
+                    response = response[: -len(stop)]
+            processed_responses.append(response.strip())
+        return processed_responses
+
+    def query_reward_model(self, *, messages: Iterable, model: str) -> dict:
+        """
+        Prompts an LLM Reward model to score a conversation between a user and assistant
+        Args:
+            messages: The conversation to calculate a score for.
+                Should be formatted like:
+                    [{"role": "user", "content": "Write a sentence"}, {"role": "assistant", "content": "This is a sentence"}, ...]
+            model: The name of the model that should be used to calculate the reward.
+                Must be a reward model, cannot be a regular LLM.
+        Returns:
+            A mapping of score_name -> score
+        """
+        raise NotImplementedError(
+            "Reward model inference is not supported in NeMo Deploy Clients"
+        )