From 4bb6ddce08bbff4ad8ad967021b8342b976de58c Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Wed, 26 Jun 2024 09:06:34 -0700 Subject: [PATCH 01/69] Begin implementation on OpenAI client Signed-off-by: Ryan Wolf --- nemo_curator/__init__.py | 1 + nemo_curator/services/__init__.py | 2 + nemo_curator/services/model_client.py | 23 +++++++++++ nemo_curator/services/openai_client.py | 56 ++++++++++++++++++++++++++ setup.py | 6 ++- 5 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 nemo_curator/services/__init__.py create mode 100644 nemo_curator/services/model_client.py create mode 100644 nemo_curator/services/openai_client.py diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py index c9e79ff7..48d9d22d 100644 --- a/nemo_curator/__init__.py +++ b/nemo_curator/__init__.py @@ -27,6 +27,7 @@ from .modules import * +from .services import AsyncLLMClient, AsyncOpenAIClient, LLMClient, OpenAIClient from .utils.distributed_utils import get_client # Dask will automatically convert the list score type diff --git a/nemo_curator/services/__init__.py b/nemo_curator/services/__init__.py new file mode 100644 index 00000000..7d3e4559 --- /dev/null +++ b/nemo_curator/services/__init__.py @@ -0,0 +1,2 @@ +from .model_client import AsyncLLMClient, LLMClient +from .openai_client import AsyncOpenAIClient, OpenAIClient diff --git a/nemo_curator/services/model_client.py b/nemo_curator/services/model_client.py new file mode 100644 index 00000000..c0d5935d --- /dev/null +++ b/nemo_curator/services/model_client.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod + + +class LLMClient(ABC): + """ + Interface representing a client connecting to an LLM inference server + and making requests synchronously + """ + + @abstractmethod + def query_model(self, user_input: str) -> str: + raise NotImplementedError("Subclass of LLMClient must implement 'query_model'") + + +class AsyncLLMClient(ABC): + """ + Interface representing a client connecting to an LLM inference server + and making requests asynchronously + """ + + @abstractmethod + async def query_model(self, user_input: str) -> str: + raise NotImplementedError("Subclass of LLMClient must implement 'query_model'") diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py new file mode 100644 index 00000000..8cb6f378 --- /dev/null +++ b/nemo_curator/services/openai_client.py @@ -0,0 +1,56 @@ +from typing import Iterable, List, Optional, Union + +from model_client import AsyncLLMClient, LLMClient +from openai import AsyncOpenAI, OpenAI + + +class OpenAIClient(LLMClient): + """ + A wrapper around OpenAI's Python client for querying models + """ + + def __init__(self, openai_client: OpenAI) -> None: + self.client = openai_client + + def query_model( + self, + *, + messages: Iterable, + model: str, + max_tokens: Optional[int] = None, + stop: Union[Optional[str], List[str]] = None, + top_p: Optional[float] = None + ) -> str: + return self.client.chat.completions.create( + messages=messages, + model=model, + max_tokens=max_tokens, + stop=stop, + top_p=top_p, + ) + + +class AsyncOpenAIClient(AsyncLLMClient): + """ + A wrapper around OpenAI's Python async client for querying models + """ + + def __init__(self, async_openai_client: AsyncOpenAI) -> None: + self.client = async_openai_client + + async def query_model( + self, + *, + messages: Iterable, + model: str, + max_tokens: Optional[int] = None, + stop: Union[Optional[str], List[str]] = None, + top_p: Optional[float] = None + ) -> str: + return self.client.chat.completions.create( + messages=messages, + model=model, + max_tokens=max_tokens, + stop=stop, + top_p=top_p, + ) diff --git a/setup.py b/setup.py index 74c9d431..ef6b0d2b 100644 --- a/setup.py +++ b/setup.py @@ -76,7 +76,11 @@ "cugraph-cu12>=24.2", "dask-cuda>=24.2", "spacy[cuda12x]>=3.6.0, <4.0.0", - ] + ], + "synth": [ + "openai", + "nemo_toolkit[infer]>=1.23.0", + ], }, entry_points={ "console_scripts": [ From 850403f189bdfd1816081d607bd83a2a579f2c4c Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Wed, 26 Jun 2024 09:23:40 -0700 Subject: [PATCH 02/69] Fix relative import Signed-off-by: Ryan Wolf --- nemo_curator/services/openai_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index 8cb6f378..94ce2874 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -1,8 +1,9 @@ from typing import Iterable, List, Optional, Union -from model_client import AsyncLLMClient, LLMClient from openai import AsyncOpenAI, OpenAI +from .model_client import AsyncLLMClient, LLMClient + class OpenAIClient(LLMClient): """ From e6cac7ac690e925c1d3417101d14c126aca75d2a Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Wed, 26 Jun 2024 09:28:36 -0700 Subject: [PATCH 03/69] Add temperature Signed-off-by: Ryan Wolf --- nemo_curator/services/openai_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index 94ce2874..926fbace 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -46,6 +46,7 @@ async def query_model( model: str, max_tokens: Optional[int] = None, stop: Union[Optional[str], List[str]] = None, + temperature: Optional[float] = None, top_p: Optional[float] = None ) -> str: return self.client.chat.completions.create( @@ -53,5 +54,6 @@ async def query_model( model=model, max_tokens=max_tokens, stop=stop, + temperature=temperature, top_p=top_p, ) From 131b2d66ebf0bd6ea6e59aefdf278532455a25bf Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Wed, 26 Jun 2024 15:43:28 -0700 Subject: [PATCH 04/69] Modify client interface and begin ultrachat Signed-off-by: Ryan Wolf --- nemo_curator/services/model_client.py | 25 ++++++++++- nemo_curator/services/openai_client.py | 18 ++++++-- nemo_curator/synthetic/__init__.py | 0 nemo_curator/synthetic/prompts.py | 3 ++ nemo_curator/synthetic/ultrachat.py | 59 ++++++++++++++++++++++++++ 5 files changed, 99 insertions(+), 6 deletions(-) create mode 100644 nemo_curator/synthetic/__init__.py create mode 100644 nemo_curator/synthetic/prompts.py create mode 100644 nemo_curator/synthetic/ultrachat.py diff --git a/nemo_curator/services/model_client.py b/nemo_curator/services/model_client.py index c0d5935d..d2e74e7e 100644 --- a/nemo_curator/services/model_client.py +++ b/nemo_curator/services/model_client.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from typing import Iterable, List, Optional, Union class LLMClient(ABC): @@ -8,7 +9,17 @@ class LLMClient(ABC): """ @abstractmethod - def query_model(self, user_input: str) -> str: + def query_model( + self, + *, + messages: Iterable, + model: str, + max_tokens: Optional[int] = None, + n: Optional[int] = 1, + stop: Union[Optional[str], List[str]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None + ) -> List[str]: raise NotImplementedError("Subclass of LLMClient must implement 'query_model'") @@ -19,5 +30,15 @@ class AsyncLLMClient(ABC): """ @abstractmethod - async def query_model(self, user_input: str) -> str: + async def query_model( + self, + *, + messages: Iterable, + model: str, + max_tokens: Optional[int] = None, + n: Optional[int] = 1, + stop: Union[Optional[str], List[str]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None + ) -> List[str]: raise NotImplementedError("Subclass of LLMClient must implement 'query_model'") diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index 926fbace..509c204a 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -19,17 +19,23 @@ def query_model( messages: Iterable, model: str, max_tokens: Optional[int] = None, + n: Optional[int] = 1, stop: Union[Optional[str], List[str]] = None, + temperature: Optional[float] = None, top_p: Optional[float] = None - ) -> str: - return self.client.chat.completions.create( + ) -> List[str]: + response = self.client.chat.completions.create( messages=messages, model=model, max_tokens=max_tokens, + n=n, stop=stop, + temperature=temperature, top_p=top_p, ) + return [choice.message.content for choice in response.choices] + class AsyncOpenAIClient(AsyncLLMClient): """ @@ -45,15 +51,19 @@ async def query_model( messages: Iterable, model: str, max_tokens: Optional[int] = None, + n: Optional[int] = 1, stop: Union[Optional[str], List[str]] = None, temperature: Optional[float] = None, top_p: Optional[float] = None - ) -> str: - return self.client.chat.completions.create( + ) -> List[str]: + response = await self.client.chat.completions.create( messages=messages, model=model, max_tokens=max_tokens, + n=n, stop=stop, temperature=temperature, top_p=top_p, ) + + return [choice.message.content for choice in response.choices] diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py new file mode 100644 index 00000000..9d499e35 --- /dev/null +++ b/nemo_curator/synthetic/prompts.py @@ -0,0 +1,3 @@ +DEFAULT_META_TOPICS_PROMPT_TEMPLATE = ( + "Write a list of {ntopics} topics about the world." +) diff --git a/nemo_curator/synthetic/ultrachat.py b/nemo_curator/synthetic/ultrachat.py new file mode 100644 index 00000000..9e70e7a9 --- /dev/null +++ b/nemo_curator/synthetic/ultrachat.py @@ -0,0 +1,59 @@ +from typing import List, Optional, Union + +from nemo_curator.services.model_client import AsyncLLMClient, LLMClient +from nemo_curator.synthetic.prompts import DEFAULT_META_TOPICS_PROMPT_TEMPLATE + + +class UltraChatGenerator: + """ + Provides a collection of methods for generating synthetic data inspired + by the UltraChat paper: https://arxiv.org/abs/2305.14233 + """ + + def __init__(self, llm_client: LLMClient) -> None: + self.client = llm_client + + def generate_world_question_openlines(self): + pass + + def generate_meta_topics( + self, + ntopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_META_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of meta topics about the world + Args: + ntopics: The number of meta topics to generate. Can be an integer like 5 or a string like "five". + It is used where it is referenced in prompt_template + model: The name model that should be used to generate the meta topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have a {ntopics} + parameter that will be populated with the ntopics value passed in this function. + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt = prompt_template.format(ntopics=ntopics, **prompt_kwargs) + messages = [{"role": "user", "content": prompt}] + meta_topics = self.client.query_model( + messages=messages, model=model, **model_kwargs + ) + + return meta_topics + + def generate_subtopics(self, meta_topics: List[str], model: str): + pass + + def generate_questions(self, subtopics): + pass + + def generate_creative_openlines(self): + pass + + def generate_data_assistance_openlines(self): + pass From fb82737c33140f69b08fe76067c7ec3cce54215b Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Wed, 26 Jun 2024 16:06:48 -0700 Subject: [PATCH 05/69] Change type annotation in openai client Signed-off-by: Ryan Wolf --- nemo_curator/services/openai_client.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index 509c204a..3f192c8b 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -1,6 +1,7 @@ from typing import Iterable, List, Optional, Union from openai import AsyncOpenAI, OpenAI +from openai._types import NOT_GIVEN, NotGiven from .model_client import AsyncLLMClient, LLMClient @@ -18,11 +19,11 @@ def query_model( *, messages: Iterable, model: str, - max_tokens: Optional[int] = None, - n: Optional[int] = 1, - stop: Union[Optional[str], List[str]] = None, - temperature: Optional[float] = None, - top_p: Optional[float] = None + max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN, + n: Union[Optional[int], NotGiven] = NOT_GIVEN, + stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, + temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, + top_p: Union[Optional[float], NotGiven] = NOT_GIVEN ) -> List[str]: response = self.client.chat.completions.create( messages=messages, @@ -50,11 +51,11 @@ async def query_model( *, messages: Iterable, model: str, - max_tokens: Optional[int] = None, - n: Optional[int] = 1, - stop: Union[Optional[str], List[str]] = None, - temperature: Optional[float] = None, - top_p: Optional[float] = None + max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN, + n: Union[Optional[int], NotGiven] = NOT_GIVEN, + stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, + temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, + top_p: Union[Optional[float], NotGiven] = NOT_GIVEN ) -> List[str]: response = await self.client.chat.completions.create( messages=messages, From f3e6309235b11ca0798d4283872334e8b856c70a Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Wed, 26 Jun 2024 16:14:51 -0700 Subject: [PATCH 06/69] Make imports easier Signed-off-by: Ryan Wolf --- nemo_curator/services/__init__.py | 2 ++ nemo_curator/synthetic/__init__.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/nemo_curator/services/__init__.py b/nemo_curator/services/__init__.py index 7d3e4559..83df8d3c 100644 --- a/nemo_curator/services/__init__.py +++ b/nemo_curator/services/__init__.py @@ -1,2 +1,4 @@ from .model_client import AsyncLLMClient, LLMClient from .openai_client import AsyncOpenAIClient, OpenAIClient + +__all__ = ["AsyncLLMClient", "LLMClient", "AsyncOpenAIClient", "OpenAIClient"] diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py index e69de29b..2737fcf3 100644 --- a/nemo_curator/synthetic/__init__.py +++ b/nemo_curator/synthetic/__init__.py @@ -0,0 +1,4 @@ +from .prompts import DEFAULT_META_TOPICS_PROMPT_TEMPLATE +from .ultrachat import UltraChatGenerator + +__all__ = ["UltraChatGenerator", "DEFAULT_META_TOPICS_PROMPT_TEMPLATE"] From 5ad683f67f5bdff65c4bbe41737e3bbed962e816 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 14:43:51 -0700 Subject: [PATCH 07/69] Reformat to match nemotron report Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/__init__.py | 15 ++- nemo_curator/synthetic/nemotron.py | 138 ++++++++++++++++++++++++++++ nemo_curator/synthetic/prompts.py | 8 +- nemo_curator/synthetic/ultrachat.py | 59 ------------ 4 files changed, 155 insertions(+), 65 deletions(-) create mode 100644 nemo_curator/synthetic/nemotron.py delete mode 100644 nemo_curator/synthetic/ultrachat.py diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py index 2737fcf3..864b3c5b 100644 --- a/nemo_curator/synthetic/__init__.py +++ b/nemo_curator/synthetic/__init__.py @@ -1,4 +1,13 @@ -from .prompts import DEFAULT_META_TOPICS_PROMPT_TEMPLATE -from .ultrachat import UltraChatGenerator +from .nemotron import NemotronGenerator +from .prompts import ( + DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, +) -__all__ = ["UltraChatGenerator", "DEFAULT_META_TOPICS_PROMPT_TEMPLATE"] +__all__ = [ + "NemotronGenerator", + "DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE", + "DEFAULT_SUBTOPICS_PROMPT_TEMPLATE", + "DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE", +] diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py new file mode 100644 index 00000000..117f9070 --- /dev/null +++ b/nemo_curator/synthetic/nemotron.py @@ -0,0 +1,138 @@ +from typing import List, Optional, Union + +from nemo_curator.services.model_client import AsyncLLMClient, LLMClient +from nemo_curator.synthetic.prompts import ( + DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, +) + + +class NemotronGenerator: + """ + Provides a collection of methods for generating synthetic data + described in the Nemotron-4 340B Technical Report + (https://arxiv.org/abs/2406.11704v1) and inspired by the + UltraChat paper (https://arxiv.org/abs/2305.14233) + """ + + def __init__(self, llm_client: LLMClient) -> None: + self.client = llm_client + + def generate_world_question_openlines(self): + pass + + def generate_macro_topics( + self, + n_macro_topics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about the world + Args: + n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". + It is used where it is referenced in prompt_template + model: The name model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have a {ntopics} + parameter that will be populated with the ntopics value passed in this function. + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt = prompt_template.format(n_macro_topics=n_macro_topics, **prompt_kwargs) + messages = [{"role": "user", "content": prompt}] + macro_topics = self.client.query_model( + messages=messages, model=model, **model_kwargs + ) + + return macro_topics + + def generate_subtopics( + self, + macro_topics: List[str], + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ): + """ + Prompts an LLM to generate a list of subtopics relating to a macro topic + Args: + macro_topics: A list of macro topics to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with an element of the macro_topics list passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM for each macro topic. The outer list will have the same length + as macro_topics, while the inner list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + subtopics = [] + for macro_topic in macro_topics: + prompt = prompt_template.format( + n_subtopics=n_subtopics, macro_topic=macro_topic, **prompt_kwargs + ) + messages = [{"role": "user", "content": prompt}] + subtopics_response = self.client.query_model( + messages=messages, model=model, **model_kwargs + ) + subtopics.append(subtopics_response) + + return subtopics + + def generate_open_qa_from_topics( + self, + topics: List[str], + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ): + """ + Prompts an LLM to generate a list of open Q&A questions based on topics + Args: + topics: A list of topics to generate questions for. + n_openlines: The number of questions to generate per topic. + model: The name model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with an element of the topics list passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM for each topic. The outer list will have the same length + as topics, while the inner list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + openlines = [] + for topic in topics: + prompt = prompt_template.format( + n_openlines=n_openlines, topic=topic, **prompt_kwargs + ) + messages = [{"role": "user", "content": prompt}] + subtopics_response = self.client.query_model( + messages=messages, model=model, **model_kwargs + ) + openlines.append(subtopics_response) + + return openlines + + def generate_creative_openlines(self): + pass + + def generate_data_assistance_openlines(self): + pass diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index 9d499e35..b2ea636d 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -1,3 +1,5 @@ -DEFAULT_META_TOPICS_PROMPT_TEMPLATE = ( - "Write a list of {ntopics} topics about the world." -) +DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_macro_topics} comprehensive topics that encompass various aspects of our daily life, the world, and science? Your answer should be a list of topics. Make the topics as diverse as possible.For example, 1. Food and drinks. \n2. Technology.\n" + +DEFAULT_SUBTOPICS_PROMPT_TEMPLATE = "Can you generate {n_subtopics} comprehensive topics that encompass various aspects of {macro_topic}? Your answer should be a list of topics. Make the topics as diverse as possible." + +DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_openlines} questions or requests related to {topic}? The questions and requests should be as diverse possible. Your answer should be a list." diff --git a/nemo_curator/synthetic/ultrachat.py b/nemo_curator/synthetic/ultrachat.py deleted file mode 100644 index 9e70e7a9..00000000 --- a/nemo_curator/synthetic/ultrachat.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import List, Optional, Union - -from nemo_curator.services.model_client import AsyncLLMClient, LLMClient -from nemo_curator.synthetic.prompts import DEFAULT_META_TOPICS_PROMPT_TEMPLATE - - -class UltraChatGenerator: - """ - Provides a collection of methods for generating synthetic data inspired - by the UltraChat paper: https://arxiv.org/abs/2305.14233 - """ - - def __init__(self, llm_client: LLMClient) -> None: - self.client = llm_client - - def generate_world_question_openlines(self): - pass - - def generate_meta_topics( - self, - ntopics: Union[int, str], - model: str, - prompt_template: str = DEFAULT_META_TOPICS_PROMPT_TEMPLATE, - prompt_kwargs: dict = {}, - model_kwargs: dict = {}, - ) -> List[str]: - """ - Prompts an LLM to generate a list of meta topics about the world - Args: - ntopics: The number of meta topics to generate. Can be an integer like 5 or a string like "five". - It is used where it is referenced in prompt_template - model: The name model that should be used to generate the meta topics. - Must be available in the LLMClient passed in the constructor. - prompt_template: A format string of the prompt to use. It must have a {ntopics} - parameter that will be populated with the ntopics value passed in this function. - prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. - model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. - Returns: - A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. - """ - prompt = prompt_template.format(ntopics=ntopics, **prompt_kwargs) - messages = [{"role": "user", "content": prompt}] - meta_topics = self.client.query_model( - messages=messages, model=model, **model_kwargs - ) - - return meta_topics - - def generate_subtopics(self, meta_topics: List[str], model: str): - pass - - def generate_questions(self, subtopics): - pass - - def generate_creative_openlines(self): - pass - - def generate_data_assistance_openlines(self): - pass From 0d552b4476ee19779f775087736272032058f697 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 19:59:32 -0700 Subject: [PATCH 08/69] Add yaml conversion Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 38 ++++++++++++++++++++++++++++-- nemo_curator/synthetic/prompts.py | 2 ++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 117f9070..88f8a9e2 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -1,10 +1,13 @@ from typing import List, Optional, Union +import yaml + from nemo_curator.services.model_client import AsyncLLMClient, LLMClient from nemo_curator.synthetic.prompts import ( DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, ) @@ -19,6 +22,37 @@ class NemotronGenerator: def __init__(self, llm_client: LLMClient) -> None: self.client = llm_client + def convert_response_to_yaml_list( + self, + llm_response: str, + model: str, + prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Converts a response of an LLM to a list of strings by querying an LLM + Args: + llm_response: The original unformatted response of the LLM + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have a {llm_response} + parameter that will be populated with the llm_response value passed in this function. + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A parsed list of elements from the original LLM response + """ + prompt = prompt_template.format(llm_response=llm_response, **prompt_kwargs) + messages = [{"role": "user", "content": prompt}] + yaml_response = self.client.query_model( + messages=messages, model=model, **model_kwargs + ) + parsed_response = yaml.safe_load(yaml_response) + + return parsed_response + def generate_world_question_openlines(self): pass @@ -67,7 +101,7 @@ def generate_subtopics( Args: macro_topics: A list of macro topics to generate subtopics for. n_subtopics: The number of subtopics to generate per macro topic - model: The name model that should be used to generate the macro topics. + model: The name model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_subtopics: Will be populated with the n_subtopics passed in this function @@ -106,7 +140,7 @@ def generate_open_qa_from_topics( Args: topics: A list of topics to generate questions for. n_openlines: The number of questions to generate per topic. - model: The name model that should be used to generate the macro topics. + model: The name model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_openlines: Will be populated with the n_subtopics passed in this function diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index b2ea636d..1362fea1 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -1,3 +1,5 @@ +DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE = "Can you convert this list of items into a yaml format? {llm_response} \n\n Your answer should only be a parsable yaml format." + DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_macro_topics} comprehensive topics that encompass various aspects of our daily life, the world, and science? Your answer should be a list of topics. Make the topics as diverse as possible.For example, 1. Food and drinks. \n2. Technology.\n" DEFAULT_SUBTOPICS_PROMPT_TEMPLATE = "Can you generate {n_subtopics} comprehensive topics that encompass various aspects of {macro_topic}? Your answer should be a list of topics. Make the topics as diverse as possible." From 87ebfc400cdddae7f42b971bd684f68a96f82e11 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 20:07:40 -0700 Subject: [PATCH 09/69] Fix index error Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 88f8a9e2..faf0aed3 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -49,7 +49,7 @@ def convert_response_to_yaml_list( yaml_response = self.client.query_model( messages=messages, model=model, **model_kwargs ) - parsed_response = yaml.safe_load(yaml_response) + parsed_response = yaml.safe_load(yaml_response[0]) return parsed_response From bb72a6831d01481b1d36d8aedb7aadd797b95152 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 20:24:52 -0700 Subject: [PATCH 10/69] Add error handling for yaml parsing Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/error.py | 20 ++++++++++++++++++++ nemo_curator/synthetic/nemotron.py | 28 +++++++++++++++++++++++++++- nemo_curator/synthetic/prompts.py | 13 +++++++++++++ 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 nemo_curator/synthetic/error.py diff --git a/nemo_curator/synthetic/error.py b/nemo_curator/synthetic/error.py new file mode 100644 index 00000000..b89792ea --- /dev/null +++ b/nemo_curator/synthetic/error.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +class YamlConversionError(Exception): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + def __str__(self): + return self.message diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index faf0aed3..dfef08be 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -1,8 +1,22 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from typing import List, Optional, Union import yaml from nemo_curator.services.model_client import AsyncLLMClient, LLMClient +from nemo_curator.synthetic.error import YamlConversionError from nemo_curator.synthetic.prompts import ( DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, @@ -49,7 +63,19 @@ def convert_response_to_yaml_list( yaml_response = self.client.query_model( messages=messages, model=model, **model_kwargs ) - parsed_response = yaml.safe_load(yaml_response[0]) + try: + parsed_response = yaml.safe_load(yaml_response[0]) + except yaml.scanner.ScannerError: + raise YamlConversionError( + f"Error parsing yaml response: {yaml_response[0]}" + ) + + # Ensure there are no additional hallucinations introduced + hallucination_free = all(elem in llm_response for elem in parsed_response) + if not hallucination_free: + raise YamlConversionError( + f"Conversion introduced hallucinations. Original response:\n{llm_response}\nConverted response:\n{parsed_response}" + ) return parsed_response diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index 1362fea1..fa27e56e 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -1,3 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE = "Can you convert this list of items into a yaml format? {llm_response} \n\n Your answer should only be a parsable yaml format." DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_macro_topics} comprehensive topics that encompass various aspects of our daily life, the world, and science? Your answer should be a list of topics. Make the topics as diverse as possible.For example, 1. Food and drinks. \n2. Technology.\n" From 32c7f55bea274558a83272f0df6912ba129e2c09 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 21:04:28 -0700 Subject: [PATCH 11/69] Fix error Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index dfef08be..b7b290cb 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -65,7 +65,7 @@ def convert_response_to_yaml_list( ) try: parsed_response = yaml.safe_load(yaml_response[0]) - except yaml.scanner.ScannerError: + except yaml.scanner.ScannerError as _: raise YamlConversionError( f"Error parsing yaml response: {yaml_response[0]}" ) From a6d306e3874456b587a76a54d216d885bb3bd98d Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 21:15:48 -0700 Subject: [PATCH 12/69] Add additional yaml parsing check Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index b7b290cb..20eda86a 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -70,6 +70,11 @@ def convert_response_to_yaml_list( f"Error parsing yaml response: {yaml_response[0]}" ) + if not isinstance(parsed_response, list): + raise YamlConversionError( + f"Error: Parsed response was not a list: {parsed_response}" + ) + # Ensure there are no additional hallucinations introduced hallucination_free = all(elem in llm_response for elem in parsed_response) if not hallucination_free: From ece34b55cea656e2e51080b9e14090cfbd7955e4 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 21:20:18 -0700 Subject: [PATCH 13/69] Add more yaml error handling Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 20eda86a..7d4a8f27 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -75,12 +75,15 @@ def convert_response_to_yaml_list( f"Error: Parsed response was not a list: {parsed_response}" ) - # Ensure there are no additional hallucinations introduced - hallucination_free = all(elem in llm_response for elem in parsed_response) - if not hallucination_free: - raise YamlConversionError( - f"Conversion introduced hallucinations. Original response:\n{llm_response}\nConverted response:\n{parsed_response}" - ) + for elem in parsed_response: + if not isinstance(elem, str): + raise YamlConversionError( + f"Error: Parsed response contains non-string elements in list: {parsed_response}" + ) + if elem not in llm_response: + raise YamlConversionError( + f"Conversion introduced hallucinations. Original response:\n{llm_response}\nConverted response:\n{parsed_response}" + ) return parsed_response From 28d3a087695a77df3b701648702dd0e7a7300fb4 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 21:26:29 -0700 Subject: [PATCH 14/69] Export conversion error Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py index 864b3c5b..6b6bb774 100644 --- a/nemo_curator/synthetic/__init__.py +++ b/nemo_curator/synthetic/__init__.py @@ -1,3 +1,4 @@ +from .error import YamlConversionError from .nemotron import NemotronGenerator from .prompts import ( DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, @@ -10,4 +11,5 @@ "DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE", "DEFAULT_SUBTOPICS_PROMPT_TEMPLATE", "DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE", + "YamlConversionError", ] From 8cf295e73ac823a268e69204b432892d51e01584 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 21:34:57 -0700 Subject: [PATCH 15/69] Change variable naming Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 7d4a8f27..3741c12a 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -82,7 +82,7 @@ def convert_response_to_yaml_list( ) if elem not in llm_response: raise YamlConversionError( - f"Conversion introduced hallucinations. Original response:\n{llm_response}\nConverted response:\n{parsed_response}" + f"Conversion introduced hallucinations. Original response:\n{llm_response}\nConverted response:\n{parsed_response}\nHallucination:\n{elem}" ) return parsed_response @@ -192,10 +192,10 @@ def generate_open_qa_from_topics( n_openlines=n_openlines, topic=topic, **prompt_kwargs ) messages = [{"role": "user", "content": prompt}] - subtopics_response = self.client.query_model( + openline_response = self.client.query_model( messages=messages, model=model, **model_kwargs ) - openlines.append(subtopics_response) + openlines.append(openline_response) return openlines From 7fcd71934736d3fd684a441b8e95209de8b181b5 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 21:46:24 -0700 Subject: [PATCH 16/69] Make error catching more general Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 3741c12a..83a960b1 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -65,7 +65,7 @@ def convert_response_to_yaml_list( ) try: parsed_response = yaml.safe_load(yaml_response[0]) - except yaml.scanner.ScannerError as _: + except yaml.error.YAMLError as _: raise YamlConversionError( f"Error parsing yaml response: {yaml_response[0]}" ) From 76ddfdac5f0a281ca6a3191c3f7bcc4f8e99e26a Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Sun, 30 Jun 2024 22:19:02 -0700 Subject: [PATCH 17/69] Refactor list out of nemotron Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 98 +++++++++++++++++++----------- nemo_curator/synthetic/prompts.py | 2 + 2 files changed, 65 insertions(+), 35 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 83a960b1..7f9f8873 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -20,6 +20,7 @@ from nemo_curator.synthetic.prompts import ( DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, ) @@ -123,46 +124,42 @@ def generate_macro_topics( def generate_subtopics( self, - macro_topics: List[str], + macro_topic: str, n_subtopics: Union[int, str], model: str, prompt_template: str = DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, prompt_kwargs: dict = {}, model_kwargs: dict = {}, - ): + ) -> List[str]: """ Prompts an LLM to generate a list of subtopics relating to a macro topic Args: - macro_topics: A list of macro topics to generate subtopics for. + macro_topic: The macro topic to generate subtopics for. n_subtopics: The number of subtopics to generate per macro topic model: The name model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_subtopics: Will be populated with the n_subtopics passed in this function - - macro_topic: Will be populated with an element of the macro_topics list passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. None are needed for the default template. model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. Returns: - A list of responses from the LLM for each macro topic. The outer list will have the same length - as macro_topics, while the inner list is only greater than length 1 if n > 1 is set in model_kwargs. + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. """ - subtopics = [] - for macro_topic in macro_topics: - prompt = prompt_template.format( - n_subtopics=n_subtopics, macro_topic=macro_topic, **prompt_kwargs - ) - messages = [{"role": "user", "content": prompt}] - subtopics_response = self.client.query_model( - messages=messages, model=model, **model_kwargs - ) - subtopics.append(subtopics_response) + prompt = prompt_template.format( + n_subtopics=n_subtopics, macro_topic=macro_topic, **prompt_kwargs + ) + messages = [{"role": "user", "content": prompt}] + subtopics_response = self.client.query_model( + messages=messages, model=model, **model_kwargs + ) - return subtopics + return subtopics_response - def generate_open_qa_from_topics( + def generate_open_qa_from_topic( self, - topics: List[str], + topic: str, n_openlines: Union[str, int], model: str, prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, @@ -170,34 +167,65 @@ def generate_open_qa_from_topics( model_kwargs: dict = {}, ): """ - Prompts an LLM to generate a list of open Q&A questions based on topics + Prompts an LLM to generate a list of open Q&A questions based on a topic Args: - topics: A list of topics to generate questions for. + topic: The topic to generate questions for. n_openlines: The number of questions to generate per topic. model: The name model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_openlines: Will be populated with the n_subtopics passed in this function - - topic: Will be populated with an element of the topics list passed in this function + - topic: Will be populated with the topic passed in this function prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. None are needed for the default template. model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. Returns: - A list of responses from the LLM for each topic. The outer list will have the same length - as topics, while the inner list is only greater than length 1 if n > 1 is set in model_kwargs. + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. """ - openlines = [] - for topic in topics: - prompt = prompt_template.format( - n_openlines=n_openlines, topic=topic, **prompt_kwargs - ) - messages = [{"role": "user", "content": prompt}] - openline_response = self.client.query_model( - messages=messages, model=model, **model_kwargs - ) - openlines.append(openline_response) + prompt = prompt_template.format( + n_openlines=n_openlines, topic=topic, **prompt_kwargs + ) + messages = [{"role": "user", "content": prompt}] + openline_response = self.client.query_model( + messages=messages, model=model, **model_kwargs + ) + + return openline_response + + def revise_open_qa( + self, + openline: str, + n_revisions: Union[str, int], + model: str, + prompt_template: str = DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to revise an open Q&A question a given number of times + Args: + openline: An openline to revise + n_revisions: The number of revisions to generate for the question. + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - openline: Will be populated with the openline passed in this function + - n_revisions: Will be populated with the n_revisions passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt = prompt_template.format( + openline=openline, n_revisions=n_revisions, **prompt_kwargs + ) + messages = [{"role": "user", "content": prompt}] + revisions = self.client.query_model( + messages=messages, model=model, **model_kwargs + ) - return openlines + return revisions def generate_creative_openlines(self): pass diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index fa27e56e..5104d913 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -18,3 +18,5 @@ DEFAULT_SUBTOPICS_PROMPT_TEMPLATE = "Can you generate {n_subtopics} comprehensive topics that encompass various aspects of {macro_topic}? Your answer should be a list of topics. Make the topics as diverse as possible." DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_openlines} questions or requests related to {topic}? The questions and requests should be as diverse possible. Your answer should be a list." + +DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE = "Question: {text1}\n\nCan you revise the question above to include more contexts or details? The revised questions can be any of the follows:\n1. Adding some context to the original question. The context might state the importance of the question, explain background knowledge, or add other reasonable information.\n2. Change the questions into a different format or style, e.g., imperative statements, length requirements for the answer, etc.\n3. Elongated questions that require to elaborate on specific topic or discuss a certain point.\n4. Any other related questions or statements.\n\nThe revised question should contain two, three, or four sentences. You should generate {n_tasks} revised questions or statements in a list. Make them as diverse as possible." From 2f7a03bd0d457bb5d36f98f652a0cf0b69848ab1 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 06:37:53 -0700 Subject: [PATCH 18/69] Add prompt helper function Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 68 ++++++++++++++++++------------ 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 7f9f8873..a1619bbd 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -37,6 +37,14 @@ class NemotronGenerator: def __init__(self, llm_client: LLMClient) -> None: self.client = llm_client + def _prompt( + self, model: str, prompt_template: str, prompt_kwargs: dict, model_kwargs: dict + ) -> List[str]: + prompt = prompt_template.format(**prompt_kwargs) + messages = [{"role": "user", "content": prompt}] + + return self.client.query_model(messages=messages, model=model, **model_kwargs) + def convert_response_to_yaml_list( self, llm_response: str, @@ -59,11 +67,14 @@ def convert_response_to_yaml_list( Returns: A parsed list of elements from the original LLM response """ - prompt = prompt_template.format(llm_response=llm_response, **prompt_kwargs) - messages = [{"role": "user", "content": prompt}] - yaml_response = self.client.query_model( - messages=messages, model=model, **model_kwargs + prompt_kwargs["llm_response"] = llm_response + yaml_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, ) + try: parsed_response = yaml.safe_load(yaml_response[0]) except yaml.error.YAMLError as _: @@ -114,10 +125,12 @@ def generate_macro_topics( Returns: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. """ - prompt = prompt_template.format(n_macro_topics=n_macro_topics, **prompt_kwargs) - messages = [{"role": "user", "content": prompt}] - macro_topics = self.client.query_model( - messages=messages, model=model, **model_kwargs + prompt_kwargs["n_macro_topics"] = n_macro_topics + macro_topics = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, ) return macro_topics @@ -147,12 +160,13 @@ def generate_subtopics( Returns: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. """ - prompt = prompt_template.format( - n_subtopics=n_subtopics, macro_topic=macro_topic, **prompt_kwargs - ) - messages = [{"role": "user", "content": prompt}] - subtopics_response = self.client.query_model( - messages=messages, model=model, **model_kwargs + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, ) return subtopics_response @@ -182,12 +196,13 @@ def generate_open_qa_from_topic( Returns: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. """ - prompt = prompt_template.format( - n_openlines=n_openlines, topic=topic, **prompt_kwargs - ) - messages = [{"role": "user", "content": prompt}] - openline_response = self.client.query_model( - messages=messages, model=model, **model_kwargs + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + openline_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, ) return openline_response @@ -217,12 +232,13 @@ def revise_open_qa( Returns: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. """ - prompt = prompt_template.format( - openline=openline, n_revisions=n_revisions, **prompt_kwargs - ) - messages = [{"role": "user", "content": prompt}] - revisions = self.client.query_model( - messages=messages, model=model, **model_kwargs + prompt_kwargs["openline"] = openline + prompt_kwargs["n_revisions"] = n_revisions + revisions = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, ) return revisions From 76c4bdd9f9770ce4059b534cc2babfc87abf0782 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 06:59:53 -0700 Subject: [PATCH 19/69] Add revisions and writing prompts Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 77 +++++++++++++++++++++++++++++- nemo_curator/synthetic/prompts.py | 6 ++- 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index a1619bbd..6f074972 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -243,8 +243,81 @@ def revise_open_qa( return revisions - def generate_creative_openlines(self): - pass + def generate_writing_tasks( + self, + topic: str, + text_material_type: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of writing tasks based on a topic and document type + Args: + topic: The topic to generate writing tasks for. + text_material_type: The type of the document the question should ask to generate (e.g., "Email", "Poem") + n_openlines: The number of tasks to generate per topic and text material pair. + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - topic: Will be populated with the topic passed in this function + - text_material_type: Will be populated with the text_material_type passed in this function + - n_openlines: Will be populated with the n_openlines passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["text_material_type"] = text_material_type + prompt_kwargs["n_openlines"] = n_openlines + writing_tasks = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return writing_tasks + + def revise_writing_tasks( + self, + openline: str, + n_revisions: Union[str, int], + model: str, + prompt_template: str = DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to revise a writing task a given number of times + Args: + openline: An openline to revise + n_revisions: The number of revisions to generate for the task. + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - openline: Will be populated with the openline passed in this function + - n_revisions: Will be populated with the n_revisions passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["openline"] = openline + prompt_kwargs["n_revisions"] = n_revisions + revisions = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return revisions def generate_data_assistance_openlines(self): pass diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index 5104d913..be8fa441 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -19,4 +19,8 @@ DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_openlines} questions or requests related to {topic}? The questions and requests should be as diverse possible. Your answer should be a list." -DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE = "Question: {text1}\n\nCan you revise the question above to include more contexts or details? The revised questions can be any of the follows:\n1. Adding some context to the original question. The context might state the importance of the question, explain background knowledge, or add other reasonable information.\n2. Change the questions into a different format or style, e.g., imperative statements, length requirements for the answer, etc.\n3. Elongated questions that require to elaborate on specific topic or discuss a certain point.\n4. Any other related questions or statements.\n\nThe revised question should contain two, three, or four sentences. You should generate {n_tasks} revised questions or statements in a list. Make them as diverse as possible." +DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE = "Question: {openline}\n\nCan you revise the question above to include more contexts or details? The revised questions can be any of the follows:\n1. Adding some context to the original question. The context might state the importance of the question, explain background knowledge, or add other reasonable information.\n2. Change the questions into a different format or style, e.g., imperative statements, length requirements for the answer, etc.\n3. Elongated questions that require to elaborate on specific topic or discuss a certain point.\n4. Any other related questions or statements.\n\nThe revised question should contain two, three, or four sentences. You should generate {n_revisions} revised questions or statements in a list. Make them as diverse as possible." + +DEFAULT_WRITING_TASK_PROMPT_TEMPLATE = 'Can you generate {n_openlines} tasks, each of which requires to create a "{text_material_type}" related to {topic}? Each task should be concise and include one or two sentences only. The tasks should be as diverse as possible. Your answer should be a list of tasks.' + +DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE = "TASK: {openline}\n\nCan you revise the task above to include more detailed requirements? These requirements can be any of the follows:\n1. Require to elaborate on a specific topic or discuss a certain point.\n2. Require to include some examples, data points, or references.\n3. Require to follow specific formats or styles, e.g., no more than 300 words, including specific words, etc.\n4. Any other reasonable requests to make the task more detailed.\n\nThe revised task should contain two, three, or four sentences. You should generate {n_revisions} revised tasks in a list. Make the tasks as diverse as possible." From 2f15d897c2dbc44b39ee9a25b83376e9688705c1 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 07:19:24 -0700 Subject: [PATCH 20/69] Fix default prompt templates Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 6 ++++-- nemo_curator/synthetic/prompts.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 6f074972..066c0c85 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -21,7 +21,9 @@ DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, ) @@ -249,7 +251,7 @@ def generate_writing_tasks( text_material_type: str, n_openlines: Union[str, int], model: str, - prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + prompt_template: str = DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, prompt_kwargs: dict = {}, model_kwargs: dict = {}, ) -> List[str]: @@ -288,7 +290,7 @@ def revise_writing_tasks( openline: str, n_revisions: Union[str, int], model: str, - prompt_template: str = DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + prompt_template: str = DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, prompt_kwargs: dict = {}, model_kwargs: dict = {}, ) -> List[str]: diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index be8fa441..d23258d9 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE = "Can you convert this list of items into a yaml format? {llm_response} \n\n Your answer should only be a parsable yaml format." +DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE = "The following document contains a list of items. Parse the list of items into a yaml list of strings. Do not parse any other part of the document. There should be no additional formatting to your response, just the yaml list of strings.\n\n {llm_response}" DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_macro_topics} comprehensive topics that encompass various aspects of our daily life, the world, and science? Your answer should be a list of topics. Make the topics as diverse as possible.For example, 1. Food and drinks. \n2. Technology.\n" From cc18dfec38d9b7132644e4a860b19952c3e17f6b Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 07:39:30 -0700 Subject: [PATCH 21/69] Add closed qa Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 39 +++++++++++++++++++++++++++++- nemo_curator/synthetic/prompts.py | 2 ++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 066c0c85..d8157898 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -18,6 +18,7 @@ from nemo_curator.services.model_client import AsyncLLMClient, LLMClient from nemo_curator.synthetic.error import YamlConversionError from nemo_curator.synthetic.prompts import ( + DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, @@ -181,7 +182,7 @@ def generate_open_qa_from_topic( prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, prompt_kwargs: dict = {}, model_kwargs: dict = {}, - ): + ) -> List[str]: """ Prompts an LLM to generate a list of open Q&A questions based on a topic Args: @@ -321,5 +322,41 @@ def revise_writing_tasks( return revisions + def generate_closed_qa_instructions( + self, + document: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of closed Q&A questions based on a reference document + Args: + document: The document to use when generating questions + n_openlines: The number of questions to generate per document. + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - document: Will be populated with the document passed in this function + - n_openlines: Will be populated with the n_openlines passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["document"] = document + prompt_kwargs["n_openlines"] = n_openlines + openline_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + def generate_data_assistance_openlines(self): pass diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index d23258d9..85b7ed98 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -24,3 +24,5 @@ DEFAULT_WRITING_TASK_PROMPT_TEMPLATE = 'Can you generate {n_openlines} tasks, each of which requires to create a "{text_material_type}" related to {topic}? Each task should be concise and include one or two sentences only. The tasks should be as diverse as possible. Your answer should be a list of tasks.' DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE = "TASK: {openline}\n\nCan you revise the task above to include more detailed requirements? These requirements can be any of the follows:\n1. Require to elaborate on a specific topic or discuss a certain point.\n2. Require to include some examples, data points, or references.\n3. Require to follow specific formats or styles, e.g., no more than 300 words, including specific words, etc.\n4. Any other reasonable requests to make the task more detailed.\n\nThe revised task should contain two, three, or four sentences. You should generate {n_revisions} revised tasks in a list. Make the tasks as diverse as possible." + +DEFAULT_CLOSED_QA_PROMPT_TEMPLATE = "TEXT: {document}\n\nGiven the text above, can you come up with {n_instructions} questions or tasks?They can be any of the follows:\n1. Asking certain information in the text;\n2. Summarizing, repharsing or explaining the text;\n3. Writing something similar to the text;\n4. Any other reasonable requests related to the text.\n\nMake the questions or tasks as diverse as possible." From d4755c015ff57d91f89e8cf64c6ab91f3596fa50 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 07:40:56 -0700 Subject: [PATCH 22/69] Fix prompt Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/prompts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index 85b7ed98..f4e3b84c 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -25,4 +25,4 @@ DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE = "TASK: {openline}\n\nCan you revise the task above to include more detailed requirements? These requirements can be any of the follows:\n1. Require to elaborate on a specific topic or discuss a certain point.\n2. Require to include some examples, data points, or references.\n3. Require to follow specific formats or styles, e.g., no more than 300 words, including specific words, etc.\n4. Any other reasonable requests to make the task more detailed.\n\nThe revised task should contain two, three, or four sentences. You should generate {n_revisions} revised tasks in a list. Make the tasks as diverse as possible." -DEFAULT_CLOSED_QA_PROMPT_TEMPLATE = "TEXT: {document}\n\nGiven the text above, can you come up with {n_instructions} questions or tasks?They can be any of the follows:\n1. Asking certain information in the text;\n2. Summarizing, repharsing or explaining the text;\n3. Writing something similar to the text;\n4. Any other reasonable requests related to the text.\n\nMake the questions or tasks as diverse as possible." +DEFAULT_CLOSED_QA_PROMPT_TEMPLATE = "TEXT: {document}\n\nGiven the text above, can you come up with {n_openlines} questions or tasks? They can be any of the follows:\n1. Asking certain information in the text;\n2. Summarizing, repharsing or explaining the text;\n3. Writing something similar to the text;\n4. Any other reasonable requests related to the text.\n\nMake the questions or tasks as diverse as possible." From 366fea887c31698c5e73741a26c32d84fe4f7466 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 08:03:23 -0700 Subject: [PATCH 23/69] Add math and coding Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 210 +++++++++++++++++++++++++++++ nemo_curator/synthetic/prompts.py | 14 ++ 2 files changed, 224 insertions(+) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index d8157898..2d0594f4 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -20,7 +20,13 @@ from nemo_curator.synthetic.prompts import ( DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, @@ -358,5 +364,209 @@ def generate_closed_qa_instructions( return openline_response + def generate_math_macro_topics( + self, + n_macro_topics: Union[int, str], + school_level: str, + model: str, + prompt_template: str = DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about math + Args: + n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". + school_level: The school level the math questions should be targeted at. + model: The name model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + - school_level: Will be populated with the school_level passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + prompt_kwargs["school_level"] = school_level + macro_topics = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + def generate_math_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a math macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + def classify_math_entity( + self, + entity: str, + model: str, + prompt_template: str = DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs={}, + ) -> List[str]: + """ + Prompts an LLM to classify if an entity is related to math + Args: + entity: The entity to classify + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - entity: Will be populated with the entity passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["entity"] = entity + classification_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return classification_response + + def generate_python_macro_topics( + self, + n_macro_topics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about the Python programming language + Args: + n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". + model: The name model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + macro_topics = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + def generate_python_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a Python macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + def classify_python_entity( + self, + entity: str, + model: str, + prompt_template: str = DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs={}, + ) -> List[str]: + """ + Prompts an LLM to classify if an entity is related to Python + Args: + entity: The entity to classify + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - entity: Will be populated with the entity passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["entity"] = entity + classification_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return classification_response + def generate_data_assistance_openlines(self): pass diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index f4e3b84c..1507c0b5 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -26,3 +26,17 @@ DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE = "TASK: {openline}\n\nCan you revise the task above to include more detailed requirements? These requirements can be any of the follows:\n1. Require to elaborate on a specific topic or discuss a certain point.\n2. Require to include some examples, data points, or references.\n3. Require to follow specific formats or styles, e.g., no more than 300 words, including specific words, etc.\n4. Any other reasonable requests to make the task more detailed.\n\nThe revised task should contain two, three, or four sentences. You should generate {n_revisions} revised tasks in a list. Make the tasks as diverse as possible." DEFAULT_CLOSED_QA_PROMPT_TEMPLATE = "TEXT: {document}\n\nGiven the text above, can you come up with {n_openlines} questions or tasks? They can be any of the follows:\n1. Asking certain information in the text;\n2. Summarizing, repharsing or explaining the text;\n3. Writing something similar to the text;\n4. Any other reasonable requests related to the text.\n\nMake the questions or tasks as diverse as possible." + +DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE = "Can you generate {n_macro_topics} comprehensive topics that encompass the mathematics knowledge taughted in {school_level}? Your answer should be a list of topics. Make the topics as diverse as possible." + +DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE = 'List {n_subtopics} mathemathics topics that encompass various aspects of "{macro_topic}". Your answer should be a list of topics. Make the topics as diverse as possible.' + +DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE = 'Does the concept "{entity}" belong to one of the following categories?\n- Math concepts taught at elementary school, middle school, high school, and univiersity.\n- Important mathematics axioms, theorems, algorithms, equations, or inequalities.\n- Representative math problems, functions, and applications.\n\nYour answer should start with "Yes" or "No".' + +DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE = ( + "List {n_macro_topics} important concepts in the python language." +) + +DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE = 'List {n_subtopics} important concepts related to "{macro_topic}" in the python language.' + +DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE = 'Does the concept "{entity}" belong to one of the following categories?\n- Programming concepts like loops, functions, and data structures in python.\n- Important functions, objects, or libraries in python.\n- Mathematical concepts like linear algebra which can be implemented in python.\n- Basic algorithms or problems in computer science likes Greedy Search and Dynamics programming which can be addressed in python.\n\nYour answer should start with "Yes" or "No".' From f563018de7b11ac5be8a278ced64bb22d1dbae20 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 08:18:55 -0700 Subject: [PATCH 24/69] Add problem generation Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 78 ++++++++++++++++++++++++++++++ nemo_curator/synthetic/prompts.py | 10 ++++ 2 files changed, 88 insertions(+) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 2d0594f4..a81fa22f 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -32,6 +32,8 @@ DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, ) @@ -468,6 +470,42 @@ def classify_math_entity( return classification_response + def generate_math_problem( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of math problems based on a topic + Args: + topic: The topic to generate problems for. + n_openlines: The number of problems to generate per topic. + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + openline_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + def generate_python_macro_topics( self, n_macro_topics: Union[int, str], @@ -568,5 +606,45 @@ def classify_python_entity( return classification_response + def generate_coding_problem( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + language="Python", + prompt_template: str = PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of coding problems based on a topic + Args: + topic: The topic to generate problems for. + n_openlines: The number of problems to generate per topic. + model: The name model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + language: The programming language to target when generating these questions. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + - language: Will be populated with the language passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + prompt_kwargs["language"] = language + openline_response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + def generate_data_assistance_openlines(self): pass diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index 1507c0b5..e7e2e5ed 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -33,6 +33,10 @@ DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE = 'Does the concept "{entity}" belong to one of the following categories?\n- Math concepts taught at elementary school, middle school, high school, and univiersity.\n- Important mathematics axioms, theorems, algorithms, equations, or inequalities.\n- Representative math problems, functions, and applications.\n\nYour answer should start with "Yes" or "No".' +MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE = 'Generate {n_openlines} mathematics problems which are related to "{topic}" or can be addressed using "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' + +MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE = 'Generate {n_openlines} mathematics problems which are related to "{topic}" or can be addressed using "{topic}". These problems should be suitable for beginners who just learnt "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' + DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE = ( "List {n_macro_topics} important concepts in the python language." ) @@ -40,3 +44,9 @@ DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE = 'List {n_subtopics} important concepts related to "{macro_topic}" in the python language.' DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE = 'Does the concept "{entity}" belong to one of the following categories?\n- Programming concepts like loops, functions, and data structures in python.\n- Important functions, objects, or libraries in python.\n- Mathematical concepts like linear algebra which can be implemented in python.\n- Basic algorithms or problems in computer science likes Greedy Search and Dynamics programming which can be addressed in python.\n\nYour answer should start with "Yes" or "No".' + +PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE = 'Generate {n_openlines} {language} coding problems related to "{topic}". These problems should be suitable for beginners who just learnt "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' + +PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE = 'Generate {n_openlines} {language} coding problems related to "{topic}". These problems should be suitable for medium-level programmers with some experiences of "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' + +PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE = 'Generate {n_openlines} {language} coding problems related to "{topic}". These problems should be suitable for advanced programmers with solid knowledge and experiences of "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' From 294a390e2d1a3b4569920979a0b034dbdea97a0d Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 08:20:29 -0700 Subject: [PATCH 25/69] Rename function Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index a81fa22f..44810a15 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -606,7 +606,7 @@ def classify_python_entity( return classification_response - def generate_coding_problem( + def generate_python_problem( self, topic: str, n_openlines: Union[str, int], From 728d585f49a98f11d19f4bd3bb3f833a66f80338 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 09:59:47 -0700 Subject: [PATCH 26/69] Add dialogue support Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/__init__.py | 38 +++++++++ nemo_curator/synthetic/nemotron.py | 129 +++++++++++++++++++++++++---- nemo_curator/synthetic/prompts.py | 6 ++ 3 files changed, 157 insertions(+), 16 deletions(-) diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py index 6b6bb774..968a0f1b 100644 --- a/nemo_curator/synthetic/__init__.py +++ b/nemo_curator/synthetic/__init__.py @@ -1,9 +1,28 @@ from .error import YamlConversionError from .nemotron import NemotronGenerator from .prompts import ( + DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE, + DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE, + DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE, + PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE, ) __all__ = [ @@ -11,5 +30,24 @@ "DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE", "DEFAULT_SUBTOPICS_PROMPT_TEMPLATE", "DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE", + "DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE", + "DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE", + "DEFAULT_WRITING_TASK_PROMPT_TEMPLATE", + "DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE", + "DEFAULT_CLOSED_QA_PROMPT_TEMPLATE", + "DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE", + "DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE", + "DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE", + "MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE", + "MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE", + "DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE", + "DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE", + "DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE", + "PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE", + "PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE", + "PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE", + "DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE", + "DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE", + "DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE", "YamlConversionError", ] diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 44810a15..1cd11268 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -32,6 +32,7 @@ DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, ) @@ -68,7 +69,7 @@ def convert_response_to_yaml_list( Converts a response of an LLM to a list of strings by querying an LLM Args: llm_response: The original unformatted response of the LLM - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have a {llm_response} parameter that will be populated with the llm_response value passed in this function. @@ -126,7 +127,7 @@ def generate_macro_topics( Args: n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". It is used where it is referenced in prompt_template - model: The name model that should be used to generate the macro topics. + model: The name of the model that should be used to generate the macro topics. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have a {ntopics} parameter that will be populated with the ntopics value passed in this function. @@ -160,7 +161,7 @@ def generate_subtopics( Args: macro_topic: The macro topic to generate subtopics for. n_subtopics: The number of subtopics to generate per macro topic - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_subtopics: Will be populated with the n_subtopics passed in this function @@ -196,7 +197,7 @@ def generate_open_qa_from_topic( Args: topic: The topic to generate questions for. n_openlines: The number of questions to generate per topic. - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_openlines: Will be populated with the n_subtopics passed in this function @@ -232,7 +233,7 @@ def revise_open_qa( Args: openline: An openline to revise n_revisions: The number of revisions to generate for the question. - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - openline: Will be populated with the openline passed in this function @@ -270,7 +271,7 @@ def generate_writing_tasks( topic: The topic to generate writing tasks for. text_material_type: The type of the document the question should ask to generate (e.g., "Email", "Poem") n_openlines: The number of tasks to generate per topic and text material pair. - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - topic: Will be populated with the topic passed in this function @@ -308,7 +309,7 @@ def revise_writing_tasks( Args: openline: An openline to revise n_revisions: The number of revisions to generate for the task. - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - openline: Will be populated with the openline passed in this function @@ -344,7 +345,7 @@ def generate_closed_qa_instructions( Args: document: The document to use when generating questions n_openlines: The number of questions to generate per document. - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - document: Will be populated with the document passed in this function @@ -380,7 +381,7 @@ def generate_math_macro_topics( Args: n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". school_level: The school level the math questions should be targeted at. - model: The name model that should be used to generate the macro topics. + model: The name of the model that should be used to generate the macro topics. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_macro_topics: Will be populated with the n_macro_topics passed in this function @@ -416,7 +417,7 @@ def generate_math_subtopics( Args: macro_topic: The macro topic to generate subtopics for. n_subtopics: The number of subtopics to generate per macro topic - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_subtopics: Will be populated with the n_subtopics passed in this function @@ -450,7 +451,7 @@ def classify_math_entity( Prompts an LLM to classify if an entity is related to math Args: entity: The entity to classify - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - entity: Will be populated with the entity passed in this function @@ -484,11 +485,14 @@ def generate_math_problem( Args: topic: The topic to generate problems for. n_openlines: The number of problems to generate per topic. - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_openlines: Will be populated with the n_subtopics passed in this function - topic: Will be populated with the topic passed in this function + Some example templates found in nemo_curator.synthetic include: + - MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE + - MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. None are needed for the default template. model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. @@ -518,7 +522,7 @@ def generate_python_macro_topics( Prompts an LLM to generate a list of macro topics about the Python programming language Args: n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". - model: The name model that should be used to generate the macro topics. + model: The name of the model that should be used to generate the macro topics. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_macro_topics: Will be populated with the n_macro_topics passed in this function @@ -552,7 +556,7 @@ def generate_python_subtopics( Args: macro_topic: The macro topic to generate subtopics for. n_subtopics: The number of subtopics to generate per macro topic - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_subtopics: Will be populated with the n_subtopics passed in this function @@ -586,7 +590,7 @@ def classify_python_entity( Prompts an LLM to classify if an entity is related to Python Args: entity: The entity to classify - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. prompt_template: A format string of the prompt to use. It must have the following parameters: - entity: Will be populated with the entity passed in this function @@ -621,13 +625,17 @@ def generate_python_problem( Args: topic: The topic to generate problems for. n_openlines: The number of problems to generate per topic. - model: The name model that should be used to generate the response. + model: The name of the model that should be used to generate the response. Must be available in the LLMClient passed in the constructor. language: The programming language to target when generating these questions. prompt_template: A format string of the prompt to use. It must have the following parameters: - n_openlines: Will be populated with the n_subtopics passed in this function - topic: Will be populated with the topic passed in this function - language: Will be populated with the language passed in this function + Some example templates found in nemo_curator.synthetic include: + - PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE + - PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE + - PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. None are needed for the default template. model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. @@ -648,3 +656,92 @@ def generate_python_problem( def generate_data_assistance_openlines(self): pass + + def generate_dialogue( + self, + openline: str, + user_model: str, + assistant_model: str, + n_user_turns: int = 3, + prompt_template: str = DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + user_model_kwargs: dict = {}, + assistant_model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a dialogue based on a given openline. + The LLM will alternate impersonating the user and the assistant. + Args: + openline: The openline that will comprise the first user turn. + user_model: The model that will be impersonating the user. + Must be available in the LLMClient passed in the constructor. + assistant_model: The model that will be impersonating the assistant + Must be available in the LLMClient passed in the constructor. + n_user_turns: The number of user turns to go through. The openline counts as 1 user turn. + Therefore, if there are 3 user turns, 2 will be generated by the LLM impersonating the user. + prompt_template: A format string of the prompt to use when impersonating the user. + It must have the following parameters: + - converstation_history: Will be populated with a formatted history of the dialogue up to that point. + Some example templates found in nemo_curator.synthetic include: + - DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + user_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the user. + assistant_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the assistant. + Returns: + A conversation between a User and Assistant + """ + conversation_history = [{"role": "user", "content": openline}] + first_assistant_response = self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + )[0] + conversation_history.append( + {"role": "assistant", "content": first_assistant_response} + ) + for _ in range(n_user_turns - 1): + user_response = self._impersonate_user( + conversation_history=conversation_history, + model=user_model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=user_model_kwargs, + ) + conversation_history.append({"role": "user", "content": user_response}) + assistant_response = self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + )[0] + conversation_history.append( + {"role": "assistant", "content": assistant_response} + ) + + return conversation_history + + def _impersonate_user( + self, + conversation_history: List[dict], + model: str, + prompt_template: str, + prompt_kwargs: dict, + model_kwargs: dict, + ) -> str: + # Convert the conversation history to a string + history_str = "" + for turn in conversation_history: + history_str += f"{turn['role'].capitalize()}: {turn['content']}" + prompt_kwargs["conversation_history"] = history_str + response = self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return response[0] diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index e7e2e5ed..a8baa1fa 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -50,3 +50,9 @@ PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE = 'Generate {n_openlines} {language} coding problems related to "{topic}". These problems should be suitable for medium-level programmers with some experiences of "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE = 'Generate {n_openlines} {language} coding problems related to "{topic}". These problems should be suitable for advanced programmers with solid knowledge and experiences of "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' + +DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{converstaion_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the tone of User. Directly give me the question without extraneous words." + +DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{conversation_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the tone of User. Make sure the question is complex and diverse enough and suitable as a followup question. Directly give me the question without extraneous words." + +DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{conversation_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the toneof User. Be critical. Make sure the question is concise and has a real-life tone. Directly give me the question without extraneous words." From 4c64c3a9eda6864be95e5187365263536f369884 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 10:10:22 -0700 Subject: [PATCH 27/69] Fix mispell Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/prompts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index a8baa1fa..fbe7e026 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -51,7 +51,7 @@ PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE = 'Generate {n_openlines} {language} coding problems related to "{topic}". These problems should be suitable for advanced programmers with solid knowledge and experiences of "{topic}". Your answer should be a list of problems. Make them as diverse as possible.' -DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{converstaion_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the tone of User. Directly give me the question without extraneous words." +DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{conversation_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the tone of User. Directly give me the question without extraneous words." DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{conversation_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the tone of User. Make sure the question is complex and diverse enough and suitable as a followup question. Directly give me the question without extraneous words." From 8db6019a194e4209d952bcc3ab0bc3c224d65e9c Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 15:24:33 -0700 Subject: [PATCH 28/69] Add two turn generation Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 58 +++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 1cd11268..71cc3e6a 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -667,7 +667,7 @@ def generate_dialogue( prompt_kwargs: dict = {}, user_model_kwargs: dict = {}, assistant_model_kwargs: dict = {}, - ) -> List[str]: + ) -> List[dict]: """ Prompts an LLM to generate a dialogue based on a given openline. The LLM will alternate impersonating the user and the assistant. @@ -724,6 +724,62 @@ def generate_dialogue( return conversation_history + def generate_two_turn_prompt( + self, + openline: str, + user_model: str, + assistant_model: str, + prompt_template: str = DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + user_model_kwargs: dict = {}, + assistant_model_kwargs: dict = {}, + ) -> List[dict]: + """ + Prompts an LLM to generate a response as an assistant, then as the user based on a given openline. + The conversation will look like "User -> Assistant -> User" + Args: + openline: The openline that will comprise the first user turn. + user_model: The model that will be impersonating the user. + Must be available in the LLMClient passed in the constructor. + assistant_model: The model that will be impersonating the assistant + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use when impersonating the user. + It must have the following parameters: + - converstation_history: Will be populated with a formatted history of the dialogue up to that point. + Some example templates found in nemo_curator.synthetic include: + - DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + user_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the user. + assistant_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the assistant. + Returns: + A conversation between a User and Assistant + """ + conversation_history = [{"role": "user", "content": openline}] + first_assistant_response = self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + )[0] + conversation_history.append( + {"role": "assistant", "content": first_assistant_response} + ) + + user_response = self._impersonate_user( + conversation_history=conversation_history, + model=user_model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=user_model_kwargs, + ) + conversation_history.append({"role": "user", "content": user_response}) + + return conversation_history + def _impersonate_user( self, conversation_history: List[dict], From 2d13d63222a3461d530d79f1581af5a53f64a99e Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 21:53:49 -0700 Subject: [PATCH 29/69] Add reward model as judge Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 32 +++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 71cc3e6a..14be8d88 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -584,7 +584,7 @@ def classify_python_entity( model: str, prompt_template: str = DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, prompt_kwargs: dict = {}, - model_kwargs={}, + model_kwargs: dict = {}, ) -> List[str]: """ Prompts an LLM to classify if an entity is related to Python @@ -801,3 +801,33 @@ def _impersonate_user( ) return response[0] + + def calculate_rewards( + self, + conversation_history: List[dict], + reward_model: str, + model_kwargs: dict = {}, + ) -> dict: + """ + Prompts an LLM Reward model to score a conversation between a user and assistant + Args: + conversation_history: The conversation to calculate a score for. + Should be formatted like: + [{"role": "user", "content": "Write a sentence"}, {"role": "assistant", "content": "This is a sentence"}, ...] + model: The name of the model that should be used to calculate the reward. + Must be available in the LLMClient passed in the constructor. + Must be a reward model, cannot be a regular LLM. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A mapping of score_name -> score + """ + response = self.client.query_model( + messages=conversation_history, model=reward_model, **model_kwargs + ) + metrics = [ + metric.split(":") + for metric in response.choices[0].message[0].content.split(",") + ] + scores = {category: float(score) for category, score in metrics} + + return scores From 8336452fc3138c21f3ba60fe7eb368fd94fb7fd4 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 22:07:56 -0700 Subject: [PATCH 30/69] Refactor reward query Signed-off-by: Ryan Wolf --- nemo_curator/services/model_client.py | 30 +++++++++++++++-- nemo_curator/services/openai_client.py | 46 ++++++++++++++++++++++++++ nemo_curator/synthetic/nemotron.py | 30 ----------------- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/nemo_curator/services/model_client.py b/nemo_curator/services/model_client.py index d2e74e7e..aa0ed17b 100644 --- a/nemo_curator/services/model_client.py +++ b/nemo_curator/services/model_client.py @@ -18,10 +18,21 @@ def query_model( n: Optional[int] = 1, stop: Union[Optional[str], List[str]] = None, temperature: Optional[float] = None, - top_p: Optional[float] = None + top_p: Optional[float] = None, ) -> List[str]: raise NotImplementedError("Subclass of LLMClient must implement 'query_model'") + @abstractmethod + def query_reward_model( + self, + *, + messages: Iterable, + model: str, + ) -> dict: + raise NotImplementedError( + "Subclass of LLMClient must implement 'query_reward_model'" + ) + class AsyncLLMClient(ABC): """ @@ -39,6 +50,19 @@ async def query_model( n: Optional[int] = 1, stop: Union[Optional[str], List[str]] = None, temperature: Optional[float] = None, - top_p: Optional[float] = None + top_p: Optional[float] = None, ) -> List[str]: - raise NotImplementedError("Subclass of LLMClient must implement 'query_model'") + raise NotImplementedError( + "Subclass of AsyncLLMClient must implement 'query_model'" + ) + + @abstractmethod + async def query_reward_model( + self, + *, + messages: Iterable, + model: str, + ) -> dict: + raise NotImplementedError( + "Subclass of LLMClient must implement 'query_reward_model'" + ) diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index 3f192c8b..b1131e82 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -37,6 +37,28 @@ def query_model( return [choice.message.content for choice in response.choices] + def query_reward_model(self, *, messages: Iterable, model: str) -> dict: + """ + Prompts an LLM Reward model to score a conversation between a user and assistant + Args: + messages: The conversation to calculate a score for. + Should be formatted like: + [{"role": "user", "content": "Write a sentence"}, {"role": "assistant", "content": "This is a sentence"}, ...] + model: The name of the model that should be used to calculate the reward. + Must be a reward model, cannot be a regular LLM. + Returns: + A mapping of score_name -> score + """ + response = self.client.chat.completions.create(messages=messages, model=model) + + metrics = [ + metric.split(":") + for metric in response.choices[0].message[0].content.split(",") + ] + scores = {category: float(score) for category, score in metrics} + + return scores + class AsyncOpenAIClient(AsyncLLMClient): """ @@ -68,3 +90,27 @@ async def query_model( ) return [choice.message.content for choice in response.choices] + + async def query_reward_model(self, *, messages: Iterable, model: str) -> dict: + """ + Prompts an LLM Reward model to score a conversation between a user and assistant + Args: + messages: The conversation to calculate a score for. + Should be formatted like: + [{"role": "user", "content": "Write a sentence"}, {"role": "assistant", "content": "This is a sentence"}, ...] + model: The name of the model that should be used to calculate the reward. + Must be a reward model, cannot be a regular LLM. + Returns: + A mapping of score_name -> score + """ + response = await self.client.chat.completions.create( + messages=messages, model=model + ) + + metrics = [ + metric.split(":") + for metric in response.choices[0].message[0].content.split(",") + ] + scores = {category: float(score) for category, score in metrics} + + return scores diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 14be8d88..a5acab2a 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -801,33 +801,3 @@ def _impersonate_user( ) return response[0] - - def calculate_rewards( - self, - conversation_history: List[dict], - reward_model: str, - model_kwargs: dict = {}, - ) -> dict: - """ - Prompts an LLM Reward model to score a conversation between a user and assistant - Args: - conversation_history: The conversation to calculate a score for. - Should be formatted like: - [{"role": "user", "content": "Write a sentence"}, {"role": "assistant", "content": "This is a sentence"}, ...] - model: The name of the model that should be used to calculate the reward. - Must be available in the LLMClient passed in the constructor. - Must be a reward model, cannot be a regular LLM. - model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. - Returns: - A mapping of score_name -> score - """ - response = self.client.query_model( - messages=conversation_history, model=reward_model, **model_kwargs - ) - metrics = [ - metric.split(":") - for metric in response.choices[0].message[0].content.split(",") - ] - scores = {category: float(score) for category, score in metrics} - - return scores From 87acce030090091eae6374707b0a5ea120b3d7b4 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 22:13:03 -0700 Subject: [PATCH 31/69] Add error handling for non-reward models Signed-off-by: Ryan Wolf --- nemo_curator/services/openai_client.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index b1131e82..59fcd43b 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -23,7 +23,7 @@ def query_model( n: Union[Optional[int], NotGiven] = NOT_GIVEN, stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, - top_p: Union[Optional[float], NotGiven] = NOT_GIVEN + top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, ) -> List[str]: response = self.client.chat.completions.create( messages=messages, @@ -77,7 +77,7 @@ async def query_model( n: Union[Optional[int], NotGiven] = NOT_GIVEN, stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, - top_p: Union[Optional[float], NotGiven] = NOT_GIVEN + top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, ) -> List[str]: response = await self.client.chat.completions.create( messages=messages, @@ -107,10 +107,12 @@ async def query_reward_model(self, *, messages: Iterable, model: str) -> dict: messages=messages, model=model ) - metrics = [ - metric.split(":") - for metric in response.choices[0].message[0].content.split(",") - ] + try: + message = response.choices[0].message[0] + except TypeError as _: + raise ValueError(f"{model} is not a reward model.") + + metrics = [metric.split(":") for metric in message.content.split(",")] scores = {category: float(score) for category, score in metrics} return scores From fd1f0665817098aad3422e4c38894582503ebf79 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 1 Jul 2024 22:18:06 -0700 Subject: [PATCH 32/69] Add error handling to sync client Signed-off-by: Ryan Wolf --- nemo_curator/services/openai_client.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index 59fcd43b..ca233e82 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -51,10 +51,12 @@ def query_reward_model(self, *, messages: Iterable, model: str) -> dict: """ response = self.client.chat.completions.create(messages=messages, model=model) - metrics = [ - metric.split(":") - for metric in response.choices[0].message[0].content.split(",") - ] + try: + message = response.choices[0].message[0] + except TypeError as _: + raise ValueError(f"{model} is not a reward model.") + + metrics = [metric.split(":") for metric in message.content.split(",")] scores = {category: float(score) for category, score in metrics} return scores From 69c431f6537a31103eec2667c07d5f55fb4bca64 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 06:59:30 -0700 Subject: [PATCH 33/69] Add open qa pipeline Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 151 +++++++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 6 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index a5acab2a..587d3639 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -111,9 +111,6 @@ def convert_response_to_yaml_list( return parsed_response - def generate_world_question_openlines(self): - pass - def generate_macro_topics( self, n_macro_topics: Union[int, str], @@ -654,9 +651,6 @@ def generate_python_problem( return openline_response - def generate_data_assistance_openlines(self): - pass - def generate_dialogue( self, openline: str, @@ -801,3 +795,148 @@ def _impersonate_user( ) return response[0] + + def run_open_qa_pipeline( + self, + n_macro_topics: Union[str, int], + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + n_revisions: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + open_qa_from_topics_prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + revise_open_qa_prompt_template: str = DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + ) -> List[str]: + # Generate the macro topics + responses = self.generate_macro_topics( + n_macro_topics=n_macro_topics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + + # Generate the subtopics + raw_topics = [ + self.generate_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + )[0] + for macro_topic in macro_topics + ] + topic_list = [] + for topic in raw_topics: + try: + parsed_topics = self.convert_response_to_yaml_list( + topic, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_topics) != n_subtopics: + raise YamlConversionError( + f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" + ) + topic_list.extend(parsed_topics) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + topic_list.extend(additional_subtopics) + + # Generate the openlines + raw_lines = [ + self.generate_open_qa_from_topic( + topic=t, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=open_qa_from_topics_prompt_template, + )[0] + for t in topic_list + ] + openlines = [] + for line in raw_lines: + try: + parsed_line = self.convert_response_to_yaml_list( + line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_line) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" + ) + openlines.extend(parsed_line) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + # Revise the openlines + raw_revisions = [ + self.revise_open_qa( + openline=line, + n_revisions=n_revisions, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=revise_open_qa_prompt_template, + )[0] + for line in openlines + ] + revised_openlines = [] + for line in raw_revisions: + try: + parsed_revision = self.convert_response_to_yaml_list( + line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_revision) != n_revisions: + raise YamlConversionError( + f"Error: Length of revisions {len(parsed_revision)} does not match desired n_revisions {n_revisions}: {parsed_revision}" + ) + revised_openlines.extend(parsed_revision) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + return revised_openlines + + def run_writing_pipeline(): + pass + + def run_closed_qa_pipeline(): + pass + + def run_math_pipeline(): + pass + + def run_python_pipeline(): + pass From 24089724ecbd131bbeca650694396a55d6d32754 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 07:41:25 -0700 Subject: [PATCH 34/69] Improve docs and add writing pipeline Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 158 +++++++++++++++++++++++++++-- 1 file changed, 152 insertions(+), 6 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 587d3639..053a1acb 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -122,12 +122,11 @@ def generate_macro_topics( """ Prompts an LLM to generate a list of macro topics about the world Args: - n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". - It is used where it is referenced in prompt_template + n_macro_topics: The number of macro topics to generate. model: The name of the model that should be used to generate the macro topics. Must be available in the LLMClient passed in the constructor. - prompt_template: A format string of the prompt to use. It must have a {ntopics} - parameter that will be populated with the ntopics value passed in this function. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. None are needed for the default template. model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. @@ -813,7 +812,46 @@ def run_open_qa_pipeline( additional_macro_topics: List[str] = [], additional_subtopics: List[str] = [], ignore_conversion_failure: bool = False, + combine_topics: bool = True, ) -> List[str]: + """ + Runs a pipeline for automatically generating Open Q&A openlines for a dialogue + Args: + n_macro_topics: The number of macro topics to generate + n_subtopics: The number of subtopics to generate per macro topic + n_openlines: The number of questions to generate per topic. + n_revisions: The number of revisions to generate per original question. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + open_qa_from_topics_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + revise_open_qa_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_revisions: Will be populated with the n_revisions passed in this function + - openline: Will be populated with a generated open Q&A openline + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated open Q&A prompts + """ # Generate the macro topics responses = self.generate_macro_topics( n_macro_topics=n_macro_topics, @@ -865,6 +903,10 @@ def run_open_qa_pipeline( raise e topic_list.extend(additional_subtopics) + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + # Generate the openlines raw_lines = [ self.generate_open_qa_from_topic( @@ -929,8 +971,112 @@ def run_open_qa_pipeline( return revised_openlines - def run_writing_pipeline(): - pass + def run_writing_pipeline( + self, + topics: List[str], + text_material_types: List[str], + n_openlines: Union[str, int], + n_revisions: Union[str, int], + model: str, + writing_task_prompt_template: str = DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + revise_writing_task_prompt_template: str = DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + ignore_conversion_failure: bool = False, + ) -> List[str]: + """ + Runs a pipeline for automatically generating writing task openlines for a dialogue + Args: + topics: A list of topics to generate tasks for + text_material_types: A list of writing material types, like "Essay" or "Blog post" + n_openlines: The number of tasks to generate per (topic, text_material_type) pair. + n_revisions: The number of revisions to generate per original task. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + writing_task_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with one element of the topics list passed in this function + - text_material_type: Will be populated with one element of the text_material_types list passed in this function + No additional parameters may be passed to this prompt template. + revise_writing_task_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_revisions: Will be populated with the n_revisions passed in this function + - openline: Will be populated with one of the writing tasks generated in the pipeline. + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + Returns: + A list of synthetically generated writing task prompts + """ + # Generate the tasks + writing_tasks = [] + for topic in topics: + for material in text_material_types: + raw_tasks = self.generate_writing_tasks( + topic=topic, + text_material_type=material, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=writing_task_prompt_template, + )[0] + try: + parsed_tasks = self.convert_response_to_yaml_list( + raw_tasks, + model=model, + prompt_template=yaml_conversion_prompt_template, + ) + writing_tasks.extend(parsed_tasks) + if len(parsed_tasks) != n_openlines: + raise YamlConversionError( + f"Error: Length of writing tasks {len(parsed_tasks)} does not match desired n_openlines {n_openlines}: {parsed_tasks}" + ) + writing_tasks.extend(parsed_tasks) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + # Revise the tasks + raw_revisions = [ + self.revise_writing_tasks( + openline=line, + n_revisions=n_revisions, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=revise_writing_task_prompt_template, + )[0] + for line in writing_tasks + ] + revised_openlines = [] + for line in raw_revisions: + try: + parsed_revision = self.convert_response_to_yaml_list( + line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_revision) != n_revisions: + raise YamlConversionError( + f"Error: Length of revisions {len(parsed_revision)} does not match desired n_revisions {n_revisions}: {parsed_revision}" + ) + revised_openlines.extend(parsed_revision) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + return revised_openlines def run_closed_qa_pipeline(): pass From c8c8039ff0fc92d54fa77c8af852ae3e86c66c9d Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 08:01:33 -0700 Subject: [PATCH 35/69] Add closed qa pipeline Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 72 ++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 053a1acb..a6caf252 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional, Union +from typing import List, Optional, Tuple, Union import yaml @@ -1078,8 +1078,74 @@ def run_writing_pipeline( return revised_openlines - def run_closed_qa_pipeline(): - pass + def run_closed_qa_pipeline( + self, + documents: List[str], + n_openlines: Union[str, int], + model: str, + closed_qa_prompt_template: str = DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + ignore_conversion_failure: bool = False, + ) -> List[Tuple[int, str]]: + """ + Runs a pipeline for automatically generating closed Q&A openlines for a dialogue + Args: + documents: A list of documents to generate closed Q&A questions for + n_openlines: The number of questions to generate per document. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + closed_qa_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - document: Will be populated with one element of the documents list passed in this function + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + Returns: + A list of pairs where the first element represents the index of the document used to generate the question in the documents list + and the second element represents a synthetically generated closed Q&A prompt. Example: [(0, "Summarize this document"), ...] + """ + raw_instructions = [ + self.generate_closed_qa_instructions( + document=document, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=closed_qa_prompt_template, + )[0] + for document in documents + ] + document_openline_pairs = [] + for i, instruction in enumerate(raw_instructions): + try: + parsed_instructions = self.convert_response_to_yaml_list( + instruction, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_instructions) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_instructions)} does not match desired n_openlines {n_openlines}: {parsed_instructions}" + ) + document_openline_pairs.extend( + [(i, inst) for inst in parsed_instructions] + ) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + return document_openline_pairs def run_math_pipeline(): pass From babdb403265573638dd2007b380b613b0f02611a Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 08:21:21 -0700 Subject: [PATCH 36/69] Add math pipeline Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 146 ++++++++++++++++++++++++++++- 1 file changed, 144 insertions(+), 2 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index a6caf252..57497242 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -1147,8 +1147,150 @@ def run_closed_qa_pipeline( return document_openline_pairs - def run_math_pipeline(): - pass + def run_math_pipeline( + self, + n_macro_topics: Union[str, int], + school_level: str, + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + math_problem_prompt_template: str = MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating math questions for a dialogue + Args: + n_macro_topics: The number of macro topics to generate. + school_level: The school level to target when generating macro topics. + n_subtopics: The number of subtopics to generate per macro topic. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + - school_level: Will be populated with the school_level passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + math_problem_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + Some example templates found in nemo_curator.synthetic include: + - MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE + - MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated math prompts + """ + # Generate the macro topics + responses = self.generate_math_macro_topics( + n_macro_topics=n_macro_topics, + school_level=school_level, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + + # Generate the subtopics + raw_topics = [ + self.generate_math_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + )[0] + for macro_topic in macro_topics + ] + topic_list = [] + for topic in raw_topics: + try: + parsed_topics = self.convert_response_to_yaml_list( + topic, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_topics) != n_subtopics: + raise YamlConversionError( + f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" + ) + topic_list.extend(parsed_topics) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + topic_list.extend(additional_subtopics) + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self.generate_math_problem( + topic=t, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=math_problem_prompt_template, + )[0] + for t in topic_list + ] + openlines = [] + for line in raw_lines: + try: + parsed_line = self.convert_response_to_yaml_list( + line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_line) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" + ) + openlines.extend(parsed_line) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + return openlines def run_python_pipeline(): pass From c3a999853e5bda666aef6a8e910cd4c29ddf404e Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 08:47:49 -0700 Subject: [PATCH 37/69] Add python pipeline Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 144 ++++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 2 deletions(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 57497242..4a18b12b 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -1292,5 +1292,145 @@ def run_math_pipeline( return openlines - def run_python_pipeline(): - pass + def run_python_pipeline( + self, + n_macro_topics: Union[str, int], + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + python_problem_prompt_template: str = PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating Python questions for a dialogue + Args: + n_macro_topics: The number of macro topics to generate. + n_subtopics: The number of subtopics to generate per macro topic. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + python_problem_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - language: Will be populated with "Python" + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + Some example templates found in nemo_curator.synthetic include: + - PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE + - PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE + - PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated Python prompts + """ + # Generate the macro topics + responses = self.generate_python_macro_topics( + n_macro_topics=n_macro_topics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + + # Generate the subtopics + raw_topics = [ + self.generate_python_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + )[0] + for macro_topic in macro_topics + ] + topic_list = [] + for topic in raw_topics: + try: + parsed_topics = self.convert_response_to_yaml_list( + topic, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_topics) != n_subtopics: + raise YamlConversionError( + f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" + ) + topic_list.extend(parsed_topics) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + topic_list.extend(additional_subtopics) + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self.generate_python_problem( + topic=t, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=python_problem_prompt_template, + )[0] + for t in topic_list + ] + openlines = [] + for line in raw_lines: + try: + parsed_line = self.convert_response_to_yaml_list( + line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_line) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" + ) + openlines.extend(parsed_line) + except YamlConversionError as e: + if ignore_conversion_failure: + continue + else: + raise e + + return openlines From 48665ee456c945e96299db8b6b8555085e8cc8eb Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 09:49:43 -0700 Subject: [PATCH 38/69] Add async nemotron generator Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/__init__.py | 2 + nemo_curator/synthetic/async_nemotron.py | 1675 ++++++++++++++++++++++ nemo_curator/synthetic/nemotron.py | 6 +- 3 files changed, 1680 insertions(+), 3 deletions(-) create mode 100644 nemo_curator/synthetic/async_nemotron.py diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py index 968a0f1b..7dc47920 100644 --- a/nemo_curator/synthetic/__init__.py +++ b/nemo_curator/synthetic/__init__.py @@ -1,3 +1,4 @@ +from .async_nemotron import AsyncNemotronGenerator from .error import YamlConversionError from .nemotron import NemotronGenerator from .prompts import ( @@ -27,6 +28,7 @@ __all__ = [ "NemotronGenerator", + "AsyncNemotronGenerator", "DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE", "DEFAULT_SUBTOPICS_PROMPT_TEMPLATE", "DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE", diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py new file mode 100644 index 00000000..5514c187 --- /dev/null +++ b/nemo_curator/synthetic/async_nemotron.py @@ -0,0 +1,1675 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +from typing import List, Optional, Tuple, Union + +import yaml + +from nemo_curator.services.model_client import AsyncLLMClient +from nemo_curator.synthetic.error import YamlConversionError +from nemo_curator.synthetic.prompts import ( + DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, + DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, +) + + +class AsyncNemotronGenerator: + """ + Provides a collection of methods for generating synthetic data + described in the Nemotron-4 340B Technical Report + (https://arxiv.org/abs/2406.11704v1) and inspired by the + UltraChat paper (https://arxiv.org/abs/2305.14233) + """ + + def __init__(self, llm_client: AsyncLLMClient) -> None: + self.client = llm_client + + async def _prompt( + self, model: str, prompt_template: str, prompt_kwargs: dict, model_kwargs: dict + ) -> List[str]: + prompt = prompt_template.format(**prompt_kwargs) + messages = [{"role": "user", "content": prompt}] + + return await self.client.query_model( + messages=messages, model=model, **model_kwargs + ) + + async def convert_response_to_yaml_list( + self, + llm_response: str, + model: str, + prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Converts a response of an LLM to a list of strings by querying an LLM + Args: + llm_response: The original unformatted response of the LLM + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have a {llm_response} + parameter that will be populated with the llm_response value passed in this function. + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A parsed list of elements from the original LLM response + """ + prompt_kwargs["llm_response"] = llm_response + yaml_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + try: + parsed_response = yaml.safe_load(yaml_response[0]) + except yaml.error.YAMLError as _: + raise YamlConversionError( + f"Error parsing yaml response: {yaml_response[0]}" + ) + + if not isinstance(parsed_response, list): + raise YamlConversionError( + f"Error: Parsed response was not a list: {parsed_response}" + ) + + for elem in parsed_response: + if not isinstance(elem, str): + raise YamlConversionError( + f"Error: Parsed response contains non-string elements in list: {parsed_response}" + ) + if elem not in llm_response: + raise YamlConversionError( + f"Conversion introduced hallucinations. Original response:\n{llm_response}\nConverted response:\n{parsed_response}\nHallucination:\n{elem}" + ) + + return parsed_response + + async def generate_macro_topics( + self, + n_macro_topics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about the world + Args: + n_macro_topics: The number of macro topics to generate. + model: The name of the model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + macro_topics = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + async def generate_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + async def generate_open_qa_from_topic( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of open Q&A questions based on a topic + Args: + topic: The topic to generate questions for. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + openline_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + async def revise_open_qa( + self, + openline: str, + n_revisions: Union[str, int], + model: str, + prompt_template: str = DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to revise an open Q&A question a given number of times + Args: + openline: An openline to revise + n_revisions: The number of revisions to generate for the question. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - openline: Will be populated with the openline passed in this function + - n_revisions: Will be populated with the n_revisions passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["openline"] = openline + prompt_kwargs["n_revisions"] = n_revisions + revisions = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return revisions + + async def generate_writing_tasks( + self, + topic: str, + text_material_type: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of writing tasks based on a topic and document type + Args: + topic: The topic to generate writing tasks for. + text_material_type: The type of the document the question should ask to generate (e.g., "Email", "Poem") + n_openlines: The number of tasks to generate per topic and text material pair. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - topic: Will be populated with the topic passed in this function + - text_material_type: Will be populated with the text_material_type passed in this function + - n_openlines: Will be populated with the n_openlines passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["text_material_type"] = text_material_type + prompt_kwargs["n_openlines"] = n_openlines + writing_tasks = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return writing_tasks + + async def revise_writing_tasks( + self, + openline: str, + n_revisions: Union[str, int], + model: str, + prompt_template: str = DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to revise a writing task a given number of times + Args: + openline: An openline to revise + n_revisions: The number of revisions to generate for the task. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - openline: Will be populated with the openline passed in this function + - n_revisions: Will be populated with the n_revisions passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["openline"] = openline + prompt_kwargs["n_revisions"] = n_revisions + revisions = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return revisions + + async def generate_closed_qa_instructions( + self, + document: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of closed Q&A questions based on a reference document + Args: + document: The document to use when generating questions + n_openlines: The number of questions to generate per document. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - document: Will be populated with the document passed in this function + - n_openlines: Will be populated with the n_openlines passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["document"] = document + prompt_kwargs["n_openlines"] = n_openlines + openline_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + async def generate_math_macro_topics( + self, + n_macro_topics: Union[int, str], + school_level: str, + model: str, + prompt_template: str = DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about math + Args: + n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". + school_level: The school level the math questions should be targeted at. + model: The name of the model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + - school_level: Will be populated with the school_level passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + prompt_kwargs["school_level"] = school_level + macro_topics = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + async def generate_math_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a math macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + async def classify_math_entity( + self, + entity: str, + model: str, + prompt_template: str = DEFAULT_MATH_CLASSIFICATION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs={}, + ) -> List[str]: + """ + Prompts an LLM to classify if an entity is related to math + Args: + entity: The entity to classify + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - entity: Will be populated with the entity passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["entity"] = entity + classification_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return classification_response + + async def generate_math_problem( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + prompt_template: str = MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of math problems based on a topic + Args: + topic: The topic to generate problems for. + n_openlines: The number of problems to generate per topic. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + Some example templates found in nemo_curator.synthetic include: + - MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE + - MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + openline_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + async def generate_python_macro_topics( + self, + n_macro_topics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of macro topics about the Python programming language + Args: + n_macro_topics: The number of macro topics to generate. Can be an integer like 5 or a string like "five". + model: The name of the model that should be used to generate the macro topics. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_macro_topics"] = n_macro_topics + macro_topics = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return macro_topics + + async def generate_python_subtopics( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + prompt_template: str = DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of subtopics relating to a Python macro topic + Args: + macro_topic: The macro topic to generate subtopics for. + n_subtopics: The number of subtopics to generate per macro topic + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with the macro_topic passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["n_subtopics"] = n_subtopics + prompt_kwargs["macro_topic"] = macro_topic + subtopics_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return subtopics_response + + async def classify_python_entity( + self, + entity: str, + model: str, + prompt_template: str = DEFAULT_PYTHON_CLASSIFICATION_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to classify if an entity is related to Python + Args: + entity: The entity to classify + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - entity: Will be populated with the entity passed in this function + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["entity"] = entity + classification_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return classification_response + + async def generate_python_problem( + self, + topic: str, + n_openlines: Union[str, int], + model: str, + language="Python", + prompt_template: str = PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Prompts an LLM to generate a list of coding problems based on a topic + Args: + topic: The topic to generate problems for. + n_openlines: The number of problems to generate per topic. + model: The name of the model that should be used to generate the response. + Must be available in the LLMClient passed in the constructor. + language: The programming language to target when generating these questions. + prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_subtopics passed in this function + - topic: Will be populated with the topic passed in this function + - language: Will be populated with the language passed in this function + Some example templates found in nemo_curator.synthetic include: + - PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE + - PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE + - PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + model_kwargs: Any additional keyword arguments that should be passed to the LLMClient.query_model call. + Returns: + A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + prompt_kwargs["topic"] = topic + prompt_kwargs["n_openlines"] = n_openlines + prompt_kwargs["language"] = language + openline_response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return openline_response + + async def generate_dialogue( + self, + openline: str, + user_model: str, + assistant_model: str, + n_user_turns: int = 3, + prompt_template: str = DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + user_model_kwargs: dict = {}, + assistant_model_kwargs: dict = {}, + ) -> List[dict]: + """ + Prompts an LLM to generate a dialogue based on a given openline. + The LLM will alternate impersonating the user and the assistant. + Args: + openline: The openline that will comprise the first user turn. + user_model: The model that will be impersonating the user. + Must be available in the LLMClient passed in the constructor. + assistant_model: The model that will be impersonating the assistant + Must be available in the LLMClient passed in the constructor. + n_user_turns: The number of user turns to go through. The openline counts as 1 user turn. + Therefore, if there are 3 user turns, 2 will be generated by the LLM impersonating the user. + prompt_template: A format string of the prompt to use when impersonating the user. + It must have the following parameters: + - converstation_history: Will be populated with a formatted history of the dialogue up to that point. + Some example templates found in nemo_curator.synthetic include: + - DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + user_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the user. + assistant_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the assistant. + Returns: + A conversation between a User and Assistant + """ + conversation_history = [{"role": "user", "content": openline}] + first_assistant_response = await self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + )[0] + conversation_history.append( + {"role": "assistant", "content": first_assistant_response} + ) + for _ in range(n_user_turns - 1): + user_response = await self._impersonate_user( + conversation_history=conversation_history, + model=user_model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=user_model_kwargs, + ) + conversation_history.append({"role": "user", "content": user_response}) + assistant_response = await self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + )[0] + conversation_history.append( + {"role": "assistant", "content": assistant_response} + ) + + return conversation_history + + async def generate_two_turn_prompt( + self, + openline: str, + user_model: str, + assistant_model: str, + prompt_template: str = DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE, + prompt_kwargs: dict = {}, + user_model_kwargs: dict = {}, + assistant_model_kwargs: dict = {}, + ) -> List[dict]: + """ + Prompts an LLM to generate a response as an assistant, then as the user based on a given openline. + The conversation will look like "User -> Assistant -> User" + Args: + openline: The openline that will comprise the first user turn. + user_model: The model that will be impersonating the user. + Must be available in the LLMClient passed in the constructor. + assistant_model: The model that will be impersonating the assistant + Must be available in the LLMClient passed in the constructor. + prompt_template: A format string of the prompt to use when impersonating the user. + It must have the following parameters: + - converstation_history: Will be populated with a formatted history of the dialogue up to that point. + Some example templates found in nemo_curator.synthetic include: + - DIALOGUE_NORMAL_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE + - DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE + prompt_kwargs: Any additional keyword arguments that should be passed to the prompt template. + None are needed for the default template. + user_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the user. + assistant_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the assistant. + Returns: + A conversation between a User and Assistant + """ + conversation_history = [{"role": "user", "content": openline}] + first_assistant_response = await self.client.query_model( + messages=conversation_history, + model=assistant_model, + **assistant_model_kwargs, + )[0] + conversation_history.append( + {"role": "assistant", "content": first_assistant_response} + ) + + user_response = await self._impersonate_user( + conversation_history=conversation_history, + model=user_model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=user_model_kwargs, + ) + conversation_history.append({"role": "user", "content": user_response}) + + return conversation_history + + async def _impersonate_user( + self, + conversation_history: List[dict], + model: str, + prompt_template: str, + prompt_kwargs: dict, + model_kwargs: dict, + ) -> str: + # Convert the conversation history to a string + history_str = "" + for turn in conversation_history: + history_str += f"{turn['role'].capitalize()}: {turn['content']}" + prompt_kwargs["conversation_history"] = history_str + response = await self._prompt( + model=model, + prompt_template=prompt_template, + prompt_kwargs=prompt_kwargs, + model_kwargs=model_kwargs, + ) + + return response[0] + + async def run_open_qa_pipeline( + self, + n_macro_topics: Union[str, int], + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + n_revisions: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_SUBTOPICS_PROMPT_TEMPLATE, + open_qa_from_topics_prompt_template: str = DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE, + revise_open_qa_prompt_template: str = DEFAULT_REVISE_OPEN_QA_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating Open Q&A openlines for a dialogue + Args: + n_macro_topics: The number of macro topics to generate + n_subtopics: The number of subtopics to generate per macro topic + n_openlines: The number of questions to generate per topic. + n_revisions: The number of revisions to generate per original question. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + open_qa_from_topics_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + revise_open_qa_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_revisions: Will be populated with the n_revisions passed in this function + - openline: Will be populated with a generated open Q&A openline + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated open Q&A prompts + """ + # Generate the macro topics + responses = await self.generate_macro_topics( + n_macro_topics=n_macro_topics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = await self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + + # Generate the subtopics + raw_topics = [ + self._generate_parse_subtopic( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + subtopic_prompt_template=subtopic_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for macro_topic in macro_topics + ] + raw_topics = await asyncio.gather(*raw_topics) + topic_list = [item for subtopics in raw_topics for item in subtopics] + topic_list.extend(additional_subtopics) + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self._generate_parse_openline( + subtopic=subtopic, + n_openlines=n_openlines, + model=model, + open_qa_from_topics_prompt_template=open_qa_from_topics_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for subtopic in topic_list + ] + raw_lines = await asyncio.gather(*raw_lines) + openlines = [item for lines in raw_lines for item in lines] + + # Revise the openlines + raw_revisions = [ + self._revise_parse_openline( + openline=openline, + n_revisions=n_revisions, + model=model, + revise_open_qa_prompt_template=revise_open_qa_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for openline in openlines + ] + raw_revisions = await asyncio.gather(*raw_revisions) + revised_openlines = [item for revisions in raw_revisions for item in revisions] + + return revised_openlines + + async def _generate_parse_subtopic( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + subtopic_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + subtopic = await self.generate_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + )[0] + try: + parsed_topics = await self.convert_response_to_yaml_list( + subtopic, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_topics) != n_subtopics: + raise YamlConversionError( + f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_topics + + async def _generate_parse_openline( + self, + subtopic: str, + n_openlines: Union[int, str], + model: str, + open_qa_from_topics_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + openline = await self.generate_open_qa_from_topic( + topic=subtopic, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=open_qa_from_topics_prompt_template, + )[0] + try: + parsed_line = await self.convert_response_to_yaml_list( + openline, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_line) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_line + + async def _revise_parse_openline( + self, + openline: str, + n_revisions: Union[int, str], + model: str, + revise_open_qa_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + revised_openline = await self.revise_open_qa( + openline=openline, + n_revisions=n_revisions, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=revise_open_qa_prompt_template, + )[0] + try: + parsed_revision = await self.convert_response_to_yaml_list( + revised_openline, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_revision) != n_revisions: + raise YamlConversionError( + f"Error: Length of revisions {len(parsed_revision)} does not match desired n_revisions {n_revisions}: {parsed_revision}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_revision + + async def run_writing_pipeline( + self, + topics: List[str], + text_material_types: List[str], + n_openlines: Union[str, int], + n_revisions: Union[str, int], + model: str, + writing_task_prompt_template: str = DEFAULT_WRITING_TASK_PROMPT_TEMPLATE, + revise_writing_task_prompt_template: str = DEFAULT_REVISE_WRITING_TASK_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + ignore_conversion_failure: bool = False, + ) -> List[str]: + """ + Runs a pipeline for automatically generating writing task openlines for a dialogue + Args: + topics: A list of topics to generate tasks for + text_material_types: A list of writing material types, like "Essay" or "Blog post" + n_openlines: The number of tasks to generate per (topic, text_material_type) pair. + n_revisions: The number of revisions to generate per original task. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + writing_task_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with one element of the topics list passed in this function + - text_material_type: Will be populated with one element of the text_material_types list passed in this function + No additional parameters may be passed to this prompt template. + revise_writing_task_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_revisions: Will be populated with the n_revisions passed in this function + - openline: Will be populated with one of the writing tasks generated in the pipeline. + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + Returns: + A list of synthetically generated writing task prompts + """ + # Generate the tasks + raw_writing_tasks = [] + for topic in topics: + for material in text_material_types: + raw_writing_tasks.append( + self._generate_parse_writing_task( + topic=topic, + material=material, + n_openlines=n_openlines, + model=model, + writing_task_prompt_template=writing_task_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + ) + raw_writing_tasks = await asyncio.gather(*raw_writing_tasks) + writing_tasks = [item for tasks in raw_writing_tasks for item in tasks] + + # Revise the tasks + raw_revised_openlines = [ + self._revise_parse_writing_task( + task=task, + n_revisions=n_revisions, + model=model, + revise_writing_task_prompt_template=revise_writing_task_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for task in writing_tasks + ] + raw_revised_openlines = await asyncio.gather(*raw_revised_openlines) + revised_openlines = [item for lines in raw_revised_openlines for item in lines] + + return revised_openlines + + async def _generate_parse_writing_task( + self, + topic: str, + material: str, + n_openlines: Union[int, str], + model: str, + writing_task_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_tasks = await self.generate_writing_tasks( + topic=topic, + text_material_type=material, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=writing_task_prompt_template, + )[0] + try: + parsed_tasks = await self.convert_response_to_yaml_list( + raw_tasks, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_tasks) != n_openlines: + raise YamlConversionError( + f"Error: Length of writing tasks {len(parsed_tasks)} does not match desired n_openlines {n_openlines}: {parsed_tasks}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_tasks + + async def _revise_parse_writing_task( + self, + task: str, + n_revisions: Union[int, str], + model: str, + revise_writing_task_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_revision = await self.revise_writing_tasks( + openline=task, + n_revisions=n_revisions, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=revise_writing_task_prompt_template, + )[0] + try: + parsed_revision = await self.convert_response_to_yaml_list( + raw_revision, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_revision) != n_revisions: + raise YamlConversionError( + f"Error: Length of revisions {len(parsed_revision)} does not match desired n_revisions {n_revisions}: {parsed_revision}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_revision + + async def run_closed_qa_pipeline( + self, + documents: List[str], + n_openlines: Union[str, int], + model: str, + closed_qa_prompt_template: str = DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + ignore_conversion_failure: bool = False, + ) -> List[Tuple[int, str]]: + """ + Runs a pipeline for automatically generating closed Q&A openlines for a dialogue + Args: + documents: A list of documents to generate closed Q&A questions for + n_openlines: The number of questions to generate per document. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + closed_qa_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - document: Will be populated with one element of the documents list passed in this function + No additional parameters may be passed to this prompt template. + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + Returns: + A list of pairs where the first element represents the index of the document used to generate the question in the documents list + and the second element represents a synthetically generated closed Q&A prompt. Example: [(0, "Summarize this document"), ...] + """ + raw_qa = [ + self._generate_parse_closed_qa( + document_id=i, + document=document, + n_openlines=n_openlines, + model=model, + closed_qa_prompt_template=closed_qa_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for i, document in enumerate(documents) + ] + raw_qa = await asyncio.gather(*raw_qa) + document_openline_pairs = [item for lines in raw_qa for item in lines] + + return document_openline_pairs + + async def _generate_parse_closed_qa( + self, + document_id: int, + document: str, + n_openlines: Union[int, str], + model: str, + closed_qa_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_instruction = self.generate_closed_qa_instructions( + document=document, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=closed_qa_prompt_template, + )[0] + try: + parsed_instructions = self.convert_response_to_yaml_list( + raw_instruction, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_instructions) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_instructions)} does not match desired n_openlines {n_openlines}: {parsed_instructions}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return [(document_id, inst) for inst in parsed_instructions] + + async def run_math_pipeline( + self, + n_macro_topics: Union[str, int], + school_level: str, + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_MATH_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_MATH_SUBTOPICS_PROMPT_TEMPLATE, + math_problem_prompt_template: str = MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating math questions for a dialogue + Args: + n_macro_topics: The number of macro topics to generate. + school_level: The school level to target when generating macro topics. + n_subtopics: The number of subtopics to generate per macro topic. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + - school_level: Will be populated with the school_level passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + math_problem_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + Some example templates found in nemo_curator.synthetic include: + - MATH_PROBLEM_GENERAL_PROMPT_TEMPLATE + - MATH_PROBLEM_BEGINNER_PROMPT_TEMPLATE + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated math prompts + """ + # Generate the macro topics + responses = await self.generate_math_macro_topics( + n_macro_topics=n_macro_topics, + school_level=school_level, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = await self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + + # Generate the subtopics + raw_topics = [ + self._generate_parse_math_subtopic( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + subtopic_prompt_template=subtopic_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for macro_topic in macro_topics + ] + raw_topics = await asyncio.gather(*raw_topics) + topic_list = [item for subtopics in raw_topics for item in subtopics] + topic_list.extend(additional_subtopics) + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self._generate_parse_math_openline( + subtopic=subtopic, + n_openlines=n_openlines, + model=model, + math_problem_prompt_template=math_problem_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for subtopic in topic_list + ] + raw_lines = await asyncio.gather(*raw_lines) + openlines = [item for lines in raw_lines for item in lines] + + return openlines + + async def _generate_parse_math_subtopic( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + subtopic_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_topic = await self.generate_math_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + )[0] + try: + parsed_topics = self.convert_response_to_yaml_list( + raw_topic, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_topics) != n_subtopics: + raise YamlConversionError( + f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_topics + + async def _generate_parse_math_openline( + self, + subtopic: str, + n_openlines: Union[int, str], + model: str, + math_problem_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_line = self.generate_math_problem( + topic=subtopic, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=math_problem_prompt_template, + )[0] + try: + parsed_line = self.convert_response_to_yaml_list( + raw_line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_line) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_line + + async def run_python_pipeline( + self, + n_macro_topics: Union[str, int], + n_subtopics: Union[str, int], + n_openlines: Union[str, int], + model: str, + macro_topic_prompt_template: str = DEFAULT_PYTHON_MACRO_TOPICS_PROMPT_TEMPLATE, + subtopic_prompt_template: str = DEFAULT_PYTHON_SUBTOPICS_PROMPT_TEMPLATE, + python_problem_prompt_template: str = PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE, + yaml_conversion_prompt_template: str = DEFAULT_YAML_CONVERSION_PROMPT_TEMPLATE, + base_model_kwargs: dict = {}, + conversion_model_kwargs: dict = {}, + additional_macro_topics: List[str] = [], + additional_subtopics: List[str] = [], + ignore_conversion_failure: bool = False, + combine_topics: bool = True, + ) -> List[str]: + """ + Runs a pipeline for automatically generating Python questions for a dialogue + Args: + n_macro_topics: The number of macro topics to generate. + n_subtopics: The number of subtopics to generate per macro topic. + n_openlines: The number of questions to generate per topic. + model: The name of the model that should be used to generate all the responses. + Must be available in the LLMClient passed in the constructor. + macro_topic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_macro_topics: Will be populated with the n_macro_topics passed in this function + No additional parameters may be passed to this prompt template. + subtopic_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_subtopics: Will be populated with the n_subtopics passed in this function + - macro_topic: Will be populated with a generated macro topic + No additional parameters may be passed to this prompt template. + python_problem_prompt_template: A format string of the prompt to use. It must have the following parameters: + - n_openlines: Will be populated with the n_openlines passed in this function + - language: Will be populated with "Python" + - topic: Will be populated with a generated topic + No additional parameters may be passed to this prompt template. + Some example templates found in nemo_curator.synthetic include: + - PYTHON_PROBLEM_BEGINNER_PROMPT_TEMPLATE + - PYTHON_PROBLEM_INTERMEDIATE_PROMPT_TEMPLATE + - PYTHON_PROBLEM_ADVANCED_PROMPT_TEMPLATE + yaml_conversion_prompt_template: A format string of the prompt to use. It must have the following parameters: + - llm_response: Will be populated with the raw LLM response from each stage of the pipeline + No additional parameters may be passed to this prompt template. + base_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the normal stages of the pipeline. + conversion_model_kwargs: Any additional keyword arguments that should be passed to the + LLMClient.query_model call for the yaml conversion stages of the pipeline. + ignore_conversion_failure: Ignores yaml conversion failures when able and discards the data + that conversion was attempted on + combine_topics: If True, mixes the macro topics with the subtopics when generating openlines. + If False, only the subtopics are used. + Returns: + A list of synthetically generated Python prompts + """ + # Generate the macro topics + responses = await self.generate_python_macro_topics( + n_macro_topics=n_macro_topics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=macro_topic_prompt_template, + ) + macro_topics = await self.convert_response_to_yaml_list( + responses[0], + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(macro_topics) != n_macro_topics and not ignore_conversion_failure: + raise YamlConversionError( + f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" + ) + macro_topics.extend(additional_macro_topics) + + # Generate the subtopics + raw_topics = [ + self._generate_parse_python_subtopic( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + subtopic_prompt_template=subtopic_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for macro_topic in macro_topics + ] + raw_topics = await asyncio.gather(*raw_topics) + topic_list = [item for subtopics in raw_topics for item in subtopics] + topic_list.extend(additional_subtopics) + + # Mix the macro topics with the subtopics + if combine_topics: + topic_list.extend(macro_topics) + + # Generate the openlines + raw_lines = [ + self._generate_parse_python_openline( + subtopic=subtopic, + n_openlines=n_openlines, + model=model, + python_problem_prompt_template=python_problem_prompt_template, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + base_model_kwargs=base_model_kwargs, + conversion_model_kwargs=conversion_model_kwargs, + ignore_conversion_failure=ignore_conversion_failure, + ) + for subtopic in topic_list + ] + raw_lines = await asyncio.gather(*raw_lines) + openlines = [item for lines in raw_lines for item in lines] + + return openlines + + async def _generate_parse_python_subtopic( + self, + macro_topic: str, + n_subtopics: Union[int, str], + model: str, + subtopic_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_topic = await self.generate_python_subtopics( + macro_topic=macro_topic, + n_subtopics=n_subtopics, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=subtopic_prompt_template, + )[0] + try: + parsed_topics = self.convert_response_to_yaml_list( + raw_topic, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_topics) != n_subtopics: + raise YamlConversionError( + f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_topics + + async def _generate_parse_python_openline( + self, + subtopic: str, + n_openlines: Union[int, str], + model: str, + math_problem_prompt_template: str, + yaml_conversion_prompt_template: str, + base_model_kwargs: dict, + conversion_model_kwargs: dict, + ignore_conversion_failure: bool, + ) -> List[str]: + raw_line = self.generate_python_problem( + topic=subtopic, + n_openlines=n_openlines, + model=model, + model_kwargs=base_model_kwargs, + prompt_template=math_problem_prompt_template, + )[0] + try: + parsed_line = self.convert_response_to_yaml_list( + raw_line, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_line) != n_openlines: + raise YamlConversionError( + f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_line diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 4a18b12b..18c7ac45 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional, Tuple, Union +from typing import List, Tuple, Union import yaml -from nemo_curator.services.model_client import AsyncLLMClient, LLMClient +from nemo_curator.services.model_client import LLMClient from nemo_curator.synthetic.error import YamlConversionError from nemo_curator.synthetic.prompts import ( DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, @@ -1032,8 +1032,8 @@ def run_writing_pipeline( raw_tasks, model=model, prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, ) - writing_tasks.extend(parsed_tasks) if len(parsed_tasks) != n_openlines: raise YamlConversionError( f"Error: Length of writing tasks {len(parsed_tasks)} does not match desired n_openlines {n_openlines}: {parsed_tasks}" From 494c141e0e0f68cfad667ab05150f6f5124fbdcb Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 10:01:05 -0700 Subject: [PATCH 39/69] Fix await with index Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/async_nemotron.py | 44 +++++++++++++++--------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py index 5514c187..66f16f32 100644 --- a/nemo_curator/synthetic/async_nemotron.py +++ b/nemo_curator/synthetic/async_nemotron.py @@ -949,7 +949,8 @@ async def _generate_parse_subtopic( model=model, model_kwargs=base_model_kwargs, prompt_template=subtopic_prompt_template, - )[0] + ) + subtopic = subtopic[0] try: parsed_topics = await self.convert_response_to_yaml_list( subtopic, @@ -986,7 +987,8 @@ async def _generate_parse_openline( model=model, model_kwargs=base_model_kwargs, prompt_template=open_qa_from_topics_prompt_template, - )[0] + ) + openline = openline[0] try: parsed_line = await self.convert_response_to_yaml_list( openline, @@ -1023,7 +1025,8 @@ async def _revise_parse_openline( model=model, model_kwargs=base_model_kwargs, prompt_template=revise_open_qa_prompt_template, - )[0] + ) + revised_openline = revised_openline[0] try: parsed_revision = await self.convert_response_to_yaml_list( revised_openline, @@ -1145,7 +1148,8 @@ async def _generate_parse_writing_task( model=model, model_kwargs=base_model_kwargs, prompt_template=writing_task_prompt_template, - )[0] + ) + raw_tasks = raw_tasks[0] try: parsed_tasks = await self.convert_response_to_yaml_list( raw_tasks, @@ -1182,7 +1186,8 @@ async def _revise_parse_writing_task( model=model, model_kwargs=base_model_kwargs, prompt_template=revise_writing_task_prompt_template, - )[0] + ) + raw_revision = raw_revision[0] try: parsed_revision = await self.convert_response_to_yaml_list( raw_revision, @@ -1268,15 +1273,16 @@ async def _generate_parse_closed_qa( conversion_model_kwargs: dict, ignore_conversion_failure: bool, ) -> List[str]: - raw_instruction = self.generate_closed_qa_instructions( + raw_instruction = await self.generate_closed_qa_instructions( document=document, n_openlines=n_openlines, model=model, model_kwargs=base_model_kwargs, prompt_template=closed_qa_prompt_template, - )[0] + ) + raw_instruction = raw_instruction[0] try: - parsed_instructions = self.convert_response_to_yaml_list( + parsed_instructions = await self.convert_response_to_yaml_list( raw_instruction, model=model, prompt_template=yaml_conversion_prompt_template, @@ -1428,7 +1434,8 @@ async def _generate_parse_math_subtopic( model=model, model_kwargs=base_model_kwargs, prompt_template=subtopic_prompt_template, - )[0] + ) + raw_topic = raw_topic[0] try: parsed_topics = self.convert_response_to_yaml_list( raw_topic, @@ -1459,15 +1466,16 @@ async def _generate_parse_math_openline( conversion_model_kwargs: dict, ignore_conversion_failure: bool, ) -> List[str]: - raw_line = self.generate_math_problem( + raw_line = await self.generate_math_problem( topic=subtopic, n_openlines=n_openlines, model=model, model_kwargs=base_model_kwargs, prompt_template=math_problem_prompt_template, - )[0] + ) + raw_line = raw_line[0] try: - parsed_line = self.convert_response_to_yaml_list( + parsed_line = await self.convert_response_to_yaml_list( raw_line, model=model, prompt_template=yaml_conversion_prompt_template, @@ -1617,9 +1625,10 @@ async def _generate_parse_python_subtopic( model=model, model_kwargs=base_model_kwargs, prompt_template=subtopic_prompt_template, - )[0] + ) + raw_topic = raw_topic[0] try: - parsed_topics = self.convert_response_to_yaml_list( + parsed_topics = await self.convert_response_to_yaml_list( raw_topic, model=model, prompt_template=yaml_conversion_prompt_template, @@ -1648,15 +1657,16 @@ async def _generate_parse_python_openline( conversion_model_kwargs: dict, ignore_conversion_failure: bool, ) -> List[str]: - raw_line = self.generate_python_problem( + raw_line = await self.generate_python_problem( topic=subtopic, n_openlines=n_openlines, model=model, model_kwargs=base_model_kwargs, prompt_template=math_problem_prompt_template, - )[0] + ) + raw_line = raw_line[0] try: - parsed_line = self.convert_response_to_yaml_list( + parsed_line = await self.convert_response_to_yaml_list( raw_line, model=model, prompt_template=yaml_conversion_prompt_template, From 2fb48db93b144de95d5d7a9eff6bb557d6e9bc76 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 10:13:29 -0700 Subject: [PATCH 40/69] Add seed parameter Signed-off-by: Ryan Wolf --- nemo_curator/services/model_client.py | 2 ++ nemo_curator/services/openai_client.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/nemo_curator/services/model_client.py b/nemo_curator/services/model_client.py index aa0ed17b..7585c400 100644 --- a/nemo_curator/services/model_client.py +++ b/nemo_curator/services/model_client.py @@ -16,6 +16,7 @@ def query_model( model: str, max_tokens: Optional[int] = None, n: Optional[int] = 1, + seed: Optional[int] = None, stop: Union[Optional[str], List[str]] = None, temperature: Optional[float] = None, top_p: Optional[float] = None, @@ -48,6 +49,7 @@ async def query_model( model: str, max_tokens: Optional[int] = None, n: Optional[int] = 1, + seed: Optional[int] = None, stop: Union[Optional[str], List[str]] = None, temperature: Optional[float] = None, top_p: Optional[float] = None, diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index ca233e82..b3f691d0 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -21,6 +21,7 @@ def query_model( model: str, max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN, n: Union[Optional[int], NotGiven] = NOT_GIVEN, + seed: Union[Optional[int], NotGiven] = NOT_GIVEN, stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, @@ -30,6 +31,7 @@ def query_model( model=model, max_tokens=max_tokens, n=n, + seed=seed, stop=stop, temperature=temperature, top_p=top_p, @@ -77,6 +79,7 @@ async def query_model( model: str, max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN, n: Union[Optional[int], NotGiven] = NOT_GIVEN, + seed: Union[Optional[int], NotGiven] = NOT_GIVEN, stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, @@ -86,6 +89,7 @@ async def query_model( model=model, max_tokens=max_tokens, n=n, + seed=seed, stop=stop, temperature=temperature, top_p=top_p, From 39acac1d1e2697fbb5952ed3161cdeb7f3155737 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 14:42:45 -0700 Subject: [PATCH 41/69] Add missing await Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/async_nemotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py index 66f16f32..e50d3165 100644 --- a/nemo_curator/synthetic/async_nemotron.py +++ b/nemo_curator/synthetic/async_nemotron.py @@ -1437,7 +1437,7 @@ async def _generate_parse_math_subtopic( ) raw_topic = raw_topic[0] try: - parsed_topics = self.convert_response_to_yaml_list( + parsed_topics = await self.convert_response_to_yaml_list( raw_topic, model=model, prompt_template=yaml_conversion_prompt_template, From 4c888e4320529c438efebc7d459cbba35516061b Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 14:55:18 -0700 Subject: [PATCH 42/69] Fix parameter names Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/async_nemotron.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py index e50d3165..f23b19e7 100644 --- a/nemo_curator/synthetic/async_nemotron.py +++ b/nemo_curator/synthetic/async_nemotron.py @@ -1651,7 +1651,7 @@ async def _generate_parse_python_openline( subtopic: str, n_openlines: Union[int, str], model: str, - math_problem_prompt_template: str, + python_problem_prompt_template: str, yaml_conversion_prompt_template: str, base_model_kwargs: dict, conversion_model_kwargs: dict, @@ -1662,7 +1662,7 @@ async def _generate_parse_python_openline( n_openlines=n_openlines, model=model, model_kwargs=base_model_kwargs, - prompt_template=math_problem_prompt_template, + prompt_template=python_problem_prompt_template, ) raw_line = raw_line[0] try: From 4724d68f5de01054693744a1a2e4bb79df5364ed Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 15:05:53 -0700 Subject: [PATCH 43/69] Fix subscript await issues Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/async_nemotron.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py index f23b19e7..b8e25ef1 100644 --- a/nemo_curator/synthetic/async_nemotron.py +++ b/nemo_curator/synthetic/async_nemotron.py @@ -696,7 +696,8 @@ async def generate_dialogue( messages=conversation_history, model=assistant_model, **assistant_model_kwargs, - )[0] + ) + first_assistant_response = first_assistant_response[0] conversation_history.append( {"role": "assistant", "content": first_assistant_response} ) @@ -713,7 +714,8 @@ async def generate_dialogue( messages=conversation_history, model=assistant_model, **assistant_model_kwargs, - )[0] + ) + assistant_response = assistant_response[0] conversation_history.append( {"role": "assistant", "content": assistant_response} ) @@ -760,7 +762,8 @@ async def generate_two_turn_prompt( messages=conversation_history, model=assistant_model, **assistant_model_kwargs, - )[0] + ) + first_assistant_response = first_assistant_response[0] conversation_history.append( {"role": "assistant", "content": first_assistant_response} ) From de27abc7c664c37f90edea8c49b8047f9fd42c99 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 15:26:42 -0700 Subject: [PATCH 44/69] Switch parsing method for reward model Signed-off-by: Ryan Wolf --- nemo_curator/services/openai_client.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index b3f691d0..c4a610c3 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -53,13 +53,14 @@ def query_reward_model(self, *, messages: Iterable, model: str) -> dict: """ response = self.client.chat.completions.create(messages=messages, model=model) - try: - message = response.choices[0].message[0] - except TypeError as _: - raise ValueError(f"{model} is not a reward model.") + if response.choices[0].logprobs is None: + raise ValueError( + f"Logprobs not found. {model} is likely not a reward model." + ) - metrics = [metric.split(":") for metric in message.content.split(",")] - scores = {category: float(score) for category, score in metrics} + scores = { + score.token: score.logprob for score in response.choices[0].logprobs.content + } return scores @@ -113,12 +114,13 @@ async def query_reward_model(self, *, messages: Iterable, model: str) -> dict: messages=messages, model=model ) - try: - message = response.choices[0].message[0] - except TypeError as _: - raise ValueError(f"{model} is not a reward model.") + if response.choices[0].logprobs is None: + raise ValueError( + f"Logprobs not found. {model} is likely not a reward model." + ) - metrics = [metric.split(":") for metric in message.content.split(",")] - scores = {category: float(score) for category, score in metrics} + scores = { + score.token: score.logprob for score in response.choices[0].logprobs.content + } return scores From 8daea940c778cb4df98e6db14fd41bc1ac59f16f Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 2 Jul 2024 16:03:35 -0700 Subject: [PATCH 45/69] Add initial docs Signed-off-by: Ryan Wolf --- docs/user-guide/index.rst | 3 +++ docs/user-guide/syntheticdata.rst | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 docs/user-guide/syntheticdata.rst diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index 74c219c2..5675a95f 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -18,6 +18,9 @@ :ref:`GPU Accelerated Exact and Fuzzy Deduplication ` Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF. +:ref:`Synthetic Data Generation ` + Synthetic data generation tools and example piplines are available within NeMo Curator. + :ref:`Downstream Task Decontamination ` After training, large language models are usually evaluated by their performance on downstream tasks consisting of unseen test data. When dealing with large datasets, there is a potential for leakage of this test data into the model’s training dataset. NeMo Curator allows you to remove sections of documents in your dataset that are present in downstream tasks. diff --git a/docs/user-guide/syntheticdata.rst b/docs/user-guide/syntheticdata.rst new file mode 100644 index 00000000..5a3d0235 --- /dev/null +++ b/docs/user-guide/syntheticdata.rst @@ -0,0 +1,22 @@ + +.. _data-curator-syntheticdata: + +====================================== +Synthetic Data Generation +====================================== +-------------------------------------- +Background +-------------------------------------- +Synthetic data generation has become increasing useful in large language model training. +It is used in pretraining, fine-tuning, and evalutation. +Synthetically generated data can be useful for adapting an LLM to low resource languages/domains, or performing knowledge distillation from other models among other purposes. +There are a variety of ways to construct synthetic data generation pipelines, with numerous LLM and classical filters. + +NeMo Curator has a simple, easy-to-use set of tools that allow you to use prebuilt synthetic generation pipelines or build your own. +Any model inference service that uses the OpenAI API is compatible with the synthetic data generation module, allowing you to generate your data from any model. +NeMo Curator has prebuilt synthetic data generation pipelines for supervised fine-tuning (SFT) and preference data that were used to generate data for the training of `Nemotron-4 340B `_. +And, you can easily interweave filtering and deduplication steps in your synthetic data pipeline with the other modules in NeMo Curator. + +----------------------------------------- +Usage +----------------------------------------- From 6ae83b1067f0b326e14d906a8ba49c6c18170737 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 4 Jul 2024 21:20:09 -0700 Subject: [PATCH 46/69] Add nemo deploy client Signed-off-by: Ryan Wolf --- nemo_curator/services/__init__.py | 13 +++ nemo_curator/services/model_client.py | 19 ++++ nemo_curator/services/nemo_client.py | 89 +++++++++++++++++++ nemo_curator/services/openai_client.py | 26 ++++++ nemo_curator/synthetic/__init__.py | 20 ++++- .../synthetic/conversation_formatter.py | 28 ++++++ nemo_curator/synthetic/mixtral.py | 38 ++++++++ nemo_curator/synthetic/nemotron.py | 36 ++++++++ 8 files changed, 268 insertions(+), 1 deletion(-) create mode 100644 nemo_curator/services/nemo_client.py create mode 100644 nemo_curator/synthetic/conversation_formatter.py create mode 100644 nemo_curator/synthetic/mixtral.py diff --git a/nemo_curator/services/__init__.py b/nemo_curator/services/__init__.py index 83df8d3c..a0b05c8a 100644 --- a/nemo_curator/services/__init__.py +++ b/nemo_curator/services/__init__.py @@ -1,3 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .model_client import AsyncLLMClient, LLMClient from .openai_client import AsyncOpenAIClient, OpenAIClient diff --git a/nemo_curator/services/model_client.py b/nemo_curator/services/model_client.py index 7585c400..0e1f9399 100644 --- a/nemo_curator/services/model_client.py +++ b/nemo_curator/services/model_client.py @@ -1,6 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from abc import ABC, abstractmethod from typing import Iterable, List, Optional, Union +from nemo_curator.synthetic.conversation_formatter import ConversationFormatter + class LLMClient(ABC): """ @@ -14,6 +29,7 @@ def query_model( *, messages: Iterable, model: str, + conversation_formatter: Optional[ConversationFormatter] = None, max_tokens: Optional[int] = None, n: Optional[int] = 1, seed: Optional[int] = None, @@ -29,6 +45,7 @@ def query_reward_model( *, messages: Iterable, model: str, + conversation_formatter: Optional[ConversationFormatter] = None, ) -> dict: raise NotImplementedError( "Subclass of LLMClient must implement 'query_reward_model'" @@ -47,6 +64,7 @@ async def query_model( *, messages: Iterable, model: str, + conversation_formatter: Optional[ConversationFormatter] = None, max_tokens: Optional[int] = None, n: Optional[int] = 1, seed: Optional[int] = None, @@ -64,6 +82,7 @@ async def query_reward_model( *, messages: Iterable, model: str, + conversation_formatter: Optional[ConversationFormatter] = None, ) -> dict: raise NotImplementedError( "Subclass of LLMClient must implement 'query_reward_model'" diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py new file mode 100644 index 00000000..aa5dba71 --- /dev/null +++ b/nemo_curator/services/nemo_client.py @@ -0,0 +1,89 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +from typing import Iterable, List, Optional, Union + +from nemo.deploy.nlp import NemoQueryLLM + +from nemo_curator.synthetic.conversation_formatter import ConversationFormatter + +from .model_client import AsyncLLMClient, LLMClient + + +class NemoDeployClient(LLMClient): + """ + A wrapper around NemoQueryLLM for querying models in synthetic data generation + """ + + def __init__(self, nemo_deploy: NemoQueryLLM) -> None: + self.client = nemo_deploy + + def query_model( + self, + *, + messages: Iterable, + model: str, + conversation_formatter: Optional[ConversationFormatter] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + seed: Optional[int] = None, + stop: Union[Optional[str], List[str]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + ) -> List[str]: + + prompt = conversation_formatter.format_conversation(messages) + self.client.model_name = model + + if n is not None: + warnings.warn("n is not supported in NemoDeployClient") + + if isinstance(stop, str): + stop = [stop] + + response = self.client.query_llm( + prompts=[prompt], + model=model, + max_output_len=max_tokens, + random_seed=seed, + stop_words_list=stop, + temperature=temperature, + top_p=top_p, + )[0] + + return self._postprocess_response(response, stop) + + @staticmethod + def _postprocess_response(response: str, stop_words: List[str]) -> str: + for stop in stop_words: + if response.endswith(stop): + response = response[: -len(stop)] + response = response.strip() + return response + + def query_reward_model(self, *, messages: Iterable, model: str) -> dict: + """ + Prompts an LLM Reward model to score a conversation between a user and assistant + Args: + messages: The conversation to calculate a score for. + Should be formatted like: + [{"role": "user", "content": "Write a sentence"}, {"role": "assistant", "content": "This is a sentence"}, ...] + model: The name of the model that should be used to calculate the reward. + Must be a reward model, cannot be a regular LLM. + Returns: + A mapping of score_name -> score + """ + raise NotImplementedError( + "Reward model inference is not supported in NeMo Deploy Clients" + ) diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index c4a610c3..23e42470 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -1,8 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings from typing import Iterable, List, Optional, Union from openai import AsyncOpenAI, OpenAI from openai._types import NOT_GIVEN, NotGiven +from nemo_curator.synthetic.conversation_formatter import ConversationFormatter + from .model_client import AsyncLLMClient, LLMClient @@ -19,6 +35,7 @@ def query_model( *, messages: Iterable, model: str, + conversation_formatter: Optional[ConversationFormatter] = None, max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN, n: Union[Optional[int], NotGiven] = NOT_GIVEN, seed: Union[Optional[int], NotGiven] = NOT_GIVEN, @@ -26,6 +43,10 @@ def query_model( temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, ) -> List[str]: + + if conversation_formatter is not None: + warnings.warn("conversation_formatter is not used in an OpenAIClient") + response = self.client.chat.completions.create( messages=messages, model=model, @@ -78,6 +99,7 @@ async def query_model( *, messages: Iterable, model: str, + conversation_formatter: Optional[ConversationFormatter] = None, max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN, n: Union[Optional[int], NotGiven] = NOT_GIVEN, seed: Union[Optional[int], NotGiven] = NOT_GIVEN, @@ -85,6 +107,10 @@ async def query_model( temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, ) -> List[str]: + + if conversation_formatter is not None: + warnings.warn("conversation_formatter is not used in an AsyncOpenAIClient") + response = await self.client.chat.completions.create( messages=messages, model=model, diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py index 7dc47920..db71b2d4 100644 --- a/nemo_curator/synthetic/__init__.py +++ b/nemo_curator/synthetic/__init__.py @@ -1,6 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .async_nemotron import AsyncNemotronGenerator +from .conversation_formatter import ConversationFormatter from .error import YamlConversionError -from .nemotron import NemotronGenerator +from .mixtral import Mixtral8x7BFormatter +from .nemotron import NemotronFormatter, NemotronGenerator from .prompts import ( DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, @@ -29,6 +44,9 @@ __all__ = [ "NemotronGenerator", "AsyncNemotronGenerator", + "NemotronFormatter", + "ConversationFormatter", + "Mixtral8x7BFormatter", "DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE", "DEFAULT_SUBTOPICS_PROMPT_TEMPLATE", "DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE", diff --git a/nemo_curator/synthetic/conversation_formatter.py b/nemo_curator/synthetic/conversation_formatter.py new file mode 100644 index 00000000..c4db1cc2 --- /dev/null +++ b/nemo_curator/synthetic/conversation_formatter.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC, abstractmethod +from typing import List + + +class ConversationFormatter(ABC): + """ + Represents a way of formatting a conversation with an LLM + such that it can response appropriately + """ + + @abstractmethod + def format_conversation(self, conv: List[dict]) -> str: + raise NotImplementedError( + "format_converstaion must be implemented by subclasses" + ) diff --git a/nemo_curator/synthetic/mixtral.py b/nemo_curator/synthetic/mixtral.py new file mode 100644 index 00000000..96fe5c73 --- /dev/null +++ b/nemo_curator/synthetic/mixtral.py @@ -0,0 +1,38 @@ +from typing import List + +from .conversation_formatter import ConversationFormatter + + +class Mixtral8x7BFormatter(ConversationFormatter): + + BASE_PROMPT = " [INST] \n" + + @staticmethod + def format_conversation(conv: List[dict]) -> str: + """ + Formats a converstation between a user and assistant in the Mixtral-8x7B format + described here: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + Args: + conv: A conversation between a user and assistant + Returns: + A conversation formatted as text + """ + prompt = Mixtral8x7BFormatter.BASE_PROMPT + + for i, turn in enumerate(conv): + user_turn = i % 2 == 0 + + if user_turn: + if turn["role"] != "user": + raise ValueError( + f"Conversation turn {i} is not 'user'. All even number turns should be." + ) + prompt += turn["content"] + " [/INST]" + else: + if turn["role"] != "assistant": + raise ValueError( + f"Conversation turn {i} is not 'assistant'. All odd number turns should be." + ) + prompt += turn["content"] + "[INST] " + + return prompt diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 18c7ac45..2bd8bd38 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -16,6 +16,7 @@ import yaml from nemo_curator.services.model_client import LLMClient +from nemo_curator.synthetic.conversation_formatter import ConversationFormatter from nemo_curator.synthetic.error import YamlConversionError from nemo_curator.synthetic.prompts import ( DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, @@ -1434,3 +1435,38 @@ def run_python_pipeline( raise e return openlines + + +class NemotronFormatter(ConversationFormatter): + + BASE_PROMPT = "System\n\nUser\n" + + @staticmethod + def format_conversation(conv: List[dict]) -> str: + """ + Formats a converstation between a user and assistant in the Nemotron 340B format + described here: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/nemotron-4-340b-instruct + Args: + conv: A conversation between a user and assistant + Returns: + A conversation formatted as text + """ + prompt = NemotronFormatter.BASE_PROMPT + + for i, turn in enumerate(conv): + user_turn = i % 2 == 0 + + if user_turn: + if turn["role"] != "user": + raise ValueError( + f"Conversation turn {i} is not 'user'. All even number turns should be." + ) + prompt += turn["content"] + "\nAssistant\n" + else: + if turn["role"] != "assistant": + raise ValueError( + f"Conversation turn {i} is not 'assistant'. All odd number turns should be." + ) + prompt += turn["content"] + "\nUser\n" + + return prompt From 7daefb70d47afe7712e4286795111af8354a1579 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 4 Jul 2024 23:17:27 -0700 Subject: [PATCH 47/69] Add easy import Signed-off-by: Ryan Wolf --- nemo_curator/__init__.py | 8 +++++++- nemo_curator/services/__init__.py | 9 ++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py index 48d9d22d..b9314559 100644 --- a/nemo_curator/__init__.py +++ b/nemo_curator/__init__.py @@ -27,7 +27,13 @@ from .modules import * -from .services import AsyncLLMClient, AsyncOpenAIClient, LLMClient, OpenAIClient +from .services import ( + AsyncLLMClient, + AsyncOpenAIClient, + LLMClient, + NemoDeployClient, + OpenAIClient, +) from .utils.distributed_utils import get_client # Dask will automatically convert the list score type diff --git a/nemo_curator/services/__init__.py b/nemo_curator/services/__init__.py index a0b05c8a..39ad92e0 100644 --- a/nemo_curator/services/__init__.py +++ b/nemo_curator/services/__init__.py @@ -12,6 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. from .model_client import AsyncLLMClient, LLMClient +from .nemo_client import NemoDeployClient from .openai_client import AsyncOpenAIClient, OpenAIClient -__all__ = ["AsyncLLMClient", "LLMClient", "AsyncOpenAIClient", "OpenAIClient"] +__all__ = [ + "AsyncLLMClient", + "LLMClient", + "AsyncOpenAIClient", + "OpenAIClient", + "NemoDeployClient", +] From c0509f9f1ba8f4188a6b074ab3f8a3b25342805a Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 4 Jul 2024 23:30:56 -0700 Subject: [PATCH 48/69] Move conversation formatter Signed-off-by: Ryan Wolf --- nemo_curator/services/__init__.py | 2 ++ nemo_curator/{synthetic => services}/conversation_formatter.py | 0 nemo_curator/services/openai_client.py | 2 +- nemo_curator/synthetic/__init__.py | 2 -- nemo_curator/synthetic/mixtral.py | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename nemo_curator/{synthetic => services}/conversation_formatter.py (100%) diff --git a/nemo_curator/services/__init__.py b/nemo_curator/services/__init__.py index 39ad92e0..ff769b2e 100644 --- a/nemo_curator/services/__init__.py +++ b/nemo_curator/services/__init__.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .conversation_formatter import ConversationFormatter from .model_client import AsyncLLMClient, LLMClient from .nemo_client import NemoDeployClient from .openai_client import AsyncOpenAIClient, OpenAIClient @@ -21,4 +22,5 @@ "AsyncOpenAIClient", "OpenAIClient", "NemoDeployClient", + "ConversationFormatter", ] diff --git a/nemo_curator/synthetic/conversation_formatter.py b/nemo_curator/services/conversation_formatter.py similarity index 100% rename from nemo_curator/synthetic/conversation_formatter.py rename to nemo_curator/services/conversation_formatter.py diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index 23e42470..ebdced1a 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -17,7 +17,7 @@ from openai import AsyncOpenAI, OpenAI from openai._types import NOT_GIVEN, NotGiven -from nemo_curator.synthetic.conversation_formatter import ConversationFormatter +from nemo_curator.services.conversation_formatter import ConversationFormatter from .model_client import AsyncLLMClient, LLMClient diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py index db71b2d4..22aa72fc 100644 --- a/nemo_curator/synthetic/__init__.py +++ b/nemo_curator/synthetic/__init__.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. from .async_nemotron import AsyncNemotronGenerator -from .conversation_formatter import ConversationFormatter from .error import YamlConversionError from .mixtral import Mixtral8x7BFormatter from .nemotron import NemotronFormatter, NemotronGenerator @@ -45,7 +44,6 @@ "NemotronGenerator", "AsyncNemotronGenerator", "NemotronFormatter", - "ConversationFormatter", "Mixtral8x7BFormatter", "DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE", "DEFAULT_SUBTOPICS_PROMPT_TEMPLATE", diff --git a/nemo_curator/synthetic/mixtral.py b/nemo_curator/synthetic/mixtral.py index 96fe5c73..9b7568f3 100644 --- a/nemo_curator/synthetic/mixtral.py +++ b/nemo_curator/synthetic/mixtral.py @@ -1,6 +1,6 @@ from typing import List -from .conversation_formatter import ConversationFormatter +from nemo_curator.services.conversation_formatter import ConversationFormatter class Mixtral8x7BFormatter(ConversationFormatter): From e96471290dfa83d340d21454308df5813f1579cc Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 4 Jul 2024 23:32:42 -0700 Subject: [PATCH 49/69] Add other file Signed-off-by: Ryan Wolf --- nemo_curator/services/nemo_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py index aa5dba71..dcf22e29 100644 --- a/nemo_curator/services/nemo_client.py +++ b/nemo_curator/services/nemo_client.py @@ -16,7 +16,7 @@ from nemo.deploy.nlp import NemoQueryLLM -from nemo_curator.synthetic.conversation_formatter import ConversationFormatter +from nemo_curator.services.conversation_formatter import ConversationFormatter from .model_client import AsyncLLMClient, LLMClient From e500814ebfc76986867ed56cd7a41b65ee3d7a78 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 4 Jul 2024 23:33:23 -0700 Subject: [PATCH 50/69] Update nemotron import Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/nemotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 2bd8bd38..9e10a30d 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -15,8 +15,8 @@ import yaml +from nemo_curator.services.conversation_formatter import ConversationFormatter from nemo_curator.services.model_client import LLMClient -from nemo_curator.synthetic.conversation_formatter import ConversationFormatter from nemo_curator.synthetic.error import YamlConversionError from nemo_curator.synthetic.prompts import ( DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, From 2b4d3ffad95dfb5cfe3fb86ae8f097946d81340d Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 4 Jul 2024 23:34:30 -0700 Subject: [PATCH 51/69] Update model client import Signed-off-by: Ryan Wolf --- nemo_curator/services/model_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/services/model_client.py b/nemo_curator/services/model_client.py index 0e1f9399..5c7a257d 100644 --- a/nemo_curator/services/model_client.py +++ b/nemo_curator/services/model_client.py @@ -14,7 +14,7 @@ from abc import ABC, abstractmethod from typing import Iterable, List, Optional, Union -from nemo_curator.synthetic.conversation_formatter import ConversationFormatter +from nemo_curator.services.conversation_formatter import ConversationFormatter class LLMClient(ABC): From 7acbee99888d30e737158e96b58357c5798abeb3 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 4 Jul 2024 23:39:20 -0700 Subject: [PATCH 52/69] Remove model in query call Signed-off-by: Ryan Wolf --- nemo_curator/services/nemo_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py index dcf22e29..df81051a 100644 --- a/nemo_curator/services/nemo_client.py +++ b/nemo_curator/services/nemo_client.py @@ -54,7 +54,6 @@ def query_model( response = self.client.query_llm( prompts=[prompt], - model=model, max_output_len=max_tokens, random_seed=seed, stop_words_list=stop, From 06b73109fceca1f0c0fde1db98763125d35ae784 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Thu, 4 Jul 2024 23:49:47 -0700 Subject: [PATCH 53/69] Add extra index Signed-off-by: Ryan Wolf --- nemo_curator/services/nemo_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py index df81051a..fb0a4d3c 100644 --- a/nemo_curator/services/nemo_client.py +++ b/nemo_curator/services/nemo_client.py @@ -59,7 +59,7 @@ def query_model( stop_words_list=stop, temperature=temperature, top_p=top_p, - )[0] + )[0][0] return self._postprocess_response(response, stop) From f05b13ac0f4b0cab4c1e102b2c56b178a3d4c1e6 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 5 Jul 2024 07:24:12 -0700 Subject: [PATCH 54/69] Fix response indexing Signed-off-by: Ryan Wolf --- nemo_curator/services/nemo_client.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py index fb0a4d3c..5cde1122 100644 --- a/nemo_curator/services/nemo_client.py +++ b/nemo_curator/services/nemo_client.py @@ -59,17 +59,19 @@ def query_model( stop_words_list=stop, temperature=temperature, top_p=top_p, - )[0][0] + )[0] return self._postprocess_response(response, stop) @staticmethod - def _postprocess_response(response: str, stop_words: List[str]) -> str: - for stop in stop_words: - if response.endswith(stop): - response = response[: -len(stop)] - response = response.strip() - return response + def _postprocess_response(responses: List[str], stop_words: List[str]) -> List[str]: + processed_responses = [] + for response in responses: + for stop in stop_words: + if response.endswith(stop): + response = response[: -len(stop)] + processed_responses.append(response.strip()) + return processed_responses def query_reward_model(self, *, messages: Iterable, model: str) -> dict: """ From 0efc8089accd9102c5937851525ed3f7080d986a Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 5 Jul 2024 07:33:35 -0700 Subject: [PATCH 55/69] Add top k Signed-off-by: Ryan Wolf --- nemo_curator/services/model_client.py | 2 ++ nemo_curator/services/nemo_client.py | 1 + nemo_curator/services/openai_client.py | 6 ++++++ 3 files changed, 9 insertions(+) diff --git a/nemo_curator/services/model_client.py b/nemo_curator/services/model_client.py index 5c7a257d..0f929023 100644 --- a/nemo_curator/services/model_client.py +++ b/nemo_curator/services/model_client.py @@ -35,6 +35,7 @@ def query_model( seed: Optional[int] = None, stop: Union[Optional[str], List[str]] = None, temperature: Optional[float] = None, + top_k: Optional[int] = None, top_p: Optional[float] = None, ) -> List[str]: raise NotImplementedError("Subclass of LLMClient must implement 'query_model'") @@ -70,6 +71,7 @@ async def query_model( seed: Optional[int] = None, stop: Union[Optional[str], List[str]] = None, temperature: Optional[float] = None, + top_k: Optional[int] = None, top_p: Optional[float] = None, ) -> List[str]: raise NotImplementedError( diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py index 5cde1122..fc4caca9 100644 --- a/nemo_curator/services/nemo_client.py +++ b/nemo_curator/services/nemo_client.py @@ -40,6 +40,7 @@ def query_model( seed: Optional[int] = None, stop: Union[Optional[str], List[str]] = None, temperature: Optional[float] = None, + top_k: Optional[int] = None, top_p: Optional[float] = None, ) -> List[str]: diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index ebdced1a..5719cf5b 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -41,11 +41,14 @@ def query_model( seed: Union[Optional[int], NotGiven] = NOT_GIVEN, stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, + top_k: Optional[int] = None, top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, ) -> List[str]: if conversation_formatter is not None: warnings.warn("conversation_formatter is not used in an OpenAIClient") + if top_k is not None: + warnings.warn("top_k is not used in an OpenAIClient") response = self.client.chat.completions.create( messages=messages, @@ -105,11 +108,14 @@ async def query_model( seed: Union[Optional[int], NotGiven] = NOT_GIVEN, stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, + top_k: Optional[int] = None, top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, ) -> List[str]: if conversation_formatter is not None: warnings.warn("conversation_formatter is not used in an AsyncOpenAIClient") + if top_k is not None: + warnings.warn("top_k is not used in an AsyncOpenAIClient") response = await self.client.chat.completions.create( messages=messages, From c8d1419799b6c5fabf86168f4f4e38da0cc8dfb4 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 5 Jul 2024 07:46:22 -0700 Subject: [PATCH 56/69] Remove extras Signed-off-by: Ryan Wolf --- setup.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/setup.py b/setup.py index ef6b0d2b..1f9b3fe3 100644 --- a/setup.py +++ b/setup.py @@ -67,6 +67,7 @@ # Numpy 2.0 breaks with spacy https://github.com/explosion/spaCy/issues/13528 # TODO: Remove when issue is fixed "numpy<2", + "openai", ], extras_require={ "cuda12x": [ @@ -77,10 +78,6 @@ "dask-cuda>=24.2", "spacy[cuda12x]>=3.6.0, <4.0.0", ], - "synth": [ - "openai", - "nemo_toolkit[infer]>=1.23.0", - ], }, entry_points={ "console_scripts": [ From 2d11a8c14a79a036c8067ad0fb3323c589936e3e Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 5 Jul 2024 07:58:35 -0700 Subject: [PATCH 57/69] Add safe import for nemo deploy Signed-off-by: Ryan Wolf --- nemo_curator/services/nemo_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py index fc4caca9..bcb4329e 100644 --- a/nemo_curator/services/nemo_client.py +++ b/nemo_curator/services/nemo_client.py @@ -14,12 +14,13 @@ import warnings from typing import Iterable, List, Optional, Union -from nemo.deploy.nlp import NemoQueryLLM - from nemo_curator.services.conversation_formatter import ConversationFormatter +from nemo_curator.utils.import_utils import safe_import_from from .model_client import AsyncLLMClient, LLMClient +NemoQueryLLM = safe_import_from("nemo.deploy.nlp", "NemoQueryLLM") + class NemoDeployClient(LLMClient): """ @@ -43,7 +44,6 @@ def query_model( top_k: Optional[int] = None, top_p: Optional[float] = None, ) -> List[str]: - prompt = conversation_formatter.format_conversation(messages) self.client.model_name = model From 20afd89da5516b9357a1a0ef81ee8f84020f1596 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 5 Jul 2024 08:23:40 -0700 Subject: [PATCH 58/69] Add pandas conversions Signed-off-by: Ryan Wolf --- nemo_curator/datasets/doc_dataset.py | 40 +++++++++++++++++++++++++++- tests/test_dataset.py | 25 +++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 tests/test_dataset.py diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py index 32b23114..ce846571 100644 --- a/nemo_curator/datasets/doc_dataset.py +++ b/nemo_curator/datasets/doc_dataset.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Union +from typing import List, Optional, Union import dask.dataframe as dd @@ -130,6 +130,44 @@ def to_pickle( ): raise NotImplementedError("DocumentDataset does not support to_pickle yet") + @classmethod + def from_pandas( + cls, + data, + npartitions: Optional[int] = None, + chunksize: Optional[int] = None, + sort: Optional[bool] = True, + name: Optional[str] = None, + ): + """ + Creates a document dataset from a pandas data frame. + For more information on the arguments see Dask's from_pandas documentation + https://docs.dask.org/en/stable/generated/dask.dataframe.from_pandas.html + + Args: + data: A pandas dataframe + Returns: + A document dataset with a pandas backend (on the CPU). + """ + return cls( + dd.from_pandas( + data=data, + npartitions=npartitions, + chunksize=chunksize, + sort=sort, + name=name, + ) + ) + + def to_pandas(self): + """ + Creates a pandas dataframe from a DocumentDataset + + Returns: + A pandas dataframe (on the CPU) + """ + return self.df.compute() + def _read_json_or_parquet( input_files: Union[str, List[str]], diff --git a/tests/test_dataset.py b/tests/test_dataset.py new file mode 100644 index 00000000..f16d49b2 --- /dev/null +++ b/tests/test_dataset.py @@ -0,0 +1,25 @@ +import dask.dataframe as dd +import pandas as pd + +import nemo_curator as nc +from nemo_curator.datasets import DocumentDataset + + +def all_equal(left_result: pd.DataFrame, right_result: pd.DataFrame): + l_cols = set(left_result.columns) + r_cols = set(right_result.columns) + assert l_cols == r_cols + for col in left_result.columns: + left = left_result[col].reset_index(drop=True) + right = right_result[col].reset_index(drop=True) + assert all(left == right), f"Mismatch in {col} column.\n{left}\n{right}\n" + + +class TestDocumentDataset: + def test_to_from_pandas(self): + original_df = pd.DataFrame( + {"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]} + ) + dataset = DocumentDataset.from_pandas(original_df) + converted_df = dataset.to_pandas() + all_equal(original_df, converted_df) From 2987c9aba2daf484f3db1f3814e25a00e2548f88 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 5 Jul 2024 08:28:34 -0700 Subject: [PATCH 59/69] Add partition default Signed-off-by: Ryan Wolf --- nemo_curator/datasets/doc_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py index ce846571..38026452 100644 --- a/nemo_curator/datasets/doc_dataset.py +++ b/nemo_curator/datasets/doc_dataset.py @@ -134,7 +134,7 @@ def to_pickle( def from_pandas( cls, data, - npartitions: Optional[int] = None, + npartitions: Optional[int] = 1, chunksize: Optional[int] = None, sort: Optional[bool] = True, name: Optional[str] = None, From 3f8dcc8536cc8c7e5a06c6d91ce3a0a4b8d8e469 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 5 Jul 2024 09:37:32 -0700 Subject: [PATCH 60/69] Add no format Signed-off-by: Ryan Wolf --- .../services/conversation_formatter.py | 18 ++++++++++++++++++ nemo_curator/services/nemo_client.py | 5 +++++ 2 files changed, 23 insertions(+) diff --git a/nemo_curator/services/conversation_formatter.py b/nemo_curator/services/conversation_formatter.py index c4db1cc2..bd307412 100644 --- a/nemo_curator/services/conversation_formatter.py +++ b/nemo_curator/services/conversation_formatter.py @@ -26,3 +26,21 @@ def format_conversation(self, conv: List[dict]) -> str: raise NotImplementedError( "format_converstaion must be implemented by subclasses" ) + + +class NoFormat(ConversationFormatter): + + def format_conversation(self, conv: List[dict]) -> str: + if len(conv) != 1: + raise ValueError( + "There must be exactly one turn in the conversation to use NoFormat" + ) + + turn = conv[0] + + if turn["role"] != "user": + raise ValueError( + "Conversation turn 0 is not 'user'. All even number turns should be." + ) + + return turn["content"] diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py index bcb4329e..6ae1c578 100644 --- a/nemo_curator/services/nemo_client.py +++ b/nemo_curator/services/nemo_client.py @@ -44,6 +44,11 @@ def query_model( top_k: Optional[int] = None, top_p: Optional[float] = None, ) -> List[str]: + if conversation_formatter is None: + raise ValueError( + "NemoDeployClient's query_model requires a conversation_formatter" + ) + prompt = conversation_formatter.format_conversation(messages) self.client.model_name = model From 0926cbd2db0925503893242460c9c03005893a92 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 5 Jul 2024 09:45:55 -0700 Subject: [PATCH 61/69] Move no format location Signed-off-by: Ryan Wolf --- .../services/conversation_formatter.py | 18 ---------- nemo_curator/synthetic/__init__.py | 2 ++ nemo_curator/synthetic/no_format.py | 34 +++++++++++++++++++ 3 files changed, 36 insertions(+), 18 deletions(-) create mode 100644 nemo_curator/synthetic/no_format.py diff --git a/nemo_curator/services/conversation_formatter.py b/nemo_curator/services/conversation_formatter.py index bd307412..c4db1cc2 100644 --- a/nemo_curator/services/conversation_formatter.py +++ b/nemo_curator/services/conversation_formatter.py @@ -26,21 +26,3 @@ def format_conversation(self, conv: List[dict]) -> str: raise NotImplementedError( "format_converstaion must be implemented by subclasses" ) - - -class NoFormat(ConversationFormatter): - - def format_conversation(self, conv: List[dict]) -> str: - if len(conv) != 1: - raise ValueError( - "There must be exactly one turn in the conversation to use NoFormat" - ) - - turn = conv[0] - - if turn["role"] != "user": - raise ValueError( - "Conversation turn 0 is not 'user'. All even number turns should be." - ) - - return turn["content"] diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py index 22aa72fc..44a4b6c1 100644 --- a/nemo_curator/synthetic/__init__.py +++ b/nemo_curator/synthetic/__init__.py @@ -15,6 +15,7 @@ from .error import YamlConversionError from .mixtral import Mixtral8x7BFormatter from .nemotron import NemotronFormatter, NemotronGenerator +from .no_format import NoFormat from .prompts import ( DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE, @@ -45,6 +46,7 @@ "AsyncNemotronGenerator", "NemotronFormatter", "Mixtral8x7BFormatter", + "NoFormat", "DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE", "DEFAULT_SUBTOPICS_PROMPT_TEMPLATE", "DEFAULT_OPEN_QA_FROM_TOPICS_PROMPT_TEMPLATE", diff --git a/nemo_curator/synthetic/no_format.py b/nemo_curator/synthetic/no_format.py new file mode 100644 index 00000000..744c87b3 --- /dev/null +++ b/nemo_curator/synthetic/no_format.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List + +from nemo_curator.services.conversation_formatter import ConversationFormatter + + +class NoFormat(ConversationFormatter): + + def format_conversation(self, conv: List[dict]) -> str: + if len(conv) != 1: + raise ValueError( + "There must be exactly one turn in the conversation to use NoFormat" + ) + + turn = conv[0] + + if turn["role"] != "user": + raise ValueError( + "Conversation turn 0 is not 'user'. All even number turns should be." + ) + + return turn["content"] From e2beb5b5b400a1451482fd2013f5886d6122bc7a Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Fri, 5 Jul 2024 11:46:06 -0700 Subject: [PATCH 62/69] Use top_k in nemo client Signed-off-by: Ryan Wolf --- nemo_curator/services/nemo_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py index 6ae1c578..99c1d730 100644 --- a/nemo_curator/services/nemo_client.py +++ b/nemo_curator/services/nemo_client.py @@ -65,6 +65,7 @@ def query_model( stop_words_list=stop, temperature=temperature, top_p=top_p, + top_k=top_k, )[0] return self._postprocess_response(response, stop) From b918c1459d03dec111f0a6fa8121cd55ba47ffb0 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 8 Jul 2024 17:39:55 -0700 Subject: [PATCH 63/69] Address vibhu's review Signed-off-by: Ryan Wolf --- nemo_curator/datasets/doc_dataset.py | 2 +- nemo_curator/services/model_client.py | 2 + nemo_curator/services/nemo_client.py | 3 + nemo_curator/services/openai_client.py | 4 + nemo_curator/synthetic/async_nemotron.py | 377 +++++++++++------------ nemo_curator/synthetic/mixtral.py | 4 +- nemo_curator/synthetic/nemotron.py | 4 +- 7 files changed, 201 insertions(+), 195 deletions(-) diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py index 38026452..48840270 100644 --- a/nemo_curator/datasets/doc_dataset.py +++ b/nemo_curator/datasets/doc_dataset.py @@ -166,7 +166,7 @@ def to_pandas(self): Returns: A pandas dataframe (on the CPU) """ - return self.df.compute() + return self.df.to_backend("pandas").compute() def _read_json_or_parquet( diff --git a/nemo_curator/services/model_client.py b/nemo_curator/services/model_client.py index 0f929023..ca8ff6c4 100644 --- a/nemo_curator/services/model_client.py +++ b/nemo_curator/services/model_client.py @@ -34,6 +34,7 @@ def query_model( n: Optional[int] = 1, seed: Optional[int] = None, stop: Union[Optional[str], List[str]] = None, + stream: bool = False, temperature: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, @@ -70,6 +71,7 @@ async def query_model( n: Optional[int] = 1, seed: Optional[int] = None, stop: Union[Optional[str], List[str]] = None, + stream: bool = False, temperature: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, diff --git a/nemo_curator/services/nemo_client.py b/nemo_curator/services/nemo_client.py index 99c1d730..f83ba624 100644 --- a/nemo_curator/services/nemo_client.py +++ b/nemo_curator/services/nemo_client.py @@ -40,6 +40,7 @@ def query_model( n: Optional[int] = None, seed: Optional[int] = None, stop: Union[Optional[str], List[str]] = None, + stream: bool = False, temperature: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, @@ -54,6 +55,8 @@ def query_model( if n is not None: warnings.warn("n is not supported in NemoDeployClient") + if stream: + warnings.warn("streamming is not supported in NeMoDeployClient") if isinstance(stop, str): stop = [stop] diff --git a/nemo_curator/services/openai_client.py b/nemo_curator/services/openai_client.py index 5719cf5b..350eb308 100644 --- a/nemo_curator/services/openai_client.py +++ b/nemo_curator/services/openai_client.py @@ -40,6 +40,7 @@ def query_model( n: Union[Optional[int], NotGiven] = NOT_GIVEN, seed: Union[Optional[int], NotGiven] = NOT_GIVEN, stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, + stream: Union[Optional[bool], NotGiven] = False, temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, top_k: Optional[int] = None, top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, @@ -57,6 +58,7 @@ def query_model( n=n, seed=seed, stop=stop, + stream=stream, temperature=temperature, top_p=top_p, ) @@ -107,6 +109,7 @@ async def query_model( n: Union[Optional[int], NotGiven] = NOT_GIVEN, seed: Union[Optional[int], NotGiven] = NOT_GIVEN, stop: Union[Optional[str], List[str], NotGiven] = NOT_GIVEN, + stream: Union[Optional[bool], NotGiven] = False, temperature: Union[Optional[float], NotGiven] = NOT_GIVEN, top_k: Optional[int] = None, top_p: Union[Optional[float], NotGiven] = NOT_GIVEN, @@ -124,6 +127,7 @@ async def query_model( n=n, seed=seed, stop=stop, + stream=stream, temperature=temperature, top_p=top_p, ) diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py index b8e25ef1..c01de79e 100644 --- a/nemo_curator/synthetic/async_nemotron.py +++ b/nemo_curator/synthetic/async_nemotron.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. import asyncio -from typing import List, Optional, Tuple, Union +import os +from typing import Any, Coroutine, List, Optional, Tuple, Union +import tqdm +import tqdm.asyncio import yaml +from nemo_curator.log import create_logger from nemo_curator.services.model_client import AsyncLLMClient from nemo_curator.synthetic.error import YamlConversionError from nemo_curator.synthetic.prompts import ( @@ -47,8 +51,22 @@ class AsyncNemotronGenerator: UltraChat paper (https://arxiv.org/abs/2305.14233) """ - def __init__(self, llm_client: AsyncLLMClient) -> None: + def __init__( + self, + llm_client: AsyncLLMClient, + logger: Union[logging.LoggerAdapter, str] = "./", + max_concurrent_requests: Optional[int] = None, + ) -> None: self.client = llm_client + self.max_concurrent_requests = max_concurrent_requests + if isinstance(logger, str): + self._logger = create_logger( + rank=0, + log_file=os.path.join(logger, "nemotron-generator.log"), + name="AsyncNemotronGenrator", + ) + else: + self._logger = logger async def _prompt( self, model: str, prompt_template: str, prompt_kwargs: dict, model_kwargs: dict @@ -114,6 +132,49 @@ async def convert_response_to_yaml_list( return parsed_response + async def _try_convert_yaml_list( + self, + response: str, + model: str, + yaml_conversion_prompt_template: str, + conversion_model_kwargs: dict, + expected_length: int, + ignore_conversion_failure: bool, + ): + try: + parsed_list = await self.convert_response_to_yaml_list( + response, + model=model, + prompt_template=yaml_conversion_prompt_template, + model_kwargs=conversion_model_kwargs, + ) + if len(parsed_list) != expected_length: + raise YamlConversionError( + f"Error: Length of parsed list {len(parsed_list)} does not match expected length {expected_length}: {parsed_list}" + ) + except YamlConversionError as e: + if ignore_conversion_failure: + return [] + else: + raise e + + return parsed_list + + async def _gather( + self, requests: List[Coroutine[Any, Any, List[str]]] + ) -> List[str]: + max_requests = self.max_concurrent_requests + if max_requests is None: + max_requests = len(requests) + + final_list = [] + for i in tqdm(range(0, len(requests), max_requests)): + request_slice = requests[i : i + max_requests] + result = await tqdm.asyncio.gather(*request_slice) + final_list.extend(result) + + return final_list + async def generate_macro_topics( self, n_macro_topics: Union[int, str], @@ -858,7 +919,9 @@ async def run_open_qa_pipeline( Returns: A list of synthetically generated open Q&A prompts """ + self._logger.info("Starting open q&a pipeline") # Generate the macro topics + self._logger.info("Starting macro topic generation") responses = await self.generate_macro_topics( n_macro_topics=n_macro_topics, model=model, @@ -876,6 +939,7 @@ async def run_open_qa_pipeline( f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" ) macro_topics.extend(additional_macro_topics) + self._logger.info("Finished macro topic generation") # Generate the subtopics raw_topics = [ @@ -891,9 +955,11 @@ async def run_open_qa_pipeline( ) for macro_topic in macro_topics ] - raw_topics = await asyncio.gather(*raw_topics) + self._logger.info("Starting subtopic generation") + raw_topics = await self._gather(raw_topics) topic_list = [item for subtopics in raw_topics for item in subtopics] topic_list.extend(additional_subtopics) + self._logger.info("Finished subtopic generation") # Mix the macro topics with the subtopics if combine_topics: @@ -913,8 +979,10 @@ async def run_open_qa_pipeline( ) for subtopic in topic_list ] - raw_lines = await asyncio.gather(*raw_lines) + self._logger.info("Starting openline generation") + raw_lines = await self._gather(raw_lines) openlines = [item for lines in raw_lines for item in lines] + self._logger.info("Finished openline generation") # Revise the openlines raw_revisions = [ @@ -930,8 +998,11 @@ async def run_open_qa_pipeline( ) for openline in openlines ] - raw_revisions = await asyncio.gather(*raw_revisions) + self._logger.info("Starting openline revision") + raw_revisions = await self._gather(raw_revisions) revised_openlines = [item for revisions in raw_revisions for item in revisions] + self._logger.info("Finished openline revision") + self._logger.info("Finished open q&a pipeline") return revised_openlines @@ -954,24 +1025,14 @@ async def _generate_parse_subtopic( prompt_template=subtopic_prompt_template, ) subtopic = subtopic[0] - try: - parsed_topics = await self.convert_response_to_yaml_list( - subtopic, - model=model, - prompt_template=yaml_conversion_prompt_template, - model_kwargs=conversion_model_kwargs, - ) - if len(parsed_topics) != n_subtopics: - raise YamlConversionError( - f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" - ) - except YamlConversionError as e: - if ignore_conversion_failure: - return [] - else: - raise e - - return parsed_topics + return self._try_convert_yaml_list( + subtopic, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_subtopics, + ignore_conversion_failure=ignore_conversion_failure, + ) async def _generate_parse_openline( self, @@ -992,24 +1053,14 @@ async def _generate_parse_openline( prompt_template=open_qa_from_topics_prompt_template, ) openline = openline[0] - try: - parsed_line = await self.convert_response_to_yaml_list( - openline, - model=model, - prompt_template=yaml_conversion_prompt_template, - model_kwargs=conversion_model_kwargs, - ) - if len(parsed_line) != n_openlines: - raise YamlConversionError( - f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" - ) - except YamlConversionError as e: - if ignore_conversion_failure: - return [] - else: - raise e - - return parsed_line + return self._try_convert_yaml_list( + openline, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_openlines, + ignore_conversion_failure=ignore_conversion_failure, + ) async def _revise_parse_openline( self, @@ -1030,24 +1081,14 @@ async def _revise_parse_openline( prompt_template=revise_open_qa_prompt_template, ) revised_openline = revised_openline[0] - try: - parsed_revision = await self.convert_response_to_yaml_list( - revised_openline, - model=model, - prompt_template=yaml_conversion_prompt_template, - model_kwargs=conversion_model_kwargs, - ) - if len(parsed_revision) != n_revisions: - raise YamlConversionError( - f"Error: Length of revisions {len(parsed_revision)} does not match desired n_revisions {n_revisions}: {parsed_revision}" - ) - except YamlConversionError as e: - if ignore_conversion_failure: - return [] - else: - raise e - - return parsed_revision + return self._try_convert_yaml_list( + revised_openline, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_revisions, + ignore_conversion_failure=ignore_conversion_failure, + ) async def run_writing_pipeline( self, @@ -1093,6 +1134,7 @@ async def run_writing_pipeline( Returns: A list of synthetically generated writing task prompts """ + self._logger.info("Starting writing pipeline") # Generate the tasks raw_writing_tasks = [] for topic in topics: @@ -1110,8 +1152,10 @@ async def run_writing_pipeline( ignore_conversion_failure=ignore_conversion_failure, ) ) - raw_writing_tasks = await asyncio.gather(*raw_writing_tasks) + self._logger.info("Starting writing task generation") + raw_writing_tasks = await self._gather(raw_writing_tasks) writing_tasks = [item for tasks in raw_writing_tasks for item in tasks] + self._logger.info("Finished writing task generation") # Revise the tasks raw_revised_openlines = [ @@ -1127,8 +1171,11 @@ async def run_writing_pipeline( ) for task in writing_tasks ] - raw_revised_openlines = await asyncio.gather(*raw_revised_openlines) + self._logger.info("Starting writing task revision") + raw_revised_openlines = await self._gather(raw_revised_openlines) revised_openlines = [item for lines in raw_revised_openlines for item in lines] + self._logger.info("Finished writing task revision") + self._logger.info("Finished writing pipeline") return revised_openlines @@ -1153,24 +1200,14 @@ async def _generate_parse_writing_task( prompt_template=writing_task_prompt_template, ) raw_tasks = raw_tasks[0] - try: - parsed_tasks = await self.convert_response_to_yaml_list( - raw_tasks, - model=model, - prompt_template=yaml_conversion_prompt_template, - model_kwargs=conversion_model_kwargs, - ) - if len(parsed_tasks) != n_openlines: - raise YamlConversionError( - f"Error: Length of writing tasks {len(parsed_tasks)} does not match desired n_openlines {n_openlines}: {parsed_tasks}" - ) - except YamlConversionError as e: - if ignore_conversion_failure: - return [] - else: - raise e - - return parsed_tasks + return self._try_convert_yaml_list( + raw_tasks, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_openlines, + ignore_conversion_failure=ignore_conversion_failure, + ) async def _revise_parse_writing_task( self, @@ -1191,24 +1228,14 @@ async def _revise_parse_writing_task( prompt_template=revise_writing_task_prompt_template, ) raw_revision = raw_revision[0] - try: - parsed_revision = await self.convert_response_to_yaml_list( - raw_revision, - model=model, - prompt_template=yaml_conversion_prompt_template, - model_kwargs=conversion_model_kwargs, - ) - if len(parsed_revision) != n_revisions: - raise YamlConversionError( - f"Error: Length of revisions {len(parsed_revision)} does not match desired n_revisions {n_revisions}: {parsed_revision}" - ) - except YamlConversionError as e: - if ignore_conversion_failure: - return [] - else: - raise e - - return parsed_revision + return self._try_convert_yaml_list( + raw_revision, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_revisions, + ignore_conversion_failure=ignore_conversion_failure, + ) async def run_closed_qa_pipeline( self, @@ -1245,6 +1272,7 @@ async def run_closed_qa_pipeline( A list of pairs where the first element represents the index of the document used to generate the question in the documents list and the second element represents a synthetically generated closed Q&A prompt. Example: [(0, "Summarize this document"), ...] """ + self._logger.info("Starting closed q&a pipeline") raw_qa = [ self._generate_parse_closed_qa( document_id=i, @@ -1259,8 +1287,9 @@ async def run_closed_qa_pipeline( ) for i, document in enumerate(documents) ] - raw_qa = await asyncio.gather(*raw_qa) + raw_qa = await self._gather(raw_qa) document_openline_pairs = [item for lines in raw_qa for item in lines] + self._logger.info("Finished closed q&a pipeline") return document_openline_pairs @@ -1284,22 +1313,14 @@ async def _generate_parse_closed_qa( prompt_template=closed_qa_prompt_template, ) raw_instruction = raw_instruction[0] - try: - parsed_instructions = await self.convert_response_to_yaml_list( - raw_instruction, - model=model, - prompt_template=yaml_conversion_prompt_template, - model_kwargs=conversion_model_kwargs, - ) - if len(parsed_instructions) != n_openlines: - raise YamlConversionError( - f"Error: Length of openlines {len(parsed_instructions)} does not match desired n_openlines {n_openlines}: {parsed_instructions}" - ) - except YamlConversionError as e: - if ignore_conversion_failure: - return [] - else: - raise e + parsed_instructions = self._try_convert_yaml_list( + raw_instruction, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_openlines, + ignore_conversion_failure=ignore_conversion_failure, + ) return [(document_id, inst) for inst in parsed_instructions] @@ -1359,7 +1380,9 @@ async def run_math_pipeline( Returns: A list of synthetically generated math prompts """ + self._logger.info("Starting math pipeline") # Generate the macro topics + self._logger.info("Starting math macro topic generation") responses = await self.generate_math_macro_topics( n_macro_topics=n_macro_topics, school_level=school_level, @@ -1378,6 +1401,7 @@ async def run_math_pipeline( f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" ) macro_topics.extend(additional_macro_topics) + self._logger.info("Finished math macro topic generation") # Generate the subtopics raw_topics = [ @@ -1393,9 +1417,11 @@ async def run_math_pipeline( ) for macro_topic in macro_topics ] - raw_topics = await asyncio.gather(*raw_topics) + self._logger.info("Starting math subtopic generation") + raw_topics = await self._gather(raw_topics) topic_list = [item for subtopics in raw_topics for item in subtopics] topic_list.extend(additional_subtopics) + self._logger.info("Finished math subtopic generation") # Mix the macro topics with the subtopics if combine_topics: @@ -1415,8 +1441,11 @@ async def run_math_pipeline( ) for subtopic in topic_list ] - raw_lines = await asyncio.gather(*raw_lines) + self._logger.info("Starting math openline generation") + raw_lines = await self._gather(raw_lines) openlines = [item for lines in raw_lines for item in lines] + self._logger.info("Finished math openline generation") + self._logger.info("Finished math pipeline") return openlines @@ -1439,24 +1468,14 @@ async def _generate_parse_math_subtopic( prompt_template=subtopic_prompt_template, ) raw_topic = raw_topic[0] - try: - parsed_topics = await self.convert_response_to_yaml_list( - raw_topic, - model=model, - prompt_template=yaml_conversion_prompt_template, - model_kwargs=conversion_model_kwargs, - ) - if len(parsed_topics) != n_subtopics: - raise YamlConversionError( - f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" - ) - except YamlConversionError as e: - if ignore_conversion_failure: - return [] - else: - raise e - - return parsed_topics + return self._try_convert_yaml_list( + raw_topic, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_subtopics, + ignore_conversion_failure=ignore_conversion_failure, + ) async def _generate_parse_math_openline( self, @@ -1477,24 +1496,14 @@ async def _generate_parse_math_openline( prompt_template=math_problem_prompt_template, ) raw_line = raw_line[0] - try: - parsed_line = await self.convert_response_to_yaml_list( - raw_line, - model=model, - prompt_template=yaml_conversion_prompt_template, - model_kwargs=conversion_model_kwargs, - ) - if len(parsed_line) != n_openlines: - raise YamlConversionError( - f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" - ) - except YamlConversionError as e: - if ignore_conversion_failure: - return [] - else: - raise e - - return parsed_line + return self._try_convert_yaml_list( + raw_line, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_openlines, + ignore_conversion_failure=ignore_conversion_failure, + ) async def run_python_pipeline( self, @@ -1551,7 +1560,9 @@ async def run_python_pipeline( Returns: A list of synthetically generated Python prompts """ + self._logger.info("Starting python pipeline") # Generate the macro topics + self._logger.info("Starting python macro topic generation") responses = await self.generate_python_macro_topics( n_macro_topics=n_macro_topics, model=model, @@ -1569,6 +1580,7 @@ async def run_python_pipeline( f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" ) macro_topics.extend(additional_macro_topics) + self._logger.info("Finished python macro topic generation") # Generate the subtopics raw_topics = [ @@ -1584,9 +1596,11 @@ async def run_python_pipeline( ) for macro_topic in macro_topics ] - raw_topics = await asyncio.gather(*raw_topics) + self._logger.info("Starting python subtopic generation") + raw_topics = await self._gather(raw_topics) topic_list = [item for subtopics in raw_topics for item in subtopics] topic_list.extend(additional_subtopics) + self._logger.info("Finished python subtopic generation") # Mix the macro topics with the subtopics if combine_topics: @@ -1606,8 +1620,11 @@ async def run_python_pipeline( ) for subtopic in topic_list ] - raw_lines = await asyncio.gather(*raw_lines) + self._logger.info("Starting python openline generation") + raw_lines = await self._gather(raw_lines) openlines = [item for lines in raw_lines for item in lines] + self._logger.info("Finished python openline generation") + self._logger.info("Finished python pipeline") return openlines @@ -1630,24 +1647,14 @@ async def _generate_parse_python_subtopic( prompt_template=subtopic_prompt_template, ) raw_topic = raw_topic[0] - try: - parsed_topics = await self.convert_response_to_yaml_list( - raw_topic, - model=model, - prompt_template=yaml_conversion_prompt_template, - model_kwargs=conversion_model_kwargs, - ) - if len(parsed_topics) != n_subtopics: - raise YamlConversionError( - f"Error: Length of subtopics {len(parsed_topics)} does not match desired n_subtopics {n_subtopics}: {parsed_topics}" - ) - except YamlConversionError as e: - if ignore_conversion_failure: - return [] - else: - raise e - - return parsed_topics + return self._try_convert_yaml_list( + raw_topic, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_subtopics, + ignore_conversion_failure=ignore_conversion_failure, + ) async def _generate_parse_python_openline( self, @@ -1668,21 +1675,11 @@ async def _generate_parse_python_openline( prompt_template=python_problem_prompt_template, ) raw_line = raw_line[0] - try: - parsed_line = await self.convert_response_to_yaml_list( - raw_line, - model=model, - prompt_template=yaml_conversion_prompt_template, - model_kwargs=conversion_model_kwargs, - ) - if len(parsed_line) != n_openlines: - raise YamlConversionError( - f"Error: Length of openlines {len(parsed_line)} does not match desired n_openlines {n_openlines}: {parsed_line}" - ) - except YamlConversionError as e: - if ignore_conversion_failure: - return [] - else: - raise e - - return parsed_line + return self._try_convert_yaml_list( + raw_line, + model=model, + yaml_conversion_prompt_template=yaml_conversion_prompt_template, + conversion_model_kwargs=conversion_model_kwargs, + expected_length=n_openlines, + ignore_conversion_failure=ignore_conversion_failure, + ) diff --git a/nemo_curator/synthetic/mixtral.py b/nemo_curator/synthetic/mixtral.py index 9b7568f3..71a6f372 100644 --- a/nemo_curator/synthetic/mixtral.py +++ b/nemo_curator/synthetic/mixtral.py @@ -5,7 +5,7 @@ class Mixtral8x7BFormatter(ConversationFormatter): - BASE_PROMPT = " [INST] \n" + PROMPT_PREFIX = " [INST] \n" @staticmethod def format_conversation(conv: List[dict]) -> str: @@ -17,7 +17,7 @@ def format_conversation(conv: List[dict]) -> str: Returns: A conversation formatted as text """ - prompt = Mixtral8x7BFormatter.BASE_PROMPT + prompt = Mixtral8x7BFormatter.PROMPT_PREFIX for i, turn in enumerate(conv): user_turn = i % 2 == 0 diff --git a/nemo_curator/synthetic/nemotron.py b/nemo_curator/synthetic/nemotron.py index 9e10a30d..fedf7c19 100644 --- a/nemo_curator/synthetic/nemotron.py +++ b/nemo_curator/synthetic/nemotron.py @@ -1439,7 +1439,7 @@ def run_python_pipeline( class NemotronFormatter(ConversationFormatter): - BASE_PROMPT = "System\n\nUser\n" + PROMPT_PREFIX = "System\n\nUser\n" @staticmethod def format_conversation(conv: List[dict]) -> str: @@ -1451,7 +1451,7 @@ def format_conversation(conv: List[dict]) -> str: Returns: A conversation formatted as text """ - prompt = NemotronFormatter.BASE_PROMPT + prompt = NemotronFormatter.PROMPT_PREFIX for i, turn in enumerate(conv): user_turn = i % 2 == 0 From b79ce6b1f95cd7f1f534b6581facec91a4101bec Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 8 Jul 2024 17:48:15 -0700 Subject: [PATCH 64/69] Add logging import Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/async_nemotron.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py index c01de79e..ec47d4d6 100644 --- a/nemo_curator/synthetic/async_nemotron.py +++ b/nemo_curator/synthetic/async_nemotron.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import asyncio +import logging import os from typing import Any, Coroutine, List, Optional, Tuple, Union From 8957e120a248ced7b5a24a5317230c42455d84e8 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 8 Jul 2024 17:49:34 -0700 Subject: [PATCH 65/69] Fix import Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/async_nemotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py index ec47d4d6..d15cb67d 100644 --- a/nemo_curator/synthetic/async_nemotron.py +++ b/nemo_curator/synthetic/async_nemotron.py @@ -16,9 +16,9 @@ import os from typing import Any, Coroutine, List, Optional, Tuple, Union -import tqdm import tqdm.asyncio import yaml +from tqdm import tqdm from nemo_curator.log import create_logger from nemo_curator.services.model_client import AsyncLLMClient From 1400a3264ecf0679c86e0f96b0b69cf17d79543b Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 8 Jul 2024 17:53:43 -0700 Subject: [PATCH 66/69] Fix tqdm Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/async_nemotron.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py index d15cb67d..17e30bc6 100644 --- a/nemo_curator/synthetic/async_nemotron.py +++ b/nemo_curator/synthetic/async_nemotron.py @@ -16,9 +16,8 @@ import os from typing import Any, Coroutine, List, Optional, Tuple, Union -import tqdm.asyncio import yaml -from tqdm import tqdm +from tqdm.asyncio import tqdm from nemo_curator.log import create_logger from nemo_curator.services.model_client import AsyncLLMClient @@ -171,7 +170,7 @@ async def _gather( final_list = [] for i in tqdm(range(0, len(requests), max_requests)): request_slice = requests[i : i + max_requests] - result = await tqdm.asyncio.gather(*request_slice) + result = await tqdm.gather(*request_slice) final_list.extend(result) return final_list From 0926d6ec19662684add21c86bb650a598ed5a03d Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 8 Jul 2024 18:02:43 -0700 Subject: [PATCH 67/69] Add missing awaits Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/async_nemotron.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py index 17e30bc6..4c879b9b 100644 --- a/nemo_curator/synthetic/async_nemotron.py +++ b/nemo_curator/synthetic/async_nemotron.py @@ -1025,7 +1025,7 @@ async def _generate_parse_subtopic( prompt_template=subtopic_prompt_template, ) subtopic = subtopic[0] - return self._try_convert_yaml_list( + return await self._try_convert_yaml_list( subtopic, model=model, yaml_conversion_prompt_template=yaml_conversion_prompt_template, @@ -1053,7 +1053,7 @@ async def _generate_parse_openline( prompt_template=open_qa_from_topics_prompt_template, ) openline = openline[0] - return self._try_convert_yaml_list( + return await self._try_convert_yaml_list( openline, model=model, yaml_conversion_prompt_template=yaml_conversion_prompt_template, @@ -1081,7 +1081,7 @@ async def _revise_parse_openline( prompt_template=revise_open_qa_prompt_template, ) revised_openline = revised_openline[0] - return self._try_convert_yaml_list( + return await self._try_convert_yaml_list( revised_openline, model=model, yaml_conversion_prompt_template=yaml_conversion_prompt_template, @@ -1200,7 +1200,7 @@ async def _generate_parse_writing_task( prompt_template=writing_task_prompt_template, ) raw_tasks = raw_tasks[0] - return self._try_convert_yaml_list( + return await self._try_convert_yaml_list( raw_tasks, model=model, yaml_conversion_prompt_template=yaml_conversion_prompt_template, @@ -1228,7 +1228,7 @@ async def _revise_parse_writing_task( prompt_template=revise_writing_task_prompt_template, ) raw_revision = raw_revision[0] - return self._try_convert_yaml_list( + return await self._try_convert_yaml_list( raw_revision, model=model, yaml_conversion_prompt_template=yaml_conversion_prompt_template, @@ -1313,7 +1313,7 @@ async def _generate_parse_closed_qa( prompt_template=closed_qa_prompt_template, ) raw_instruction = raw_instruction[0] - parsed_instructions = self._try_convert_yaml_list( + parsed_instructions = await self._try_convert_yaml_list( raw_instruction, model=model, yaml_conversion_prompt_template=yaml_conversion_prompt_template, @@ -1468,7 +1468,7 @@ async def _generate_parse_math_subtopic( prompt_template=subtopic_prompt_template, ) raw_topic = raw_topic[0] - return self._try_convert_yaml_list( + return await self._try_convert_yaml_list( raw_topic, model=model, yaml_conversion_prompt_template=yaml_conversion_prompt_template, @@ -1496,7 +1496,7 @@ async def _generate_parse_math_openline( prompt_template=math_problem_prompt_template, ) raw_line = raw_line[0] - return self._try_convert_yaml_list( + return await self._try_convert_yaml_list( raw_line, model=model, yaml_conversion_prompt_template=yaml_conversion_prompt_template, @@ -1647,7 +1647,7 @@ async def _generate_parse_python_subtopic( prompt_template=subtopic_prompt_template, ) raw_topic = raw_topic[0] - return self._try_convert_yaml_list( + return await self._try_convert_yaml_list( raw_topic, model=model, yaml_conversion_prompt_template=yaml_conversion_prompt_template, @@ -1675,7 +1675,7 @@ async def _generate_parse_python_openline( prompt_template=python_problem_prompt_template, ) raw_line = raw_line[0] - return self._try_convert_yaml_list( + return await self._try_convert_yaml_list( raw_line, model=model, yaml_conversion_prompt_template=yaml_conversion_prompt_template, From fbe929228d03d6dcdbbd6ae011ed1e66325c2422 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 8 Jul 2024 18:19:04 -0700 Subject: [PATCH 68/69] Standardize names Signed-off-by: Ryan Wolf --- nemo_curator/synthetic/async_nemotron.py | 72 ++++++++++++------------ 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/nemo_curator/synthetic/async_nemotron.py b/nemo_curator/synthetic/async_nemotron.py index 4c879b9b..d52fd755 100644 --- a/nemo_curator/synthetic/async_nemotron.py +++ b/nemo_curator/synthetic/async_nemotron.py @@ -60,13 +60,13 @@ def __init__( self.client = llm_client self.max_concurrent_requests = max_concurrent_requests if isinstance(logger, str): - self._logger = create_logger( + self.logger = create_logger( rank=0, log_file=os.path.join(logger, "nemotron-generator.log"), name="AsyncNemotronGenrator", ) else: - self._logger = logger + self.logger = logger async def _prompt( self, model: str, prompt_template: str, prompt_kwargs: dict, model_kwargs: dict @@ -919,9 +919,9 @@ async def run_open_qa_pipeline( Returns: A list of synthetically generated open Q&A prompts """ - self._logger.info("Starting open q&a pipeline") + self.logger.info("Starting open q&a pipeline") # Generate the macro topics - self._logger.info("Starting macro topic generation") + self.logger.info("Starting macro topic generation") responses = await self.generate_macro_topics( n_macro_topics=n_macro_topics, model=model, @@ -939,7 +939,7 @@ async def run_open_qa_pipeline( f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" ) macro_topics.extend(additional_macro_topics) - self._logger.info("Finished macro topic generation") + self.logger.info("Finished macro topic generation") # Generate the subtopics raw_topics = [ @@ -955,11 +955,11 @@ async def run_open_qa_pipeline( ) for macro_topic in macro_topics ] - self._logger.info("Starting subtopic generation") + self.logger.info("Starting subtopic generation") raw_topics = await self._gather(raw_topics) topic_list = [item for subtopics in raw_topics for item in subtopics] topic_list.extend(additional_subtopics) - self._logger.info("Finished subtopic generation") + self.logger.info("Finished subtopic generation") # Mix the macro topics with the subtopics if combine_topics: @@ -979,10 +979,10 @@ async def run_open_qa_pipeline( ) for subtopic in topic_list ] - self._logger.info("Starting openline generation") + self.logger.info("Starting openline generation") raw_lines = await self._gather(raw_lines) openlines = [item for lines in raw_lines for item in lines] - self._logger.info("Finished openline generation") + self.logger.info("Finished openline generation") # Revise the openlines raw_revisions = [ @@ -998,11 +998,11 @@ async def run_open_qa_pipeline( ) for openline in openlines ] - self._logger.info("Starting openline revision") + self.logger.info("Starting openline revision") raw_revisions = await self._gather(raw_revisions) revised_openlines = [item for revisions in raw_revisions for item in revisions] - self._logger.info("Finished openline revision") - self._logger.info("Finished open q&a pipeline") + self.logger.info("Finished openline revision") + self.logger.info("Finished open q&a pipeline") return revised_openlines @@ -1134,7 +1134,7 @@ async def run_writing_pipeline( Returns: A list of synthetically generated writing task prompts """ - self._logger.info("Starting writing pipeline") + self.logger.info("Starting writing pipeline") # Generate the tasks raw_writing_tasks = [] for topic in topics: @@ -1152,10 +1152,10 @@ async def run_writing_pipeline( ignore_conversion_failure=ignore_conversion_failure, ) ) - self._logger.info("Starting writing task generation") + self.logger.info("Starting writing task generation") raw_writing_tasks = await self._gather(raw_writing_tasks) writing_tasks = [item for tasks in raw_writing_tasks for item in tasks] - self._logger.info("Finished writing task generation") + self.logger.info("Finished writing task generation") # Revise the tasks raw_revised_openlines = [ @@ -1171,11 +1171,11 @@ async def run_writing_pipeline( ) for task in writing_tasks ] - self._logger.info("Starting writing task revision") + self.logger.info("Starting writing task revision") raw_revised_openlines = await self._gather(raw_revised_openlines) revised_openlines = [item for lines in raw_revised_openlines for item in lines] - self._logger.info("Finished writing task revision") - self._logger.info("Finished writing pipeline") + self.logger.info("Finished writing task revision") + self.logger.info("Finished writing pipeline") return revised_openlines @@ -1272,7 +1272,7 @@ async def run_closed_qa_pipeline( A list of pairs where the first element represents the index of the document used to generate the question in the documents list and the second element represents a synthetically generated closed Q&A prompt. Example: [(0, "Summarize this document"), ...] """ - self._logger.info("Starting closed q&a pipeline") + self.logger.info("Starting closed q&a pipeline") raw_qa = [ self._generate_parse_closed_qa( document_id=i, @@ -1289,7 +1289,7 @@ async def run_closed_qa_pipeline( ] raw_qa = await self._gather(raw_qa) document_openline_pairs = [item for lines in raw_qa for item in lines] - self._logger.info("Finished closed q&a pipeline") + self.logger.info("Finished closed q&a pipeline") return document_openline_pairs @@ -1380,9 +1380,9 @@ async def run_math_pipeline( Returns: A list of synthetically generated math prompts """ - self._logger.info("Starting math pipeline") + self.logger.info("Starting math pipeline") # Generate the macro topics - self._logger.info("Starting math macro topic generation") + self.logger.info("Starting math macro topic generation") responses = await self.generate_math_macro_topics( n_macro_topics=n_macro_topics, school_level=school_level, @@ -1401,7 +1401,7 @@ async def run_math_pipeline( f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" ) macro_topics.extend(additional_macro_topics) - self._logger.info("Finished math macro topic generation") + self.logger.info("Finished math macro topic generation") # Generate the subtopics raw_topics = [ @@ -1417,11 +1417,11 @@ async def run_math_pipeline( ) for macro_topic in macro_topics ] - self._logger.info("Starting math subtopic generation") + self.logger.info("Starting math subtopic generation") raw_topics = await self._gather(raw_topics) topic_list = [item for subtopics in raw_topics for item in subtopics] topic_list.extend(additional_subtopics) - self._logger.info("Finished math subtopic generation") + self.logger.info("Finished math subtopic generation") # Mix the macro topics with the subtopics if combine_topics: @@ -1441,11 +1441,11 @@ async def run_math_pipeline( ) for subtopic in topic_list ] - self._logger.info("Starting math openline generation") + self.logger.info("Starting math openline generation") raw_lines = await self._gather(raw_lines) openlines = [item for lines in raw_lines for item in lines] - self._logger.info("Finished math openline generation") - self._logger.info("Finished math pipeline") + self.logger.info("Finished math openline generation") + self.logger.info("Finished math pipeline") return openlines @@ -1560,9 +1560,9 @@ async def run_python_pipeline( Returns: A list of synthetically generated Python prompts """ - self._logger.info("Starting python pipeline") + self.logger.info("Starting python pipeline") # Generate the macro topics - self._logger.info("Starting python macro topic generation") + self.logger.info("Starting python macro topic generation") responses = await self.generate_python_macro_topics( n_macro_topics=n_macro_topics, model=model, @@ -1580,7 +1580,7 @@ async def run_python_pipeline( f"Error: Length of macro topics {len(macro_topics)} does not match desired n_macro_topics {n_macro_topics}: {macro_topics}" ) macro_topics.extend(additional_macro_topics) - self._logger.info("Finished python macro topic generation") + self.logger.info("Finished python macro topic generation") # Generate the subtopics raw_topics = [ @@ -1596,11 +1596,11 @@ async def run_python_pipeline( ) for macro_topic in macro_topics ] - self._logger.info("Starting python subtopic generation") + self.logger.info("Starting python subtopic generation") raw_topics = await self._gather(raw_topics) topic_list = [item for subtopics in raw_topics for item in subtopics] topic_list.extend(additional_subtopics) - self._logger.info("Finished python subtopic generation") + self.logger.info("Finished python subtopic generation") # Mix the macro topics with the subtopics if combine_topics: @@ -1620,11 +1620,11 @@ async def run_python_pipeline( ) for subtopic in topic_list ] - self._logger.info("Starting python openline generation") + self.logger.info("Starting python openline generation") raw_lines = await self._gather(raw_lines) openlines = [item for lines in raw_lines for item in lines] - self._logger.info("Finished python openline generation") - self._logger.info("Finished python pipeline") + self.logger.info("Finished python openline generation") + self.logger.info("Finished python pipeline") return openlines From 8f66396d388b536756c567243a3e626576ac820e Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Mon, 8 Jul 2024 18:21:36 -0700 Subject: [PATCH 69/69] Address Ayush nit Signed-off-by: Ryan Wolf --- docs/user-guide/syntheticdata.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/user-guide/syntheticdata.rst b/docs/user-guide/syntheticdata.rst index 5a3d0235..ae3206b4 100644 --- a/docs/user-guide/syntheticdata.rst +++ b/docs/user-guide/syntheticdata.rst @@ -15,8 +15,4 @@ There are a variety of ways to construct synthetic data generation pipelines, wi NeMo Curator has a simple, easy-to-use set of tools that allow you to use prebuilt synthetic generation pipelines or build your own. Any model inference service that uses the OpenAI API is compatible with the synthetic data generation module, allowing you to generate your data from any model. NeMo Curator has prebuilt synthetic data generation pipelines for supervised fine-tuning (SFT) and preference data that were used to generate data for the training of `Nemotron-4 340B `_. -And, you can easily interweave filtering and deduplication steps in your synthetic data pipeline with the other modules in NeMo Curator. - ------------------------------------------ -Usage ------------------------------------------ +And, you can easily interweave filtering and deduplication steps in your synthetic data pipeline with the other modules in NeMo Curator. \ No newline at end of file