From a2df24668efe0c611c9031343a14c46c3c9cb466 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Thu, 3 Oct 2024 01:46:44 -0500 Subject: [PATCH] clean up noteb ook and fix pre commit Signed-off-by: Yang Yu --- .../config/heuristic_filter_en.yaml | 2 +- tutorials/pretraining-data-curation/helper.py | 24 ++-- .../red-pajama-v2-curation-tutorial.ipynb | 114 +++++++----------- .../start-distributed-notebook.sh | 2 +- 4 files changed, 59 insertions(+), 83 deletions(-) diff --git a/tutorials/pretraining-data-curation/config/heuristic_filter_en.yaml b/tutorials/pretraining-data-curation/config/heuristic_filter_en.yaml index 6819e4e16..ee0bd4f55 100644 --- a/tutorials/pretraining-data-curation/config/heuristic_filter_en.yaml +++ b/tutorials/pretraining-data-curation/config/heuristic_filter_en.yaml @@ -31,4 +31,4 @@ filters: - name: nemo_curator.filters.heuristic_filter.WordCountFilter params: min_words: 50 - max_words: 100000 \ No newline at end of file + max_words: 100000 diff --git a/tutorials/pretraining-data-curation/helper.py b/tutorials/pretraining-data-curation/helper.py index 22a4389c8..2f7af9508 100644 --- a/tutorials/pretraining-data-curation/helper.py +++ b/tutorials/pretraining-data-curation/helper.py @@ -1,21 +1,23 @@ import gzip -import os import json +import os + +import cudf import dask.bag as db def convert_single_file(input_output_paths): input_path, output_path = input_output_paths - - with gzip.open(input_path, 'rt', encoding='utf-8') as f_in: - with open(output_path, 'w', encoding='utf-8') as f_out: + + with gzip.open(input_path, "rt", encoding="utf-8") as f_in: + with open(output_path, "w", encoding="utf-8") as f_out: for line in f_in: try: # Parse each line as a separate JSON object item = json.loads(line) # Write the JSON object to the .jsonl file json.dump(item, f_out) - f_out.write('\n') + f_out.write("\n") except json.JSONDecodeError as e: print(f"Error decoding JSON in file {input_path}: {e}") continue @@ -24,16 +26,18 @@ def convert_single_file(input_output_paths): def convert_json_gz_to_jsonl(input_dir, output_dir, partition_size=2): # Ensure the output directory exists os.makedirs(output_dir, exist_ok=True) - + # List all .json.gz files in the input directory file_paths = [] for filename in os.listdir(input_dir): - if filename.endswith('.json.gz'): + if filename.endswith(".json.gz"): input_path = os.path.join(input_dir, filename) - output_filename = os.path.splitext(os.path.splitext(filename)[0])[0] + '.jsonl' + output_filename = ( + os.path.splitext(os.path.splitext(filename)[0])[0] + ".jsonl" + ) output_path = os.path.join(output_dir, output_filename) file_paths.append((input_path, output_path)) - + # Create a Dask bag from the file paths and apply the function in parallel bag = db.from_sequence(file_paths, partition_size=partition_size) bag.map(convert_single_file).compute() @@ -47,4 +51,4 @@ def convert_str_id_to_int(df, id_column="id"): dx = df[id_column].str.rsplit("-", n=1, expand=True) df["doc_id"] = dx[1].astype("int64").values df["dataset_id"] = dx[0].hash_values() - return df \ No newline at end of file + return df diff --git a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb index 05e8225fd..0d6999254 100644 --- a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb +++ b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb @@ -5,20 +5,7 @@ "id": "8498f9f4-3d9a-481e-80fb-af41f7d7aa7d", "metadata": {}, "source": [ - "# Pre-training Data Curation in NeMo Curator\n", - "\n", - "The NeMo Curator is a Python library that consists of a collection of scalable data-mining modules for curating natural language processing (NLP) data for training large language models (LLMs). The modules within the NeMo Data Curator enable NLP researchers to mine high-quality text at scale from massive uncurated web corpora.\n", - "\n", - "NeMo Curator includes the following modules to perform data curation:\n", - "\n", - "- Data download and Extraction\n", - "- Language identification and separation\n", - "- Text reformatting and cleaning\n", - "- Quality filtering\n", - "- Document-level deduplication\n", - "- Multilingual downstream-task decontamination\n", - "- Distributed Data Classification\n", - "- Personal identifiable information (PII) redaction" + "# Pretraining Data Curation in NeMo Curator" ] }, { @@ -26,7 +13,7 @@ "id": "28d17c49", "metadata": {}, "source": [ - "# Table of Contents\n", + "## Table of Contents\n", "\n", "1. [Introduction](#introduction)\n", "2. [Getting Started](#get-start)\n", @@ -36,6 +23,24 @@ "6. [Quality filtering](#filter)" ] }, + { + "cell_type": "markdown", + "id": "4c55d981", + "metadata": {}, + "source": [ + "# 1. Introduction\n", + "\n", + "\n", + "In this tutorial, we will show how to curate large-scale data for LLM pretraining in a distributed environment using NeMo-Curator. Specifically, we will focus on the following modules in NeMo-Curator:\n", + "\n", + "- Language identification and separation\n", + "- Text reformatting and cleaning\n", + "- Quality filtering\n", + "- Document-level deduplication\n", + "\n", + "For demonstration, we will use the [RedPajama-Data-v2](#rpv2) dataset, an open dataset for LLM pretraining." + ] + }, { "cell_type": "markdown", "id": "520eef06-0edb-4108-a048-af006dea8601", @@ -44,25 +49,20 @@ "tags": [] }, "source": [ - "# Introduction\n", - "\n", - "\n", - "In this tutorial, we will be demonstrating how to curate a LLM pre-training dataset using NeMo Curator.\n", - "\n", - "## System Information\n", + "## 1.1 System Information\n", "Here is the information on the system this notebook was run on:\n", "\n", "- **GPU**: 2 A100 nodes (each with 8 A100-SXM4-80GB)\n", "\n", - "- **CUDA & Nvidia Drivers**: CUDA 12.2 with Driver 535.104.12\n", + "- **CUDA & Nvidia Drivers**: CUDA 12.4 with Driver 535.104.12\n", "\n", - "- **OS**: Ubuntu 20.04.5 LTS\n", + "- **OS**: Ubuntu 22.04.4 LTS\n", "\n", - "## Running NeMo-Curator\n", + "## 1.2 Running NeMo-Curator\n", "\n", - "NeMo-curator came pre-installed in Nemo framework container. This notebook use 24.07 release of the Nemo framework container. User can pull the container following the steps below:\n", + "NeMo-curator came pre-installed in Nemo Framework container. This notebook use 24.07 release of the NeMo Framework container. User can pull the container following the steps below:\n", "\n", - "- Get access to the NeMo Frameworm container on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)\n", + "- Get access to the NeMo Framework container on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)\n", "\n", "- Set your docker credentials\n", "\n", @@ -85,7 +85,7 @@ "id": "7d57dd35-cce6-4bfa-b34a-fb4a2ea584e0", "metadata": {}, "source": [ - "# Getting started\n", + "# 2. Getting started\n", "\n", "\n", "NeMo-Curator uses dask for parallelization. Before we start using curator, we need to start a dask cluster. To start a multi-node dask cluster in slurm, we can use the `start-distributed-notebook.sh` script in this directory to start the cluster. The user will want to change the following:\n", @@ -177,7 +177,7 @@ "id": "bf008174-a7b6-4a62-b421-0e3d84e305f2", "metadata": {}, "source": [ - "# RedPajama-Data-v2\n", + "# 3. RedPajama-Data-v2\n", "" ] }, @@ -195,16 +195,6 @@ "The raw rpv2 data is stored in compressed json. We will first decompress the json.gz file and write them into jsonl files. For this, we will use a helper function `convert_json_gz_to_jsonl` in `helper.py`\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d13a997", - "metadata": {}, - "outputs": [], - "source": [ - "from helper import convert_json_gz_to_jsonl" - ] - }, { "cell_type": "code", "execution_count": 4, @@ -222,6 +212,8 @@ } ], "source": [ + "from helper import convert_json_gz_to_jsonl\n", + "\n", "input_data_dir = os.path.join(base_dir,\"rpv2-2023-06-raw\")\n", "output_data_dir = os.path.join(base_dir,\"rpv2-2023-06\")\n", "\n", @@ -368,7 +360,7 @@ "id": "22b85d83-fd01-49cd-8618-006cf6806461", "metadata": {}, "source": [ - "Removing the raw dataset to save disk space:" + "[Optional] Removing the raw dataset to save disk space:" ] }, { @@ -380,7 +372,7 @@ }, "outputs": [], "source": [ - "!rm -rf /lustre/fsw/portfolios/coreai/users/yayu/data.fs5/rpv2-2023-06" + "!rm -rf {base_dir}/rpv2-2023-06" ] }, { @@ -485,7 +477,7 @@ }, "outputs": [], "source": [ - "!rm -rf /lustre/fsw/portfolios/coreai/users/yayu/data.fs5/rpv2-2023-06-sharded" + "!rm -rf {base_dir}/rpv2-2023-06-sharded" ] }, { @@ -535,30 +527,12 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "2b7e6c9b-2aa3-4a8b-89ea-6dc4ca585b8d", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2024-08-23 16:03:29-- https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin\n", - "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.227.74.12, 13.227.74.45, 13.227.74.118, ...\n", - "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.227.74.12|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 131266198 (125M) [application/octet-stream]\n", - "Saving to: ‘/lustre/fsw/portfolios/coreai/users/yayu/data.fs5/rpv2-2023-06-language/lid.176.bin’\n", - "\n", - "lid.176.bin 100%[===================>] 125.18M 39.2MB/s in 4.9s \n", - "\n", - "2024-08-23 16:03:35 (25.5 MB/s) - ‘/lustre/fsw/portfolios/coreai/users/yayu/data.fs5/rpv2-2023-06-language/lid.176.bin’ saved [131266198/131266198]\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P {model_path}" ] @@ -678,7 +652,7 @@ }, "outputs": [], "source": [ - "!rm -rf \"/lustre/fsw/portfolios/coreai/users/yayu/data.fs5/rpv2-2023-06-id\"" + "!rm -rf {base_dir}/rpv2-2023-06-id" ] }, { @@ -802,7 +776,7 @@ }, "outputs": [], "source": [ - "!rm -rf \"/lustre/fsw/portfolios/coreai/users/yayu/data.fs5/rpv2-2023-06-language/data/EN\"" + "!rm -rf {base_dir}/rpv2-2023-06-language/data/EN" ] }, { @@ -2948,7 +2922,6 @@ ], "source": [ "output_path = os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/connected_components.parquet\")\n", - "# As i repartition (i dont need to shuffle the whole thing \n", "cc_result = dask_cudf.read_parquet(output_path, split_row_groups=False).repartition(npartitions=1)\n", "\n", "# Set 'group' as the index and shuffle to ensure all same 'group' values are in the same partition\n", @@ -2961,10 +2934,10 @@ " df = df.drop(columns=['cumcount'])\n", " return df\n", "\n", - "# Apply the function to each partition\n", + "# Find duplicates by applying the function to each partition\n", "docs_to_remove = cc_result.map_partitions(assign_cumcount, meta=cc_result)\n", "\n", - "# Reset the index if necessary\n", + "# Reset the index\n", "docs_to_remove = docs_to_remove.reset_index()\n", "\n", "docs_to_remove = docs_to_remove[[\"dataset_id\", \"doc_id\"]]\n", @@ -4665,7 +4638,6 @@ ], "source": [ "output_path = os.path.join(base_dir, \"fuzzy-dedup-output-2023-06-and-14/connected_components.parquet\")\n", - "# As i repartition (i dont need to shuffle the whole thing \n", "cc_result = dask_cudf.read_parquet(output_path, split_row_groups=False).repartition(npartitions=1)\n", "\n", "# Set 'group' as the index and shuffle to ensure all same 'group' values are in the same partition\n", @@ -4678,10 +4650,10 @@ " df = df.drop(columns=['cumcount'])\n", " return df\n", "\n", - "# Apply the function to each partition\n", + "# Find duplicates by applying the function to each partition\n", "docs_to_remove = cc_result.map_partitions(assign_cumcount, meta=cc_result)\n", "\n", - "# Reset the index if necessary\n", + "# Reset the index\n", "docs_to_remove = docs_to_remove.reset_index()\n", "\n", "docs_to_remove = docs_to_remove[[\"dataset_id\", \"doc_id\"]]\n", @@ -4718,11 +4690,11 @@ } ], "source": [ - "output_resharded_dir = expand_outdir_and_mkdir(\"/lustre/fsw/portfolios/coreai/users/yayu/data.fs5/rpv2-2023-06-and-14-deduped-resharded\")\n", + "output_resharded_dir = expand_outdir_and_mkdir(os.path.join(base_dir, \"rpv2-2023-06-and-14-deduped-resharded\"))\n", "\n", "t0 = time.time()\n", "reshard_jsonl(\n", - " '/lustre/fsw/portfolios/coreai/users/yayu/data.fs5/rpv2-2023-06-and-14-deduped',\n", + " os.path.join(base_dir, \"rpv2-2023-06-and-14-deduped\"),\n", " output_resharded_dir,\n", " output_file_size=\"100M\",\n", " start_index=0,\n", diff --git a/tutorials/pretraining-data-curation/start-distributed-notebook.sh b/tutorials/pretraining-data-curation/start-distributed-notebook.sh index 2a554e5a7..a35993390 100644 --- a/tutorials/pretraining-data-curation/start-distributed-notebook.sh +++ b/tutorials/pretraining-data-curation/start-distributed-notebook.sh @@ -75,4 +75,4 @@ export DASK_DATAFRAME__QUERY_PLANNING=False srun \ --container-mounts=${MOUNTS} \ --container-image=${CONTAINER_IMAGE} \ - ${CONTAINER_ENTRYPOINT} \ No newline at end of file + ${CONTAINER_ENTRYPOINT}