diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..b2a38eaf --- /dev/null +++ b/.gitignore @@ -0,0 +1,153 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +*.cpp + +# C extensions +*.so +*.c + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# Dask cache +dask-worker-space/ + +# Data downloaded and generated when running the examples. +data/ + +# SLURM Files +*.out +*.err + +# Text Editor / IDE Files +.vscode diff --git a/.style.yapf b/.style.yapf new file mode 100644 index 00000000..4861cafe --- /dev/null +++ b/.style.yapf @@ -0,0 +1,3 @@ +[style] +based_on_style = google +indent_width = 2 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..3bd14fc1 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,118 @@ +# Checklist + +We are glad you are contributing to NeMo Curator! Before you make a PR, be sure to read over this guide in detail. +This checklist ensures that NeMo Curator stays easy-to-use by both users and developers. +Not all steps are necessary for some contributions, so read the linked sections for more information about each item. + +1. [Follow the general principles in your design](#general-principles) +1. [Write your code in the proper place](#repo-structure) +1. [Write examples and documentation for using your code](#examples-and-documentation) +1. [Format using the style guide](#python-style) +1. [Write unit tests](#unit-tests) +1. [Make a pull request](#pull-requests-pr-guidelines) + +## General principles +1. **User-oriented**: make it easy for end users, even at the cost of writing more code in the background +1. **Robust**: make it hard for users to make mistakes. +1. **Reusable**: for every piece of code, think about how it can be reused in the future and make it easy to be reused. +1. **Readable**: code should be easier to read. +1. **Legal**: if you copy even one line of code from the Internet, make sure that the code allows the license that NeMo Curator supports. Give credit and link back to the code. +1. **Sensible**: code should make sense. If you think a piece of code might be confusing, write comments. + +## Code Structure +The repository is home to flexible Python modules, sample scripts, tests, and more. +Here is a brief overview of where everything lives: +- [config](config/) - A collection of example configuration files for many of the curator's modules. +- [docs](docs/) - Walkthroughs and motivations for each of the modules. +- [examples](examples/) - Example scripts for how users may want to compose the curator. +- [nemo_curator](nemo_curator/) - The main home for all the NeMo Curator's Python APIs. + - [modules](nemo_curator/modules) - Classes for the modules. + - [filters](nemo_curator/filters) - Classes for the filters. + - [utils](nemo_curator/utils) - Common utilities for file/network operations. +- [tests](tests/) - Unit tests for each module. + +## Examples and Documentation +Examples provide an easy way for users to see how the curator works in action. +There should be at least one example per module in the curator. +They should be incredibly lightweight and rely on the core `nemo_curator` modules for their functionality. +Most should be designed for a user to get up and running on their local machines, but distributed examples are welcomed if it makes sense. +Python scripts should be the primary way to showcase your module. +Though, SLURM scripts or other cluster scripts should be included if there are special steps needed to run the module. + +The documentation should complement each example by going through the motivation behind why a user would use each module. +It should include both an explanation of the module, and how it's used in its corresponding example. +The documentation should also cover potential pitfalls and performance considerations when running the module at scale. +This existing examples and documentation should serve as a good reference to what is expected. + +## Python style +We use ``black`` as our style guide. To fix your format run `pip install pre-commit && pre-commit install && pre-commit run --all`. + +1. Include docstrings for every class and method exposed to the user. +1. Avoid wild import: ``from X import *`` unless in ``X.py``, ``__all__`` is defined. +1. Minimize the use of ``**kwargs``. +1. ``RaiseError`` is preferred to ``assert``. Write: ```if X: raise Error``` instead of ```assert X```. +1. Classes are preferred to standalone methods. +1. Methods should be atomic. A method shouldn't be longer than 75 lines, e.g. can be fit into the computer screen without scrolling. +1. If a method has arguments that don't fit into one line, each argument should be in its own line for readability. +1. Add ``__init__.py`` for every folder. +1. F-strings are prefered to formatted strings. +1. Loggers are preferred to print. +1. Private functions (functions start with ``_``) shouldn't be called outside its host file. +1. If a comment lasts multiple lines, use ``'''`` instead of ``#``. + +## Unit tests +Unit tests should be simple and fast. +Developers should be able to run them frequently while developing without any slowdown. +``` +pytest +# If you don't have NVIDIA GPU do: +# pytest --cpu +``` + +## Pull Requests (PR) Guidelines + +**Send your PRs to the `main` or `dev` branch** + +1) Make sure your PR does one thing. Have a clear answer to "What does this PR do?". +2) Read General Principles and style guide below +3) Make sure you sign your commits. E.g. use ``git commit -sS`` when committing. +4) Make sure all unittests finish successfully before sending PR ``pytest`` or (if your dev box does not have GPU) ``pytest --cpu`` from the root folder +5) Send your PR and request a review + +The `dev` branch is for active development and may be unstable. Unit tests are expected to pass before merging into `dev` or `main`. +Every release `dev` and `main` will sync to be the same. + +Full text of the DCO: + +``` +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. +1 Letterman Drive +Suite D4700 +San Francisco, CA, 94129 + +Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or + +(b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or + +(c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it. + +(d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. +``` + +## Whom should you ask for review: + +Joseph Jennings (@jjennings) or Ryan Wolf (@rywolf) + +They may ask for other reviewers depending on the scope of the change. Your pull requests must pass all checks and peer-review before they can be merged. + + +Thank you for contributing to NeMo Curator! diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 00000000..60911ade --- /dev/null +++ b/README.md @@ -0,0 +1,129 @@ +# NeMo Curator + +NeMo Curator is a Python library that consists of a collection of scalable data-mining modules for curating natural language processing (NLP) data for training large language models (LLMs). The modules within NeMo Curator enable NLP researchers to mine high-quality text at scale from massive uncurated web corpora. For a demonstration of how each of the modules in NeMo Curator improves downstream performance, check out the [module ablation](#module-ablation). + +NeMo Curator is built on [Dask](https://www.dask.org/) and [RAPIDS](https://developer.nvidia.com/rapids) to scale data curation and provide GPU acceleration. The Python interface provides easy methods to expand the functionality of your curation pipeline without worrying about how it will scale. More information can be found in the [usage section](#usage). There are many ways to integrate NeMo Curator in your pipeline. Check out the [installation instructions](#installation) for how to get started using it. + +## Features +We currently support the following data-curation modules. For more details on each module, visit its documentation page in the [NeMo framework user guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/index.html). + - [Data download and text extraction](docs/user-guide/Download.rst) + - Default implementations of download and extraction of Common Crawl, Wikipedia, and ArXiv data + - Users can easily customize the download and extraction and extend to other datasets + - [Language identification and separation](docs/user-guide/LanguageIdentificationUnicodeFormatting.rst) + - Language identification with [fastText](https://fasttext.cc/docs/en/language-identification.html) and [pycld2](https://pypi.org/project/pycld2/) + - [Text reformatting and cleaning](docs/user-guide/LanguageIdentificationUnicodeFormatting.rst) + - Fix unicode decoding errors via [ftfy](https://ftfy.readthedocs.io/en/latest/) + - [Quality filtering](docs/user-guide/QualityFiltering.rst) + - Multilingual heuristic-based filtering + - Classifier-based filtering via [fastText](https://fasttext.cc/) + - [Document-level deduplication](docs/user-guide/GpuDeduplication.rst) + - Both exact and fuzzy deduplication are accelerated using cuDF and Dask. + - For fuzzy deduplication, our implementation follows the method described in [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990). + - [Multilingual downstream-task decontamination](docs/user-guide/TaskDecontamination.rst) + - Our implementation follows the approach of [OpenAI GPT3](https://arxiv.org/pdf/2005.14165.pdf) and [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990) + - [Distributed data classification](docs/user-guide/DistributedDataClassification.rst) + - Multi-node multi-GPU classifier inference + - Allows for sophisticated domain and quality classification + - Flexible interface for extending to your own classifier network + - [Personal identifiable information (PII) redaction](docs/user-guide/PersonalIdentifiableInformationIdentificationAndRemoval.rst) + - Idenficiation tools for removing addresses, credit card numbers, social security numbers and more. + +These modules are designed to be flexible and allow for reordering with few exceptions. The [NeMo Framework Launcher](https://github.com/NVIDIA/NeMo-Megatron-Launcher) includes prebuilt pipelines for you to start with and modify as needed. + +## Learn More +- [Documentation](docs/) +- [Examples](examples/) +- [Module Ablation and Compute Performance](#module-ablation-and-compute-performance) + +## Installation + +NeMo Curator currently requires a GPU with CUDA 12 or above installed in order to be used. + +NeMo Curator can be installed manually by cloning the repository and installing as follows: +``` +pip install --extra-index-url https://pypi.nvidia.com . +``` +NeMo Curator is available in the [NeMo Framework Container](https://registry.ngc.nvidia.com/orgs/ea-bignlp/teams/ga-participants/containers/nemofw-training) which can be applied for [here](https://developer.nvidia.com/nemo-framework). It comes preinstalled in the container. + +## Usage + +### Python Library + +```Python +# Download your dataset +dataset = download_common_crawl("/datasets/common_crawl/", "2021-04", "2021-10", url_limit=10) +# Build your pipeline +curation_pipeline = Sequential([ + Modify(UnicodeReformatter()), + ScoreFilter(WordCountFilter(min_words=80)), + ScoreFilter(FastTextQualityFilter(model_path="model.bin")), + TaskDecontamination([Winogrande(), Squad(), TriviaQA()]) +]) +# Curate your dataset +curated_dataset = curation_pipeline(dataset) +``` + +NeMo Curator provides a collection of robust python modules that can be chained together to construct your entire data curation pipeline. These modules can be run on your local machine or in a distributed compute environment like SLURM with no modifications. NeMo Curator provides simple base classes that you can inherit from to create your own filters, document modifiers, and other extensions without needing to worry about how they scale. The [examples](examples/) directory contains a bunch of scripts showcasing each of these modules. The data curation section of the [NeMo framework user guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/index.html) provides in-depth documentation on how each of the modules work. If you need more information to modify the NeMo Curator for your usecase, the [implementation section](#implementation) provides a good starting point. + +### Scripts + +We provide CLI scripts to use as well in case those are more convienent. The scripts under `nemo_curator/scripts` map closely with each of the created python modules. Visit the [documentation](docs) for each of the python modules for more information about the scripts associated with it. + + +### NeMo Framework Launcher +[NeMo Megatron Launcher](https://github.com/NVIDIA/NeMo-Megatron-Launcher) is another way to interface with NeMo Curator. The launcher allows +for easy parameter and cluster configuration and will automatically generate the SLURM batch scripts that wrap around the CLI scripts required to run your pipeline. +Note: This is not the only way to run NeMo Curator on SLURM. There are example scripts in [`examples/slurm`](examples/slurm/) for running NeMo Curator on SLURM without the launcher. + +## Module Ablation and Compute Performance + +The modules within NeMo Curator were in large part designed to curate high-quality documents from Common Crawl snapshots and to be able to do so +in a scalable manner. In order to assess the quality of the Common Crawl documents curated by the modules in NeMo Curator, we performed a series +of ablation experiments in which we trained a 357M-parameter GPT-style model on the datasets resulting from the different stages of our data curation +pipeline implemented in NeMo Curator. The figure below demonstrates that the different data curation modules implemented within NeMo Curator +lead to improved model zero-shot downstream task performance. + +

+ drawing +

+ +In terms of scalability and compute performance, using the RAPIDS + Dask fuzzy deduplication, we are able to deduplicate the 1.1 Trillion token Red Pajama dataset in 1.8 hours using 64 A100s. + +Additionally, using the CPU-based modules the table below shows the time required and resulting data size reduction of each step of processing the [Common Crawl snapshot from November/December of 2020](https://commoncrawl.org/2020/12/nov-dec-2020-crawl-archive-now-available/) using 30 CPU nodes (with hardware similar to the `c5.24xlarge` [Amazon AWS C5 instance](https://aws.amazon.com/ec2/instance-types/c5/)): + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Dataset Download and text extractionText cleaning Quality filtering
Time Output Size Time Output Size Time Output Size
Common Crawl 2020-50 36 hrs2.8 TB 1 hr 2.8 TB 0.2 hr 0.52 TB
+ +## Implementation + +As mentioned above, the modules within NeMo Curator enable users to scale data-mining and NLP processing tasks to many nodes within a compute cluster. +The modules accomplish this using [Dask](https://www.dask.org/) with [cuDF](https://docs.rapids.ai/api/cudf/nightly/user_guide/10min/) (for the GPU-accelerated modules). +At the core of the NeMo Curator, `DocumentDataset` (the main dataset class) is just a simple wrapper around a Dask dataframe. Dask allows NeMo Curator to scale to arbitrary cluster sizes, and it supports a variety of distributed computing platforms. It supports reading and writing to different file formats, and it can balance these operations among nodes in the cluster. Importantly, Dask also supports the RAPIDS cuDF library for GPU-acclerated exact and fuzzy deduplication. \ No newline at end of file diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..2be787ab --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,24 @@ +# Security + +NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization. + +If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub.** + +## Reporting Potential Security Vulnerability in an NVIDIA Product + +To report a potential security vulnerability in any NVIDIA product: +- Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html) +- E-Mail: psirt@nvidia.com + - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) + - Please include the following information: + - Product/Driver name and version/branch that contains the vulnerability + - Type of vulnerability (code execution, denial of service, buffer overflow, etc.) + - Instructions to reproduce the vulnerability + - Proof-of-concept or exploit code + - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability + +While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information. + +## NVIDIA Product Security + +For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security \ No newline at end of file diff --git a/config/arxiv_builder.yaml b/config/arxiv_builder.yaml new file mode 100644 index 00000000..007566d6 --- /dev/null +++ b/config/arxiv_builder.yaml @@ -0,0 +1,11 @@ +download_module: nemo_curator.download.arxiv.ArxivDownloader +download_params: {} +iterator_module: nemo_curator.download.arxiv.ArxivIterator +iterator_params: + log_frequency: 1000 +extract_module: nemo_curator.download.arxiv.ArxivExtractor +extract_params: {} +format: + text: str + id: str + source_id: str \ No newline at end of file diff --git a/config/cc_warc_builder.yaml b/config/cc_warc_builder.yaml new file mode 100644 index 00000000..3e6a8ed9 --- /dev/null +++ b/config/cc_warc_builder.yaml @@ -0,0 +1,12 @@ +download_module: nemo_curator.download.commoncrawl.CommonCrawlWARCDownloader +download_params: {} +iterator_module: nemo_curator.download.commoncrawl.CommonCrawlWARCIterator +iterator_params: {} +extract_module: nemo_curator.download.commoncrawl.CommonCrawlWARCExtractor +extract_params: {} +format: + text: str + language: str + url: str + warc_id: str + source_id: str \ No newline at end of file diff --git a/config/fasttext_langid.yaml b/config/fasttext_langid.yaml new file mode 100644 index 00000000..86b18761 --- /dev/null +++ b/config/fasttext_langid.yaml @@ -0,0 +1,5 @@ +input_field: text +filters: + - name: nemo_curator.filters.classifier_filter.FastTextLangId + params: + model_path: diff --git a/config/fasttext_quality_filter.yaml b/config/fasttext_quality_filter.yaml new file mode 100644 index 00000000..05054781 --- /dev/null +++ b/config/fasttext_quality_filter.yaml @@ -0,0 +1,12 @@ +input_field: text +filters: + - name: nemo_curator.filters.classifier_filter.FastTextQualityFilter + params: + # FastText Model file + model_path: + # Pareto sampling parameter + # (Higher alpha values will allow fewer low-quality documents + # to pass through) + alpha: 3 + # The label used for high-quality documents + label: "__label__hq" diff --git a/config/heuristic_filter_code.yaml b/config/heuristic_filter_code.yaml new file mode 100644 index 00000000..7d6b36e4 --- /dev/null +++ b/config/heuristic_filter_code.yaml @@ -0,0 +1,20 @@ +input_field: text +filters: + # The filters below define a chain of heuristic filters to be applied to each document in a corpus. + # This particular cascade of filters is intended to filter Python code data. + # The filter listed at the top will be applied first, and the following filters will be applied in + # the order they appear in this file. Each filter can be removed and re-ordered as desired. + # Change this based on the language of the data + # Code filter implementations are in nemo_curator/filter/code.py + - name: nemo_curator.filters.code.PythonCommentToCodeFilter + params: + min_comment_to_code_ratio: 0.001 + max_comment_to_code_ratio: 0.85 + - name: nemo_curator.filters.code.NumberOfLinesOfCodeFilter + params: + min_lines: 5 + max_lines: 20000 + - name: nemo_curator.filters.code.TokenizerFertilityFilter + params: + path_to_tokenizer: + min_char_to_token_ratio: 2 diff --git a/config/heuristic_filter_en.yaml b/config/heuristic_filter_en.yaml new file mode 100644 index 00000000..4e3bbb79 --- /dev/null +++ b/config/heuristic_filter_en.yaml @@ -0,0 +1,105 @@ +input_field: text +filters: + # The filters below define a chain of heuristic filters to be applied to each document in a corpus. + # This particular cascade of filters is intended to filter English language data. + # The filter listed at the top will be applied first, and the following filters will be applied in + # the order they appear in this file. Each filter can be removed and re-ordered as desired. + - name: nemo_curator.filters.heuristic_filter.NonAlphaNumericFilter + params: + max_non_alpha_numeric_to_text_ratio: 0.25 + - name: nemo_curator.filters.heuristic_filter.SymbolsToWordsFilter + params: + max_symbol_to_word_ratio: 0.1 + - name: nemo_curator.filters.heuristic_filter.NumbersFilter + params: + max_number_to_text_ratio: 0.15 + - name: nemo_curator.filters.heuristic_filter.UrlsFilter + params: + max_url_to_text_ratio: 0.2 + - name: nemo_curator.filters.heuristic_filter.WhiteSpaceFilter + params: + max_white_space_ratio: 0.25 + - name: nemo_curator.filters.heuristic_filter.ParenthesesFilter + params: + max_parentheses_ratio: 0.1 + - name: nemo_curator.filters.heuristic_filter.BoilerPlateStringFilter + params: + remove_if_at_top_or_bottom: True + max_boilerplate_string_ratio: 0.4 + - name: nemo_curator.filters.heuristic_filter.RepeatedLinesFilter + params: + max_repeated_line_fraction: 0.7 + - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsFilter + params: + max_repeated_paragraphs_ratio: 0.7 + - name: nemo_curator.filters.heuristic_filter.RepeatedLinesByCharFilter + params: + max_repeated_lines_char_ratio: 0.8 + - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsByCharFilter + params: + max_repeated_paragraphs_char_ratio: 0.8 + - name: nemo_curator.filters.heuristic_filter.WordCountFilter + params: + min_words: 50 + max_words: 100000 + - name: nemo_curator.filters.heuristic_filter.PunctuationFilter + params: + max_num_sentences_without_endmark_ratio: 0.85 + - name: nemo_curator.filters.heuristic_filter.WordsWithoutAlphabetsFilter + params: + min_words_with_alphabets: 0.8 + - name: nemo_curator.filters.heuristic_filter.CommonEnglishWordsFilter + params: + min_num_common_words: 2 + stop_at_false: True + - name: nemo_curator.filters.heuristic_filter.MeanWordLengthFilter + params: + max_mean_word_length: 10 + min_mean_word_length: 3 + - name: nemo_curator.filters.heuristic_filter.LongWordFilter + params: + max_word_length: 1000 + - name: nemo_curator.filters.heuristic_filter.EllipsisFilter + params: + max_num_lines_ending_with_ellipsis_ratio: 0.3 + # Top N-Gram filters for N-grams 2, 3, and 4 + - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter + params: + n: 2 + max_repeating_ngram_ratio: 0.2 + - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter + params: + n: 3 + max_repeating_ngram_ratio: 0.18 + - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter + params: + n: 4 + max_repeating_ngram_ratio: 0.16 + # Duplicate N-gram filters for N-grams 5, 6, 7, 8, 9, and 10 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 5 + max_repeating_duplicate_ngram_ratio: 0.15 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 6 + max_repeating_duplicate_ngram_ratio: 0.14 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 7 + max_repeating_duplicate_ngram_ratio: 0.13 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 8 + max_repeating_duplicate_ngram_ratio: 0.12 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 9 + max_repeating_duplicate_ngram_ratio: 0.11 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 10 + max_repeating_duplicate_ngram_ratio: 0.10 + - name: nemo_curator.filters.heuristic_filter.BulletsFilter + params: + max_bullet_lines_ratio: 0.9 \ No newline at end of file diff --git a/config/heuristic_filter_non-en.yaml b/config/heuristic_filter_non-en.yaml new file mode 100644 index 00000000..783d0e54 --- /dev/null +++ b/config/heuristic_filter_non-en.yaml @@ -0,0 +1,97 @@ +input_field: text +filters: + # The filters below define a chain of heuristic filters to be applied to each document in a corpus. + # This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words. + # The filter listed at the top will be applied first, and the following filters will be applied in + # the order they appear in this file. Each filter can be removed and re-ordered as desired. + - name: nemo_curator.filters.heuristic_filter.SymbolsToWordsFilter + params: + max_symbol_to_word_ratio: 0.1 + - name: nemo_curator.filters.heuristic_filter.NumbersFilter + params: + max_number_to_text_ratio: 0.15 + - name: nemo_curator.filters.heuristic_filter.UrlsFilter + params: + max_url_to_text_ratio: 0.2 + - name: nemo_curator.filters.heuristic_filter.WhiteSpaceFilter + params: + max_white_space_ratio: 0.25 + - name: nemo_curator.filters.heuristic_filter.ParenthesesFilter + params: + max_parentheses_ratio: 0.1 + - name: nemo_curator.filters.heuristic_filter.BoilerPlateStringFilter + params: + remove_if_at_top_or_bottom: True + max_boilerplate_string_ratio: 0.4 + - name: nemo_curator.filters.heuristic_filter.RepeatedLinesFilter + params: + max_repeated_line_fraction: 0.7 + - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsFilter + params: + max_repeated_paragraphs_ratio: 0.7 + - name: nemo_curator.filters.heuristic_filter.RepeatedLinesByCharFilter + params: + max_repeated_lines_char_ratio: 0.8 + - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsByCharFilter + params: + max_repeated_paragraphs_char_ratio: 0.8 + - name: nemo_curator.filters.heuristic_filter.WordCountFilter + params: + min_words: 50 + max_words: 100000 + # NOTE: This filter tends to remove many documents and will need to + # be tuned per language + - name: nemo_curator.filters.heuristic_filter.PunctuationFilter + params: + max_num_sentences_without_endmark_ratio: 0.85 + - name: nemo_curator.filters.heuristic_filter.MeanWordLengthFilter + params: + max_mean_word_length: 10 + min_mean_word_length: 3 + - name: nemo_curator.filters.heuristic_filter.LongWordFilter + params: + max_word_length: 1000 + - name: nemo_curator.filters.heuristic_filter.EllipsisFilter + params: + max_num_lines_ending_with_ellipsis_ratio: 0.3 + # Top N-Gram filters for N-grams 2, 3, and 4 + - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter + params: + n: 2 + max_repeating_ngram_ratio: 0.2 + - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter + params: + n: 3 + max_repeating_ngram_ratio: 0.18 + - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter + params: + n: 4 + max_repeating_ngram_ratio: 0.16 + # Duplicate N-gram filters for N-grams 5, 6, 7, 8, 9, and 10 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 5 + max_repeating_duplicate_ngram_ratio: 0.15 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 6 + max_repeating_duplicate_ngram_ratio: 0.14 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 7 + max_repeating_duplicate_ngram_ratio: 0.13 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 8 + max_repeating_duplicate_ngram_ratio: 0.12 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 9 + max_repeating_duplicate_ngram_ratio: 0.11 + - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter + params: + n: 10 + max_repeating_duplicate_ngram_ratio: 0.10 + - name: nemo_curator.filters.heuristic_filter.BulletsFilter + params: + max_bullet_lines_ratio: 0.9 \ No newline at end of file diff --git a/config/lm_tasks.yaml b/config/lm_tasks.yaml new file mode 100644 index 00000000..3d38ec6f --- /dev/null +++ b/config/lm_tasks.yaml @@ -0,0 +1,48 @@ +tasks: + # The Python modules below define language model downstream evaluation + # task data. If one of the below tasks is specified, N-grams will + # be constructed from the documents that make up the task data + # using the script prepare_task_data. + # find_matching_ngrams will then search for these N-grams + # in the training documents, and remove_matching_ngrams will + # split the documents based on matches + - name: nemo_curator.tasks.Winogrande + params: {} + - name: nemo_curator.tasks.Squad + params: {} + - name: nemo_curator.tasks.TriviaQA + params: {} + - name: nemo_curator.tasks.Quac + params: {} + - name: nemo_curator.tasks.WebQA + params: {} + - name: nemo_curator.tasks.Race + params: {} + - name: nemo_curator.tasks.Drop + params: {} + - name: nemo_curator.tasks.WiC + params: {} + - name: nemo_curator.tasks.PIQA + params: {} + - name: nemo_curator.tasks.ArcEasy + params: {} + - name: nemo_curator.tasks.ArcChallenge + params: {} + - name: nemo_curator.tasks.OpenBookQA + params: {} + - name: nemo_curator.tasks.BoolQ + params: {} + - name: nemo_curator.tasks.Copa + params: {} + - name: nemo_curator.tasks.RTE + params: {} + - name: nemo_curator.tasks.MultiRC + params: {} + - name: nemo_curator.tasks.WSC + params: {} + - name: nemo_curator.tasks.CB + params: {} + - name: nemo_curator.tasks.ANLI + params: {} + - name: nemo_curator.tasks.Record + params: {} diff --git a/config/pii_config.yaml b/config/pii_config.yaml new file mode 100644 index 00000000..725fde30 --- /dev/null +++ b/config/pii_config.yaml @@ -0,0 +1,16 @@ +pii_config: + language: 'en' + supported_entities: + - PERSON + - ADDRESS + anonymize: + #type: 'replace' + #new_value: ABC + action: 'mask' + chars_to_mask: 40 + masking_char: '*' + + #type: 'hash' + #hash_type: 'sha256' + + #type: 'redact' \ No newline at end of file diff --git a/config/wikipedia_builder.yaml b/config/wikipedia_builder.yaml new file mode 100644 index 00000000..47831537 --- /dev/null +++ b/config/wikipedia_builder.yaml @@ -0,0 +1,15 @@ +download_module: nemo_curator.download.wikipedia.WikipediaDownloader +download_params: {} +iterator_module: nemo_curator.download.wikipedia.WikipediaIterator +iterator_params: + language: 'en' +extract_module: nemo_curator.download.wikipedia.WikipediaExtractor +extract_params: + language: 'en' +format: + text: str + title: str + id: str + url: str + language: str + source_id: str \ No newline at end of file diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..06edad0f --- /dev/null +++ b/docs/README.md @@ -0,0 +1,3 @@ +# Documentation +## User Guide +The NeMo User Guide is the recommended way to view the documentation for each one of the NeMo Curator's modules. diff --git a/docs/user-guide/CPUvsGPU.rst b/docs/user-guide/CPUvsGPU.rst new file mode 100644 index 00000000..c3159b21 --- /dev/null +++ b/docs/user-guide/CPUvsGPU.rst @@ -0,0 +1,98 @@ +====================================== +CPU and GPU Modules with Dask +====================================== + +NeMo Curator provides GPU-accelerated modules alongside its CPU modules. +These modules are based on RAPIDS to enable scaling workflows to massive dataset sizes. +The remaining modules are CPU based and rely on Dask to scale to multi-node clusters. +When working with these different modules, it's important to understand how to properly set up your Dask cluster and how to manage where your dataset is stored in memory. + +----------------------------------------- +Initializing the Dask Cluster +----------------------------------------- + +NeMo Curator provides a simple function ``get_client`` that can be used to start a local Dask cluster or connect to an existing one. +All of the ``examples/`` use it to set up a Dask cluster. + +.. code-block:: python + + import argparse + from nemo_curator.utils.distributed_utils import get_client + from nemo_curator.utils.script_utils import add_distributed_args + + + def main(args): + # Set up Dask client + client = get_client(args, args.device) + + # Perform some computation... + + def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)): + return add_distributed_args(parser) + + if __name__ == "__main__": + main(attach_args().parse_args()) + +In this short example, you can see that ``get_client`` takes an argparse object with options configured in the corresponding ``add_distributed_args``. + +* ``--device`` controls what type of Dask cluster to create. "cpu" will create a CPU based local Dask cluster, while "gpu" will create a GPU based local cluster. + If "cpu" is specified, the number of processes started with the cluster can be specified with the ``--n-workers`` argument. + By default, this argument is set to ``os.cpu_count()``. + If "gpu" is specified, one worker is started per GPU. + It is possible to run entirely CPU-based workflows on a GPU cluster, though the process count (and therefore the number of parallel tasks) will be limited by the number of GPUs on your machine. + +* ``--scheduler-address`` and ``--scheduler-file`` are used for connecting to an existing Dask cluster. + Supplying one of these is essential if you are running a Dask cluster on SLURM or Kubernetes. + All other arguments are ignored if either of these are passed, as the cluster configuration will be done when you create the schduler and works on your cluster. + +----------------------------------------- +CPU Modules +----------------------------------------- + +As mentioned in the ``DocumentDataset`` documentation, the underlying storage format for datasets in NeMo Curator is just a Dask dataframe. +For the CPU modules, Dask uses pandas dataframes to hold dataframe partitions. +Most modules in NeMo Curator are CPU based. +Therefore, the default behavior for reading and writing datasets is to operate on them in CPU memory with a pandas backend. +The following two functions calls are equivalent. + +.. code-block:: python + + books = DocumentDataset.read_json(files, add_filename=True) + books = DocumentDataset.read_json(files, add_filename=True, backend="pandas") + + +----------------------------------------- +GPU Modules +----------------------------------------- + +The following NeMo Curator modules are GPU based. + +* Exact Deduplication +* Fuzzy Deduplication +* Distributed Data Classification + + * Domain Classification + * Quality Classification + +GPU modules store the ``DocumentDataset`` using a ``cudf`` backend instead of a ``pandas`` one. +To read a dataset into GPU memory, one could use the following function call. + +.. code-block:: python + + gpu_books = DocumentDataset.read_json(files, add_filename=True, backend="cudf") + + +Even if you start a GPU dask cluster, you can't operate on datasets that use a ``pandas`` backend. +The ``DocuemntDataset`` must either have been originally read in with a ``cudf`` backend, or it must be transferred during the script. + +----------------------------------------- +Dask with SLURM +----------------------------------------- + +We provide an example SLURM script pipeline in ``examples/slurm``. +This pipeline has a script ``start-slurm.sh`` that provides configuration options similar to what ``get_client`` provides. +Every SLURM cluster is different, so make sure you understand how your SLURM cluster works so the scripts can be easily adapted. +``start-slurm.sh`` calls ``containter-entrypoint.sh`` which sets up a Dask scheduler and workers across the cluster. + +Our Python examples are designed to work such that they can be run locally on their own, or easily substituted into the ``start-slurm.sh`` to run on multiple nodes. +You can adapt your scripts easily too by simply following the pattern of adding ``get_client`` with ``add_distributed_args``. \ No newline at end of file diff --git a/docs/user-guide/DataCuration.rsts b/docs/user-guide/DataCuration.rsts new file mode 100644 index 00000000..55083b44 --- /dev/null +++ b/docs/user-guide/DataCuration.rsts @@ -0,0 +1,2 @@ +Data Curation +!!!!!!!!!!!!! diff --git a/docs/user-guide/DistributedDataClassification.rst b/docs/user-guide/DistributedDataClassification.rst new file mode 100644 index 00000000..b7a99a20 --- /dev/null +++ b/docs/user-guide/DistributedDataClassification.rst @@ -0,0 +1,71 @@ +============================================ +Distributed Data Classification +============================================ + +----------------------------------------- +Background +----------------------------------------- + +When preparing text data to be used in training a large language model (LLM), it is useful to classify +text documents in various ways, to enhance the LLM's performance by making it able to produce more +contextually appropriate and accurate language across various subjects. NeMo Curator provides this module to +help a user run inference with pre-trained models on large amounts of text documents. We achieve +this by chunking the datasets across multiple computing nodes, each equipped with multiple GPUs, to +accelerate the classification task in a distributed way. In other words, because the classification of +a single text document is independent of other documents within a dataset, we can distribute the +workload across multiple nodes and multiple GPUs to perform parallel processing. + +Domain classification and quality classification are two tasks we include as examples within our module. +Here, we summarize why each is useful for training an LLM. + +Domain classification is useful because it helps the LLM understand the context and specific domain of +the input text. Because different domains have different linguistic characteristics and terminologies, +an LLM's ability to generate contextually relevant responses can be improved by tailoring training data +to a specific domain. Overall, this helps provide more accurate and specialized information. + +Quality classification is useful for filtering out noisy or low quality data. This allows the model to +focus on learning from high quality and informative examples, which contributes to the LLM's robustness +and enhances its ability to generate reliable and meaningful outputs. Additionally, quality +classification helps mitigate biases and inaccuracies that may arise from poorly curated training data. + +----------------------------------------- +Usage +----------------------------------------- + +NeMo Curator provides a base class ``DistributedDataClassifier`` that can be extended to fit your specfic model. +The only requirement is that the model can fit on a single GPU. +We have also provided two subclasses that focus on domain and quality classification. +Let's see how ``DomainClassifier`` works in a small excerpt taken from ``examples/distributed_data_classification_examples/domain_api_example.py``: + +.. code-block:: python + + labels = [ + "Adult", + "Arts_and_Entertainment", + "Autos_and_Vehicles", + ..., + "Shopping", + "Sports", + "Travel_and_Transportation", + ] + + model_file_name = "pytorch_model_file.pth" + + files = get_all_files_paths_under("books_dataset/") + input_dataset = DocumentDataset.read_json(files, backend="cudf", add_filename=True) + + domain_classifier = DomainClassifier( + model_file_name=model_file_name, + labels=labels, + filter_by=["Games", "Sports"], + ) + result_dataset = domain_classifier(dataset=input_dataset) + + result_dataset.to_json("games_and_sports/", write_to_filename=True) + +This module functions very similarly to the ``ScoreFilter`` module. +The key differences is that it operates on the GPU instead of the CPU. +Therefore, the Dask cluster must be started as a GPU one. +And, ``DomainClassifier`` requires ``DocumentDataset`` to be on the GPU (i.e., have ``backend=cudf``). +It is easy to extend ``DistributedDataClassifier`` to your own model. +Check out ``nemo_curator.modules.distributed_data_classifier.py`` for reference. \ No newline at end of file diff --git a/docs/user-guide/DocumentDataset.rst b/docs/user-guide/DocumentDataset.rst new file mode 100644 index 00000000..8711227a --- /dev/null +++ b/docs/user-guide/DocumentDataset.rst @@ -0,0 +1,139 @@ +====================================== +Working with DocumentDataset +====================================== +----------------------------------------- +Background +----------------------------------------- +Text datasets are responsible for storing metadata along with the core text/document. +``jsonl``` files are common for their ease of processing and inspecting. +``parquet`` files are also a common format. +In both cases, a single dataset is often represented with multiple underlying files (called shards). +For example, if you have a large dataset named "books" it is likely you will store it in shards with each shard being named something like ``books_00.jsonl``, ``books_01.jsonl``, ``books_02.jsonl``, etc. + +How you store your dataset in memory is just as important as how you store it on disk. +If you have a large dataset that is too big to fit directly into memory, you will have to somehow distribute it across multiple machines/nodes. +Furthermore, if curating your dataset takes a long time, it is likely to get interrupted due to some unforseen failure or another. +NeMo Curator's ``DocumentDataset`` employs `Dask's distributed dataframes `_ to mangage large datasets across multiple nodes and allow for easy restarting of interrupted curation. +``DocumentDataset`` supports reading and writing to sharded ``jsonl`` and ``parquet`` files both on local disk and from remote sources directly like S3. + +----------------------------------------- +Usage +----------------------------------------- +############################ +Reading and Writing +############################ +``DocumentDataset`` is the standard format for text datasets in NeMo Curator. +Imagine we have a "books" dataset stored in the following structure: +:: + + books_dataset/ + books_00.jsonl + books_01.jsonl + books_02.jsonl + +You could read, filter the dataset, and write it using the following methods + +.. code-block:: python + + import nemo_curator as nc + from nemo_curator.datasets import DocumentDataset + from nemo_curator.utils.file_utils import get_all_files_paths_under + from nemo_curator.filters import WordCountFilter + + files = get_all_files_paths_under("books_dataset/") + books = DocumentDataset.read_json(files, add_filename=True) + + filter_step = nc.ScoreFilter( + WordCountFilter(min_words=80), + text_field="text", + score_field="word_count", + ) + + long_books = filter_step(books) + + long_books.to_json("long_books/", write_to_filename=True) + +Let's walk through this code line by line. + +* ``files = get_all_files_paths_under("books_dataset/")`` This retrieves a list of all files in the given directory. + In our case, this is equivalent to writing + + .. code-block:: python + + files = ["books_dataset/books_00.jsonl", + "books_dataset/books_01.jsonl", + "books_dataset/books_02.jsonl"] + +* ``books = DocumentDataset.read_json(files, add_filename=True)`` This will read the files listed into memory. + The ``add_filename=True`` option preserves the name of the shard (``books_00.jsonl``, ``books_01.jsonl``, etc.) as an additional ``filename`` field. + When the dataset is written back to disk, this option (in conjunction with the ``write_to_filename`` option) ensure that documents stay in their original shard. + This can be useful for manually inspecting the results of filtering shard by shard. +* ``filter_step = ...`` This constructs and applies a heuristic filter for the length of the document. + More information is provided in the filtering page of the documentation. +* ``long_books.to_json("long_books/", write_to_filename=True)`` This writes the filtered dataset to a new directory. + As mentioned above, the ``write_to_filename=True`` preserves the sharding of the dataset. + If the dataset was not read in with ``add_filename=True``, setting ``write_to_filename=True`` will throw an error. + +``DocumentDataset`` is just a wrapper around a `Dask dataframe `_. +The underlying dataframe can be accessed with the ``DocumentDataset.df`` member variable. +It is important to understand how Dask handles computation. +To quote from their `documentation `_: + + Dask is lazily evaluated. The result from a computation isn't computed until you ask for it. Instead, a Dask task graph for the computation is produced. + +Because of this, the call to ``DocumentDataset.read_json`` will not execute immediately. +Instead, tasks that read each shard of the dataset will be placed on the task graph. +The task graph is only executed when a call to ``DocumentDataset.df.compute()`` is made, or some operation that depends on ``DocumentDataset.df`` calls ``.compute()``. +This allows us to avoid reading massive datasets into memory. +In our case, ``long_books.to_json()`` internally calls ``.compute()``, so the task graph will be executed then. + +############################ +Resuming from Interruptions +############################ +It can be helpful to track which documents in a dataset have already been processed so that long curation jobs can be resumed if they are interrupted. +NeMo Curator provides a utility for easily tracking which dataset shards have already been processed. +Consider a modified version of the code above: + +.. code-block:: python + + from nemo_curator.utils.file_utils import get_remaining_files + + files = get_remaining_files("books_dataset/", "long_books/", "jsonl") + books = DocumentDataset.read_json(files, add_filename=True) + + filter_step = nc.ScoreFilter( + WordCountFilter(min_words=80), + text_field="text", + score_field="word_count", + ) + + long_books = filter_step(books) + + long_books.to_json("long_books/", write_to_filename=True) + +``get_remaining_files`` compares the input directory (``"books_dataset/"``) and the output directory (``"long_books"``) and returns a list of all the shards in the input directory that have not yet been written to the output directory. + + + +While Dask provides an easy way to avoid reading too much data into memory, there are times when we may need to call ``persist()`` or a similar operation that forces the dataset into memory. +In these cases, we recommend processing the input dataset in batches using a simple wrapper function around ``get_remaining_files`` as shown below. + +.. code-block:: python + + from nemo_curator.utils.file_utils import get_batched_files + + for files in get_batched_files("books_dataset/", "long_books/", "jsonl", batch_size=64): + books = DocumentDataset.read_json(files, add_filename=True) + + filter_step = nc.ScoreFilter( + WordCountFilter(min_words=80), + text_field="text", + score_field="word_count", + ) + + long_books = filter_step(books) + + long_books.to_json("long_books/", write_to_filename=True) + +This will read in 64 shards at a time, process them, and write them back to disk. +Like ``get_remaining_files``, it only includes files that are in the input directory and not in the output directory. \ No newline at end of file diff --git a/docs/user-guide/Download.rst b/docs/user-guide/Download.rst new file mode 100644 index 00000000..66a34463 --- /dev/null +++ b/docs/user-guide/Download.rst @@ -0,0 +1,190 @@ + +.. _data-curator-download: + +====================================== +Downloading and Extracting Text +====================================== +----------------------------------------- +Background +----------------------------------------- +Publicly hosted text datasets are stored in various locations and formats. Downloading a massive public dataset is usually the first step in data curation, +and it can be cumbersome due to the dataset's massive size and hosting method. +Also, massive pretraining text datasets are rarely in a format that can be immediately operated on for further curation and training. +For example, the Common Crawl stores its data in a compressed web archive format (:code:`.warc.gz`) for its raw crawl data, but formats +like :code:`.jsonl` are more common for data curation due to their ease of use. +However, extraction can be by far the most computational expensive step of the data curation pipeline, so it can be beneifical to do some filtering prior to +the extraction step to limit the amount of documents that undergo this heavy computation. + +NeMo Curator provides example utilities for downloading and extracting Common Crawl, ArXiv, and Wikipedia data. +In addition, it provides a flexible interface to extend the utility to other datasets. +Our Common Crawl example demonstrates how to process a crawl by downloading the data from S3, doing preliminary language filtering with pyCLD2, +and extracting the relevant text with jusText to output :code:`.jsonl` files. + +----------------------------------------- +Usage +----------------------------------------- + +``nemo_curator.download`` has a collection of functions for handling the download and extraction of online datasets. +By "download", we typically mean the transfer of data from a web-hosted data source to local file storage. +By "extraction", we typically mean the process of converting a data format from its raw form (e.g., ``.warc.gz``) to a standardized format (e.g., ``.jsonl``) and discarding irrelvant data. + +* ``download_common_crawl`` will download and extract the compressed web archive files of common crawl snapshots to a target directory. + Common crawl has an S3 bucket and a direct HTTPS endpoint. If you want to use the S3 bucket, ensure you have properly setup your credentials with `s5cmd `_. + Otherwise, the HTTPS endpoints will be used with ``wget``. Here is a small example of how to use it: + + .. code-block:: python + + from nemo_curator.download import download_common_crawl + + common_crawl = download_common_crawl("/extracted/output/folder", "2020-50", "2021-04", output_type="jsonl") + + * ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed. + * ``"2020-50"`` is the first common crawl snapshot that will be included in the download. + * ``"2021-04"`` is the last common crawl snapshot that will be included in the download. + * ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported. + + The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use. + + NeMo Curator's Common Crawl extraction process looks like this under the hood: + + 1. Decode the HTML within the record from binary to text + 2. If the HTML can be properly decoded, then with `pyCLD2 `_, perform language detection on the input HTML + 3. Finally, the extract the relevant text with `jusText `_ from the HTML and write it out as a single string within the 'text' field of a json entry within a `.jsonl` file +* ``download_wikipedia`` will download and extract the latest wikipedia dump. Files are downloaded using ``wget``. Wikipedia might download slower than the other datasets. This is because they limit the number of downloads that can occur per-ip address. + + .. code-block:: python + + from nemo_curator.download import download_wikipedia + + wikipedia = download_wikipedia("/extracted/output/folder", dump_date="20240201") + + * ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed. + * ``dump_date="20240201"`` fixes the Wikipedia dump to a specific date. If no date is specified, the latest dump is used. + +* ``download_arxiv`` will download and extract latex versions of ArXiv papers. They are hosted on S3, so ensure you have properly setup your credentials with `s5cmd `_. + + .. code-block:: python + + from nemo_curator.download import download_arxiv + + arxiv = download_arxiv("/extracted/output/folder") + + * ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed. + + +All of these functions return a ``DocumentDataset`` of the underlying dataset and metadata that was obtained during extraction. If the dataset has been downloaded and extracted at the path passed to it, it will read from the files there instead of downloading and extracting them again. +Due to how massive each of these datasets are (with Common Crawl snapshots being on the order of hundreds of terrabytes) all of these datasets are sharded accross different files. +They all have a ``url_limit`` parameter that allows you to only download a small number of shards. + +----------------------------------------- +Related Scripts +----------------------------------------- +In addition to the Python module described above, NeMo Curator provides several CLI scripts that you may find useful for performing the same function. + +The :code:`download_and_extract` script within NeMo Curator is a generic tool that can be used to download and extract from a number of different +datasets. In general, it can be called as follows in order to download and extract text from the web + +.. code-block:: bash + + download_and_extract \ + --input-url-file= \ + --builder-config-file= \ + --output-json-dir= + +This utility takes as input a list of URLs that point to files that contain prepared, unextracted data (e.g., pre-crawled web pages from Common Crawl), a config file that describes how to download and extract the data, and the output directory to where the extracted text will be written in jsonl format (one json written to each document per line). For each URL provided in the list of URLs, a corresponding jsonl file will be written to the output directory. + +The config file that must be provided at runtime, should take the following form + +.. code-block:: yaml + + download_module: nemo_curator.download.mydataset.DatasetDownloader + download_params: {} + iterator_module: nemo_curator.download.mydataset.DatasetIterator + iterator_params: {} + extract_module: nemo_curator.download.mydataset.DatasetExtractor + extract_params: {} + +Each pair of lines corresponds to an implementation of the abstract DocumentDownloader, DocumentIterator and DocumentExtractor classes. In this case the dummy names of DatasetDownloader, DatasetIterator, and DatasetExtractor have been provided. For this example, each of these have been defined within the fictitious file :code:`nemo_curator/download/mydataset.py`. Already within NeMo Curator, we provide implementations of each of these classes for the Common Crawl, Wikipedia and ArXiv datasets. + +############################### +Common Crawl Example +############################### + + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Setup +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If you prefer, the download process can pull WARC files from S3 using `s5cmd `_. +This utility is preinstalled in the NeMo Framework Container, but you must have the necessary credentials within :code:`~/.aws/config` in order to use it. +If you would prefer to use this over `wget `_ instead, you may set :code:`aws=True` in the :code:`download_params` as follows + +.. code-block:: yaml + + download_module: nemo_curator.download.commoncrawl.CommonCrawlWARCDownloader + download_params: + aws: True + iterator_module: nemo_curator.download.commoncrawl.CommonCrawlWARCIterator + iterator_params: {} + extract_module: nemo_curator.download.commoncrawl.CommonCrawlWARCExtractor + extract_params: {} + + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Downloading and Extracting Common Crawl +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +As described in the first section of this document, the first step towards using the :code:`download_and_extract` for Common Crawl will be to create a list of URLs that point to the location of the WARC files hosted by Common Crawl. +Within NeMo Curator, we provide the utility :code:`get_common_crawl_urls` to obtain these urls. This utility can be run as follows + +.. code-block:: bash + + get_common_crawl_urls \ + --cc-snapshot-index-file=./url_data/collinfo.json \ + --starting-snapshot="2020-50" \ + --ending-snapshot="2020-50" \ + --output-warc-url-file=./url_data/warc_urls_cc_2020_50.txt + +This script pulls the Common Crawl index from `https://index.commoncrawl.org` and stores the index to the file +specified by the argument :code:`--cc-snapshot-index-file`. It then retrieves all WARC urls between the +dates specified by the arguments :code:`--starting-snapshot` and :code:`--ending-snapshot`. +Finally, it writes all WARC urls to the text file :code:`--output-warc-urls`. This file is a simple text file +with the following format:: + + https://data.commoncrawl.org/crawl-data/CC-MAIN-2020-50/segments/1606141163411.0/warc/CC-MAIN-20201123153826-20201123183826-00000.warc.gz + https://data.commoncrawl.org/crawl-data/CC-MAIN-2020-50/segments/1606141163411.0/warc/CC-MAIN-20201123153826-20201123183826-00001.warc.gz + https://data.commoncrawl.org/crawl-data/CC-MAIN-2020-50/segments/1606141163411.0/warc/CC-MAIN-20201123153826-20201123183826-00002.warc.gz + https://data.commoncrawl.org/crawl-data/CC-MAIN-2020-50/segments/1606141163411.0/warc/CC-MAIN-20201123153826-20201123183826-00003.warc.gz + https://data.commoncrawl.org/crawl-data/CC-MAIN-2020-50/segments/1606141163411.0/warc/CC-MAIN-20201123153826-20201123183826-00004.warc.gz + ... + +For the CC-MAIN-2020-50 snapshot there are a total of 72,000 compressed WARC files each between 800 - 900 MB. + +Now with the prepared list of URLs, we can use the Common Crawl config included in the :code:`config` directory under the root directory of the repository. This config uses the download, data loader and extraction classes defined in the file :code:`nemo_curator/download/commoncrawl.py`. +With this config and the input list of URLs, the :code:`download_and_extract` utility can be used as follows for downloading and extracting text from Common Crawl + +.. code-block:: bash + + download_and_extract \ + --input-url-file=./url_data/warc_urls_cc_2020_50.txt \ + --builder-config-file=./config/cc_warc_builder.yaml \ + --output-json-dir=/datasets/CC-MAIN-2020-50/json + + +As the text is extracted from the WARC records, the prepared documents are written to the directory specified by :code:`--output-json-dir`. Here is an +example of a single line of an output `.jsonl` file extracted from a WARC record + +.. code-block:: json + + {"text": "커뮤니티\n\n어린이 요리 교실은 평소 조리와 제과 제빵에 관심이 있는 초등학생을 대상으로 나이프스킬, 한식, 중식, 양식, 제과, 제빵, 디저트, + 생활요리 등 요리 기초부터 시작해 다양한 요리에 대해 배우고, 경험할 수 있도록 구성되었다.\n\n요즘 부모들의 자녀 요리 교육에 대한 관심이 높아지고 + 있는데, 어린이 요리교실은 자녀들이 어디서 어떻게 요리를 처음 시작할지 막막하고 어려워 고민하는 이들을 위해 만들어졌다.\n\n그 뿐만 아니라 학생들이 + 식재료를 다루는 과정에서 손으로 만지고 느끼는 것이 감각을 자극하여 두뇌발달에 도움을 주며, 조리를 통해 자신의 감정을 자연스럽게 표현할 수 + 있고 이를 통해 정서적 안정을 얻을 수 있다. 또한, 다양한 사물을 만져 보면서 차이점을 구별하고 사물의 특징에 대해 인지할 수 있으므로 인지 능력 향상에 + 도움이 되며, 만지고 느끼고 비교하는 과정에서 감각 기능을 향상시킬 수 있다.\n\n방과 후 시간이 되지 않는 초등학생들을 위해 평일반 뿐만 아니라 주말반도 + 운영하고 있으며 두 분의 선생님들의 안전적인 지도하에 수업이 진행된다. 한국조리예술학원은 젊은 감각과 학생들과의 소통을 통해 자발적인 교육을 가르친다. + 자세한 학원 문의는 한국조리예술학원 홈페이지나 대표 전화, 카카오톡 플러스친구를 통해 가능하다.", "id": "a515a7b6-b6ec-4bed-998b-8be2f86f8eac", + "source_id": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2020-50/segments/1606141163411.0/warc/CC-MAIN-20201123153826-20201123183826-00000.warc.gz", + "url": "http://hanjowon.co.kr/web/home.php?mid=70&go=pds.list&pds_type=1&start=20&num=67&s_key1=&s_que=", "language": "KOREAN"} + +Once all records have been processed within a WARC file, it is by default deleted from disk. + diff --git a/docs/user-guide/GpuDeduplication.rst b/docs/user-guide/GpuDeduplication.rst new file mode 100644 index 00000000..d23e8ee7 --- /dev/null +++ b/docs/user-guide/GpuDeduplication.rst @@ -0,0 +1,83 @@ + +.. _data-curator-gpu-deduplication: + +####################################################### +GPU Accelerated Exact and Fuzzy Deduplication +####################################################### + +----------------------------------------- +Background +----------------------------------------- + +Training on randomly selected documents for many epochs can be sub-optimal to downstream performance for language models. +For more information on when this is harmful, please see `Muennighoff et al., 2023 `_ and `Tirumala et al., 2023 `_. +The exact and fuzzy document-level deduplication module in the NeMo Curator aims at reducing the occurence of duplicate and +near-duplicate documents in the dataset. Exact deduplication refers to removing identical (i.e., document strings are equal) +documents from the dataset, while fuzzy deduplication refers to removing near-identical (e.g., an excerpt of a document is used in another document) +documents from the dataset. + +Both functionalities are supported in NeMo Curator and accelerated using `RAPIDS `_. +Exact dedpulication works by hashing each document and only keeping one document per hash. +Fuzzy deduplication is more involved and follows the method outlined in `Microsoft Turing NLG 530B `_. + +----------------------------------------- +Usage +----------------------------------------- +As exact deduplication is a much less involved procedure and requires significantly less compute, +we typically will first run exact deduplication before fuzzy deduplication. Also, from our experience in +deduplicating Common Crawl snapshots, a significant portion of the duplicates are in fact exact duplicates. + +When removing near-duplicates within the corpus we perform fuzzy deduplication at the document level in order to remove documents that +have high Jaccard similarity. Our approach closely resembles the approach described in `Smith et al., 2020 `_. This +approach can essentially be split into two conceptual changes. The first stage involves computing MinHashes Signatures on +documents and then performing Locality Sensitive Hashing (LSH) to find candidate duplucates. Due to the approximate nature of the bucketing via MinHash + LSH +(`Leskovec et al., 2020 `_) we process each of the buckets to remove any potential false positives that may have been hashed into the buckets. + + + +Before running either of these modules, users should assign a unique document ID to each document in the corpus. +This can be accomplished using the :code:`add_id` module within the NeMo Curator: + +.. code-block:: bash + + add_id \ + --input-data-dir= \ + --log-dir=./log/add_id + +By default, this will create a new field named :code:`adlr_id` within each json document which will have the form "doc_prefix-000001". +If the dataset already has a unique ID this step can be skipped. + +**Note**: Fuzzy deduplication only works with numeric ID's or the specific ID format generated by the :code:`add_id` script. If the +dataset does not contain ID's in this format it's recommended to convert to an integer based ID or ID created by the :code:`add_id` script. + +Once a unique ID has been added to each document, users can proceed with exact and fuzzy deduplication which roughly require the following +steps (all scripts are included in the :code:`nemo_curator/scripts/` subdirectory): + +* Exact dedup + 1. Input: Data directories + 2. Output: _exact_duplicates.parquet. List of exact duplicates and the document hash. + +* Fuzzy Dedup + 1. Minhashes (Compute minhashes) + 1. Input: Data Directories + 2. Output: minhashes.parquet for each data dir. + 2. Buckets (Minhash Buckets/LSH) + 1. Input: Minhash directories + 2. Output: _buckets.parquet + 3. Map Buckets + 1. Input: Buckets.parquet + Data Dirs + 2. Output: anchor_docs_with_bk.parquet + 4. Jaccard Shuffle + 1. Input: anchor_docs_with_bk.parquet + Data Dirs + 2. Output: shuffled_docs.parquet + 5. Jaccard compute + 1. Input: Shuffled docs.parquet + 2. Output: jaccard_similarity_results.parquet + 6. Connected Components + 1. Input: jaccard_similarity_results.parquet + 2. Output: connected_components.parquet + +In addition to the scripts, there are examples in the `examples` directory that showcase using the python module +directly in your own code. It also has examples on how to remove documents from the corpus using the list of duplicate IDs generated from exact or fuzzy +deduplication. + diff --git a/docs/user-guide/LanguageIdentificationUnicodeFormatting.rst b/docs/user-guide/LanguageIdentificationUnicodeFormatting.rst new file mode 100644 index 00000000..ddd107bf --- /dev/null +++ b/docs/user-guide/LanguageIdentificationUnicodeFormatting.rst @@ -0,0 +1,93 @@ + +.. _data-curator-languageidentification: + +####################################################### +Language Identification and Unicode Fixing +####################################################### + +----------------------------------------- +Background +----------------------------------------- +Large unlabeled text corpora often contain a variety of languages. +However, data curation usually includes steps that are language specific (e.g. using language-tuned heuristics for quality filtering) +and many curators are only interested in curating a monolingual dataset. +Datasets also may have improperly decoded unicode characters (e.g. "The Mona Lisa doesn't have eyebrows." decoding as "The Mona Lisa doesn’t have eyebrows."). + +NeMo Curator provides utilities to identify languages and fix improperly decoded unicode characters. +The language identification is performed using `fastText `_ and unicode fixing is performed using `ftfy `_. +Even though a preliminary language identification may have been performed on the unextracted text (as is the case in our Common Crawl pipeline +using pyCLD2), `fastText `_ is more accurate so it can be used for a second pass. + +----------------------------------------- +Usage +----------------------------------------- + +We provide an example of how to use the language identification and unicode reformatting utility at ``examples/identify_languages_and_fix_unicode.py``. +At a high level, the module first identifies the languages of the documents and removes any documents for which it has high uncertainty about the language. +Notably, this line uses one of the ``DocmentModifiers`` that NeMo Curator provides: + +.. code-block:: python + + cleaner = nc.Modify(UnicodeReformatter()) + cleaned_data = cleaner(lang_data) + +``DocumentModifier``s like ``UnicodeReformatter`` are very similar to ``DocumentFilter``s. +They implement a single ``modify_document`` function that takes in a document and outputs a modified document. +Here is the implementation of the ``UnicodeReformatter`` modifier: + +.. code-block:: python + + class UnicodeReformatter(DocumentModifier): + def __init__(self): + super().__init__() + + def modify_document(self, text: str) -> str: + return ftfy.fix_text(text) + +Also like the ``DocumentFilter`` functions, ``modify_document`` can be annotated with ``batched`` to take in a pandas series of documents instead of a single document. + +----------------------------------------- +Related Scripts +----------------------------------------- + +To perform the language identification, we can use the config file provided in the `config` directory +and provide the path to a local copy of the `lid.176.bin` language identification fastText model. Then, with the general purpose +:code:`filter_documents` tool, we can compute language scores and codes for each document in the corpus as follows + +.. code-block:: bash + + filter_documents \ + --input-data-dir= \ + --filter-config-file=./config/fasttext_langid.yaml \ + --log-scores \ + --log-dir=./log/lang_id + + +This will apply the fastText model, compute the score and obtain the language class, and then write this +information as additonal keys within each json document. + +With the language information present within the keys of each json, the :code:`separate_by_metadata`, will first construct +a count of the documents by language within the corpus and then from that information, split each file across all the languages +within that file. Below is an example run command for :code:`separate_by_metadata` + +.. code-block:: bash + + separate_by_metadata \ + --input-data-dir= \ + --input-metadata-field=language \ + --output-data-dir= \ + --output-metadata-distribution=./data/lang_distro.json + +After running this module, the output directory will consist of one directory per language present within the corpus and all documents +within those directories will contain text that originates from the same language. Finally, the text within a specific language can have +its unicode fixed using the :code:`text_cleaning` module + +.. code-block:: bash + + text_cleaning \ + --input-data-dir=/EN \ + --output-clean-dir= + + +The above :code:`text_cleaning` module uses the heuristics defined within the :code:`ftfy` package that is commonly used for fixing +improperly decoded unicode. diff --git a/docs/user-guide/PersonalIdentifiableInformationIdentificationAndRemoval.rst b/docs/user-guide/PersonalIdentifiableInformationIdentificationAndRemoval.rst new file mode 100644 index 00000000..9e05b688 --- /dev/null +++ b/docs/user-guide/PersonalIdentifiableInformationIdentificationAndRemoval.rst @@ -0,0 +1,104 @@ + +.. _data-curator-pii: + +====================================== +PII Identification and Removal +====================================== + +-------------------------------------- +Background +-------------------------------------- + +The purpose of the personal identifiable information (PII) de-identification tool is to help scrub sensitive data +out of datasets. The following is the list of sensitive data types that +are currently supported by the tool: + +- Name +- Email Address +- Physical Address +- Phone Numbers +- IP Address +- Credit/Debit Card Numbers +- US Social Security Numbers +- Dates + +The tool utilizes `Dask `_ to parallelize tasks and hence it can be +used to scale up to terabytes of data easily. Although Dask can be deployed on various +distributed compute environments such as HPC clusters, Kubernetes and other cloud +offerings such as AWS EKS, Google cloud etc, the current implementation only supports +Dask on HPC clusters that use SLURM as the resource manager. + +----------------------------------------- +Usage +----------------------------------------- +######################################################## +Reading documents and de-identifying +######################################################## + +Imagine we have a "books" dataset stored in the following structure: +:: + + books_dataset/ + books_00.jsonl + books_01.jsonl + books_02.jsonl + +You could read, de-identify the dataset, and write it to an output directory using the following approach + +.. code-block:: python + + from nemo_curator.datasets import DocumentDataset + from nemo_curator.utils.distributed_utils import read_data, write_to_disk, get_client + from nemo_curator.utils.file_utils import get_batched_files + from nemo_curator.modules.modify import Modify + from nemo_curator.modifiers.pii_modifier import PiiModifierBatched + + modifier = PiiModifierBatched( + language="en", + supported_entities=["PERSON", "EMAIL_ADDRESS"], + anonymize_action="replace", + batch_size=1000, + device="gpu") + + for file_names in get_batched_files( + "book_dataset, + "output_directory", + "jsonl", + 32 + ): + source_data = read_data(file_names, file_type="jsonl", backend='pandas', add_filename=True) + dataset = DocumentDataset(source_data) + print(f"Dataset has {source_data.npartitions} partitions") + + modify = Modify(modifier, batched=True) + modified_dataset = modify(dataset) + write_to_disk(modified_dataset.df, + "output_directory", + write_to_filename=True, + output_type="jsonl" + ) + +Let's walk through this code line by line. + +* ``modifier = PiiModifierBatched`` creates an instance of ``PiiModifierBatched`` class that is responsible for PII de-identification +* ``for file_names in get_batched_files`` retrieves a batch of 32 documents from the `book_dataset` +* ``source_data = read_data(file_names, file_type="jsonl", backend='pandas', add_filename=True)`` reads the data from all the files using Dask using Pandas as the backend. The ``add_filename`` argument ensures that the output files have the same filename as the input files. +* ``dataset = DocumentDataset(source_data)`` creates an instance of ``DocumentDataset`` using the batch files. ``DocumentDataset`` is the standard format for text datasets in NeMo Curator. +* ``modify = Modify(modifier, batched=True)`` creates an instance of the ``Modify`` class. This class can take any modifier as an argument +* ``modified_dataset = modify(dataset)`` modifies the data in the dataset by performing the PII de-identification based upon the passed parameters. +* ``write_to_disk(modified_dataset.df ....`` writes the de-identified documents to disk. + +The PII redaction module can also be invoked via ``script/find_pii_and_deidentify.py`` script which provides a CLI based interface. To see a complete list of options supported by the script just execute + +``python nemo_curator/scripts/find_pii_and_deidentify.py`` + +To launch the script from within a SLURM environment, the script ``examples/slurm/start-slurm.sh`` can be modified and used. + + +############################ +Resuming from Interruptions +############################ +It can be helpful to track which documents in a dataset have already been processed so that long curation jobs can be resumed if they are interrupted. +NeMo Curator provides a utility for easily tracking which dataset shards have already been processed. A call to ``get_batched_files`` will return an iterator over the files that have yet to be processed by a modifier such as ``PiiModifierBatched`` +When you re-run the code example provided above, NeMo Curator ensures that only unprocessed files are processed by the PII module. + diff --git a/docs/user-guide/QualityFiltering.rst b/docs/user-guide/QualityFiltering.rst new file mode 100644 index 00000000..6d044e22 --- /dev/null +++ b/docs/user-guide/QualityFiltering.rst @@ -0,0 +1,250 @@ + +.. _data-curator-qualityfiltering: + +============================================ +Classifier and Heuristic Quality Filtering +============================================ + +----------------------------------------- +Background +----------------------------------------- + +Large datasets often contain many documents considered to be "low quality". +In this context, "low quality" data simply means data we don't want a downstream model to learn from, and "high quality" data is data that we do want a downstream model to learn from. +The metrics that define quality can vary. +There are heuristics that measure quality by gathering simple statistics like how many punctutation marks a document has, how long is the document, and how repetitive is the document. +You can then filter documents by these statistics. +In contrast, you may have a high quality collection of data that you want a new dataset to align with. +You could train a simple classifier to differentiate between documents that look similar to those high quality documents and documents that do not. + +NeMo Curator offers modules for both kinds of filtering, and it provides an easy interface for adding your own filters and combining them with existing ones. +You can also use these modules to collect statistics and metadata on your documents without removing any of them. +There are 30+ filters available for English, non-English, and code datasets. + +----------------------------------------- +Usage +----------------------------------------- + +The ``ScoreFilter`` is at the center of the filtering in NeMo Curator. +Let's examine this small example: + +.. code-block:: python + + import nemo_curator as nc + from nemo_curator.datasets import DocumentDataset + from nemo_curator.utils.file_utils import get_all_files_paths_under + from nemo_curator.filters import WordCountFilter + + files = get_all_files_paths_under("books_dataset/") + books = DocumentDataset.read_json(files, add_filename=True) + + filter_step = nc.ScoreFilter( + WordCountFilter(min_words=80), + text_field="text", + score_field="word_count", + ) + + long_books = filter_step(books) + + long_books.to_json("long_books/", write_to_filename=True) + +The central part to focus on is the creation of the ``filter_step``. +``WordCountFilter(min_words=80)`` creates and configures a filter object. +A filter object is a class that inherits from the abstract base class ``nemo_curator.filters.DocumentFilter``. +This base class requires the inheritor to implement two methods, ``score_document`` and ``keep_document``. +For this example, let's look at a simplified version of the ``WordCountFilter``. + +.. code-block:: python + + class WordCountFilter(DocumentFilter): + + def __init__(self, min_words=50, max_words=100000, lang='en'): + self._min_words = min_words + self._max_words = max_words + self._word_splitter = get_word_splitter(lang) + self._name = 'word_count' + + def score_document(self, text: str): + return len(self._word_splitter(text)) + + def keep_document(self, score: int): + return self._min_words <= score <= self._max_words + +With this implementation, it becomes clear what each function is doing. +``score_document`` takes the text of a document, and returns the number of words in the document. +``keep_document`` takes in the score outputted by ``score_document`` (the number of words in this case) and returns ``True`` if the score indicates the document should be kept and ``False`` if the document should be removed. +Now, it's important to note that ``WordCountFilter`` and ``DocumentFilter`` only operate on a single document. +In order to apply the filter to a whole dataset, we must use ``ScoreFilter``. + +.. code-block:: python + + filter_step = nc.ScoreFilter( + WordCountFilter(min_words=80), + text_field="text", + score_field="word_count", + ) + +The construction of ``ScoreFilter`` creates a function that can be applied to a ``DocumentDataset`` instead of just a single document. +``text_field`` designates the field in the dataset that holds the documents that should get passed to the filter's ``score_document`` function. +``score_field`` is an optional argument that allows you to record the score in the given metadata field of the document, and if specified, it will be written to disk with the rest of the metadata. + +In some cases, the dataset may come with metadata that you want to filter directly. Or, you might want to simply add a new piece of metadata without filtering on it. +The ``Filter`` and ``Score`` modules allow you to accomplish each task respectively. + +For example, if the dataset in the above example came pre-populated with the ``word_count`` field, you could rewrite it as follows: + +.. code-block:: python + + books = DocumentDataset.read_json(files, add_filename=True) + + filter_step = nc.Filter( + WordCountFilter(min_words=80).keep_document, + filter_field="word_count", + ) + + long_books = filter_step(books) + + long_books.to_json("long_books/", write_to_filename=True) + +Alternatively, if you simply want to track the length of the words in the documents and not filter based on them, you could rewrite it as follows: + +.. code-block:: python + + books = DocumentDataset.read_json(files, add_filename=True) + + filter_step = nc.Score( + WordCountFilter(min_words=80).score_document, + text_field="text", + score_field="word_count", + ) + + annotated_books = filter_step(books) + + annotated_books.to_json("annotated_books/", write_to_filename=True) + + +############################ +Batched Filtering +############################ + +While the scoring and filtering functions defined above operate on single documents, NeMo Curator can take advantage of functions that operate in batches for improved performance. +To accomplish this, you can annotate your functions with the ``batched`` decorator. +This decorator will cause a pandas series of documents/scores to be passed to the function instead of a single document/score. +Here is the ``WordCountFilter`` rewritten to use batches in the ``keep_document``. + +.. code-block:: python + + from nemo_curator.utils.decorators import batched + + class WordCountFilter(DocumentFilter): + + def __init__(self, min_words=50, max_words=100000, lang='en'): + self._min_words = min_words + self._max_words = max_words + self._word_splitter = get_word_splitter(lang) + self._name = 'word_count' + + def score_document(self, text: str): + return len(self._word_splitter(text)) + + @batched + def keep_document(self, scores: pd.Series): + pass_min = self._min_words <= scores + pass_max = score <= self._max_words + return pass_min & pass_max + + + +----------------------------------------- +Classifier Filtering +----------------------------------------- + +The classifier-based filtering approach we have implemented follows closely to that used in `Brown et al., 2020 `_, +and trains a binary skip-gram classifier that can be used to distinguish between low and high quality documents. To implement this, we use the +functions provided by fastText. Following the examples provided in the fastText documentation, we first create a file consisting of +high and low-quality training documents. We provide an example of how to train and use a model in ``examples/classifier_filtering.py``. + +We also provide CLI scripts for the same functionality. The :code:`prepare_fasttext_training_data` script will randomly sample documents +from an input dataset and will prepare them to be used to train a fasText skip-gram classifier. For a high-quality dataset we recommend sampling from +either OpenWebText2 or Wikipedia and an unfiltered version of Common Crawl can be used for a low-quality dataset. + +.. code-block:: bash + + prepare_fasttext_training_data \ + --input-data-dir= \ + --output-num-samples= \ + --label='__label__cc' \ + --output-train-file=${res_dir}/cc_samples.txt \ + + prepare_fasttext_training_data \ + --input-data-dir= \ + --output-num-samples= \ + --label='__label__hq' \ + --output-train-file=${res_dir}/hq_samples.txt \ + +Once the samples have been prepared and written to :code:`.txt` files, users can use the :code:`train_fasttext` script that reads in the samples within the :code:`.txt` files +in order to train a quality classifier. :code:`train_fasttext` will read in all of the samples within the :code:`.txt` files, split the data into training and +validation sets and train the binary skip-gram classifier. After training, it evaluates the model on the validation samples and writes the predictions +to a jsonl file prints the confusion matrix to stdout. + +.. code-block:: bash + + train_fasttext \ + --fasttext-files-dir=${res_dir} \ + --output-train-file=${res_dir}/fasttext_samples.train \ + --output-validation-file=${res_dir}/fasttext_samples.valid \ + --output-model=${res_dir}/cc_filter_test.bin \ + --output-predictions=${res_dir}/preds.jsonl + +Finally, with the model trained and able to provide quality scores, it can be used to for quality filtering. Similar to how +:code:`filter_documents` performs language identification with the fastText model :code:`lid.176.bin`, we provide a default config that can +be used for classifier-based quality filtering with a fastText model. Additionally, this filter implements Pareto-based sampling approach +as is described in `Brown et al., 2020 `_. + +.. code-block:: bash + + filter_documents \ + --input-data-dir= \ + --filter-config-file=./config/fasttext_quality_filter.yaml \ + --output-retained-document-dir= \ + --output-removed-document-dir= \ + --log-dir=${log_dir}/fasttext_classifier \ + +----------------------------------------- +Heuristic Filtering +----------------------------------------- + +As with other filtering steps, the heuristic-based filtering in NeMo Curator can be carried out using ``ScoreFilter`` or the :code:`filter_documents` +utility. Filters can be chained in NeMo Curator using ``Sequential`` as follows. + +.. code-block:: python + + filter_step = nc.Sequential([ + ScoreFilter( + WordCountFilter(min_words=80), + score_field="word_count", + ), + ScoreFilter(IncompleteStoryFilter()), + ScoreFilter(RepeatingTopNGramsFilter(n=2, max_repeating_ngram_ratio=0.2)), + ScoreFilter(RepeatingTopNGramsFilter(n=3, max_repeating_ngram_ratio=0.18)), + ScoreFilter(RepeatingTopNGramsFilter(n=4, max_repeating_ngram_ratio=0.16)), + ]) + +The filter config file :code:`config/heuristic_filter.yaml` provides a generic list of heuristic filters that have been tested +and shown to provide documents that when used for training, lead to improvements in language model downstream task performance. +The filters are general enough that users should feel free to remove certain filters within the cascade of filters and experiment +with the results of different filter configurations/parameters. + +Additionally, these filters have been used for curating high-quality non-English documents. However, it is advised that when applying +to non-English data that users write out the document scores by specifying the :code:`--document-score-dir` argument. This will allow users to +examine if a particular filter is responsible for undesirably removing many documents from a corpus. + +.. code-block:: bash + + filter_documents \ + --input-data-dir= \ + --filter-config-file=./config/heuristic_filter_en.yaml \ + --output-retained-document-dir= \ + --output-removed-document-dir= \ + --output-document-score-dir= \ + --log-dir=${log_dir}/heuristic_filter diff --git a/docs/user-guide/TaskDecontamination.rst b/docs/user-guide/TaskDecontamination.rst new file mode 100644 index 00000000..25bd6547 --- /dev/null +++ b/docs/user-guide/TaskDecontamination.rst @@ -0,0 +1,91 @@ + +.. _data-curator-downstream: + +####################################################### +Downstream Task Decontamination/Deduplication +####################################################### + +----------------------------------------- +Background +----------------------------------------- + +After training, large language models are usually evaluated by their performance on downstream tasks consisting of unseen test data. +When dealing with large datasets there is a potential for leakage of this test data into the model's training dataset. +Therefore, NeMo Curator follows the approach of `OpenAI GPT3 `_ and `Microsoft Turing NLG 530B `_ +to remove sections of documents in your dataset that are present in downstream tasks. + +----------------------------------------- +Usage +----------------------------------------- + +The ``TaskDecontamination`` modules provides the central functionality in NeMo Curator. +Let's examine this small example: + +.. code-block:: python + + import nemo_curator as nc + from nemo_curator.datasets import DocumentDataset + from nemo_curator.utils.file_utils import get_all_files_paths_under + from nemo_curator.tasks import Winogrande, Squad, TriviaQA, + + files = get_all_files_paths_under("books_dataset/") + books = DocumentDataset.read_json(files, add_filename=True) + + downstream_tasks = [ + Winogrande(), + Squad(), + TriviaQA(), + ] + + task_decontaminate = nc.TaskDecontamination(downstream_tasks) + + decontaminated_books = task_decontaminate(books) + + decontaminated_books.to_json("decontaminated_books/", write_to_filename=True) + +If you would like more fine-grained control over the task decontamination process, NeMo Curator provides several CLI tools you can manually apply. +You can use the :code:`prepare_task_data`, :code:`find_matching_ngrams` and :code:`remove_matching_ngrams` +scripts in order to remove any task data that might be contained (i.e., "contaminate") within your training data. +You will need a list of your downstream tasks to modify the `task config (lm_tasks.yaml) <../../config/lm_tasks.yaml>`_. +If your task does not already exist as a class, you will need to construct a class that extends :code:`nemo_curator.tasks.DownstreamTask`. + +Then, you can start by constructing the n-grams from the task documents using the :code:`prepare_task_data` module. +This module requires an input configuration file that contains the different modules that describe how to form N-grams from the task data of interest. +An example of a configuration file is provided in :code:`config/lm_tasks.yaml`. A number of tasks are already implemented within the NeMo Curator +and can be found within :code:`nemo_curator.tasks`. Should users desire to add their own tasks, they can prescribe their own class similar +to those defined in :code:`nemo_curator.tasks`. Once all N-grams have been computed, they are written as keys of a dictionary that is written to a pickle file. +This step only needs to be done once per set of tasks. This pickle file can be reused across datasets that share the same downstream tasks. + +.. code-block:: bash + + prepare_task_data \ + --task-config-file=./config/lm_tasks.yaml \ + --output-task-ngrams=./data/task_ngrams.pkl + + + +Once users have computed the task N-grams, they can use the :code:`find_matching_ngrams` module in order to search for matches within their corpus. +This module task as input the path to the users dataset consisting of JSONL files as well as precomputed task N-grams, and as output provides a pickle +file consisting of the count of how many times a specific task N-gram ocurred within the training set. This N-gram count will be used in the final +step to determine if an a document should be split and the N-gram removed. + +.. code-block:: bash + + find_matching_ngrams \ + --input-data-dir= \ + --input-task-ngrams=./data/task_ngrams.pkl \ + --output-matched-ngram-data=./data/matched_ngrams.pkl + +As a final step in the task decontamination procedure, the counts associated with the matched N-grams are used to determine if a particular N-gram +should be removed from the training corpus. If the N-gram has a count that is higher than a user-defined threshold, it is not considered. Otherwise, +it is considered and will be removed from the corpus. When an N-gram is removed from the corpus, a user-defined character window that extends from +the N-gram in both directions is also removed from the corpus. Additionally, the document will be split into two separate documents. If the split +document is too short after splitting, it will be removed. Additionally, documents that are split more than a user-defined number of times are also +removed from the corpus. For more information on the task decontamination procedure, please see `Brown et al., 2020 `_ and `Smith et al., 2021 `_ + +.. code-block:: bash + + remove_matching_ngrams \ + --input-data-dir= \ + --input-matched-ngrams=./data/matched_ngrams.pkl \ + --output-task-deduped-dir= diff --git a/docs/user-guide/images/zeroshot_ablations.png b/docs/user-guide/images/zeroshot_ablations.png new file mode 100644 index 00000000..10be3494 Binary files /dev/null and b/docs/user-guide/images/zeroshot_ablations.png differ diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst new file mode 100644 index 00000000..e4cbd75e --- /dev/null +++ b/docs/user-guide/index.rst @@ -0,0 +1,43 @@ +.. include:: DataCuration.rsts + +:ref:`Downloading and Extracting Text ` + Downloading a massive public dataset is usually the first step in data curation, and it can be cumbersome due to the dataset’s massive size and hosting method. This section describes how to download and extract large corpora efficiently. + +:ref:`Working with DocumentDataset ` + DocumentDataset is the standard format for datasets in NeMo Curator. This section describes how to get datasets in and out of this format, as well as how DocumentDataset interacts with the modules. + +:ref:`CPU and GPU Modules with Dask ` + NeMo Curator provides both CPU based modules and GPU based modules and supports methods for creating compatible Dask clusters and managing the dataset transfer between CPU and GPU. + +:ref:`Document Filtering ` + This section describes how to use the 30+ filters available within the NeMo Curator and implement custom filters to apply to the documents within the corpora. + +:ref:`Language Identification and Unicode Fixing ` + Large, unlabeled text corpora often contain a variety of languages. The NeMo Curator provides utilities to identify languages and fix improperly decoded Unicode characters. + +:ref:`GPU Accelerated Exact and Fuzzy Deduplication ` + Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF. + +:ref:`Classifier and Heuristic Quality Filtering ` + Classifier-based filtering involves training a small text classifer to label a document as either high quality or low quality. Heuristic-based filtering uses simple rules (e.g. Are there too many punctuation marks?) to score a document. NeMo Curator offers both classifier and heuristic-based quality filtering of documents. + +:ref:`Downstream Task Decontamination/Deduplication ` + After training, large language models are usually evaluated by their performance on downstream tasks consisting of unseen test data. When dealing with large datasets, there is a potential for leakage of this test data into the model’s training dataset. NeMo Curator allows you to remove sections of documents in your dataset that are present in downstream tasks. + +:ref:`Personally Identifiable Information Identification and Removal ` + The purpose of the personally identifiable information (PII) redaction tool is to help scrub sensitive data out of training datasets + +.. toctree:: + :maxdepth: 4 + :titlesonly: + + + Download.rst + DocumentDataset.rst + CPUvsGPU.rst + QualityFiltering.rst + LanguageIdentificationUnicodeFormatting.rst + GpuDeduplication.rst + TaskDecontamination.rst + PersonalIdentifiableInformationIdentificationAndRemoval.rst + DistributedDataClassification.rst \ No newline at end of file diff --git a/examples/classifier_filtering.py b/examples/classifier_filtering.py new file mode 100644 index 00000000..6c3588c7 --- /dev/null +++ b/examples/classifier_filtering.py @@ -0,0 +1,88 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fasttext +import random +import argparse + +import nemo_curator as nc +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modifiers import FastTextLabelModifier +from nemo_curator.filters import BatchedFastTextQualityFilter +from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.distributed_utils import read_data, write_to_disk, get_client +from nemo_curator.utils.script_utils import add_distributed_args + + +def load_dataset(input_data_dir): + files = list(get_all_files_paths_under(input_data_dir)) + raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True) + dataset = DocumentDataset(raw_data) + + return dataset + +def create_samples(data_path, label, num_samples): + raw_dataset = load_dataset(data_path) + label_quality = nc.Modify(FastTextLabelModifier(label)) + + labeled_dataset = label_quality(raw_dataset) + labeled_samples = labeled_dataset.df.sample(frac=num_samples / len(labeled_dataset.df)) + + return labeled_samples["text"].compute().values.tolist() + +def main(args): + # Params + low_quality_data_path = "/path/to/low_quality" + high_quality_data_path = "/path/to/high_quality" + num_low_quality_samples = 1000 + num_high_quality_samples = 1000 + filtered_output = "/path/to/output" + + # Prepare samples for the classifier + client = get_client(args, args.device) + low_quality_samples = create_samples(low_quality_data_path, "__label__lq", num_low_quality_samples) + high_quality_samples = create_samples(high_quality_data_path, "__label__hq", num_high_quality_samples) + + train_samples = low_quality_samples + high_quality_samples + random.shuffle(train_samples) + train_file = "./fasttext.train" + model_path = "./fasttext_model.bin" + with open(train_file, "w") as f: + for sample in train_samples: + f.write(sample) + f.write('\n') + + # Train fastText classifier + model = fasttext.train_supervised( + input=train_file, + lr=0.01, + dim=100, + epoch=5, + wordNgrams=2, + ) + model.save_model(model_path) + + # Filter data + target_dataset = load_dataset(low_quality_data_path) + filter_pipeline = nc.ScoreFilter(BatchedFastTextQualityFilter(model_path), score_field="quality_score", batched=True, score_type=float) + filtered_dataset = filter_pipeline(target_dataset) + + # Write filtered dataset + write_to_disk(filtered_dataset.df, filtered_output, write_to_filename=True) + +def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)): + return add_distributed_args(parser) + +if __name__ == "__main__": + main(attach_args().parse_args()) \ No newline at end of file diff --git a/examples/distributed_data_classification_examples/domain_api_example.py b/examples/distributed_data_classification_examples/domain_api_example.py new file mode 100644 index 00000000..55d44e3a --- /dev/null +++ b/examples/distributed_data_classification_examples/domain_api_example.py @@ -0,0 +1,140 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +import argparse + +from nemo_curator import DomainClassifier +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.distributed_utils import get_client + + +def main(args): + global_st = time.time() + + labels = [ + "Adult", + "Arts_and_Entertainment", + "Autos_and_Vehicles", + "Beauty_and_Fitness", + "Books_and_Literature", + "Business_and_Industrial", + "Computers_and_Electronics", + "Finance", + "Food_and_Drink", + "Games", + "Health", + "Hobbies_and_Leisure", + "Home_and_Garden", + "Internet_and_Telecom", + "Jobs_and_Education", + "Law_and_Government", + "News", + "Online_Communities", + "People_and_Society", + "Pets_and_Animals", + "Real_Estate", + "Science", + "Sensitive_Subjects", + "Shopping", + "Sports", + "Travel_and_Transportation", + ] + + model_file_name = "/path/to/pytorch_model_file.pth" + + # Input can be a string or list + input_file_path = "/path/to/data" + output_file_path = "./" + + client = get_client(args, cluster_type=args.device) + + input_dataset = DocumentDataset.read_json( + input_file_path, backend="cudf", add_filename=True + ) + + domain_classifier = DomainClassifier( + model_file_name=model_file_name, + labels=labels, + filter_by=["Games", "Sports"], + ) + result_dataset = domain_classifier(dataset=input_dataset) + + result_dataset.to_json( + output_file_dir=output_file_path, write_to_filename=True + ) + + global_et = time.time() + print( + f"Total time taken for domain classifier inference: {global_et-global_st} s", + flush=True, + ) + + client.close() + +def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)): + parser.add_argument( + "--scheduler-address", + type=str, + default=None, + help="Address to the scheduler of a created dask cluster. If not provided" + "a single node LocalCUDACluster will be started.", + ) + parser.add_argument( + "--scheduler-file", + type=str, + default=None, + help="Path to the scheduler file of a created dask cluster. If not provided" + " a single node LocalCUDACluster will be started.", + ) + parser.add_argument( + "--nvlink-only", + action="store_true", + help="Start a local cluster with only NVLink enabled." + "Only applicable when protocol=ucx and no scheduler file/address is specified", + ) + parser.add_argument( + "--protocol", + type=str, + default="ucx", + help="Protcol to use for dask cluster" + "Note: This only applies to the localCUDACluster. If providing an user created " + "cluster refer to" + "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-protocol", # noqa: E501 + ) + parser.add_argument( + "--rmm-pool-size", + type=str, + default="14GB", + help="Initial pool size to use for the RMM Pool Memory allocator" + "Note: This only applies to the localCUDACluster. If providing an user created " + "cluster refer to" + "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-rmm-pool-size", # noqa: E501 + ) + parser.add_argument("--enable-spilling", action="store_true") + parser.add_argument("--set-torch-to-use-rmm", action="store_true") + parser.add_argument( + "--device", + type=str, + default="gpu", + help="Device to run the script on. Either 'cpu' or 'gpu'.", + ) + + return parser + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/examples/distributed_data_classification_examples/quality_api_example.py b/examples/distributed_data_classification_examples/quality_api_example.py new file mode 100644 index 00000000..65e28d18 --- /dev/null +++ b/examples/distributed_data_classification_examples/quality_api_example.py @@ -0,0 +1,109 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +import argparse + +from nemo_curator import QualityClassifier +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.distributed_utils import get_client + + +def main(args): + global_st = time.time() + + labels = ["High", "Medium", "Low"] + model_file_name = "/path/to/pytorch_model_file.pth" + + # Input can be a string or list + input_file_path = "/path/to/data" + output_file_path = "./" + + client = get_client(args, cluster_type=args.device) + + input_dataset = DocumentDataset.from_json( + input_file_path, backend="cudf", add_filename=True + ) + + quality_classifier = QualityClassifier( + model_file_name=model_file_name, + labels=labels, + filter_by=["High", "Medium"], + ) + result_dataset = quality_classifier(dataset=input_dataset) + print(result_dataset.df.head()) + + global_et = time.time() + print( + f"Total time taken for quality classifier inference: {global_et-global_st} s", + flush=True, + ) + + client.close() + +def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)): + parser.add_argument( + "--scheduler-address", + type=str, + default=None, + help="Address to the scheduler of a created dask cluster. If not provided" + "a single node LocalCUDACluster will be started.", + ) + parser.add_argument( + "--scheduler-file", + type=str, + default=None, + help="Path to the scheduler file of a created dask cluster. If not provided" + " a single node LocalCUDACluster will be started.", + ) + parser.add_argument( + "--nvlink-only", + action="store_true", + help="Start a local cluster with only NVLink enabled." + "Only applicable when protocol=ucx and no scheduler file/address is specified", + ) + parser.add_argument( + "--protocol", + type=str, + default="ucx", + help="Protcol to use for dask cluster" + "Note: This only applies to the localCUDACluster. If providing an user created " + "cluster refer to" + "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-protocol", # noqa: E501 + ) + parser.add_argument( + "--rmm-pool-size", + type=str, + default="14GB", + help="Initial pool size to use for the RMM Pool Memory allocator" + "Note: This only applies to the localCUDACluster. If providing an user created " + "cluster refer to" + "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-rmm-pool-size", # noqa: E501 + ) + parser.add_argument("--enable-spilling", action="store_true") + parser.add_argument("--set-torch-to-use-rmm", action="store_true") + parser.add_argument( + "--device", + type=str, + default="gpu", + help="Device to run the script on. Either 'cpu' or 'gpu'.", + ) + + return parser + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/examples/download_arxiv.py b/examples/download_arxiv.py new file mode 100644 index 00000000..945457aa --- /dev/null +++ b/examples/download_arxiv.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nemo_curator.download import download_arxiv +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import add_distributed_args + + +def main(args): + # Params + output_directory = "/path/to/output" + + # Only download 10 shards as an example + url_limit = 10 + + # Set up Dask client + client = get_client(args, args.device) + + # Download and sample data + arxiv = download_arxiv(output_directory, url_limit=url_limit) + sample = arxiv.df.sample(frac=10 / len(arxiv)) + + # Inspect the samples + print(sample.compute()) + +def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)): + return add_distributed_args(parser) + +if __name__ == "__main__": + main(attach_args().parse_args()) \ No newline at end of file diff --git a/examples/download_common_crawl.py b/examples/download_common_crawl.py new file mode 100644 index 00000000..2ded1532 --- /dev/null +++ b/examples/download_common_crawl.py @@ -0,0 +1,45 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nemo_curator.download import download_common_crawl +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import add_distributed_args + + +def main(args): + # Params + start_snapshot = "2021-04" + end_snapshot = "2021-10" + output_directory = "/path/to/output" + + # Only download 10 shards as an example + url_limit = 10 + + # Set up Dask client + client = get_client(args, args.device) + + # Download and sample data + common_crawl = download_common_crawl(output_directory, start_snapshot, end_snapshot, url_limit=url_limit) + sample = common_crawl.df.sample(frac=10 / len(common_crawl)) + + # Inspect the samples + print(sample.compute()) + +def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)): + return add_distributed_args(parser) + +if __name__ == "__main__": + main(attach_args().parse_args()) \ No newline at end of file diff --git a/examples/download_wikipedia.py b/examples/download_wikipedia.py new file mode 100644 index 00000000..377e2ee5 --- /dev/null +++ b/examples/download_wikipedia.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nemo_curator.download import download_wikipedia +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import add_distributed_args + + +def main(args): + # Params + dump_date = "20240201" + output_directory = "/path/to/output" + + # Only download 10 shards as an example + url_limit = 10 + + # Set up Dask client + client = get_client(args, args.device) + + # Download and sample data + wikipedia = download_wikipedia(output_directory, dump_date=dump_date, url_limit=url_limit) + sample = wikipedia.df.sample(frac=10 / len(wikipedia)) + + # Inspect the samples + print(sample.compute()) + +def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)): + return add_distributed_args(parser) + +if __name__ == "__main__": + main(attach_args().parse_args()) \ No newline at end of file diff --git a/examples/exact_deduplication.py b/examples/exact_deduplication.py new file mode 100644 index 00000000..2cd50bb2 --- /dev/null +++ b/examples/exact_deduplication.py @@ -0,0 +1,83 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modules import ExactDuplicates +from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk +from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.script_utils import add_distributed_args +import time + +def pre_imports(): + import cudf # noqa: F401 + + +def main(args): + + dataset_dir = "/path/to/data" + log_dir = "./" + output_dir = "./" + dataset_id_field = "id" + dataset_text_field = "text" + client = get_client(args, args.device) + backend = "cudf" if args.device == "gpu" else "pandas" + + if args.device == "gpu": + client.run(pre_imports) + + t0 = time.time() + input_dataset = DocumentDataset.read_json(dataset_dir, backend=backend) + + exact_dup = ExactDuplicates( + logger=log_dir, + id_field=dataset_id_field, + text_field=dataset_text_field, + # cache_dir=output_dir # Optionally write the output to disk + ) + + duplicates = exact_dup(dataset=input_dataset) + + # If caching, result is a path to the output dataset. + if isinstance(duplicates, str): + duplicates = DocumentDataset.read_parquet(duplicates, backend=backend) + + # It's easy to apply dataframe operations to the dataset by using the underlying df. + + # By default all duplicate id's are included in the result + # keep 1 document from each group of duplcates and mark the others to remove + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html + docs_to_remove = duplicates.df.map_partitions( + lambda x: x[x._hashes.duplicated(keep="first")] + ) + + # When there are few duplicates we can compute the results to a list and use `isin`. + result = input_dataset.df[ + ~input_dataset.df[dataset_id_field].isin(docs_to_remove[dataset_id_field].compute()) + ] + write_to_disk(result, output_dir, output_type="parquet") + print(time.time()-t0) + + +def attach_args( + parser=argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ), +): + return add_distributed_args(parser) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/examples/find_pii_and_deidentify.py b/examples/find_pii_and_deidentify.py new file mode 100644 index 00000000..0ac1eb56 --- /dev/null +++ b/examples/find_pii_and_deidentify.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import dask.dataframe +import pandas as pd + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modifiers.pii_modifier import PiiModifierBatched +from nemo_curator.modules.modify import Modify +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import add_distributed_args + + +def console_script(): + parser = argparse.ArgumentParser() + arguments = add_distributed_args(parser).parse_args() + _ = get_client(arguments, arguments.device) + + dataframe = pd.DataFrame({'text': ['Sarah and Ryan went out to play', 'Jensen is the CEO of NVIDIA']}) + dd = dask.dataframe.from_pandas(dataframe, npartitions=1) + dataset = DocumentDataset(dd) + + modifier = PiiModifierBatched( + log_dir='./logs', + batch_size=2000, + language='en', + supported_entities=['PERSON', "EMAIL_ADDRESS"], + anonymize_action='replace') + + modify = Modify(modifier, batched=True) + modified_dataset = modify(dataset) + modified_dataset.df.to_json('output_files/*.jsonl', lines=True, orient='records') + + +if __name__ == "__main__": + console_script() \ No newline at end of file diff --git a/examples/gpu_deduplication_example/README.md b/examples/gpu_deduplication_example/README.md new file mode 100644 index 00000000..9a5e64c1 --- /dev/null +++ b/examples/gpu_deduplication_example/README.md @@ -0,0 +1,26 @@ +### Deduplication Steps + +1. Exact dedup + 1. Input: Data directories + 2. Output: exact_duplicates.parquet. List of exact duplicates and the document hash. + +Fuzzy Dedup +1. Minhashes (Compute minhashes) + 1. Input: Data Directories + 2. Output: minhashes.parquet for each data dir. +2. Buckets (Minhash Buckets) + 1. Input: Minhash directories + 2. Output: Buckets.parquet +3. Jaccard Map Buckets + Jaccard shuffle + 1. Input: Buckets.parquet + Data Dir + 2. Output: Shuffled docs.parquet +4. Jaccard compute + 1. Input: Shuffled docs.parquet + 2. Output: dedup_final_results.parquet +5. Connected Components + 1. Input: Dedup_final_Results.parquet + 2. Output: connected_components.parquet + + +While calling the main `run-workflow.sh` script that points to these runscripts users can also set the relevant `LIBCUDF_CUFILE_POLICY`. +It is reccomended to set `LIBCUDF_CUFILE_POLICY=OFF` for all runs calling the script. diff --git a/examples/gpu_deduplication_example/batch.sh b/examples/gpu_deduplication_example/batch.sh new file mode 100644 index 00000000..eca7145c --- /dev/null +++ b/examples/gpu_deduplication_example/batch.sh @@ -0,0 +1,38 @@ +#! /bin/bash + +#SBATCH --job-name=nemo-data-curator:gpu-deduplication +#SBATCH --nodes=8 +#SBATCH --exclusive +#SBATCH --time=04:00:00 + +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# This script can be used for running both exact and fuzzy document-level +# deduplication using Dask and cuDF +# + +base_dir=`pwd` # Assumes base dir is top-level dir of repo +RUNSCRIPT=${RUNSCRIPT:-${base_dir}/examples/gpu_deduplication_example/run-minhash.sh} +LIBCUDF_CUFILE_POLICY=${LIBCUDF_CUFILE_POLICY:-OFF} +echo $RUNSCRIPT + +docker_image='nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.03' +mounts="${base_dir}:${base_dir}" + +srun -l \ + --container-mounts=${mounts} \ + --container-image=${docker_image} \ + bash -c "echo ${RUNSCRIPT};echo ${LIBCUDF_CUFILE_POLICY}; LIBCUDF_CUFILE_POLICY=${LIBCUDF_CUFILE_POLICY} RUNSCRIPT=${RUNSCRIPT} bash ${base_dir}/examples/gpu_deduplication_example/run-workflow.sh" diff --git a/examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh b/examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh new file mode 100644 index 00000000..757629e3 --- /dev/null +++ b/examples/gpu_deduplication_example/create-list-of-exact-duplicate-ids.sh @@ -0,0 +1,53 @@ +#! /bin/bash + +#SBATCH --job-name=nemo-data-curator:create-exact-dup-id-list +#SBATCH --nodes=1 +#SBATCH --exclusive +#SBATCH --time=0:30:00 + +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eux + +## Log and intermediate results dirs +base_dir=`pwd` +src_dir="${base_dir}/workspace/nemo-data-curator" +log_dir=${src_dir}/workspace/log/create_exact_dup_id_list +res_dir=${src_dir}/workspace/data/create_exact_dup_id_list +conf_dir=${src_dir}/workspace/config +mkdir -p ${log_dir} ${res_dir} ${conf_dir} + +## Container related variables +docker_image="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11" +mounts="${base_dir}:${base_dir}" + +## Set relevant filepath +input_id_list_dir= + +srun -l \ + --mpi=pmix \ + --output=${log_dir}/create_exact_dup_id_list_%j.out \ + --error=${log_dir}/create_exact_dup_id_list_%j.err \ + --container-image=${docker_image} \ + --container-mounts=${mounts} \ + create_list_of_duplicate_ids \ + --input-id-list-dir=${input_id_list_dir} \ + --input-bucket-key="_hashes" \ + --output-id-list-dir=${res_dir}/exact_dup_ids \ + --output-bucket-list-dir=${res_dir}/buckets \ + --log-dir=${log_dir}/create_exact_dup_id_list + +# Concatenate the extracted list of ids +cat ${res_dir}/exact_dup_ids/*.txt > ${res_dir}/exact_duplicate_id_list.txt diff --git a/examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh b/examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh new file mode 100644 index 00000000..70b0d13b --- /dev/null +++ b/examples/gpu_deduplication_example/create-list-of-fuzzy-duplicate-ids.sh @@ -0,0 +1,66 @@ +#! /bin/bash + +#SBATCH --job-name=nemo-data-curator:create-fuzzy-dup-id-list +#SBATCH --nodes=1 +#SBATCH --exclusive +#SBATCH --time=0:30:00 + +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eux + +## Log and intermediate results dirs +base_dir=`pwd` +src_dir="${base_dir}/workspace/nemo-data-curator" +log_dir=${src_dir}/workspace/log/create_fuzzy_dup_id_list +res_dir=${src_dir}/workspace/data/create_fuzzy_dup_id_list +conf_dir=${src_dir}/workspace/config +mkdir -p ${log_dir} ${res_dir} ${conf_dir} + +## Container related variables +docker_image="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11" +mounts="${base_dir}:${base_dir}" + +## Set relevant filepath +input_id_list_dir= + +# Generate the mapping and prepare the connected components +srun -l \ + --nodes=1 \ + --output=${log_dir}/create_fuzzy_dup_id_list_%j.out \ + --error=${log_dir}/create_fuzzy_dup_id_list_%j.err \ + --container-image=${docker_image} \ + --container-mounts=${mounts} \ + prepare_fuzzy_ids \ + --path-to-connected-components=${input_id_list_dir} \ + --output-indexed-connected-components=${res_dir}/indexed_connected_components.parquet \ + --output-id-mapping=${res_dir}/mapping.json + +srun -l \ + --mpi=pmix \ + --output=${log_dir}/create_fuzzy_dup_id_list_%j.out \ + --error=${log_dir}/create_fuzzy_dup_id_list_%j.err \ + --container-image=${docker_image} \ + --container-mounts=${mounts} \ + create_list_of_duplicate_ids \ + --input-id-list-dir=${res_dir}/indexed_connected_components.parquet \ + --input-bucket-key="group" \ + --id-mapping=${res_dir}/mapping.json \ + --output-id-list-dir=${res_dir}/fuzzy_dup_ids \ + --output-bucket-list-dir=${res_dir}/buckets \ + --log-dir=${log_dir}/create_fuzzy_dup_id_list + +# Concatenate the extracted list of ids +cat ${res_dir}/fuzzy_dup_ids/*.txt > ${res_dir}/fuzzy_duplicate_id_list.txt diff --git a/examples/gpu_deduplication_example/remove-duplicates.sh b/examples/gpu_deduplication_example/remove-duplicates.sh new file mode 100644 index 00000000..275c9f15 --- /dev/null +++ b/examples/gpu_deduplication_example/remove-duplicates.sh @@ -0,0 +1,52 @@ +#! /bin/bash + +#SBATCH --job-name=nemo-data-curator:remove-duplicates +#SBATCH --nodes=10 +#SBATCH --exclusive +#SBATCH --time=01:00:00 + +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eux + +## Log and intermediate results dirs +base_dir=`pwd` +src_dir="${base_dir}/workspace/nemo-data-curator" +log_dir=${src_dir}/workspace/log/remove_duplicates +res_dir=${src_dir}/workspace/data/remove_duplicates +conf_dir=${src_dir}/workspace/config +mkdir -p ${log_dir} ${res_dir} ${conf_dir} + +## Container related variables +docker_image="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11" +mounts="${base_dir}:${base_dir}" + +## Set relevant filepaths +input_data_dir="" +input_id_list="" +output_data_dir="" +fname=$(basename ${input_id_list}) +tag=$(basename $fname .txt) + +srun -l \ + --output=${log_dir}/remove_duplicates_${tag}_%j.out \ + --error=${log_dir}/remove_duplicates_${tag}_%j.err \ + --container-image=${docker_image} \ + --container-mounts=${mounts} \ + remove_duplicates \ + --input-data-dir=${input_data_dir} \ + --input-id-list=${input_id_list} \ + --output-deduped-dir=${output_data_dir}/all_deduped \ + --log-dir=${log_dir}/all_deduped_${tag} diff --git a/examples/gpu_deduplication_example/run-buckets.sh b/examples/gpu_deduplication_example/run-buckets.sh new file mode 100644 index 00000000..7ca1d102 --- /dev/null +++ b/examples/gpu_deduplication_example/run-buckets.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +minhash_dir="/outputdir/minhashes" +datasets=$(ls ${minhash_dir}) +for dataset in $datasets; do + input_minhash_dirs="$input_minhash_dirs $minhash_dir/$dataset/minhashes.parquet" +done +output_dir="/outputdir" + +buckets_per_shuffle=1 + +mkdir -p $output_dir +echo $input_minhash_dirs + +# Remove old buckets +rm -r ${output_dir}/buckets.parquet + +python -u minhash_buckets.py \ + --input-data-dirs $input_minhash_dirs \ + --minhash-length 260 \ + --output-bucket-dir $output_dir/ \ + --log-dir $LOGDIR \ + --protocol ucx \ + --num-bands 20 \ + --buckets-per-shuffle=$buckets_per_shuffle \ + --split-out=512 \ + --scheduler-file $LOGDIR/scheduler.json + +echo "Time Check: `date`" diff --git a/examples/gpu_deduplication_example/run-cc.sh b/examples/gpu_deduplication_example/run-cc.sh new file mode 100644 index 00000000..ab0c6210 --- /dev/null +++ b/examples/gpu_deduplication_example/run-cc.sh @@ -0,0 +1,26 @@ + +base_dir="/outputdir" +cc_folder="CC" +output_dir="${base_dir}/${cc_folder}_output" +cache_dir="${base_dir}/${cc_folder}_cache" +jaccard_pairs_path="/outputdir/dedup_final_results.parquet" + + +echo "output_dir set to $output_dir" +echo "cache_dir set to $cache_dir" + +export RAPIDS_NO_INITIALIZE="1" +export CUDF_SPILL="1" + +### compute connected component +#rm -r $cache_dir +mkdir -p $output_dir $cache_dir + +python -u connected_component.py \ + --jaccard-pairs-path $jaccard_pairs_path \ + --output-dir $output_dir \ + --cache-dir $cache_dir \ + --log-dir $LOGDIR \ + --profile-path $PROFILESDIR \ + --num-files $NUM_FILES \ + --scheduler-file $LOGDIR/scheduler.json diff --git a/examples/gpu_deduplication_example/run-jaccard.sh b/examples/gpu_deduplication_example/run-jaccard.sh new file mode 100644 index 00000000..6ee51d30 --- /dev/null +++ b/examples/gpu_deduplication_example/run-jaccard.sh @@ -0,0 +1,16 @@ + +shuffled_docs_dir="/outputdir/shuffled_docs.parquet" +output_dir="/outputdir" + + +export CUDF_SPILL="1" + +python jaccard_compute.py \ + --shuffled-docs-path $shuffled_docs_dir \ + --output-dir $output_dir \ + --log-dir $LOGDIR \ + --num-files $NUM_FILES \ + --scheduler-file $LOGDIR/scheduler.json + + +echo "Time Check: `date`" diff --git a/examples/gpu_deduplication_example/run-minhash.sh b/examples/gpu_deduplication_example/run-minhash.sh new file mode 100644 index 00000000..79e069cd --- /dev/null +++ b/examples/gpu_deduplication_example/run-minhash.sh @@ -0,0 +1,42 @@ +#! /bin/bash + +# Assumes each directory contains Jsonl files +input_data_dirs="/datadir/dataset1/ \ +/datadir/dataset2/ \ +/datadir/dataset3/" + +output_dir="/outputdir/minhashes" + +# NOTE: The script implicitly assumes that the last part +# of the input data paths is the dataset name and will choose +# output dir names as follows: +# /outputdir/minhashes/dataset1 +# /outputdir/minhashes/dataset2 +# /outputdir/minhashes/dataset3 +# This can cause issues if the last part of the +# dirname is the same across datasets + +mkdir -p $output_dir + +# Is a good number for files 200MB or lesser +# Use a smaller value for larger jsonl files +files_per_partition=20 + +mkdir -p $output_dir +echo $input_data_dirs + +python -u compute_minhashes.py \ + --input-data-dirs $input_data_dirs \ + --minhash-length 260 \ + --char-ngram 5 \ + --hash-bytes 4 \ + --seed 42 \ + --output-minhash-dir $output_dir \ + --log-dir $LOGDIR \ + --num-files $NUM_FILES \ + --files-per-partition $files_per_partition \ + --profile-path $PROFILESDIR \ + --log-frequency 250 \ + --scheduler-file $LOGDIR/scheduler.json + +echo "Time Check: `date`" diff --git a/examples/gpu_deduplication_example/run-shuffle.sh b/examples/gpu_deduplication_example/run-shuffle.sh new file mode 100644 index 00000000..e559dbbb --- /dev/null +++ b/examples/gpu_deduplication_example/run-shuffle.sh @@ -0,0 +1,35 @@ +input_data_dirs="/datadir/dataset1/ \ +/datadir/dataset2/ \ +/datadir/dataset3/" +buckets_dir="/outputdir/buckets.parquet" +output_dir="/outputdir" + + +export CUDF_SPILL="1" + +## Run jaccard Mapping +echo "Starting Jaccard mapping..." +python jaccard_map_buckets.py \ + --input-bucket-dir $buckets_dir \ + --input-data-dirs $input_data_dirs \ + --output-dir $output_dir \ + --log-dir $LOGDIR \ + --text-ddf-blocksize 512 \ + --num-files $NUM_FILES \ + --scheduler-file $LOGDIR/scheduler.json + +### Run jaccard Shuffle + +echo "Starting Jaccard Shuffle..." + +python jaccard_shuffle.py \ + --input-bucket-mapping-dir $output_dir/anchor_docs_with_bk.parquet \ + --input-data-dirs $input_data_dirs \ + --output-dir $output_dir \ + --text-ddf-blocksize 256 \ + --bucket-mapping-ddf-blocksize 512 \ + --num-files $NUM_FILES \ + --parts-per-worker 1 \ + --scheduler-file $LOGDIR/scheduler.json + +echo "Time Check: `date`" diff --git a/examples/gpu_deduplication_example/run-workflow.sh b/examples/gpu_deduplication_example/run-workflow.sh new file mode 100755 index 00000000..b7e1392f --- /dev/null +++ b/examples/gpu_deduplication_example/run-workflow.sh @@ -0,0 +1,70 @@ +#! /bin/bash + +echo "Starting Workflow..." +echo "Time Check: `date`" +if [[ -z "$SLURM_JOB_ID" ]]; then + TODAY="`date +"%Y_%m_%d"`" +else + TODAY="`date +"%Y_%m_%d"`-$SLURM_JOB_ID" +fi + +# Prepare output directory +export JOB_DIR=rapids-dedup-scripts/DEDUP-$TODAY +export FULL_OUTPUT_DIR=$HOME/$JOB_DIR +export LOGDIR=$FULL_OUTPUT_DIR/logs +export PROFILESDIR=$FULL_OUTPUT_DIR/profiles +# Take the default location within the container +RUNSCRIPT=${RUNSCRIPT:--/opt/nemo-data-curator/examples/gpu_deduplication_example/run-minhash.sh} +echo $RUNSCRIPT +mkdir -p $LOGDIR +mkdir -p $PROFILESDIR + +cd /opt/nemo-data-curator/nemo_curator/gpu_deduplication +#-----# + + +# Env vars +export RAPIDS_NO_INITIALIZE="1" +export CUDF_SPILL="1" + +export LIBCUDF_CUFILE_POLICY=${LIBCUDF_CUFILE_POLICY:-ALWAYS} + +# Network interface specific to the cluster being used +export INTERFACE=ibp12s0 +export PROTOCOL=ucx +echo $INTERFACE + +# This variable can be set to limit the number of jsonl files that +# are used in the dedup. Setting to -1 reads in all files +export NUM_FILES=-1 + +# Start the scheduler on the rank 0 node +if [[ -z "$SLURM_NODEID" ]] || [[ $SLURM_NODEID == 0 ]]; then + echo "Starting scheduler" + DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True \ + DASK_DISTRIBUTED__RMM__POOL_SIZE=1GB \ + dask scheduler \ + --scheduler-file $LOGDIR/scheduler.json \ + --protocol $PROTOCOL \ + --interface $INTERFACE >> $LOGDIR/scheduler.log 2>&1 & +fi +sleep 30 + +# Start the workers on each node +echo "Starting workers..." +dask-cuda-worker --scheduler-file $LOGDIR/scheduler.json --rmm-pool-size 72GiB --interface $INTERFACE --rmm-async >> $LOGDIR/worker_$HOSTNAME.log 2>&1 & + +sleep 60 + +if [[ -z "$SLURM_NODEID" ]] || [[ $SLURM_NODEID == 0 ]]; then + echo "Time Check: `date`" + bash $RUNSCRIPT + echo "Time Check: `date`" + touch $LOGDIR/done.txt +fi + +# All nodes wait until done +while [ ! -f $LOGDIR/done.txt ] +do + sleep 15 +done diff --git a/examples/identify_languages_and_fix_unicode.py b/examples/identify_languages_and_fix_unicode.py new file mode 100644 index 00000000..b9fbe353 --- /dev/null +++ b/examples/identify_languages_and_fix_unicode.py @@ -0,0 +1,77 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import nemo_curator as nc +from nemo_curator.datasets import DocumentDataset +from nemo_curator.filters import FastTextLangId +from nemo_curator.modifiers import UnicodeReformatter +from nemo_curator.utils.file_utils import get_all_files_paths_under, separate_by_metadata +from nemo_curator.utils.distributed_utils import read_data, write_to_disk, get_client +from nemo_curator.utils.script_utils import add_distributed_args + + +def load_dataset(input_data_dir): + files = list(get_all_files_paths_under(input_data_dir)) + raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True) + dataset = DocumentDataset(raw_data) + + return dataset + +def main(args): + # Params + multilingual_data_path = "/path/to/multilingual" + language_separated_output_path = "/path/to/lang_separated" + cleaned_data_output_path = "/path/to/cleaned" + + # Download a fastText language identification model + # and see a list of supported languages here: + # https://fasttext.cc/docs/en/language-identification.html + model_path = "/path/to/model.bin" + target_language = "EN" + language_field = "language" + + # Prepare samples for the classifier + client = get_client(args, args.device) + + # Filter data + multilingual_dataset = load_dataset(multilingual_data_path) + language_id_pipeline = nc.ScoreFilter(FastTextLangId(model_path), score_field=language_field, score_type='object') + filtered_dataset = language_id_pipeline(multilingual_dataset) + + # Remove the language score + filtered_dataset.df[language_field] = filtered_dataset.df[language_field].apply(lambda score: score[1]) + + # Split the dataset by language + language_stats = separate_by_metadata(filtered_dataset.df, language_separated_output_path, metadata_field=language_field).compute() + + # Read the language specific data and fix the unicode in it + lang_data_path = os.path.join(language_separated_output_path, target_language) + if not os.path.exists(lang_data_path): + raise RuntimeError(f"Dataset did not have language: {target_language}") + lang_data = load_dataset(lang_data_path) + + cleaner = nc.Modify(UnicodeReformatter()) + cleaned_data = cleaner(lang_data) + + # Write the cleaned_data + write_to_disk(cleaned_data.df, cleaned_data_output_path, write_to_filename=True) + +def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)): + return add_distributed_args(parser) + +if __name__ == "__main__": + main(attach_args().parse_args()) \ No newline at end of file diff --git a/examples/raw_download_common_crawl.py b/examples/raw_download_common_crawl.py new file mode 100644 index 00000000..ade3fd67 --- /dev/null +++ b/examples/raw_download_common_crawl.py @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nemo_curator.download import batch_download, CommonCrawlWARCDownloader +from nemo_curator.utils.download_utils import get_common_crawl_urls +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import add_distributed_args +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir + + +def main(args): + # Params + start_snapshot = "2021-04" + end_snapshot = "2021-10" + output_directory = "/path/to/output" + + # Only download 10 shards as an example + url_limit = 10 + + # Set up Dask client + client = get_client(args, args.device) + + # Download the raw compressed WARC files + # Unlike the download_common_crawl function, this does not extract the files + output_directory = expand_outdir_and_mkdir(output_directory) + downloader = CommonCrawlWARCDownloader(output_directory) + urls = get_common_crawl_urls(start_snapshot, end_snapshot) + urls = urls[:url_limit] + output_files = batch_download(urls, downloader) + + print(f"Finished downloading {len(output_files)} files at:") + for file in output_files: + print(file) + +def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)): + return add_distributed_args(parser) + +if __name__ == "__main__": + main(attach_args().parse_args()) \ No newline at end of file diff --git a/examples/slurm/container-entrypoint.sh b/examples/slurm/container-entrypoint.sh new file mode 100755 index 00000000..d3d49b17 --- /dev/null +++ b/examples/slurm/container-entrypoint.sh @@ -0,0 +1,70 @@ +#! /bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Start the scheduler on the rank 0 node +if [[ -z "$SLURM_NODEID" ]] || [[ $SLURM_NODEID == 0 ]]; then + echo "Starting scheduler" + if [[ $DEVICE == 'cpu' ]]; then + dask scheduler \ + --scheduler-file $SCHEDULER_FILE \ + --protocol $PROTOCOL \ + --interface $INTERFACE >> $SCHEDULER_LOG 2>&1 & + fi + if [[ $DEVICE == 'gpu' ]]; then + DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True \ + DASK_DISTRIBUTED__RMM__POOL_SIZE=$RMM_SCHEDULER_POOL_SIZE \ + dask scheduler \ + --scheduler-file $SCHEDULER_FILE \ + --protocol $PROTOCOL \ + --interface $INTERFACE >> $SCHEDULER_LOG 2>&1 & + fi +fi + +# Wait for the scheduler to start +sleep 30 + +# Start the workers on each node +echo "Starting workers..." +export WORKER_LOG=$LOGDIR/worker_${SLURM_NODEID}-${SLURM_LOCALID}.log +if [[ $DEVICE == 'cpu' ]]; then + dask worker \ + --scheduler-file $SCHEDULER_FILE \ + --memory-limit $CPU_WORKER_MEMORY_LIMIT \ + --nworkers -1 \ + --interface $INTERFACE >> $WORKER_LOG 2>&1 & +fi +if [[ $DEVICE == 'gpu' ]]; then + dask-cuda-worker \ + --scheduler-file $SCHEDULER_FILE \ + --rmm-pool-size $RMM_WORKER_POOL_SIZE \ + --interface $INTERFACE \ + --rmm-async >> $WORKER_LOG 2>&1 & +fi + +# Wait for the workers to start +sleep 60 + +if [[ -z "$SLURM_NODEID" ]] || [[ $SLURM_NODEID == 0 ]]; then + echo "Starting $SCRIPT_PATH" + bash -c "$SCRIPT_COMMAND" + touch $DONE_MARKER +fi + +# All nodes wait until done to keep the workers and scheduler active +while [ ! -f $DONE_MARKER ] +do + sleep 15 +done \ No newline at end of file diff --git a/examples/slurm/start-slurm.sh b/examples/slurm/start-slurm.sh new file mode 100644 index 00000000..4454655d --- /dev/null +++ b/examples/slurm/start-slurm.sh @@ -0,0 +1,83 @@ +#! /bin/bash + +#SBATCH --job-name=nemo-curator:example-script +#SBATCH --nodes=2 +#SBATCH --exclusive +#SBATCH --time=04:00:00 + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ================================================================= +# Begin easy customization +# ================================================================= + +# Base directory for all SLURM job logs and files +# Does not affect directories referenced in your script +export BASE_JOB_DIR=`pwd`/nemo-curator-jobs +export JOB_DIR=$BASE_JOB_DIR/$SLURM_JOB_ID + +# Logging information +export LOGDIR=$JOB_DIR/logs +export PROFILESDIR=$JOB_DIR/profiles +export SCHEDULER_FILE=$LOGDIR/scheduler.json +export SCHEDULER_LOG=$LOGDIR/scheduler.log +export DONE_MARKER=$LOGDIR/done.txt + +# Main script to run +# In the script, Dask must connect to a cluster through the Dask scheduler +# We recommend passing the path to a Dask scheduler's file in a +# nemo_curator.utils.distributed_utils.get_client call like the examples +export DEVICE='cpu' +export SCRIPT_PATH=/path/to/script.py +export SCRIPT_COMMAND="python $SCRIPT_PATH \ + --scheduler-file $SCHEDULER_FILE \ + --device $DEVICE" + +# Container parameters +export CONTAINER_IMAGE=/path/to/container +# Make sure to mount the directories your script references +export BASE_DIR=`pwd` +export MOUNTS="${BASE_DIR}:${BASE_DIR}" +# Below must be path to entrypoint script on your system +export CONTAINER_ENTRYPOINT=$BASE_DIR/examples/slurm/container-entrypoint.sh + +# Network interface specific to the cluster being used +export INTERFACE=eth0 +export PROTOCOL=tcp + +# CPU related variables +# 0 means no memory limit +export CPU_WORKER_MEMORY_LIMIT=0 + +# GPU related variables +export RAPIDS_NO_INITIALIZE="1" +export CUDF_SPILL="1" +export RMM_SCHEDULER_POOL_SIZE="1GB" +export RMM_WORKER_POOL_SIZE="72GiB" +export LIBCUDF_CUFILE_POLICY=OFF + + +# ================================================================= +# End easy customization +# ================================================================= + +mkdir -p $LOGDIR +mkdir -p $PROFILESDIR + +# Start the container +srun \ + --container-mounts=${MOUNTS} \ + --container-image=${CONTAINER_IMAGE} \ + ${CONTAINER_ENTRYPOINT} \ No newline at end of file diff --git a/examples/task_decontamination.py b/examples/task_decontamination.py new file mode 100644 index 00000000..91730455 --- /dev/null +++ b/examples/task_decontamination.py @@ -0,0 +1,96 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import nemo_curator as nc +from nemo_curator.datasets import DocumentDataset +from nemo_curator.tasks import ( + Winogrande, + Squad, + TriviaQA, + Quac, + WebQA, + Race, + Drop, + WiC, + PIQA, + ArcEasy, + ArcChallenge, + OpenBookQA, + BoolQ, + Copa, + RTE, + MultiRC, + WSC, + CB, + ANLI, + Record +) +from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.distributed_utils import read_data, write_to_disk, get_client +from nemo_curator.utils.script_utils import add_distributed_args + + +def load_dataset(input_data_dir): + files = list(get_all_files_paths_under(input_data_dir)) + raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True) + dataset = DocumentDataset(raw_data) + + return dataset + +def main(args): + # Params + contaminated_dataset_path = "/path/to/input" + decontaminated_output_path = "/path/to/output" + + downstream_tasks = [ + Winogrande(), + Squad(), + TriviaQA(), + Quac(), + WebQA(), + Race(), + Drop(), + WiC(), + PIQA(), + ArcEasy(), + ArcChallenge(), + OpenBookQA(), + BoolQ(), + Copa(), + RTE(), + MultiRC(), + WSC(), + CB(), + ANLI(), + Record() + ] + + # Prepare samples for the classifier + client = get_client(args, args.device) + + # Filter data + target_dataset = load_dataset(contaminated_dataset_path) + decontaminator = nc.TaskDecontamination(downstream_tasks) + decontaminated_dataset = decontaminator(target_dataset) + + # Write filtered dataset + write_to_disk(decontaminated_dataset.df, decontaminated_output_path, write_to_filename=True) + +def attach_args(parser=argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)): + return add_distributed_args(parser) + +if __name__ == "__main__": + main(attach_args().parse_args()) \ No newline at end of file diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py new file mode 100644 index 00000000..26205067 --- /dev/null +++ b/nemo_curator/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .modules import * \ No newline at end of file diff --git a/nemo_curator/_compat.py b/nemo_curator/_compat.py new file mode 100644 index 00000000..1dc07d9e --- /dev/null +++ b/nemo_curator/_compat.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dask +from packaging.version import parse as parseVersion + +_dask_version = parseVersion(dask.__version__) + +# TODO: remove when dask min version gets bumped +DASK_SHUFFLE_METHOD_ARG = _dask_version > parseVersion("2024.1.0") +DASK_P2P_ERROR = _dask_version < parseVersion("2023.10.0") diff --git a/nemo_curator/datasets/__init__.py b/nemo_curator/datasets/__init__.py new file mode 100644 index 00000000..276cbc10 --- /dev/null +++ b/nemo_curator/datasets/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .doc_dataset import DocumentDataset + +__all__ = ["DocumentDataset"] \ No newline at end of file diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py new file mode 100644 index 00000000..396aa9b6 --- /dev/null +++ b/nemo_curator/datasets/doc_dataset.py @@ -0,0 +1,207 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dask_cudf +import dask.dataframe as dd + +from nemo_curator.utils.distributed_utils import read_data, write_to_disk +from nemo_curator.utils.file_utils import get_all_files_paths_under + + +class DocumentDataset: + """ + A collection of documents and document metadata. + Internally it may be distributed across multiple nodes, and may be on GPUs. + """ + def __init__(self, dataset_df): + self.df = dataset_df + + def __len__(self): + return len(self.df) + + def persist(self): + return DocumentDataset(self.df.persist()) + + @classmethod + def read_json( + cls, + input_files, + backend="pandas", + files_per_partition=1, + add_filename=False, + ): + return cls(_read_json_or_parquet( + input_files=input_files, + file_type="jsonl", + backend=backend, + files_per_partition=files_per_partition, + add_filename=add_filename, + )) + + @classmethod + def read_parquet( + cls, + input_files, + backend="pandas", + files_per_partition=1, + add_filename=False, + ): + return cls(_read_json_or_parquet( + input_files=input_files, + file_type="parquet", + backend=backend, + files_per_partition=files_per_partition, + add_filename=add_filename, + )) + + @classmethod + def read_pickle( + cls, + input_files, + backend="pandas", + files_per_partition=1, + add_filename=False, + ): + raw_data = read_data( + input_files=input_files, + file_type="pickle", + backend=backend, + files_per_partition=files_per_partition, + add_filename=add_filename, + ) + + return cls(raw_data) + + def to_json( + self, + output_file_dir, + write_to_filename=False, + ): + """ + See nemo_curator.utils.distributed_utils.write_to_disk docstring for other parameters. + + """ + write_to_disk( + df=self.df, + output_file_dir=output_file_dir, + write_to_filename=write_to_filename, + output_type="jsonl", + ) + + def to_parquet( + self, + output_file_dir, + write_to_filename=False, + ): + """ + See nemo_curator.utils.distributed_utils.write_to_disk docstring for other parameters. + + """ + write_to_disk( + df=self.df, + output_file_dir=output_file_dir, + write_to_filename=write_to_filename, + output_type="parquet", + ) + + def to_pickle( + self, + output_file_dir, + write_to_filename=False, + ): + raise NotImplementedError("DocumentDataset does not support to_pickle yet") + + +def _read_json_or_parquet( + input_files, + file_type, + backend, + files_per_partition, + add_filename, +): + """ + `input_files` may be a list or a string type. + If `input_files` is a list, it may be a list of JSONL or Parquet files, + e.g., `input_files = ["file1.jsonl", "file2.jsonl", "file3.jsonl"]`, + or a list of directories containing JSONL or Parquet files, + e.g., `input_files = ["dir1", "dir2", "dir3"]`, + where each of `dir1`, `dir2`, and `dir3` contain all JSONL or Parquet files. + If `input_files` is a string, it may be a single JSONL or Parquet file, + such as `input_files = "my_file.jsonl"`, + or it may also be a single directory containing JSONL or Parquet files, + such as `input_files = "my_directory"`. + + See nemo_curator.utils.distributed_utils.read_data docstring for other parameters. + + Returns a DataFrame to be used in initializing a DocumentDataset. + + """ + file_ext = "." + file_type + + if isinstance(input_files, list): + # List of jsonl or parquet files + if all(f.endswith(file_ext) for f in input_files): + raw_data = read_data( + input_files, + file_type=file_type, + backend=backend, + files_per_partition=files_per_partition, + add_filename=add_filename, + ) + + # List of directories + else: + dfs = [] + + for data_path in input_files: + files = get_all_files_paths_under( + root=data_path, recurse_subdirectories=False + ) + df = read_data( + files, + file_type=file_type, + backend=backend, + files_per_partition=files_per_partition, + add_filename=add_filename, + ) + dfs.append(df) + + if backend == "cudf": + raw_data = dask_cudf.concat(dfs, ignore_unknown_divisions=True) + else: + raw_data = dd.concat(dfs, ignore_unknown_divisions=True) + + elif isinstance(input_files, str): + # Single file + if input_files.endswith(file_ext): + files = [input_files] + + # Directory of jsonl or parquet files + else: + files = get_all_files_paths_under( + root=input_files, recurse_subdirectories=False + ) + + raw_data = read_data( + input_files=files, + file_type=file_type, + backend=backend, + files_per_partition=files_per_partition, + add_filename=add_filename, + ) + + else: + raise TypeError("File input must be a string or list.") + + return raw_data diff --git a/nemo_curator/distributed_data_classification/__init__.py b/nemo_curator/distributed_data_classification/__init__.py new file mode 100644 index 00000000..fe99e99a --- /dev/null +++ b/nemo_curator/distributed_data_classification/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/nemo_curator/distributed_data_classification/arg_utils.py b/nemo_curator/distributed_data_classification/arg_utils.py new file mode 100644 index 00000000..967d1c42 --- /dev/null +++ b/nemo_curator/distributed_data_classification/arg_utils.py @@ -0,0 +1,163 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ["RAPIDS_NO_INITIALIZE"] = "1" +import warnings + +warnings.filterwarnings("ignore") + + +def add_input_output_args(parser): + """ + This function adds the command line arguments related to input and output files. + + Args: + parser: An argparse ArgumentParser object. + Returns: + An argparse ArgumentParser with 3 additional arguments. + + """ + parser.add_argument( + "--input_file_path", + type=str, + help="The path of the input files", + required=True, + ) + parser.add_argument( + "--input_file_type", + type=str, + help="The type of the input files", + required=True, + ) + parser.add_argument( + "--output_file_path", + type=str, + help="The path of the output files", + required=True, + ) + return parser + + +def add_cluster_args(parser): + """ + This function adds the command line arguments related to Dask cluster setup. + + Args: + parser: An argparse ArgumentParser object. + Returns: + An argparse ArgumentParser with 8 additional arguments. + + """ + parser.add_argument( + "--scheduler-address", + type=str, + default=None, + help="""Address to the scheduler of a created Dask cluster. + If not provided, a single node LocalCUDACluster will be started.""", + ) + parser.add_argument( + "--scheduler-file", + type=str, + default=None, + help="""Path to the scheduler file of a created Dask cluster. + If not provided, a single node LocalCUDACluster will be started.""", + ) + parser.add_argument( + "--protocol", + type=str, + default="ucx", + help="""Protocol to use for Dask cluster. + Note: This only applies to the LocalCUDACluster. + If providing a user created cluster, refer to + https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-protocol""", + ) + parser.add_argument( + "--nvlink-only", + action="store_true", + help="""Start a local cluster with only NVLink enabled. + Only applicable when protocol=ucx and no scheduler file/address is specified.""", + ) + parser.add_argument( + "--rmm_pool_size", + type=str, + help="The size of the RMM pool to be used by each worker.", + default="14GB", + ) + parser.add_argument( + "--CUDA_VISIBLE_DEVICES", + type=str, + help="The GPUs to be used by the cluster.", + default=None, + ) + parser.add_argument("--enable_spilling", action="store_true") + parser.add_argument("--set_torch_to_use_rmm", action="store_true") + return parser + + +def add_model_args(parser): + """ + This function adds the command line arguments related to the model. + + Args: + parser: An argparse ArgumentParser object. + Returns: + An argparse ArgumentParser with 4 additional arguments. + + """ + # Add a mutually exclusive group for model_file_name and model_file_names + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + "--model_file_name", + type=str, + help="The path to the model file", + required=False, + ) + group.add_argument( + "--model_file_names", + type=str, + nargs="*", + help="A list of model file paths", + required=False, + ) + parser.add_argument( + "--autocast", + action="store_true", + help="Whether to use autocast or not", + ) + parser.add_argument( + "--batch_size", + type=int, + default=128, + help="The batch size to be used for inference", + ) + return parser + + +def create_arg_parser(): + """ + This function creates the argument parser to add the command line arguments. + + Returns: + An argparse ArgumentParser object. + + """ + import argparse + + parser = argparse.ArgumentParser(description="Run multi-node multi-GPU inference") + parser = add_cluster_args(parser) + parser = add_input_output_args(parser) + parser = add_model_args(parser) + return parser diff --git a/nemo_curator/distributed_data_classification/domain_classifier_inference.py b/nemo_curator/distributed_data_classification/domain_classifier_inference.py new file mode 100644 index 00000000..7fdeb6cf --- /dev/null +++ b/nemo_curator/distributed_data_classification/domain_classifier_inference.py @@ -0,0 +1,275 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import warnings + +os.environ["RAPIDS_NO_INITIALIZE"] = "1" +import torch +from packaging import version +from transformers import __version__ as TRANSFORMERS_VERSION +from transformers.models.deberta_v2 import DebertaV2TokenizerFast + +from nemo_curator.distributed_data_classification.arg_utils import create_arg_parser +from nemo_curator.distributed_data_classification.pytorch_utils import ( + CFG, + CustomModel, + TestDataset, + collate, +) +from nemo_curator.utils.distributed_utils import ( + get_client, + load_object_on_worker, + process_all_batches, + read_data, + write_to_disk, +) +from nemo_curator.utils.file_utils import get_remaining_files + +warnings.filterwarnings("ignore") + + +def inference_per_partition( + df, + max_chars, + batch_size, + num_workers, + model_file_name, + labels, + autocast, +): + """ + This function runs domain classification on a subset of the data. + It loads the CFG, a data iterator, and then calls the `process_all_batches` function, + which loads the domain classifier and runs inference. + + Args: + df: A Dask DataFrame partition with a "text" column and a dummy "pred" column. + max_chars: The maximum number of characters allowed in the truncated text. + batch_size: How many samples per batch to load with PyTorch DataLoader. + num_workers: How many subprocesses to use for PyTorch DataLoader. + model_file_name: The path to the model file. + labels: The list of domain labels. + autocast: A boolean representing whether to perform inference with mixed precision. + Returns: + The input Dask DataFrame with the calculated "pred" column. + + """ + cfg = cfg_per_partition() + + dataset_valid = TestDataset(cfg, df, max_chars) + loader_valid = torch.utils.data.DataLoader( + dataset_valid, + batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + ) + device = torch.device("cuda") + load_model_kwargs = {"cfg": cfg, "device": device, "model_path": model_file_name} + run_inference_kwargs = {"autocast": autocast} + st = time.time() + preds = process_all_batches( + loader_valid, + load_model, + load_model_kwargs, + run_inference, + run_inference_kwargs, + ) + preds = preds.cpu().numpy() + df["pred"] = [labels[i] for i in preds] + + et = time.time() + print( + f"Time taken for inference for num_batches: {len(loader_valid)} : {et-st} s", + flush=True, + ) + + return df + + +def cfg_per_partition(): + """ + This function loads the CFG on the worker currently running the task. + See `load_object_on_worker` function. + + Returns: + A CFG with a set `tokenizer` attribute. + + """ + return load_object_on_worker("cfg_with_tokenizer", load_cfg_with_tokenizer, {}) + + +def load_cfg_with_tokenizer(): + """ + This function loads the CFG needed for domain classification. + + Returns: + A CFG with a set `tokenizer` attribute. + + """ + cfg = CFG() + tokenizer = DebertaV2TokenizerFast.from_pretrained(cfg.model) + cfg.tokenizer = tokenizer + return cfg + + +def load_model(cfg, device, model_path): + """ + This function loads the domain model and prepares it to be used for inference. + It is needed as an input to the `process_all_batches` function within the `inference_per_partition` function. + + Args: + cfg: A CFG object. + device: A specified PyTorch device, such as torch.device("cuda") or torch.device("cpu"). + model_path: The path to the model file. + Returns: + The loaded model. + + """ + model = CustomModel(cfg, out_dim=27, config_path=None, pretrained=True) + model = model.to(device) + sd = torch.load(os.path.join(model_path), map_location="cpu") + sd = {k[7:] if k.startswith("module.") else k: sd[k] for k in sd.keys()} + if version.parse(TRANSFORMERS_VERSION) >= version.parse("4.31.0"): + sd.pop("model.embeddings.position_ids", None) + + model.load_state_dict(sd, strict=True) + model.eval() + return model + + +def run_inference(batch, model, autocast=False): + """ + This function runs the domain classifier on a batch of data. + It is needed as an input to the `process_all_batches` function within the `inference_per_partition` function. + + Args: + batch: A subset of the data as we are iterating through PyTorch DataLoader. + model: The loaded domain classification model. + autocast: A boolean representing whether to perform inference with mixed precision. + Returns: + A tensor of predictions. + + """ + with torch.no_grad(): + batch = collate(batch) + if autocast: + with torch.autocast(device_type="cuda"): + out = model(batch)[:, 0, :] + else: + out = model(batch)[:, 0, :] + pred_idx = torch.sigmoid(out).argmax(1) + + return pred_idx + + +def main(): + labels = [ + "Adult", + "Arts_and_Entertainment", + "Autos_and_Vehicles", + "Beauty_and_Fitness", + "Books_and_Literature", + "Business_and_Industrial", + "Computers_and_Electronics", + "Finance", + "Food_and_Drink", + "Games", + "Health", + "Hobbies_and_Leisure", + "Home_and_Garden", + "Internet_and_Telecom", + "Jobs_and_Education", + "Law_and_Government", + "News", + "Online_Communities", + "People_and_Society", + "Pets_and_Animals", + "Real_Estate", + "Reference", + "Science", + "Sensitive_Subjects", + "Shopping", + "Sports", + "Travel_and_Transportation", + ] + + args = create_arg_parser().parse_args() + print(f"Arguments parsed = {args}", flush=True) + max_chars = 2000 + batch_size = args.batch_size + num_workers = 0 + + client = get_client(args, cluster_type="gpu") + print("Starting domain classifier inference", flush=True) + global_st = time.time() + files_per_run = len(client.scheduler_info()["workers"]) * 2 + input_files = get_remaining_files( + args.input_file_path, args.output_file_path, args.input_file_type + ) + print(f"Total input files {len(input_files)}", flush=True) + + if args.input_file_type == "pickle": + add_filename = False + else: + add_filename = True + + for file_batch_id, i in enumerate(range(0, len(input_files), files_per_run)): + batch_st = time.time() + current_batch_files = input_files[i : i + files_per_run] + print( + f"File Batch ID {file_batch_id}: total input files {len(current_batch_files)}", + flush=True, + ) + df = read_data( + input_files=current_batch_files, + file_type=args.input_file_type, + add_filename=add_filename, + ) + print(f"Total input Dask DataFrame partitions {df.npartitions}", flush=True) + meta_df = df._meta.copy() + meta_df["pred"] = [0] * len(meta_df) + df = df.map_partitions( + inference_per_partition, + max_chars, + batch_size, + num_workers, + args.model_file_name, + labels, + args.autocast, + meta=meta_df, + enforce_metadata=False, + ) + write_to_disk( + df=df, + output_file_dir=args.output_file_path, + write_to_filename=add_filename, + ) + batch_et = time.time() + print( + f"File Batch ID {file_batch_id}: completed in {batch_et-batch_st} seconds", + flush=True, + ) + + global_et = time.time() + print( + f"Total time taken for domain classifier inference: {global_et-global_st} s", + flush=True, + ) + client.close() + + +def console_script(): + main() diff --git a/nemo_curator/distributed_data_classification/generate_statistics.py b/nemo_curator/distributed_data_classification/generate_statistics.py new file mode 100644 index 00000000..5dae3ef9 --- /dev/null +++ b/nemo_curator/distributed_data_classification/generate_statistics.py @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time + +from nemo_curator.distributed_data_classification.arg_utils import ( + add_cluster_args, + add_input_output_args, +) +from nemo_curator.utils.distributed_utils import get_client, read_data +from nemo_curator.utils.file_utils import get_all_files_paths_under + + +def value_counts(df, column_name): + """ + This function groups a DataFrame by the specified column and counts the occurrences of each group. + It is essentially the same as pandas.Series.value_counts, except it returns a DataFrame. + + Args: + df: A DataFrame. + column_name: The column by which to group the DataFrame. + Returns: + A DataFrame with two columns: column_name and a second column containing the counts per group. + + """ + return df.groupby(column_name).size().reset_index() + + +def main(): + parser = argparse.ArgumentParser( + description="Generate label statistics and write them to disk" + ) + + parser = add_cluster_args(parser) + parser = add_input_output_args(parser) + parser.add_argument( + "--label", + type=str, + help="The label column on which to generate statistics", + required=True, + ) + args = parser.parse_args() + print(f"Arguments parsed = {args}", flush=True) + client = get_client(args, cluster_type="gpu") + + print("Starting statistics workflow", flush=True) + st = time.time() + + df = read_data( + input_files=get_all_files_paths_under(args.input_file_path, recurse_subdirecties=False), + file_type=args.input_file_type, + add_filename=True, + ) + input_files = get_all_files_paths_under(args.input_file_path, recurse_subdirecties=False) + + result = value_counts(df, column_name=args.label) + result = result.rename(columns={0: "count"}) + result.to_json(args.output_file_path) + + et = time.time() + print(f"Statistics workflow completed in {et-st}", flush=True) + client.close() + + +def console_script(): + main() diff --git a/nemo_curator/distributed_data_classification/pytorch_utils.py b/nemo_curator/distributed_data_classification/pytorch_utils.py new file mode 100644 index 00000000..58d9b5bf --- /dev/null +++ b/nemo_curator/distributed_data_classification/pytorch_utils.py @@ -0,0 +1,105 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ["RAPIDS_NO_INITIALIZE"] = "1" +import torch +import torch.nn as nn +from torch.utils.data import Dataset +from transformers import AutoConfig, AutoModel + + +class CFG: + model = "microsoft/deberta-v3-base" + fc_dropout = 0.2 + + def __init__(self, max_len=512): + self.max_len = max_len + + +def collate(inputs): + inputs = {k: v.to("cuda") for k, v in inputs.items()} + mask_len = int(inputs["attention_mask"].sum(axis=1).max()) + for k, v in inputs.items(): + # CPMP: no need to truncate labels + if k != "labels": + inputs[k] = inputs[k][:, :mask_len] + return inputs + + +class CustomModel(nn.Module): + def __init__(self, cfg, out_dim, config_path=None, pretrained=False): + super().__init__() + self.cfg = cfg + if config_path is None: + self.config = AutoConfig.from_pretrained( + cfg.model, output_hidden_states=True + ) + else: + self.config = torch.load(config_path) + if pretrained: + self.model = AutoModel.from_pretrained(cfg.model, config=self.config) + else: + self.model = AutoModel(self.config) + self.fc_dropout = nn.Dropout(cfg.fc_dropout) + self.fc = nn.Linear(self.config.hidden_size, out_dim) + self._init_weights(self.fc) + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def feature(self, input_ids, attention_mask): + outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) + last_hidden_states = outputs[0] + return last_hidden_states + + def forward(self, batch): + feature = self.feature(batch["input_ids"], batch["attention_mask"]) + output = self.fc(self.fc_dropout(feature)) + return output + + +class TestDataset(Dataset): + def __init__(self, cfg, df, max_chars): + self.cfg = cfg + text = df["text"].str.slice(0, max_chars).to_arrow().to_pylist() + with torch.no_grad(): + self.tokens = cfg.tokenizer.batch_encode_plus( + text, + return_tensors="pt", + add_special_tokens=True, + max_length=cfg.max_len, + pad_to_max_length=True, + truncation=True, + return_token_type_ids=False, + ) + self.max_chars = max_chars + self.dataset_len = len(text) + + def __len__(self): + return self.dataset_len + + def __getitem__(self, item): + return {k: v[item] for k, v in self.tokens.items()} diff --git a/nemo_curator/distributed_data_classification/quality_classifier_inference.py b/nemo_curator/distributed_data_classification/quality_classifier_inference.py new file mode 100644 index 00000000..90d84134 --- /dev/null +++ b/nemo_curator/distributed_data_classification/quality_classifier_inference.py @@ -0,0 +1,314 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import warnings + +os.environ["RAPIDS_NO_INITIALIZE"] = "1" +import torch +from transformers.models.deberta_v2 import DebertaV2TokenizerFast + +from nemo_curator.distributed_data_classification.arg_utils import create_arg_parser +from nemo_curator.distributed_data_classification.pytorch_utils import ( + CFG, + CustomModel, + TestDataset, + collate, +) +from nemo_curator.utils.distributed_utils import ( + get_client, + load_object_on_worker, + process_all_batches, + read_data, + write_to_disk, +) +from nemo_curator.utils.file_utils import get_remaining_files + +warnings.filterwarnings("ignore") + + +def inference_per_partition( + df, + max_chars, + batch_size, + num_workers, + model_file_name, + labels, + autocast, + include_model_name=False, +): + """ + This function runs quality classification on a subset of the data. + It loads the CFG, a data iterator, and then calls the `process_all_batches` function, + which loads the quality classifier and runs inference. + It also contains some additional logic to handle binary versus multiclass classification. + + Args: + df: A Dask DataFrame partition with a "text" column and a dummy "quality_pred" column. + max_chars: The maximum number of characters allowed in the truncated text. + batch_size: How many samples per batch to load with PyTorch DataLoader. + num_workers: How many subprocesses to use for PyTorch DataLoader. + model_file_name: The path to the model file. + labels: The list of domain labels. + autocast: A boolean representing whether to perform inference with mixed precision. + include_model_name: A boolean representing whether to include the model name in the "quality_pred" column name. + Returns: + The input Dask DataFrame with the calculated "quality_pred" column. + + """ + cfg = cfg_per_partition() + + dataset_valid = TestDataset(cfg, df, max_chars) + loader_valid = torch.utils.data.DataLoader( + dataset_valid, + batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + ) + device = torch.device("cuda") + if len(labels) == 1: + raise ValueError("Labels must be more than 1") + + # binary case + if len(labels) == 2: + out_dim = 1 + binary_classification = True + else: + out_dim = len(labels) + binary_classification = False + + load_model_kwargs = { + "cfg": cfg, + "device": device, + "model_path": model_file_name, + "out_dim": out_dim, + } + run_inference_kwargs = { + "autocast": autocast, + "binary_classification": binary_classification, + } + st = time.time() + probs = process_all_batches( + loader_valid, + load_model, + load_model_kwargs, + run_inference, + run_inference_kwargs, + ) + if binary_classification: + preds = (probs > 0.5).to(torch.int64).squeeze() + else: + preds = torch.argmax(probs, dim=1) + # TODO: Do this without a CPU roundtrip in the future + if include_model_name: + filename = os.path.basename(model_file_name) + df[f"quality_pred_{filename}"] = [ + labels[i] for i in preds.to("cpu").numpy().tolist() + ] + df[f"quality_prob_{filename}"] = probs.to("cpu").numpy().tolist() + else: + df["quality_pred"] = [labels[i] for i in preds.to("cpu").numpy().tolist()] + df["quality_prob"] = probs.to("cpu").numpy().tolist() + et = time.time() + print( + f"Time taken for inference for num_batches: {len(loader_valid)} : {et-st} s", + flush=True, + ) + + return df + + +def cfg_per_partition(): + """ + This function loads the CFG on the worker currently running the task. + See `load_object_on_worker` function. + + Returns: + A CFG with a set `tokenizer` attribute. + + """ + return load_object_on_worker("cfg_with_tokenizer", load_cfg_with_tokenizer, {}) + + +def load_cfg_with_tokenizer(): + """ + This function loads the CFG needed for quality classification. + + Returns: + A CFG with a set `tokenizer` attribute. + + """ + cfg = CFG(max_len=1024) + tokenizer = DebertaV2TokenizerFast.from_pretrained(cfg.model) + cfg.tokenizer = tokenizer + return cfg + + +def load_model(cfg, device, model_path, out_dim): + """ + This function loads the quality model and prepares it to be used for inference. + It is needed as an input to the `process_all_batches` function within the `inference_per_partition` function. + + Args: + cfg: A CFG object. + device: A specified PyTorch device, such as torch.device("cuda") or torch.device("cpu"). + model_path: The path to the model file. + out_dim: An integer which corresponds to the number of labels. Use 1 for binary classification. + Returns: + The loaded model. + + """ + model = CustomModel(cfg, out_dim=out_dim, config_path=None, pretrained=True) + model = model.to(device) + sd = torch.load(model_path, map_location="cpu") + if "model_state_dict" in sd: + sd = sd["model_state_dict"] + sd = {k[7:] if k.startswith("module.") else k: sd[k] for k in sd.keys()} + model.load_state_dict(sd, strict=True) + model.eval() + return model + + +def run_inference(batch, model, autocast=False, binary_classification=False): + """ + This function runs the quality classifier on a batch of data. + It is needed as an input to the `process_all_batches` function within the `inference_per_partition` function. + + Args: + batch: A subset of the data as we are iterating through PyTorch DataLoader. + model: The loaded quality classification model. + autocast: A boolean representing whether to perform inference with mixed precision. + binary_classification: A boolean representing whether it is a binary classification model. + Returns: + A tensor of predictions. + + """ + with torch.no_grad(): + batch = collate(batch) + if autocast: + with torch.autocast(device_type="cuda"): + out = model(batch)[:, 0, :] + else: + out = model(batch)[:, 0, :] + if binary_classification: + probs = torch.sigmoid(out) + else: + probs = torch.softmax(out, dim=1) + return probs + + +def add_quality_model_specific_args(parser): + """ + This function adds a command line argument for the number of labels. + + Args: + parser: An argparse ArgumentParser object. + Returns: + An argparse ArgumentParser with 1 additional argument. + + """ + parser.add_argument("--num-labels", type=int, default=3) + return parser + + +def get_labels(num_labels): + """ + This function returns a list of quality labels, depending on how many labels the user expects. + + Args: + num_labels: An integer representing the number of possible classification labels. + Returns: + A list of label names. + + """ + if num_labels == 3: + labels = ["High", "Medium", "Low"] + elif num_labels == 2: + labels = ["Medium_High", "Low"] + return labels + + +def main(): + parser = create_arg_parser() + parser = add_quality_model_specific_args(parser) + args = parser.parse_args() + labels = get_labels(args.num_labels) + print(f"Arguments parsed = {args}", flush=True) + max_chars = 6000 + batch_size = args.batch_size + num_workers = 0 + + client = get_client(args, cluster_type="gpu") + print("Starting quality classifier inference", flush=True) + global_st = time.time() + files_per_run = len(client.scheduler_info()["workers"]) * 2 + input_files = get_remaining_files( + args.input_file_path, args.output_file_path, args.input_file_type + ) + print(f"Total input files {len(input_files)}", flush=True) + + if args.input_file_type == "pickle": + add_filename = False + else: + add_filename = True + + for file_batch_id, i in enumerate(range(0, len(input_files), files_per_run)): + batch_st = time.time() + current_batch_files = input_files[i : i + files_per_run] + print( + f"File Batch ID {file_batch_id}: total input files {len(current_batch_files)}", + flush=True, + ) + df = read_data( + input_files=current_batch_files, + file_type=args.input_file_type, + add_filename=add_filename, + ) + print(f"Total input Dask DataFrame partitions {df.npartitions}", flush=True) + meta_df = df._meta.copy() + meta_df["quality_pred"] = ["low"] * len(meta_df) + meta_df["quality_prob"] = [[0, 0, 1]] * len(meta_df) + df = df.map_partitions( + inference_per_partition, + max_chars, + batch_size, + num_workers, + args.model_file_name, + labels, + args.autocast, + meta=meta_df, + enforce_metadata=False, + ) + write_to_disk( + df=df, + output_file_dir=args.output_file_path, + write_to_filename=add_filename, + ) + batch_et = time.time() + print( + f"File Batch ID {file_batch_id}: completed in {batch_et-batch_st} seconds", + flush=True, + ) + + global_et = time.time() + print( + f"Total time taken for quality classifier inference: {global_et-global_st} s", + flush=True, + ) + client.close() + + +def console_script(): + main() diff --git a/nemo_curator/distributed_data_classification/quality_classifier_multiple_models_inference.py b/nemo_curator/distributed_data_classification/quality_classifier_multiple_models_inference.py new file mode 100644 index 00000000..403741c2 --- /dev/null +++ b/nemo_curator/distributed_data_classification/quality_classifier_multiple_models_inference.py @@ -0,0 +1,155 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +from dask.distributed import wait + +from nemo_curator.distributed_data_classification.arg_utils import create_arg_parser +from nemo_curator.distributed_data_classification.quality_classifier_inference import ( + add_quality_model_specific_args, + get_labels, + inference_per_partition, +) +from nemo_curator.utils.distributed_utils import ( + get_client, + offload_object_on_worker, + read_data, + write_to_disk, +) +from nemo_curator.utils.file_utils import get_remaining_files + + +def delete_model_and_tokenizer_from_workers(client): + """ + Offloads cfg_with_tokenizer and model from all Dask client workers. + + Args: + client: A Dask client object. + + """ + task_ls = [] + # TODO: client.run does not work anymore + # See: https://dask.discourse.group/t/cannot-run-client-run-function-when-function-contains-get-worker-in-distributed-2023-3-2-1/1772 + # find a better alternate + for worker in client.scheduler_info()["workers"]: + task_ls.append( + client.submit( + offload_object_on_worker, + "cfg_with_tokenizer", + workers=[worker], + allow_other_workers=False, + pure=False, + ) + ) + task_ls.append( + client.submit( + offload_object_on_worker, + "model", + workers=[worker], + allow_other_workers=False, + pure=False, + ) + ) + wait(task_ls) + for t in task_ls: + assert t.result() == True + del task_ls + + +def main(): + parser = create_arg_parser() + parser = add_quality_model_specific_args(parser) + args = parser.parse_args() + labels = get_labels(args.num_labels) + print(f"Arguments parsed = {args}", flush=True) + + max_chars = 6000 + batch_size = args.batch_size + num_workers = 0 + client = get_client(args, cluster_type="gpu") + client.upload_file("quality_classifier_inference.py") + + print("Starting quality classifier inference", flush=True) + global_st = time.time() + files_per_run = len(client.scheduler_info()["workers"]) * 2 + input_files = get_remaining_files( + args.input_file_path, args.output_file_path, args.input_file_type + ) + print(f"Total input files {len(input_files)}", flush=True) + + if args.input_file_type == "pickle": + add_filename = False + else: + add_filename = True + + for file_batch_id, i in enumerate(range(0, len(input_files), files_per_run)): + batch_st = time.time() + current_batch_files = input_files[i : i + files_per_run] + print( + f"File Batch ID {file_batch_id}: total input files {len(current_batch_files)}", + flush=True, + ) + df = read_data( + input_files=current_batch_files, + file_type=args.input_file_type, + add_filename=add_filename, + ) + print(f"Total input Dask DataFrame partitions {df.npartitions}", flush=True) + + for model_file_path in args.model_file_names: + meta_df = df._meta.copy() + model_file_name = os.path.basename(model_file_path) + print(f"model_file_name={model_file_name}", flush=True) + print("--" * 30, flush=True) + meta_df[f"quality_pred_{model_file_name}"] = ["low"] * len(meta_df) + meta_df[f"quality_prob_{model_file_name}"] = [[0, 0, 1]] * len(meta_df) + df = df.map_partitions( + inference_per_partition, + max_chars, + batch_size, + num_workers, + model_file_path, + labels, + args.autocast, + include_model_name=True, + meta=meta_df, + enforce_metadata=False, + ) + df = df.persist() + wait(df) + delete_model_and_tokenizer_from_workers(client) + + write_to_disk( + df=df, + output_file_dir=args.output_file_path, + write_to_filename=add_filename, + ) + batch_et = time.time() + print( + f"File Batch ID {file_batch_id}: completed in {batch_et-batch_st} seconds", + flush=True, + ) + + global_et = time.time() + print( + f"Total time taken for multiple quality classifier inference models: {global_et-global_st} s", + flush=True, + ) + client.close() + + +def console_script(): + main() diff --git a/nemo_curator/distributed_data_classification/verify_results.py b/nemo_curator/distributed_data_classification/verify_results.py new file mode 100644 index 00000000..da79d692 --- /dev/null +++ b/nemo_curator/distributed_data_classification/verify_results.py @@ -0,0 +1,179 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import pandas as pd + + +def parse_args(): + """ + This function adds command line arguments related to the two files we wish to compare. + + Returns: + An argparse Namespace object. + + """ + parser = argparse.ArgumentParser(description="Run verification") + parser.add_argument( + "--results_file_path", + type=str, + help="The path of the input files", + required=True, + ) + parser.add_argument( + "--expected_results_file_path", + type=str, + help="The path of the expected_result file", + required=True, + ) + parser.add_argument( + "--results_pred_column", + type=str, + help="The prediction column name for the input files", + default="pred", + ) + parser.add_argument( + "--expected_pred_column", + type=str, + help="The prediction column name for the expected_result file", + default="pred", + ) + return parser.parse_args() + + +def verify_same_counts(got_counts, expected_counts): + """ + This function compares the results of `value_counts` on two Series. + If the value counts are not the same, it prints information about the percent difference between them. + + Args: + got_counts: A dictionary of value counts for the input Series. + expected_counts: A dictionary of the expected value counts. + + """ + exact_same_counts = got_counts == expected_counts + if exact_same_counts: + print("Results are exactly the same") + else: + print("Results are not exactly the same, see below") + total_count = sum(expected_counts.values()) + total_diff = 0 + for key in expected_counts: + if got_counts[key] != expected_counts[key]: + diff = expected_counts[key] - got_counts[key] + print( + f"Expected doc count for label {key} {expected_counts[key]} but got {got_counts[key]}" + ) + total_diff = total_diff + abs(diff) + + diff_percent = (total_diff / total_count) * 100 + print( + f"Total difference: {total_diff} out of {total_count}, diff percentage: {diff_percent}%" + ) + print("---" * 30) + + +def verify_same_dataframe( + got_df, expected_df, results_pred_column, expected_pred_column +): + """ + This function verifies whether a column from one DataFrame is identical to another column in another DataFrame. + It prints information about the differences between the two columns. + + Args: + got_df: The input DataFrame. + expected_df: The expected DataFrame. + results_pred_column: The column of interest in the input DataFrame. + expected_pred_column: The column of interest in the expected DataFrame. + + """ + # Compare the values in the two DataFrames element-wise + matches_df = got_df.merge( + expected_df, + left_on=["text", results_pred_column], + right_on=["text", expected_pred_column], + indicator="Matched", + how="outer", + ) + + matches_df = matches_df[matches_df["Matched"] == "both"] + # Calculate the match ratio + total_values = len(expected_df) + match_ratio = len(matches_df) / total_values + + different_count = abs(total_values - len(matches_df)) + + print(f"DataFrame Match Ratio: {match_ratio:.4f}") + print(f"Out of {len(expected_df)} rows {different_count} are different", flush=True) + print("---" * 30) + + +def verify_results( + results_file_path, + expected_results_file_path, + results_pred_column, + expected_pred_column, +): + """ + This function compares an input file with its expected result file. + See `verify_same_counts` and `verify_same_dataframe`. + + Args: + results_file_path: The path of the input files. + expected_results_file_path: The path of the expected_result file. + results_pred_column: The prediction column name for the input files. + expected_pred_column: The prediction column name for the expected_result file. + + """ + expected_df = pd.read_json(expected_results_file_path, lines=True) + expected_df = expected_df.sort_values(by=["text"]).reset_index(drop=True) + expected_counts = expected_df[expected_pred_column].value_counts().to_dict() + + expected_columns = expected_df.columns + if results_pred_column != expected_pred_column: + expected_columns = [ + results_pred_column if item == expected_pred_column else item + for item in expected_columns + ] + + got_paths = [p for p in os.scandir(results_file_path)] + got_df = [pd.read_json(path, lines=True)[expected_columns] for path in got_paths] + got_df = pd.concat(got_df, ignore_index=True) + got_df = got_df.sort_values(by=["text"]).reset_index(drop=True) + got_counts = got_df[results_pred_column].value_counts().to_dict() + + verify_same_counts(got_counts, expected_counts) + verify_same_dataframe( + got_df, expected_df, results_pred_column, expected_pred_column + ) + + +def main(): + """ + This script is useful for determining whether your predicted classifications match an expected output. + With this in mind, it is only useful if users have the expected results already computed. + """ + args = parse_args() + verify_results( + args.results_file_path, + args.expected_results_file_path, + args.results_pred_column, + args.expected_pred_column, + ) + + +def console_script(): + main() diff --git a/nemo_curator/download/__init__.py b/nemo_curator/download/__init__.py new file mode 100644 index 00000000..7b5d1387 --- /dev/null +++ b/nemo_curator/download/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .doc_builder import DocumentDownloader, DocumentIterator, DocumentExtractor, download_and_extract, import_downloader, import_extractor, import_iterator, batch_download +from .commoncrawl import download_common_crawl, CommonCrawlWARCDownloader, CommonCrawlWARCExtractor, CommonCrawlWARCIterator, CommonCrawlWARCDownloaderExtractOnly +from .wikipedia import download_wikipedia, WikipediaDownloader, WikipediaIterator, WikipediaExtractor +from .arxiv import download_arxiv, ArxivDownloader, ArxivIterator, ArxivExtractor + +__all__ = ["DocumentDownloader", "DocumentIterator", "DocumentExtractor", "download_and_extract", "import_downloader", "import_extractor", "import_iterator", "download_common_crawl", "CommonCrawlWARCDownloader", "CommonCrawlWARCExtractor", "CommonCrawlWARCIterator", "CommonCrawlWARCDownloaderExtractOnly", "download_wikipedia", "WikipediaDownloader", "WikipediaIterator", "WikipediaExtractor", "batch_download", "download_arxiv", "ArxivDownloader", "ArxivIterator", "ArxivExtractor"] \ No newline at end of file diff --git a/nemo_curator/download/arxiv.py b/nemo_curator/download/arxiv.py new file mode 100644 index 00000000..d9556db8 --- /dev/null +++ b/nemo_curator/download/arxiv.py @@ -0,0 +1,410 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import gzip +import tempfile +import tarfile +import subprocess + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.file_utils import get_all_files_paths_under, expand_outdir_and_mkdir +from nemo_curator.download.doc_builder import ( + DocumentDownloader, + DocumentIterator, + DocumentExtractor, + download_and_extract +) +from nemo_curator.utils.download_utils import get_arxiv_urls + +# The iterator and extractor code are in large part taken +# from the Red-Pajama repo +# https://github.com/togethercomputer/RedPajama-Data/tree/main/data_prep/arxiv + + +class ArxivDownloader(DocumentDownloader): + + def __init__(self, download_dir, verbose=False): + super().__init__() + self._download_dir = download_dir + self._verbose = False + + def download(self, tarfile): + output_file = os.path.join(self._download_dir, tarfile) + s3path = os.path.join("s3://arxiv/src", tarfile) + if os.path.exists(output_file): + print(f"tar file: {output_file} exists. Not downloading") + else: + print(f"Downloading {s3path} and writing to {output_file}") + cmd = ["s5cmd", "--request-payer=requester", "cp", s3path, output_file] + if self._verbose: + stdout, stderr = None, None + else: + stdout, stderr = subprocess.DEVNULL, subprocess.DEVNULL + p = subprocess.run( + cmd, + stdout=stdout, + stderr=stderr, + ) + if p.returncode != 0: + print(f"Failed to download {s3path} to {output_file}") + + return output_file + + +class ArxivIterator(DocumentIterator): + + def __init__(self, log_frequency=1000): + super().__init__() + self._log_frequency = log_frequency + self._counter = 0 + + def iterate(self, file_path): + self._counter = 0 + download_dir = os.path.split(file_path)[0] + bname = os.path.split(file_path)[-1] + with tempfile.TemporaryDirectory(dir=download_dir) as tmpdir: + with tarfile.open(file_path) as tf: + tf.extractall(members=tf.getmembers(), path=tmpdir) + for i, item in enumerate(get_all_files_paths_under(tmpdir)): + if self._counter > 0 and self._counter % self._log_frequency == 0: + print(f"Extracted {self._counter} papers from {file_path}") + self._counter += 1 + + tex_files = self._tex_proj_loader(item) + arxiv_id = os.path.splitext(os.path.split(item)[-1])[0] + + # get the arxiv id in the correct format + try: + clean_arxiv_id = self._format_arxiv_id(arxiv_id) + except Exception as e: + print(f"[WARNING] failed to format arxiv id " + f"{arxiv_id}; exception={e}") + clean_arxiv_id = arxiv_id + + if tex_files is None: + continue + + yield {'id': clean_arxiv_id, 'source_id': f'{bname}'}, tex_files + + def _tex_proj_loader(self, file_or_dir_path): + r""" function to load the tex files from a tar file or a gzip file. The + function will return a tuple containing a list of tex files and the + timestamp of the project. + + @param file_or_dir_path: path to the tar file or the gzip file + + @return: tuple containing a list of tex files and the timestamp of the + project + """ + files_and_content = [] + + try: + # if it is a directory, open it as a tarfile + with tarfile.open(file_or_dir_path) as sub_tf: + for member in sub_tf.getmembers(): + if member.name.endswith(".tex"): + + file_content = sub_tf.extractfile(member).read() + + try: + file_content = file_content.decode("utf-8") + except UnicodeDecodeError: + # self._logger.info(f"UnicodeDecodeError: {file_or_dir_path}") + return None + + files_and_content.append(file_content) + + except tarfile.ReadError: + # otherwise we try opening it as a gzip file + try: + with gzip.open(file_or_dir_path, "rb") as gz: + file_content = gz.read() + except Exception: + # all fails, we skip this file + # self._logger.info(f"[ERROR] {e}: {file_or_dir_path}") + return None + + try: + file_content = file_content.decode("utf-8") + except UnicodeDecodeError: + # self._logger.info(f"UnicodeDecodeError: {file_or_dir_path}") + return None + + files_and_content.append(file_content) + + except Exception as e: + print(f"[ERROR] {e}: {file_or_dir_path}") + return None + + return files_and_content + + def _format_arxiv_id(self, arxiv_id): + r""" this function brings the raw arxiv-id into a format compliant with the + specification from arxiv. This is used to create the url to the arxiv + abstract page. + + - Format prior to March 2007: + /YYMMNNN where N is a 3-digit number + - Format after March 2007: /YYMM.NNNNN where N is a + 5 (or 6)-digit number + + References: https://info.arxiv.org/help/arxiv_identifier.html + + @param arxiv_id: raw arxiv id which can be in one of the + following formats: + - + - + + @return: formatted arxiv id + """ + match = re.search(r'^([a-zA-Z-]*)([\d\.]+)$', arxiv_id) + + if match is None: + raise ValueError(f"Invalid arxiv id: {arxiv_id}") + + if match.group(1) == "": + return match.group(2) + + return f"{match.group(1)}/{match.group(2)}" + + +class ArxivExtractor(DocumentExtractor): + + def __init__(self): + super().__init__() + + def extract(self, content): + r""" function takes a list of tex files and returns a cleaned version of + the tex project. The cleaned version is a concatenation of the tex files + with the following modifications: + + - if multiple latex files, then concatenate them + - remove all comments (i.e. all lines starting with %) + - remove everything before the first \section header + - remove everything after the first occurrence of either \appendix or + \bibliography + - inline-expand definitions and macros + + @param tex_files: list of file_content strings + + @return: cleaned tex project as a string, empty string + if no tex files are provided + """ + if len(content) == 0: + return None + + # build dictionaries that contain the definitions of all macros in all tex + # files. This is later used to expand all macros used in the text with + # their definitions, so that consistency among different authors is + # ensured. + + non_arg_macros = {} + for file_content in content: + non_arg_macros.update(self._build_non_arg_macros_dict(file_content)) + + # TODO: macros that take arguments are not supported yet + arg_macros = {} + + # join multiple latex files with a newline character + try: + cleaned_latex_file_str = "\n".join( + self._clean_tex_file( + file_content=file_content, + arg_macros=arg_macros, + non_arg_macros=non_arg_macros, + ) for file_content in content) + except Exception: + return {}, None + + # Don't return meta + if cleaned_latex_file_str is not None: + if len(cleaned_latex_file_str) > 0: + return {}, cleaned_latex_file_str + + def _clean_tex_file(self, file_content, arg_macros, non_arg_macros): + r""" function takes a tex file as input and returns a cleaned version. The + cleaned version is a concatenation of the tex files with the + following modifications: + + - remove all comments (i.e. all lines starting with %) + - remove everything before the first section-like header + - remove everything after the first occurrence of either \appendix or + \bibliography + - inline-expand definitions and macros + + @param file_content: the content of the tex file as a string. + + @return: cleaned tex file as a string + """ + # find the first occurence of a \section-like header and replace everything + # before it with an empty string. This matches the following pattern: + # \[optional-args]{name} + pattern = r"^(.*?)(" + pattern += r"\\\bchapter\b\*?(?:\[(.*?)\])?\{(.*?)\}|" + pattern += r"\\\bpart\b\*?(?:\[(.*?)\])?\{(.*?)\}|" + pattern += r"\\\bsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|" + pattern += r"\\\bsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|" + pattern += r"\\\bsubsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|" + pattern += r"\\\bparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}" + pattern += r"\\\bsubparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}" + pattern += r")" + + # if no section like header is found, then we return an empty string + if not re.search(pattern, file_content, flags=re.DOTALL): + return "" + + # replace everything with the second group of the match (i.e. everything + # after and including the section header) + file_content = re.sub( + pattern=pattern, + repl=r"\2", + string=file_content, + flags=re.DOTALL # make sure that the dot matches also newlines + ) + + # remove all line comments + file_content = re.sub( + pattern=r"(?m)^%.*\n?", + repl=r"", + string=file_content, + flags=re.MULTILINE, + ) + + # remove all in comments within a line + file_content = re.sub( + # pattern matches a "%" that is not preceded by a backslash (=comment) + pattern=r"[^\\]%.+$", + repl=r"", + string=file_content, + flags=re.MULTILINE) + + # find the first occurence of either \appendix or \bibliography and + # replace everything after it with an empty string + pattern = r"(" + pattern += r"\\appendix|" + pattern += r"\\begin\{references\}|" + pattern += r"\\begin\{REFERENCES\}|" + pattern += r"\\begin\{thebibliography\}|" + pattern += r"\\bibliography\{.*\}" + pattern += r").*$" + + file_content = re.sub( + pattern=pattern, + repl=r'', + string=file_content, + flags=re.DOTALL # make sure that the dot matches also newlines + ) + + # inline-expand all non-arg macros + for macro_name, macro_value in non_arg_macros.items(): + file_content = re.sub( + # make pattern grouped to make sure that the macro is not part + # of a longer alphanumeric word + pattern=r"(" + macro_name + r")" + r"([^a-zA-Z0-9])", + # replace the macro with its value and add back the character that + # was matched after the macro + repl=macro_value + r"\2", + string=file_content, + ) + + # inline-expand all macros that use args + # TODO: inline-expand macros with args + for macro_name, macro_value in arg_macros.items(): + pass + + return file_content + + def _build_non_arg_macros_dict(self, file_content): + r""" function takes the content of a tex file and returns a dictionary + that contains the definitions of all macros that do not use arguments. + The dictionary is of the form {macro_name: macro_value}. + + @param file_content: the content of the tex file as a string. + + @return: dict + """ + # regex for extracting \newcommand macros without arguments + non_arg_nc_reg = re.compile( + # this regex matches the following: + # \newcommand{\macro_name}{macro_value} + # \newcommand*{\macro_name}{macro_value} + # where macro_name is only allowed to contain letters and numbers; + # macro_value can contain any character. + pattern=r'\\\bnewcommand\b\*?\{(\\[a-zA-Z0-9]+?)\}\{(.*?)\}$', + flags=re.MULTILINE, + ) + + # regex for extracting \def macros without arguments + non_arg_def_reg = re.compile( + # this regex matches the following: + # \def\macro_name{macro_value} + # where macro_name is only allowed to contain letters and numbers; + # macro_value can contain any character. + pattern=r'\\def\s*(\\[a-zA-Z0-9]+?)\s*\{(.*?)\}$', + flags=re.MULTILINE, + ) + + # Extract all user-defined LaTeX macros from the preamble + macros = {} + for reg in [non_arg_nc_reg, non_arg_def_reg]: + for match in reg.finditer(file_content): + # convert the macro name and value to a raw string that can be + # used in re.sub + macro_name = match \ + .group(1).encode("unicode-escape").decode("utf-8") + macro_val = match \ + .group(2).encode("unicode-escape").decode("utf-8") + + macros[macro_name] = macro_val + + return macros + +def download_arxiv(output_path: str, output_type: str="jsonl", raw_download_dir=None, keep_raw_download=False, force_download=False, url_limit=None) -> DocumentDataset: + """ + Downloads Arxiv tar files and extracts them + + Args: + output_path: The path to the root directory of the files + output_type: The file type to save the data as. + raw_download_dir: Path to store the raw download files for intermediate processing. + If None, they are stored in a folder named "downloads" under output_path. + keep_raw_download: If True, keeps the compressed WARC files that have not been extracted. + force_download: If False, will skip processing all files in output_paths that already exist and + directly read from them instead. + url_limit: The maximum number of raw files to download from the snapshot. If None, all + files from the range of snapshots are downloaded. + """ + arxiv_urls = get_arxiv_urls() + if url_limit: + arxiv_urls = arxiv_urls[:url_limit] + output_paths = list(map(lambda url: os.path.join(output_path, f"{url}.{output_type}"), arxiv_urls)) + + if not raw_download_dir: + raw_download_dir = os.path.join(output_path, "downloads") + expand_outdir_and_mkdir(raw_download_dir) + downloader = ArxivDownloader(raw_download_dir) + iterator = ArxivIterator() + extractor = ArxivExtractor() + + output_format = { + "text": str, + "id": str, + "source_id": str, + "filename": str, + } + dataset = download_and_extract(arxiv_urls, output_paths, downloader, iterator, extractor, output_format, output_type=output_type, keep_raw_download=keep_raw_download, force_download=force_download) + + return dataset \ No newline at end of file diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py new file mode 100644 index 00000000..8051db20 --- /dev/null +++ b/nemo_curator/download/commoncrawl.py @@ -0,0 +1,320 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import subprocess +import pycld2 as cld2 +import justext +import lxml +import unicodedata +from charset_normalizer import detect +from urllib.parse import urlparse +from warcio.archiveiterator import ArchiveIterator + +from nemo_curator.download.doc_builder import ( + DocumentDownloader, + DocumentIterator, + DocumentExtractor, + download_and_extract, +) +from nemo_curator.utils.download_utils import get_common_crawl_urls +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir + + +def decode_html(html_bytes): + # Convert from bytes to text using utf-8 encoding + try: + return html_bytes.decode('utf-8') + except UnicodeDecodeError: + # If utf-8 fails, try to find a different encoding + return try_decode_with_detected_encoding(html_bytes) + + +def try_decode_with_detected_encoding(html_bytes): + detected_encoding = detect(html_bytes)['encoding'] + bad_detection = not detected_encoding or detected_encoding == 'utf-8' + if bad_detection: + return None + try: + return html_bytes.decode(detected_encoding) + except: + return None + + +def lang_detect(decoded_html): + try: + details = cld2.detect(decoded_html)[2] + except Exception: + # Remove control characters + cleaned_html = ''.join(i for i in decoded_html if unicodedata.category(i)[0] != 'C') + details = cld2.detect(cleaned_html)[2] + + return details[0][0].upper() + + +def extract_text( + html, + stop_words, + length_low=70, + length_high=200, + stopwords_low=0.30, + stopwords_high=0.32, + max_link_density=0.2, + max_heading_distance=200, + no_headings=False, + logger=None, +): + # Segment the HTML into paragraphs + try: + # Form the DOM tree + dom = justext.core.html_to_dom(html) + cleaned_dom = justext.core.preprocessor(dom) + # Get the paragraphs from the DOM + handler = justext.core.ParagraphMaker() + lxml.sax.saxify(cleaned_dom, handler) + except (lxml.etree.ParserError, ValueError, Exception): + # Return nothing when we cannot segment the document + if logger is not None: + logger.info("Could not segment paragaphs in the document") + return + paragraphs = handler.paragraphs + + # Context free classification + justext.core.classify_paragraphs( + paragraphs, + stop_words, + length_low, + length_high, + stopwords_low, + stopwords_high, + max_link_density, + no_headings, + ) + + # Copy the context free class to the class_style + # This handles the headings as described in the + # documentation + for paragraph in paragraphs: + paragraph.class_type = paragraph.cf_class + + # Context sensitive classification + justext.core.revise_paragraph_classification( + paragraphs, + max_heading_distance, + ) + + return [p.text for p in paragraphs if not p.is_boilerplate] + + +def get_stop_list_dict(languages=[]): + + # Name mapping for language names from CLD2 (values) + # and jusText (keys) + lang_map = { + 'Haitian': 'HAITIAN_CREOLE', + 'Norwegian_Bokmal': 'NORWEGIAN', + 'Norwegian_Nynorsk': 'NORWEGIAN_N', + 'Waray_Waray': 'WARAY_PHILIPPINES', + } + if len(languages) == 0: + languages = justext.get_stoplists() + # Remove latin as it yields a lot of low quality documents + languages_no_latin = list(languages) + languages_no_latin.remove('Latin') + languages = frozenset(languages_no_latin) + + stop_list_dict = {} + for language in languages: + if language in lang_map: + lang_key = lang_map[language] + else: + lang_key = language.upper() + stop_list_dict[lang_key] = justext.get_stoplist(language) + + return stop_list_dict + + +def get_all_stop_words(): + stop_words = set() + for language in justext.get_stoplists(): + stop_words.update(justext.get_stoplist(language)) + + return frozenset(stop_words) + + +class CommonCrawlWARCDownloader(DocumentDownloader): + """ + Downloads WARC files from the Common Crawl + """ + + def __init__(self, download_dir, aws=False, verbose=False): + """ + Creates a downloader + + Args: + download_dir: Path to store raw compressed WARC files + aws: If True, uses the s5cmd command to download from the Common Crawl's S3 bucket. + If False, uses wget. + verbose: If True, logs stdout and stderr of the download command (s5cmd/wget) + """ + super().__init__() + self._download_dir = download_dir + self._aws = aws + self._verbose = verbose + + def download(self, url): + # Download each URL to the directory + urlpath = urlparse(url).path[1:] + output_name = urlpath.replace('/', '-') + output_file = os.path.join(self._download_dir, output_name) + if os.path.exists(output_file): + print(f"WARC file: {output_file} exists. Not downloading") + else: + print(f"Downloading {url} and writing to {output_file}") + # Download with either wget or s5cmd (aws) + if self._aws: + s3path = os.path.join("s3://commoncrawl/", urlpath) + cmd = ["s5cmd", "cp", s3path, output_file] + else: + cmd = ["wget", url, "-O", output_file] + if self._verbose: + stdout, stderr = None, None + else: + stdout, stderr = subprocess.DEVNULL, subprocess.DEVNULL + p = subprocess.run( + cmd, + stdout=stdout, + stderr=stderr, + ) + if p.returncode != 0: + print(f"Failed to download {url} to {output_file}") + + return output_file + + +class CommonCrawlWARCDownloaderExtractOnly(DocumentDownloader): + """ + A 'dummy' downloader that simply puts pre-downloaded + files on the queue + """ + + def __init__(self, aws=False, verbose=False): + super().__init__() + + def download(self, url): + print(f"Putting WARC file {url} on the queue for extraction") + return url + + +class CommonCrawlWARCIterator(DocumentIterator): + + def __init__(self, log_frequency=1000): + super().__init__() + self._counter = 0 + self._log_frequency = log_frequency + + def iterate(self, file_path): + # Loop over all records in the current WARC + self._counter = 0 + bname = os.path.split(file_path)[-1] + with open(file_path, 'rb') as file_pointer: + ai = ArchiveIterator(file_pointer, arc2warc=True) + for k, rec in enumerate(ai): + # Get the response from the crawl + if rec.rec_type == 'response': + if self._counter > 0 and self._counter % self._log_frequency == 0: + print(f"Extracted {self._counter} records in WARC") + self._counter += 1 + content = rec.content_stream().read() + warc_id = rec.rec_headers.get_header('WARC-Record-ID')[10:-1] + url = rec.rec_headers.get_header('WARC-Target-URI') + meta = { + 'url': url, + 'warc_id': warc_id, + 'source_id': f'{bname}', + } + yield meta, content + + +class CommonCrawlWARCExtractor(DocumentExtractor): + + def __init__(self): + self._stop_lists = get_stop_list_dict() + super().__init__() + + def extract(self, content): + html = decode_html(content) + if html is not None: + # Language detection and HTML extraction + lang = lang_detect(html) + text = None + if lang in self._stop_lists: + text = extract_text(html, self._stop_lists[lang]) + if text is not None: + if len(text) > 0: + text = '\n\n'.join(text) + meta = {'language': lang} + return meta, text + else: + return None, None + +def download_common_crawl(output_path: str, start_snapshot: str, end_snapshot: str, output_type: str="jsonl", news=False, aws=False, raw_download_dir=None, keep_raw_download=False, force_download=False, url_limit=None) -> DocumentDataset: + """ + Downloads Common Crawl WARC snapshots and extracts them using jusText + + Args: + output_path: The path to the root directory of the files + start_snapshot: The first common crawl snapshot to include. Snapshots must be + specified by YYYY-WeekNumber (e.g., '2020-50' or '2021-04'). For the CC-NEWS dataset, + (specified with news=True flag) this changes to Year-Month (YYYY-MM). + end_snapshot: The last common crawl snapshot to include. Must be chronologically + after the starting snapshot. + output_type: The file type to save the data as. + news: If True, gets WARC URLs for the CC-NEWS dataset instead of the CC-MAIN datasets. + Also assumes that the format for the start and end snapshots is 'YYYY-MM' (Year-Month). + aws: Whether to download from Common Crawl's S3 bucket. If True, uses s5cmd to download. + If False, uses wget. + raw_download_dir: Path to store the raw download files for intermediate processing. + If None, they are stored in a folder named "downloads" under output_path. + keep_raw_download: If True, keeps the compressed WARC files that have not been extracted. + force_download: If False, will skip processing all files in output_paths that already exist and + directly read from them instead. + url_limit: The maximum number of raw files to download from the snapshot. If None, all + files from the range of snapshots are downloaded. + """ + common_crawl_urls = get_common_crawl_urls(starting_snapshot=start_snapshot, ending_snapshot=end_snapshot, news=news) + if url_limit: + common_crawl_urls = common_crawl_urls[:url_limit] + output_paths = list(map(lambda url: os.path.join(output_path, url.split("/")[-1] + f".{output_type}"), common_crawl_urls)) + + if not raw_download_dir: + raw_download_dir = os.path.join(output_path, "downloads") + expand_outdir_and_mkdir(raw_download_dir) + downloader = CommonCrawlWARCDownloader(raw_download_dir, aws=aws) + iterator = CommonCrawlWARCIterator() + extractor = CommonCrawlWARCExtractor() + + output_format = { + "text": str, + "language": str, + "url": str, + "warc_id": str, + "source_id": str, + "filename": str, + } + dataset = download_and_extract(common_crawl_urls, output_paths, downloader, iterator, extractor, output_format, output_type=output_type, keep_raw_download=keep_raw_download, force_download=force_download) + + return dataset \ No newline at end of file diff --git a/nemo_curator/download/doc_builder.py b/nemo_curator/download/doc_builder.py new file mode 100644 index 00000000..03395e91 --- /dev/null +++ b/nemo_curator/download/doc_builder.py @@ -0,0 +1,174 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +from abc import ABC, abstractmethod +from typing import List, Tuple +import dask.dataframe as dd +from dask import delayed, compute +import pandas as pd +import os + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.distributed_utils import single_partition_write_with_filename, read_single_partition + + +class DocumentDownloader(ABC): + """ Abstract class for downloading remote data to disk """ + + def __init__(self): + super().__init__() + + @abstractmethod + def download(self, url): + pass + + +class DocumentIterator(ABC): + """ + Abstract iterator class for reading in raw records that have been + downloaded to disk + """ + + def __init__(self): + super().__init__() + + @abstractmethod + def iterate(self, file_path): + pass + + +class DocumentExtractor(ABC): + """ Abstract class for extracting text from records read from disk """ + + def __init__(self): + super().__init__() + + @abstractmethod + def extract(self, content): + pass + + +def import_downloader(downloader_path): + module_path, downloader_name = downloader_path.rsplit(".", 1) + downloader_module = importlib.import_module(module_path) + downloader_class = getattr(downloader_module, downloader_name) + if not issubclass(downloader_class, DocumentDownloader): + raise ValueError(f"Input downloader {downloader_class.__name__} " + "must be derived from DocumentDownloader defined in " + "nemo_curator.download.docbuilder") + return downloader_class + + +def import_iterator(iterator_path): + module_path, iterator_name = iterator_path.rsplit(".", 1) + iterator_module = importlib.import_module(module_path) + iterator_class = getattr(iterator_module, iterator_name) + if not issubclass(iterator_class, DocumentIterator): + raise ValueError(f"Input iterator {iterator_class.__name__} " + "must be derived from DocumentIterator " + "defined in nemo_curator.download.docbuilder") + return iterator_class + + +def import_extractor(extractor_path): + module_path, extractor_name = extractor_path.rsplit(".", 1) + extractor_module = importlib.import_module(module_path) + extractor_class = getattr(extractor_module, extractor_name) + if not issubclass(extractor_class, DocumentExtractor): + raise ValueError(f"Input extractor {extractor_class.__name__} " + "must be derived from DocumentExtractor defined " + "in nemo_curator.download.docbuilder") + return extractor_class + +def _download_and_extract_single_partition(paths: List[Tuple[str, str]], downloader: DocumentDownloader, iterator: DocumentIterator, extractor: DocumentExtractor, output_type: str, keep_raw_download: bool, force_download: bool) -> pd.DataFrame: + url, output_path = paths + + if os.path.exists(output_path) and not force_download: + partition = read_single_partition([output_path], backend="pandas", filetype=output_type, add_filename=True) + return partition + + downloaded_file = downloader.download(url) + records = [] + # Iterate over all records in file + for item in iterator.iterate(downloaded_file): + record_meta, content = item + # Extract the text from the record + extracted = extractor.extract(content) + if extracted is not None: + text_meta, text = extracted + if text is not None: + line = { + 'text': text, + **text_meta, + **record_meta, + } + records.append(line) + + partition = pd.DataFrame(records) + filename = os.path.basename(output_path) + output_dir = os.path.dirname(output_path) + partition["filename"] = filename + single_partition_write_with_filename(partition, output_dir, output_type=output_type) + if not keep_raw_download: + os.remove(downloaded_file) + + return partition + +def download_and_extract(urls: List[str], output_paths: List[str], downloader: DocumentDownloader, iterator: DocumentIterator, extractor: DocumentExtractor, output_format: dict, output_type: str="jsonl", keep_raw_download=False, force_download=False) -> DocumentDataset: + """ + Downloads and extracts a dataset into a format accepted by the NeMo Curator + + Args: + urls: A list of urls to download the dataset from + output_paths: A list of paths to save the final extracted output to. + The raw output of the downloader will be saved using the path given by downloader.download(url). + downloader: A DocumentDownloader that handles retrieving each file from its url and saving it to storage + iterator: A DocumentIterator that handles iterating through the downloaded file's format + extractor: A DocumentExtractor that handles extracting the data from its raw format into text + output_format: A dictionary mappings columns to datatypes for the fields of each datapoint after extraction. + output_type: The file type to save the dataset as. + keep_raw_download: Whether to keep the pre-extracted download file. + force_download: If False, will skip processing all files in output_paths that already exist and + directly read from them instead. + + Returns: + A DocumentDataset of the downloaded data + """ + if len(urls) != len(output_paths): + raise ValueError("Different number of urls and output_paths") + + output_format = dict(sorted(output_format.items())) + df = dd.from_map( + _download_and_extract_single_partition, + zip(urls, output_paths), + downloader=downloader, + iterator=iterator, + extractor=extractor, + output_type=output_type, + keep_raw_download=keep_raw_download, + force_download=force_download, + enforce_metadata=False, + meta=output_format, + ) + + return DocumentDataset(df) + +def batch_download(urls: List[str], downloader: DocumentDownloader) -> List[str]: + """ + Downloads all the urls using the downloader in parallel + """ + delayed_downloads = [delayed(downloader.download)(url) for url in urls] + + return compute(*delayed_downloads) \ No newline at end of file diff --git a/nemo_curator/download/wikipedia.py b/nemo_curator/download/wikipedia.py new file mode 100644 index 00000000..0df8059d --- /dev/null +++ b/nemo_curator/download/wikipedia.py @@ -0,0 +1,764 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import subprocess +import bz2 +import codecs +import mwparserfromhell +from urllib.parse import urlparse, quote +import xml.etree.cElementTree as etree + +from nemo_curator.download.doc_builder import ( + DocumentDownloader, + DocumentIterator, + DocumentExtractor, + download_and_extract +) +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.download_utils import get_wikipedia_urls +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir +from distributed import Lock + +# The majority of this code is taken from the HuggingFace +# implementation of the Wikipedia dataset preparation: +# https://github.com/huggingface/datasets/blob/7e30308f49f8c85dc7a2ab5aafbff04b5d2f38e2/datasets/wikipedia/wikipedia.py + +MEDIA_ALIASES = { + "ab": ["Медиа", "Файл", "Афаил", "Амедиа", "Изображение"], + "ace": ["Beureukaih", "Gambar", "Alat", "Berkas"], + "ady": ["Медиа"], + "af": ["Lêer", "Beeld"], + "als": ["Medium", "Datei", "Bild"], + "am": ["ፋይል", "ስዕል"], + "an": ["Imachen", "Imagen"], + "ang": ["Ymele", "Biliþ"], + "ar": ["ميديا", "صورة", "وسائط", "ملف"], + "arc": ["ܠܦܦܐ", "ܡܝܕܝܐ"], + "arz": ["ميديا", "صورة", "وسائط", "ملف"], + "as": ["চিত্ৰ", "चित्र", "চিত্র", "মাধ্যম"], + "ast": ["Imaxen", "Ficheru", "Imaxe", "Archivu", "Imagen", "Medios"], + "atj": ["Tipatcimoctakewin", "Natisinahikaniwoc"], + "av": ["Медиа", "Файл", "Изображение"], + "ay": ["Medio", "Archivo", "Imagen"], + "az": ["Mediya", "Şəkil", "Fayl"], + "azb": ["رسانه", "تصویر", "مدیا", "فایل", "رسانه‌ای"], + "ba": ["Медиа", "Рәсем", "Файл", "Изображение"], + "bar": ["Medium", "Datei", "Bild"], + "bat-smg": ["Vaizdas", "Medėjė", "Abruozdielis"], + "bcl": ["Medio", "Ladawan"], + "be": ["Мультымедыя", "Файл", "Выява"], + "be-x-old": ["Мэдыя", "Файл", "Выява"], + "bg": ["Медия", "Файл", "Картинка"], + "bh": ["मीडिया", "चित्र"], + "bjn": ["Barakas", "Gambar", "Berkas"], + "bm": ["Média", "Fichier"], + "bn": ["চিত্র", "মিডিয়া"], + "bpy": ["ছবি", "মিডিয়া"], + "br": ["Skeudenn", "Restr"], + "bs": ["Mediji", "Slika", "Datoteka", "Medija"], + "bug": ["Gambar", "Berkas"], + "bxr": ["Файл", "Меди", "Изображение"], + "ca": ["Fitxer", "Imatge"], + "cbk-zam": ["Medio", "Archivo", "Imagen"], + "cdo": ["文件", "媒體", "圖像", "檔案"], + "ce": ["Хlум", "Медиа", "Сурт", "Файл", "Медйа", "Изображение"], + "ceb": ["Payl", "Medya", "Imahen"], + "ch": ["Litratu"], + "ckb": ["میدیا", "پەڕگە"], + "co": ["Immagine"], + "crh": ["Медиа", "Resim", "Файл", "Fayl", "Ресим"], + "cs": ["Soubor", "Média", "Obrázok"], + "csb": ["Òbrôzk", "Grafika"], + "cu": ["Видъ", "Ви́дъ", "Дѣло", "Срѣдьства"], + "cv": ["Медиа", "Ӳкерчĕк", "Изображение"], + "cy": ["Delwedd"], + "da": ["Billede", "Fil"], + "de": ["Medium", "Datei", "Bild"], + "din": ["Ciɛl", "Apamduööt"], + "diq": ["Medya", "Dosya"], + "dsb": ["Wobraz", "Dataja", "Bild", "Medija"], + "dty": ["चित्र", "मिडिया"], + "dv": ["ފައިލު", "މީޑިއާ", "ފައިލް"], + "el": ["Εικόνα", "Αρχείο", "Μέσο", "Μέσον"], + "eml": ["Immagine"], + "eo": ["Dosiero", "Aŭdvidaĵo"], + "es": ["Medio", "Archivo", "Imagen"], + "et": ["Pilt", "Fail", "Meedia"], + "eu": ["Irudi", "Fitxategi"], + "ext": ["Archivu", "Imagen", "Mediu"], + "fa": ["رسانه", "تصویر", "مدیا", "پرونده", "رسانه‌ای"], + "ff": ["Média", "Fichier"], + "fi": ["Kuva", "Tiedosto"], + "fiu-vro": ["Pilt", "Meediä"], + "fo": ["Miðil", "Mynd"], + "fr": ["Média", "Fichier"], + "frp": ["Émâge", "Fichiér", "Mèdia"], + "frr": ["Medium", "Datei", "Bild"], + "fur": ["Immagine", "Figure"], + "fy": ["Ofbyld"], + "ga": ["Íomhá", "Meán"], + "gag": ["Mediya", "Medya", "Resim", "Dosya", "Dosye"], + "gan": ["媒体文件", "文件", "文檔", "档案", "媒體", "图像", "圖像", "媒体", "檔案"], + "gd": ["Faidhle", "Meadhan"], + "gl": ["Imaxe", "Ficheiro", "Arquivo", "Imagem"], + "glk": ["رسانه", "تصویر", "پرونده", "فاىل", "رسانه‌ای", "مديا"], + "gn": ["Medio", "Imagen", "Ta'ãnga"], + "gom": ["माध्यम", "मिडिया", "फायल"], + "gor": ["Gambar", "Berkas"], + "got": ["𐍆𐌴𐌹𐌻𐌰"], + "gu": ["દ્રશ્ય-શ્રાવ્ય (મિડિયા)", "દ્રશ્ય-શ્રાવ્ય_(મિડિયા)", "ચિત્ર"], + "gv": ["Coadan", "Meanyn"], + "hak": ["文件", "媒體", "圖像", "檔案"], + "haw": ["Kiʻi", "Waihona", "Pāpaho"], + "he": ["תמונה", "קו", "מדיה", "קובץ"], + "hi": ["मीडिया", "चित्र"], + "hif": ["file", "saadhan"], + "hr": ["Mediji", "DT", "Slika", "F", "Datoteka"], + "hsb": ["Wobraz", "Dataja", "Bild"], + "ht": ["Imaj", "Fichye", "Medya"], + "hu": ["Kép", "Fájl", "Média"], + "hy": ["Պատկեր", "Մեդիա"], + "ia": ["Imagine", "Multimedia"], + "id": ["Gambar", "Berkas"], + "ig": ["Nká", "Midia", "Usòrò", "Ákwúkwó orünotu", "Ákwúkwó_orünotu"], + "ii": ["媒体文件", "文件", "档案", "图像", "媒体"], + "ilo": ["Midia", "Papeles"], + "inh": ["Медиа", "Файл", "Изображение"], + "io": ["Imajo", "Arkivo"], + "is": ["Miðill", "Mynd"], + "it": ["Immagine"], + "ja": ["メディア", "ファイル", "画像"], + "jbo": ["velsku", "datnyvei"], + "jv": ["Barkas", "Medhia", "Gambar", "Médhia"], + "ka": ["მედია", "სურათი", "ფაილი"], + "kaa": ["Swret", "Таспа", "سۋرەت", "Taspa", "Su'wret", "Сурет", "تاسپا"], + "kab": ["Tugna"], + "kbd": ["Медиа", "Файл"], + "kbp": ["Média", "Fichier"], + "kg": ["Fisye"], + "kk": ["Swret", "سۋرەت", "Таспа", "Taspa", "Сурет", "تاسپا"], + "kl": ["Billede", "Fiileq", "Fil"], + "km": ["ឯកសារ", "រូបភាព", "មេឌា", "មីឌា"], + "kn": ["ಚಿತ್ರ", "ಮೀಡಿಯ"], + "ko": ["미디어", "파일", "그림"], + "koi": ["Медиа", "Файл", "Изображение"], + "krc": ["Медиа", "Файл", "Изображение"], + "ks": ["میڈیا", "فَیِل"], + "ksh": [ + "Beld", "Meedije", "Medie", "Belld", "Medium", "Datei", "Meedijum", + "Bild" + ], + "ku": ["میدیا", "پەڕگە", "Medya", "Wêne"], + "kv": ["Медиа", "Файл", "Изображение"], + "kw": ["Restren"], + "ky": ["Медиа", "Файл"], + "la": ["Imago", "Fasciculus"], + "lad": ["Dossia", "Medya", "Archivo", "Dosya", "Imagen", "Meddia"], + "lb": ["Fichier", "Bild"], + "lbe": ["Медиа", "Сурат", "Изображение"], + "lez": ["Медиа", "Mediya", "Файл", "Şəkil", "Изображение"], + "lfn": ["Fix"], + "li": ["Afbeelding", "Plaetje", "Aafbeilding"], + "lij": ["Immaggine", "Immagine"], + "lmo": ["Immagine", "Imàjine", "Archivi"], + "ln": ["Média", "Fichier"], + "lo": ["ສື່ອ", "ສື່", "ຮູບ"], + "lrc": ["رسانه", "تصویر", "رسانه‌ای", "جانیا", "أسگ", "ڤارئسگأر"], + "lt": ["Vaizdas", "Medija"], + "ltg": ["Medeja", "Fails"], + "lv": ["Attēls"], + "mai": ["मेडिया", "फाइल"], + "map-bms": ["Barkas", "Medhia", "Gambar", "Médhia"], + "mdf": ["Медиа", "Няйф", "Изображение"], + "mg": ["Rakitra", "Sary", "Média"], + "mhr": ["Медиа", "Файл", "Изображение"], + "min": ["Gambar", "Berkas"], + "mk": ["Податотека", "Медија", "Медиум", "Слика"], + "ml": ["പ്രമാണം", "ചി", "മീഡിയ", "പ്ര", "ചിത്രം"], + "mn": ["Медиа", "Файл", "Зураг"], + "mr": ["चित्र", "मिडिया"], + "mrj": ["Медиа", "Файл", "Изображение"], + "ms": ["Fail", "Imej"], + "mt": ["Midja", "Medja", "Stampa"], + "mwl": ["Multimédia", "Fexeiro", "Ficheiro", "Arquivo", "Imagem"], + "my": ["ဖိုင်", "မီဒီယာ"], + "myv": ["Медия", "Артовкс", "Изображение"], + "mzn": ["رسانه", "تصویر", "مه‌دیا", "مدیا", "پرونده", "رسانه‌ای"], + "nah": ["Mēdiatl", "Īxiptli", "Imagen"], + "nap": ["Fiùra", "Immagine"], + "nds": ["Datei", "Bild"], + "nds-nl": ["Ofbeelding", "Afbeelding", "Bestaand"], + "ne": ["मीडिया", "चित्र"], + "new": ["किपा", "माध्यम"], + "nl": ["Bestand", "Afbeelding"], + "nn": ["Fil", "Bilde", "Filpeikar"], + "no": ["Fil", "Medium", "Bilde"], + "nov": [], + "nrm": ["Média", "Fichier"], + "nso": ["Seswantšho"], + "nv": ["Eʼelyaaígíí"], + "oc": ["Imatge", "Fichièr", "Mèdia"], + "olo": ["Kuva", "Medii", "Failu"], + "or": ["ମାଧ୍ୟମ", "ଫାଇଲ"], + "os": ["Ныв", "Медиа", "Файл", "Изображение"], + "pa": ["ਤਸਵੀਰ", "ਮੀਡੀਆ"], + "pcd": ["Média", "Fichier"], + "pdc": ["Medium", "Datei", "Bild", "Feil"], + "pfl": ["Dadai", "Medium", "Datei", "Bild"], + "pi": ["मीडिया", "पटिमा"], + "pl": ["Plik", "Grafika"], + "pms": ["Figura", "Immagine"], + "pnb": ["میڈیا", "تصویر", "فائل"], + "pnt": ["Εικόνα", "Αρχείον", "Εικόναν", "Μέσον"], + "ps": ["انځور", "رسنۍ", "دوتنه"], + "pt": ["Multimédia", "Ficheiro", "Arquivo", "Imagem"], + "qu": ["Midya", "Imagen", "Rikcha"], + "rm": ["Multimedia", "Datoteca"], + "rmy": ["Fişier", "Mediya", "Chitro", "Imagine"], + "ro": ["Fişier", "Imagine", "Fișier"], + "roa-rup": ["Fişier", "Imagine", "Fișier"], + "roa-tara": ["Immagine"], + "ru": ["Медиа", "Файл", "Изображение"], + "rue": ["Медіа", "Медиа", "Файл", "Изображение", "Зображення"], + "rw": ["Dosiye", "Itangazamakuru"], + "sa": ["चित्रम्", "माध्यमम्", "सञ्चिका", "माध्यम", "चित्रं"], + "sah": ["Миэдьийэ", "Ойуу", "Билэ", "Изображение"], + "sat": ["ᱨᱮᱫ", "ᱢᱤᱰᱤᱭᱟ"], + "sc": ["Immàgini"], + "scn": ["Immagine", "Mmàggini", "Mèdia"], + "sd": ["عڪس", "ذريعات", "فائل"], + "se": ["Fiila"], + "sg": ["Média", "Fichier"], + "sh": ["Mediji", "Slika", "Медија", "Datoteka", "Medija", "Слика"], + "si": ["රූපය", "මාධ්‍යය", "ගොනුව"], + "sk": ["Súbor", "Obrázok", "Médiá"], + "sl": ["Slika", "Datoteka"], + "sq": ["Figura", "Skeda"], + "sr": [ + "Датотека", "Medij", "Slika", "Медија", "Datoteka", "Медиј", "Medija", + "Слика" + ], + "srn": ["Afbeelding", "Gefre"], + "stq": ["Bielde", "Bild"], + "su": ["Média", "Gambar"], + "sv": ["Fil", "Bild"], + "sw": ["Faili", "Picha"], + "szl": ["Plik", "Grafika"], + "ta": ["படிமம்", "ஊடகம்"], + "tcy": ["ಮಾದ್ಯಮೊ", "ಫೈಲ್"], + "te": ["ఫైలు", "దస్త్రం", "బొమ్మ", "మీడియా"], + "tet": ["Imajen", "Arquivo", "Imagem"], + "tg": ["Акс", "Медиа"], + "th": ["ไฟล์", "สื่อ", "ภาพ"], + "ti": ["ፋይል", "ሜድያ"], + "tk": ["Faýl"], + "tl": ["Midya", "Talaksan"], + "tpi": ["Fail"], + "tr": ["Medya", "Resim", "Dosya", "Ortam"], + "tt": ["Медиа", "Рәсем", "Файл", "Räsem", "Изображение"], + "ty": ["Média", "Fichier"], + "tyv": ["Медиа", "Файл", "Изображение"], + "udm": ["Медиа", "Файл", "Суред", "Изображение"], + "ug": ["ۋاسىتە", "ھۆججەت"], + "uk": ["Медіа", "Медиа", "Файл", "Изображение", "Зображення"], + "ur": ["میڈیا", "تصویر", "وسیط", "زریعہ", "فائل", "ملف"], + "uz": ["Mediya", "Tasvir", "Fayl"], + "vec": ["Immagine", "Imàjine", "Mèdia"], + "vep": ["Pilt", "Fail"], + "vi": ["Phương_tiện", "Tập_tin", "Hình", "Tập tin", "Phương tiện"], + "vls": ["Afbeelding", "Ofbeeldienge"], + "vo": ["Ragiv", "Magod", "Nünamakanäd"], + "wa": ["Imådje"], + "war": ["Medya", "Fayl", "Paypay"], + "wo": ["Xibaarukaay", "Dencukaay"], + "wuu": ["文件", "档案", "图像", "媒体"], + "xal": ["Аһар", "Боомг", "Изображение", "Зург"], + "xmf": ["მედია", "სურათი", "ფაილი"], + "yi": ["מעדיע", "תמונה", "טעקע", "בילד"], + "yo": ["Fáìlì", "Amóhùnmáwòrán", "Àwòrán"], + "za": ["媒体文件", "文件", "档案", "图像", "媒体"], + "zea": ["Afbeelding", "Plaetje"], + "zh": ["媒体文件", "F", "文件", "媒體", "档案", "图像", "圖像", "媒体", "檔案"], + "zh-classical": ["文件", "媒體", "圖像", "檔案"], + "zh-min-nan": ["tóng-àn", "文件", "媒體", "Mûi-thé", "圖像", "檔案"], + "zh-yue": ["檔", "档", "文件", "图", "媒體", "圖", "档案", "图像", "圖像", "媒体", "檔案"], +} + +CAT_ALIASES = { + "ab": ["Категория", "Акатегориа"], + "ace": ["Kawan", "Kategori"], + "af": ["Kategorie"], + "ak": ["Nkyekyem"], + "als": ["Kategorie"], + "am": ["መደብ"], + "an": ["Categoría"], + "ang": ["Flocc"], + "ar": ["تصنيف"], + "arc": ["ܣܕܪܐ"], + "arz": ["تصنيف"], + "as": ["CAT", "শ্ৰেণী", "श्रेणी", "শ্রেণী"], + "ast": ["Categoría"], + "atj": ["Tipanictawin"], + "av": ["Категория"], + "ay": ["Categoría"], + "az": ["Kateqoriya"], + "azb": ["بؤلمه"], + "ba": ["Төркөм", "Категория"], + "bar": ["Kategorie"], + "bat-smg": ["Kategorija", "Kateguorėjė"], + "bcl": ["Kategorya"], + "be": ["Катэгорыя"], + "be-x-old": ["Катэгорыя"], + "bg": ["Категория"], + "bh": ["श्रेणी"], + "bjn": ["Tumbung", "Kategori"], + "bm": ["Catégorie"], + "bn": ["বিষয়শ্রেণী", "വിഭാഗം"], + "bpy": ["থাক"], + "br": ["Rummad"], + "bs": ["Kategorija"], + "bug": ["Kategori"], + "bxr": ["Категори", "Категория"], + "ca": ["Categoria"], + "cbk-zam": ["Categoría"], + "cdo": ["分類"], + "ce": ["Категори", "Тоба", "Кадегар"], + "ceb": ["Kategoriya"], + "ch": ["Katigoria"], + "ckb": ["پ", "پۆل"], + "co": ["Categoria"], + "crh": ["Категория", "Kategoriya"], + "cs": ["Kategorie"], + "csb": ["Kategòrëjô"], + "cu": ["Катигорї", "Категория", "Катигорїꙗ"], + "cv": ["Категори"], + "cy": ["Categori"], + "da": ["Kategori"], + "de": ["Kategorie"], + "din": ["Bekätakthook"], + "diq": ["Kategoriye", "Kategori"], + "dsb": ["Kategorija"], + "dty": ["श्रेणी"], + "dv": ["ޤިސްމު"], + "el": ["Κατηγορία"], + "eml": ["Categoria"], + "eo": ["Kategorio"], + "es": ["CAT", "Categoría"], + "et": ["Kategooria"], + "eu": ["Kategoria"], + "ext": ["Categoría", "Categoria"], + "fa": ["رده"], + "ff": ["Catégorie"], + "fi": ["Luokka"], + "fiu-vro": ["Katõgooria"], + "fo": ["Bólkur"], + "fr": ["Catégorie"], + "frp": ["Catègorie"], + "frr": ["Kategorie"], + "fur": ["Categorie"], + "fy": ["Kategory"], + "ga": ["Rang", "Catagóir"], + "gag": ["Kategori", "Kategoriya"], + "gan": ["分類", "分类"], + "gd": ["Roinn-seòrsa"], + "gl": ["Categoría"], + "glk": ["جرگه", "رده"], + "gn": ["Ñemohenda"], + "gom": ["वर्ग", "श्रेणी"], + "gor": ["Dalala"], + "got": ["𐌷𐌰𐌽𐍃𐌰"], + "gu": ["શ્રેણી", "CAT", "શ્રે"], + "gv": ["Ronney"], + "hak": ["分類"], + "haw": ["Māhele"], + "he": ["קטגוריה", "קט"], + "hi": ["श्र", "श्रेणी"], + "hif": ["vibhag"], + "hr": ["CT", "KT", "Kategorija"], + "hsb": ["Kategorija"], + "ht": ["Kategori"], + "hu": ["Kategória"], + "hy": ["Կատեգորիա"], + "ia": ["Categoria"], + "id": ["Kategori"], + "ie": ["Categorie"], + "ig": ["Ébéonọr", "Òtù"], + "ii": ["分类"], + "ilo": ["Kategoria"], + "inh": ["ОагӀат"], + "io": ["Kategorio"], + "is": ["Flokkur"], + "it": ["CAT", "Categoria"], + "ja": ["カテゴリ"], + "jbo": ["klesi"], + "jv": ["Kategori"], + "ka": ["კატეგორია"], + "kaa": ["Sanat", "Kategoriya", "Санат", "سانات"], + "kab": ["Taggayt"], + "kbd": ["Категория", "Категориэ"], + "kbp": ["Catégorie"], + "kg": ["Kalasi"], + "kk": ["Sanat", "Санат", "سانات"], + "kl": ["Sumut_atassuseq", "Kategori", "Sumut atassuseq"], + "km": ["ចំនាត់ថ្នាក់ក្រុម", "ចំណាត់ក្រុម", "ចំណាត់ថ្នាក់ក្រុម"], + "kn": ["ವರ್ಗ"], + "ko": ["분류"], + "koi": ["Категория"], + "krc": ["Категория"], + "ks": ["زٲژ"], + "ksh": [ + "Saachjropp", "Saachjrop", "Katejori", "Kategorie", "Saachjrupp", + "Kattejori", "Sachjrop" + ], + "ku": ["Kategorî", "پۆل"], + "kv": ["Категория"], + "kw": ["Class", "Klass"], + "ky": ["Категория"], + "la": ["Categoria"], + "lad": ["Kateggoría", "Katēggoría", "Categoría"], + "lb": ["Kategorie"], + "lbe": ["Категория"], + "lez": ["Категория"], + "lfn": ["Categoria"], + "li": ["Categorie", "Kategorie"], + "lij": ["Categorîa", "Categoria"], + "lmo": ["Categuria", "Categoria"], + "ln": ["Catégorie"], + "lo": ["ໝວດ"], + "lrc": ["دأسە"], + "lt": ["Kategorija"], + "ltg": ["Kategoreja"], + "lv": ["Kategorija"], + "mai": ["CA", "श्रेणी"], + "map-bms": ["Kategori"], + "mdf": ["Категорие", "Категория"], + "mg": ["Sokajy", "Catégorie"], + "mhr": ["Категория", "Категорий"], + "min": ["Kategori"], + "mk": ["Категорија"], + "ml": ["വിഭാഗം", "വി", "വർഗ്ഗം", "വ"], + "mn": ["Ангилал"], + "mr": ["वर्ग"], + "mrj": ["Категори", "Категория"], + "ms": ["Kategori"], + "mt": ["Kategorija"], + "mwl": ["Catadorie", "Categoria"], + "my": ["ကဏ္ဍ"], + "myv": ["Категория"], + "mzn": ["رج", "رده"], + "nah": ["Neneuhcāyōtl", "Categoría"], + "nap": ["Categurìa", "Categoria"], + "nds": ["Kategorie"], + "nds-nl": ["Categorie", "Kattegerie", "Kategorie"], + "ne": ["श्रेणी"], + "new": ["पुचः"], + "nl": ["Categorie"], + "nn": ["Kategori"], + "no": ["Kategori"], + "nrm": ["Catégorie"], + "nso": ["Setensele"], + "nv": ["Tʼááłáhági_átʼéego", "Tʼááłáhági átʼéego"], + "oc": ["Categoria"], + "olo": ["Kategourii"], + "or": ["ବିଭାଗ", "ଶ୍ରେଣୀ"], + "os": ["Категори"], + "pa": ["ਸ਼੍ਰੇਣੀ"], + "pcd": ["Catégorie"], + "pdc": ["Abdeeling", "Kategorie"], + "pfl": ["Kadegorie", "Sachgrubb", "Kategorie"], + "pi": ["विभाग"], + "pl": ["Kategoria"], + "pms": ["Categorìa"], + "pnb": ["گٹھ"], + "pnt": ["Κατηγορίαν"], + "ps": ["وېشنيزه"], + "pt": ["Categoria"], + "qu": ["Katiguriya"], + "rm": ["Categoria"], + "rmy": ["Shopni"], + "ro": ["Categorie"], + "roa-rup": ["Categorie"], + "roa-tara": ["Categoria"], + "ru": ["Категория", "К"], + "rue": ["Категория", "Катеґорія"], + "rw": ["Ikiciro"], + "sa": ["वर्गः"], + "sah": ["Категория"], + "sat": ["ᱛᱷᱚᱠ"], + "sc": ["Categoria"], + "scn": ["Catigurìa"], + "sd": ["زمرو"], + "se": ["Kategoriija"], + "sg": ["Catégorie"], + "sh": ["Kategorija", "Категорија"], + "si": ["ප්‍රවර්ගය"], + "sk": ["Kategória"], + "sl": ["Kategorija"], + "sq": ["Kategoria", "Kategori"], + "sr": ["Kategorija", "Категорија"], + "srn": ["Categorie", "Guru"], + "stq": ["Kategorie"], + "su": ["Kategori"], + "sv": ["Kategori"], + "sw": ["Jamii"], + "szl": ["Kategoryjo", "Kategoria"], + "ta": ["பகுப்பு"], + "tcy": ["ವರ್ಗೊ"], + "te": ["వర్గం"], + "tet": ["Kategoría", "Kategoria"], + "tg": ["Гурӯҳ"], + "th": ["หมวดหมู่"], + "ti": ["መደብ"], + "tk": ["Kategoriýa"], + "tl": ["Kategorya", "Kaurian"], + "tpi": ["Grup"], + "tr": ["Kategori", "KAT"], + "tt": ["Төркем", "Törkem", "Категория"], + "ty": ["Catégorie"], + "tyv": ["Аңгылал", "Категория"], + "udm": ["Категория"], + "ug": ["تۈر"], + "uk": ["Категория", "Категорія"], + "ur": ["زمرہ"], + "uz": ["Turkum", "Kategoriya"], + "vec": ["Categoria"], + "vep": ["Kategorii"], + "vi": ["Thể_loại", "Thể loại"], + "vls": ["Categorie"], + "vo": ["Klad"], + "wa": ["Categoreye"], + "war": ["Kaarangay"], + "wo": ["Wàll", "Catégorie"], + "wuu": ["分类"], + "xal": ["Янз", "Әәшл"], + "xmf": ["კატეგორია"], + "yi": ["קאטעגאריע", "קאַטעגאָריע"], + "yo": ["Ẹ̀ka"], + "za": ["分类"], + "zea": ["Categorie"], + "zh": ["分类", "分類", "CAT"], + "zh-classical": ["分類", "CAT"], + "zh-min-nan": ["分類", "Lūi-pia̍t"], + "zh-yue": ["分类", "分類", "类", "類"], +} + + +class WikipediaDownloader(DocumentDownloader): + + def __init__(self, download_dir, verbose=False): + super().__init__() + self._download_dir = download_dir + self._verbose = verbose + self._lock = Lock(name="wikipedia_downloader") + + def download(self, url): + urlpath = urlparse(url).path[1:] + output_name = urlpath.replace('/', '-') + output_file = os.path.join(self._download_dir, output_name) + if os.path.exists(output_file): + print(f"bz2 file: {output_file} exists. Not downloading") + else: + print(f"Downloading {url} and writing to {output_file}") + # Download with either wget or s5cmd (aws) + cmd = ["wget", url, "-O", output_file] + if self._verbose: + stdout, stderr = None, None + else: + stdout, stderr = subprocess.DEVNULL, subprocess.DEVNULL + with self._lock: + p = subprocess.run( + cmd, + stdout=stdout, + stderr=stderr, + ) + if p.returncode != 0: + print(f"Failed to download {url} to {output_file}") + + return output_file + + +class WikipediaIterator(DocumentIterator): + + def __init__(self, language='en', log_frequency=1000): + super().__init__() + self._language = language + self._log_frequency = log_frequency + self._counter = 0 + + def iterate(self, file_path): + self._counter = 0 + bname = os.path.split(file_path)[-1] + input_file = bz2.BZ2File(filename=file_path) + utf_f = codecs.getreader("utf-8")(input_file) + context = etree.iterparse(utf_f, events=("end",)) + + for i, (unused_event, elem) in enumerate(context): + if not elem.tag.endswith("page"): + continue + if self._counter > 0 and self._counter % self._log_frequency == 0: + print(f"Extracted {self._counter} articles from {file_path}") + self._counter += 1 + + namespace = elem.tag[:-4] + title = elem.find(f"./{namespace}title").text + ns = elem.find(f"./{namespace}ns").text + id_ = elem.find(f"./{namespace}id").text + red_ = elem.find(f"./{namespace}redirect") + + url = f"https://{self._language}.wikipedia.org/wiki/{quote(title)}" + + # Filter pages that are not in the "main" namespace. + if ns != "0": + elem.clear() + continue + + raw_content = elem.find(f"./{namespace}revision/{namespace}text").text + elem.clear() + + # Filter redirects. + if raw_content is None or red_ is not None: + continue + + yield { + 'title': title, + 'id': id_, + 'url': url, + 'language': self._language, + 'source_id': f'{bname}', + }, raw_content + + +class WikipediaExtractor(DocumentExtractor): + + def __init__(self, language='en', parser=mwparserfromhell): + super().__init__() + self._language = language + self._parser = parser + + def extract(self, content): + wikicode = self._parser.parse(content) + + # Filters for magic words that are parser instructions -- e.g., __NOTOC__ + re_rm_magic = re.compile("__[A-Z]*__", flags=re.UNICODE) + + # Filters for file/image links. + media_prefixes = "|".join(["File", "Image", "Media"] + + MEDIA_ALIASES.get(self._language, [])) + re_rm_wikilink = re.compile(f"^(?:{media_prefixes}):", + flags=re.IGNORECASE | re.UNICODE) + + def rm_wikilink(obj): + return bool(re_rm_wikilink.match(str(obj.title))) + + # Filters for references and tables + def rm_tag(obj): + return str(obj.tag) in {"ref", "table"} + + # Leave category links in-place but remove the category prefixes + cat_prefixes = "|".join(["Category"] + CAT_ALIASES.get(self._language, [])) + re_clean_wikilink = re.compile(f"^(?:{cat_prefixes}):", + flags=re.IGNORECASE | re.UNICODE) + + def is_category(obj): + return bool(re_clean_wikilink.match(str(obj.title))) + + def clean_wikilink(obj): + text = obj.__strip__() + text = re.sub(re_clean_wikilink, "", text) + obj.text = text + + def try_replace_obj(obj): + try: + clean_wikilink(obj) + except ValueError: + # For unknown reasons, objects are sometimes not found. + pass + + def try_remove_obj(obj, section): + try: + section.remove(obj) + except ValueError: + # For unknown reasons, objects are sometimes not found. + pass + + section_text = [] + # Filter individual sections to clean. + wiki_code_kwargs = { + 'flat': True, + 'include_lead': True, + 'include_headings': True, + } + for section in wikicode.get_sections(**wiki_code_kwargs): + for obj in section.ifilter_wikilinks(recursive=True): + if rm_wikilink(obj): + try_remove_obj(obj, section) + elif is_category(obj): + try_replace_obj(obj) + for obj in section.ifilter_tags(matches=rm_tag, recursive=True): + try_remove_obj(obj, section) + + section_text.append(re.sub( + re_rm_magic, + "", + section.strip_code().strip(), + )) + # Don't return any meta here + return {}, "\n\n".join(section_text) + + +def download_wikipedia(output_path: str, language: str="en", dump_date=None, output_type: str="jsonl", raw_download_dir=None, keep_raw_download=False, force_download=False, url_limit=None) -> DocumentDataset: + """ + Downloads the latest Wikipedia dumps and extracts them using mwparserfromhell + + Args: + output_path: The path to the root directory of the files + language: The language of the Wikipedia articles to download + dump_date: A string formatted as "YYYYMMDD" for the wikipedia dump to use. + If None, latest dump is used. + output_type: The file type to save the data as. + raw_download_dir: Path to store the raw download files for intermediate processing. + If None, they are stored in a folder named "downloads" under output_path. + keep_raw_download: If True, keeps the bz2 files that have not been extracted. + force_download: If False, will skip processing all files in output_paths that already exist and + directly read from them instead. + url_limit: The maximum number of raw files to download from the snapshot. If None, all + files from the range of snapshots are downloaded. + """ + wikipedia_urls = get_wikipedia_urls(language=language, dump_date=dump_date) + if url_limit: + wikipedia_urls = wikipedia_urls[:url_limit] + output_paths = list(map(lambda url: os.path.join(output_path, url.split("/")[-1] + f".{output_type}"), wikipedia_urls)) + + if not raw_download_dir: + raw_download_dir = os.path.join(output_path, "downloads") + expand_outdir_and_mkdir(raw_download_dir) + + downloader = WikipediaDownloader(raw_download_dir) + iterator = WikipediaIterator(language=language) + extractor = WikipediaExtractor(language=language) + + output_format = { + "text": str, + "title": str, + "id": str, + "url": str, + "language": str, + "source_id": str, + "filename": str, + } + dataset = download_and_extract(wikipedia_urls, output_paths, downloader, iterator, extractor, output_format, output_type=output_type, keep_raw_download=keep_raw_download, force_download=force_download) + + return dataset \ No newline at end of file diff --git a/nemo_curator/filters/__init__.py b/nemo_curator/filters/__init__.py new file mode 100644 index 00000000..18f46afd --- /dev/null +++ b/nemo_curator/filters/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .doc_filter import DocumentFilter, import_filter +from .classifier_filter import FastTextLangId, FastTextQualityFilter, BatchedFastTextQualityFilter +from .heuristic_filter import NonAlphaNumericFilter, SymbolsToWordsFilter, NumbersFilter, UrlsFilter, BulletsFilter, WhiteSpaceFilter, ParenthesesFilter, LongWordFilter, WordCountFilter, BoilerPlateStringFilter, MeanWordLengthFilter, RepeatedLinesFilter, RepeatedParagraphsFilter, RepeatedLinesByCharFilter, RepeatedParagraphsByCharFilter, RepeatingTopNGramsFilter, RepeatingDuplicateNGramsFilter, PunctuationFilter, EllipsisFilter, CommonEnglishWordsFilter, WordsWithoutAlphabetsFilter, PornographicUrlsFilter +from .code import PythonCommentToCodeFilter, GeneralCommentToCodeFilter, NumberOfLinesOfCodeFilter, TokenizerFertilityFilter, XMLHeaderFilter, AlphaFilter, HTMLBoilerplateFilter, PerExtensionFilter + +__all__ = ["BatchedFastTextQualityFilter", "DocumentFilter", "import_filter", "FastTextLangId", "FastTextQualityFilter", "NonAlphaNumericFilter", "SymbolsToWordsFilter", "NumbersFilter", "UrlsFilter", "BulletsFilter", "WhiteSpaceFilter", "ParenthesesFilter", "LongWordFilter", "WordCountFilter", "BoilerPlateStringFilter", "MeanWordLengthFilter", "RepeatedLinesFilter", "RepeatedParagraphsFilter", "RepeatedLinesByCharFilter", "RepeatedParagraphsByCharFilter", "RepeatingTopNGramsFilter", "RepeatingDuplicateNGramsFilter", "PunctuationFilter", "EllipsisFilter", "CommonEnglishWordsFilter", "WordsWithoutAlphabetsFilter", "PornographicUrlsFilter", "PythonCommentToCodeFilter", "GeneralCommentToCodeFilter", "NumberOfLinesOfCodeFilter", "TokenizerFertilityFilter", "XMLHeaderFilter", "AlphaFilter", "HTMLBoilerplateFilter", "PerExtensionFilter"] + diff --git a/nemo_curator/filters/classifier_filter.py b/nemo_curator/filters/classifier_filter.py new file mode 100644 index 00000000..74d54ad7 --- /dev/null +++ b/nemo_curator/filters/classifier_filter.py @@ -0,0 +1,122 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fasttext +import numpy as np +import pandas as pd + +from nemo_curator.filters import DocumentFilter +from nemo_curator.utils.distributed_utils import load_object_on_worker, NoWorkerError + + +class FastTextQualityFilter(DocumentFilter): + + def __init__(self, model_path=None, label='__label__hq', alpha=3, seed=42): + if model_path is None: + raise ValueError("Must provide a valid path to a FastText model " + "to compute document scores with this filter") + self._model_path = model_path + self._label = label + self._alpha = alpha + self._seed = np.random.seed(seed) + self._name = 'fasttext_quality_filter' + + def score_document(self, text): + text = text.replace('\n', ' ').replace('__label__', ' ') + model_attr = f"{self._name}_{self._model_path}" + # Workers don't exist during type inference + try: + model = load_object_on_worker(model_attr, self._load_model, {}) + except NoWorkerError: + return 1.0 + pred = model.predict(text) + document_score = pred[1][0] + if pred[0][0] != self._label: + document_score = 1 - document_score + + return document_score + + def keep_document(self, score): + return np.random.pareto(self._alpha) > 1 - score + + def _load_model(self): + return fasttext.load_model(self._model_path) + +class BatchedFastTextQualityFilter(DocumentFilter): + + def __init__(self, model_path=None, label='__label__hq', alpha=3, seed=42): + if model_path is None: + raise ValueError("Must provide a valid path to a FastText model " + "to compute document scores with this filter") + self._model_path = model_path + self._label = label + self._alpha = alpha + self._seed = np.random.seed(seed) + self._name = 'fasttext_quality_filter' + + def score_document(self, df): + model_attr = f"{self._name}_{self._model_path}" + try: + model = load_object_on_worker(model_attr, self._load_model, {}) + except NoWorkerError: + return pd.Series(np.ones(len(df)), dtype=float) + + def _score_document(text): + text = text.replace('\n', ' ').replace('__label__', ' ') + pred = model.predict(text) + document_score = pred[1][0] + if pred[0][0] != self._label: + document_score = 1 - document_score + + return document_score + + return df.apply(_score_document) + + def keep_document(self, df): + return np.random.pareto(self._alpha, size=len(df)) > 1 - df + + def _load_model(self): + return fasttext.load_model(self._model_path) + + +class FastTextLangId(DocumentFilter): + + def __init__(self, model_path=None, min_langid_score=0.3): + if model_path is None: + raise ValueError("Must provide a valid path to a FastText model " + "to identify languages with this filter") + self._model_path = model_path + self._lang_code = None + self._cutoff = min_langid_score + self._name = "lang_id" + + def score_document(self, text): + pp = text.strip().replace('\n', ' ') + + model_attr = f"{self._name}_{self._model_path}" + try: + model = load_object_on_worker(model_attr, self._load_model, {}) + except NoWorkerError: + return [1.0, 'N/A'] + label, score = model.predict(pp, k=1) + score = score[0] + lang_code = label[0][-2:].upper() + + return [score, lang_code] + + def keep_document(self, score): + return score[0] >= self._cutoff + + def _load_model(self): + return fasttext.load_model(self._model_path) diff --git a/nemo_curator/filters/code.py b/nemo_curator/filters/code.py new file mode 100644 index 00000000..44868c29 --- /dev/null +++ b/nemo_curator/filters/code.py @@ -0,0 +1,303 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from comment_parser import comment_parser +import numpy as np +import csv +from bs4 import BeautifulSoup +from nemo.collections.common.tokenizers import SentencePieceTokenizer + +from nemo_curator.filters import DocumentFilter, import_filter +from nemo_curator.utils.text_utils import get_comments_and_docstring +from nemo_curator.utils.constants import regex_alpha, regex_alphanum + + +class PythonCommentToCodeFilter(DocumentFilter): + + def __init__( + self, + min_comment_to_code_ratio=0.01, + max_comment_to_code_ratio=0.85, + ): + self._min_threshold = min_comment_to_code_ratio + self._max_threshold = max_comment_to_code_ratio + + self._name = 'python_comment_ratio' + + def score_document(self, source): + docstrings, comments = get_comments_and_docstring(source) + if docstrings is None or comments is None: + return 0 + # No need to be super precise about the way this formatted, + # we just need to count the number of characters. + return (len(comments) + len(docstrings)) / len(source) + + def keep_document(self, score): + return self._min_threshold <= score <= self._max_threshold + + +class GeneralCommentToCodeFilter(DocumentFilter): + + def __init__( + self, + language, + min_comment_to_code_ratio=0.01, + max_comment_to_code_ratio=0.85, + ): + """ + Does not include the comment characters (// or /**/) towards the length of the comment. + Args: + language: Mime string of language + """ + self._lang = language + self._min_threshold = min_comment_to_code_ratio = min_comment_to_code_ratio + self._max_threshold = max_comment_to_code_ratio = max_comment_to_code_ratio + + self._name = 'comment_ratio' + + def score_document(self, source): + try: + comments = comment_parser.extract_comments_from_str( + source, + mime=self._lang, + ) + comments = ' '.join([x.text() for x in comments]) + except Exception: + warnings.warn("tokenization error, no comment is extracted") + return 9999 + if comments is None: + return 0 + return len(comments) / len(source) + + def keep_document(self, score): + return self._min_threshold <= score <= self._max_threshold + + +class NumberOfLinesOfCodeFilter(DocumentFilter): + + def __init__(self, min_lines=10, max_lines=20000): + self._min_lines = min_lines + self._max_lines = max_lines + + self._name = 'num_lines' + + def score_document(self, source): + return len(source.split('\n')) + + def keep_document(self, score): + return self._min_lines <= score <= self._max_lines + + +class TokenizerFertilityFilter(DocumentFilter): + + def __init__(self, path_to_tokenizer=None, min_char_to_token_ratio=2.5): + if path_to_tokenizer is None: + raise ValueError("Must provide a valid path to a SentencePiece " + "tokenizer") + self._tokenizer = SentencePieceTokenizer(path_to_tokenizer) + self._threshold = min_char_to_token_ratio + + self._name = 'tokenizer_fertility' + + def score_document(self, source): + tokens = self._tokenizer.text_to_tokens(source) + num_chars = len(source) + num_tokens = len(tokens) + if num_tokens == 0: + return -1 + return num_chars / num_tokens + + def keep_document(self, score): + return score >= self._threshold + + +class XMLHeaderFilter(DocumentFilter): + """ + This filter tries to identify files that have incorrect file extensions. + In many cases, these end up being XML files and we try to identify them + based on the header. + (Source: Starcoder https://arxiv.org/abs/2305.06161) + """ + def __init__(self, char_prefix_search_length=100): + self._char_prefix_search_length = char_prefix_search_length + + self._name = 'xml_header' + + def score_document(self, source): + source_prefix = source[:self._char_prefix_search_length] + if "= self._min_alpha_ratio + + +class HTMLBoilerplateFilter(DocumentFilter): + """ + This filter tries to identify HTML that is largely boilerplate. + """ + def __init__(self, min_lang_content_ratio=0.2, min_lang_content_num_chars=100): + self._min_lang_content_ratio = min_lang_content_ratio + self._min_lang_content_num_chars = min_lang_content_num_chars + + self._name = 'html_boilerplate' + + def score_document(self, source): + try: + soup = BeautifulSoup(source, features="html.parser") + except (TypeError, UnboundLocalError): + return None + + # kill all script and style elements + for script in soup(["script", "style"]): + script.extract() # rip it out + + # get text + text = soup.get_text() + ratio = len(text) / len(source) + + if len(text) < self._min_lang_content_num_chars: + return 0 + + return ratio + + def keep_document(self, score): + return score >= self._min_lang_content_ratio + + +class PerExtensionFilter(DocumentFilter): + """ + This filter that has specific conditions depending on the file extension. + """ + def __init__(self, lang, extension, metadata_file='code_meta.csv'): + self._metadata_file = metadata_file + self._lang = lang + self._extension = extension + self._ext_to_filter = self._load_filter_csv(metadata_file, lang) + self._name = 'per_extension_filter' + + def _load_filter_csv(self, path: str, language: str = None): + """Load csv file that specifies the filter to apply for each (lang, extension). + TODO: add some tests. Check that filters are correctly set.""" + # (Lang, extension) -> filter_args + ext_to_filter = {} + with open(path) as f: + for row in csv.DictReader(f): + # Only take the rows corresponding to the language if specified + if language is None or row["language"] == language: + ext_to_filter[(row["language"], row["extension"])] = self._get_filter_params(row) + assert len(ext_to_filter) > 0, f"Did not find filtering params corresponding to language: `{language}` in: {path}" + + return ext_to_filter + + def _get_filter_params(self, row: dict): + """Extract filter parameters from csv row""" + include = row["Include"] == "1" + try: + line_max = int(row["Long_line_threshold"]) + except ValueError: + line_max = None + line_mean = 100 if line_max else None + try: + alphanum_frac = float(row["Alphanum_threshold"]) + except ValueError: + alphanum_frac = None + try: + alphabetic_frac = float(row["Alpha filter"]) + except ValueError: + alphabetic_frac = None + return include, line_max, line_mean, alphanum_frac, alphabetic_frac + + def _language_format_from_dataset(self, lang: str): + """Convert: Language field in dataset -> language field in csv file that defines the filters.""" + # TODO: other special cases? + if lang == "C#": + return "c-sharp" + if lang == "F#": + return "f-sharp" + return lang.lower().replace(" ", "-") + + def _line_statistics(self, source): + lengths = [len(x) for x in source.split('\n')] + max_length = max(lengths) + mean_length = np.mean(lengths) + + return max_length, mean_length + + def _alphanum_fraction(self, source): + return len(regex_alphanum.findall(source)) / len(source) + + def score_document(self, source): + """Filter files based on line length and % alphanumeric characters. + The filtering parameters depend on the file extension, given by `ext_to_filter`""" + # Get the filter-params we want to use + # extension `None` is an empty string in the csv + try: + (include, line_max, line_mean, alphanum_frac, alphabetic_frac) = self._ext_to_filter[(self._language_format_from_dataset( + self._lang), self._extension if self._extension is not None else "" + )] + except KeyError as e: + # Some extensions are not in the csv. This happens for dockerfiles. + # Exclude these files + print(str(e) + f":{self._extension} not in ext_to_filter") + include = False + + if not include: + return 0 + + max_length, mean_length = self._line_statistics(source) + + if line_max and max_length > line_max: + return 0 + + elif line_mean and mean_length > line_mean: + return 0 + + # Filter files with low percentage of alphanumeric chars + elif alphanum_frac and self._alphanum_fraction(source) < alphanum_frac: + return 0 + + # Filter files with low percentage of alphabetic chars + elif alphabetic_frac and sum(map(str.isalpha, source)) < alphabetic_frac * len(source): + return 0 + + return 1 + + def keep_document(self, score): + if score is None or score == 0: + return False + else: + return True + diff --git a/nemo_curator/filters/doc_filter.py b/nemo_curator/filters/doc_filter.py new file mode 100644 index 00000000..c6013e47 --- /dev/null +++ b/nemo_curator/filters/doc_filter.py @@ -0,0 +1,72 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +from abc import ABC, abstractmethod + + +class DocumentFilter(ABC): + + def __init__(self): + super().__init__() + self._name = self.__class__.__name__ + self._sentences = None + self._paragraphs = None + self._ngrams = None + + @abstractmethod + def score_document(self, text): + pass + + @abstractmethod + def keep_document(self, scores): + pass + + @property + def name(self): + return self._name + + @property + def sentences(self): + return self._sentences + + @sentences.setter + def sentences(self, sentences): + self._sentences = sentences + + @property + def paragraphs(self): + return self._paragraphs + + @paragraphs.setter + def paragraphs(self, paragraphs): + self._paragraphs = paragraphs + + @property + def ngrams(self): + return self._ngrams + + @ngrams.setter + def ngrams(self, ngrams): + self._ngrams = ngrams + + +def import_filter(filter_path): + module_path, filter_name = filter_path.rsplit(".", 1) + filter_module = importlib.import_module(module_path) + filter_class = getattr(filter_module, filter_name) + if not issubclass(filter_class, DocumentFilter): + raise ValueError(f"Input filter {filter_class.__name__} must be derived " + "from DocumentFilter defined in nemo_curator.filters.doc_filter") + return filter_class diff --git a/nemo_curator/filters/heuristic_filter.py b/nemo_curator/filters/heuristic_filter.py new file mode 100644 index 00000000..5936ede9 --- /dev/null +++ b/nemo_curator/filters/heuristic_filter.py @@ -0,0 +1,636 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import regex + +from nemo_curator.filters import DocumentFilter, import_filter +from nemo_curator.utils.constants import ( + regex_alphanum, + regex_hash, + regex_digit, + regex_url, + regex_paren, + regex_alpha, + ellipsis_marks, + bullet_list, + white_space_list, + policy_substrings, + end_marks, + common_english_words, +) +from nemo_curator.utils.text_utils import ( + get_word_splitter, + get_paragraphs, + get_sentences, + get_ngrams, + is_paragraph_indices_in_top_or_bottom_only, +) + + +class NonAlphaNumericFilter(DocumentFilter): + """ + If more than 25% of the document is non-alphanumeric then discard + Intended to be applied only too english text + Source: Adapted from Gopher (Rae et al., 2021) + """ + + def __init__(self, max_non_alpha_numeric_to_text_ratio=0.25): + super().__init__() + self._cutoff = max_non_alpha_numeric_to_text_ratio + self._name = 'alpha_numeric' + + def score_document(self, text): + nchar = len(text) + if nchar > 0: + score = (nchar - len(regex_alphanum.findall(text))) / nchar + else: + # Remove the document if it is empty + score = 1.0 + return score + + def keep_document(self, score): + return score <= self._cutoff + + +class SymbolsToWordsFilter(DocumentFilter): + """ + Remove any document with symbol-to-word ratio greater than + 0.1 for either the hash symbol or the elipsis + Source: Gopher (Rae et al., 2021) + """ + + def __init__(self, max_symbol_to_word_ratio=0.1, lang='en'): + super().__init__() + self._cutoff = max_symbol_to_word_ratio + self._word_splitter = get_word_splitter(lang) + self._name = 'symbol_to_word' + + def score_document(self, text): + num_symbol_words = 0 + words = self._word_splitter(text.strip()) + for word in words: + word = word.strip() + # Checks if the word is an elipsis or consists mostly of symbols. + symbol_ratio = len(regex_hash.findall(word)) / len(word) + if word in ellipsis_marks or symbol_ratio > 0.5: + num_symbol_words += 1 + return num_symbol_words / len(words) + + def keep_document(self, score): + return score <= self._cutoff + + +class NumbersFilter(DocumentFilter): + """ + If more than 15% of the document contains numbers then discard + """ + + def __init__(self, max_number_to_text_ratio=0.15): + super().__init__() + self._cutoff = max_number_to_text_ratio + self._name = 'numbers_ratio' + + def score_document(self, text): + nchar = len(text) + if nchar > 0: + score = len(regex_digit.findall(text)) / nchar + else: + # Remove if the document is empty + score = 1.0 + return score + + def keep_document(self, score): + return score <= self._cutoff + + +class UrlsFilter(DocumentFilter): + """ + If more than 20% of the document is comprised of URLs then discard + """ + + def __init__(self, max_url_to_text_ratio=0.2): + super().__init__() + self._cutoff = max_url_to_text_ratio + self._name = 'urls_ratio' + + def score_document(self, text): + all_urls = regex_url.findall(text) + url_chars = sum([len(url) for url in all_urls]) + nchar = len(text) + if nchar > 0: + score = url_chars / nchar + else: + # Remove if the document is empty + score = 1.0 + return score + + def keep_document(self, score): + return score <= self._cutoff + + +class BulletsFilter(DocumentFilter): + """ + If more than 90% of the lines start with a bullet then discard + Source: Gopher (Rae et al., 2021) + """ + + def __init__(self, max_bullet_lines_ratio=0.9): + super().__init__() + self._cutoff = max_bullet_lines_ratio + self._sentences = None + self._name = 'bullet_ratio' + + def score_document(self, text): + # Get sentences + sentences = self._sentences + if sentences is None: + sentences = get_sentences(text) + num_bullet_lines = 0 + for sentence in sentences: + for bullet in bullet_list: + if sentence.strip().startswith(bullet): + num_bullet_lines += 1 + break + return num_bullet_lines / len(sentences) + + def keep_document(self, score): + return score <= self._cutoff + + +class WhiteSpaceFilter(DocumentFilter): + """ + If the document contains a significant number + of white space characters then discard + """ + + def __init__(self, max_white_space_ratio=0.25): + super().__init__() + self._cutoff = max_white_space_ratio + self._name = 'white_space' + + def score_document(self, text): + # Do not strip the document since we want to + # include leading and trailing whitepsaces as well. + nchar = len(text) + if nchar > 0: + score = len([x for x in text if x in white_space_list]) / nchar + else: + # Remove if the document is empty + score = 1.0 + return score + + def keep_document(self, score): + return score <= self._cutoff + + +class ParenthesesFilter(DocumentFilter): + """ + If more than 10% of the sentence is in parentheses then discard + """ + + def __init__(self, max_parentheses_ratio=0.1): + super().__init__() + self._max_parentheses_ratio = max_parentheses_ratio + self._name = 'parentheses_ratio' + + def score_document(self, text): + nchar = len(text) + if nchar > 0: + score = len(regex_paren.findall(text)) / nchar + else: + # Remove if the document is empty + score = 1.0 + return score + + def keep_document(self, score): + return score <= self._max_parentheses_ratio + + +class LongWordFilter(DocumentFilter): + """ + If the document contains a word longer than 1000 characters then discard + NOTE: This seems to be catching things like minified `.js` files + that don't have spaces anywhere. + Source: C4 (Google) + """ + + def __init__(self, max_word_length=1000, lang='en'): + super().__init__() + self._max_word_length = max_word_length + self._word_splitter = get_word_splitter(lang) + self._name = 'max_word_length' + + def score_document(self, text): + return max(len(w) for w in self._word_splitter(text.strip())) + + def keep_document(self, score): + return score <= self._max_word_length + + +class WordCountFilter(DocumentFilter): + """ + If a document contains a number of words not + within a specified range then discard + """ + + def __init__(self, min_words=50, max_words=100000, lang='en'): + super().__init__() + self._min_words = min_words + self._max_words = max_words + self._word_splitter = get_word_splitter(lang) + self._name = 'word_count' + + def score_document(self, text): + return len(self._word_splitter(text.strip())) + + def keep_document(self, score): + return self._min_words <= score <= self._max_words + + +class BoilerPlateStringFilter(DocumentFilter): + """ + If more than 40% of paragraphs contain boilerplate strings then discard. + This includes things like "terms of use", "privacy policy", etc. + Source: Adapted significantly from Google C4 processing. + """ + + def __init__( + self, + remove_if_at_top_or_bottom=True, + max_boilerplate_string_ratio=0.4, + ): + super().__init__() + self._remove_if_at_top_or_bottom = remove_if_at_top_or_bottom + self._max_boilerplate_string_ratio = max_boilerplate_string_ratio + self._boilerplate_paragraph_indices = [] + self._max_ratio = 1.0 + self._name = 'boilerplate_string_ratio' + + def score_document(self, text): + # Initialize variables + boilerplate_paragraph_count = 0 + + # Get the paragraphs + paragraphs = get_paragraphs(text) + + # Check each paragraph + for idx, paragraph in enumerate(paragraphs): + paragraph = paragraph.strip().lower() + if 'lorem ipsum' in paragraph: + return self._max_ratio + if any(p in paragraph for p in policy_substrings): + boilerplate_paragraph_count += 1 + + return boilerplate_paragraph_count / len(paragraphs) + + def keep_document(self, score): + return score <= self._max_boilerplate_string_ratio + + +class MeanWordLengthFilter(DocumentFilter): + """ + If the mean word length is not in a specified range then discard + """ + + def __init__( + self, + min_mean_word_length=3, + max_mean_word_length=10, + lang='en', + ): + super().__init__() + self._min_cutoff = min_mean_word_length + self._max_cutoff = max_mean_word_length + self._word_splitter = get_word_splitter(lang) + self._name = 'mean_word_length' + + def score_document(self, text): + word_lens = [ + len(w) for w in self._word_splitter(text.strip()) if len(w) > 0 + ] + return sum(word_lens) / len(word_lens) + + def keep_document(self, score): + return self._min_cutoff <= score <= self._max_cutoff + + +class RepeatedLinesFilter(DocumentFilter): + """ + If the document shrinks by > 30% in terms of number of lines after + removing duplicate lines then discard + Source: Gopher (Rae et al., 2021) + """ + + def __init__(self, max_repeated_line_fraction=0.7): + super().__init__() + self._cutoff = max_repeated_line_fraction + self._name = 'repeated_lines' + + def score_document(self, text): + sentences = self._sentences + if sentences is None: + sentences = get_sentences(text) + return len(set(sentences)) / len(sentences) + + def keep_document(self, score): + return score >= self._cutoff + + +class RepeatedParagraphsFilter(DocumentFilter): + """ + If the document shrinks by > 30% in terms of number of lines after + removing duplicate paragraphs then discard. + Source: Gopher (Rae et al., 2021) + """ + + def __init__(self, max_repeated_paragraphs_ratio=0.7): + super().__init__() + self._max_repeated_paragraphs_ratio = max_repeated_paragraphs_ratio + self._name = 'repeated_paragraphs' + + def score_document(self, text): + paragraphs = self._paragraphs + if paragraphs is None: + paragraphs = get_paragraphs(text) + return len(set(paragraphs)) / len(paragraphs) + + def keep_document(self, score): + return score >= self._max_repeated_paragraphs_ratio + + +class RepeatedLinesByCharFilter(DocumentFilter): + """ + If the document shrinks by > 20% in terms of number of lines + after removing duplicate lines then discard + Source: Gopher (Rae et al., 2021) + """ + + def __init__(self, max_repeated_lines_char_ratio=0.8): + super().__init__() + self._cutoff = max_repeated_lines_char_ratio + self._name = 'repeated_lines_char' + + def score_document(self, text): + sentences = self._sentences + if sentences is None: + sentences = get_sentences(text) + + return len(''.join(set(sentences))) / len(''.join(sentences)) + + def keep_document(self, score): + return score >= self._cutoff + + +class RepeatedParagraphsByCharFilter(DocumentFilter): + """ + If the document shrinks by > 10% in terms of number of lines after + removing duplicate paragraphs then discard. + Source: Gopher (Rae et al., 2021) + """ + + def __init__(self, max_repeated_paragraphs_char_ratio=0.8): + super().__init__() + self._cutoff = max_repeated_paragraphs_char_ratio + self._name = 'repeated_paragraphs_char' + + def score_document(self, text): + paragraphs = self._paragraphs + if paragraphs is None: + paragraphs = get_paragraphs(text) + + return len(''.join(set(paragraphs))) / len(''.join(paragraphs)) + + def keep_document(self, score): + return score >= self._cutoff + + +class RepeatingTopNGramsFilter(DocumentFilter): + """ + If the document shrinks by > x% in terms of number of characters after + removing the top n-grams then discard. + Source: Gopher (Rae et al., 2021) + """ + + def __init__(self, n=2, max_repeating_ngram_ratio=0.2, lang='en'): + super().__init__() + self._n = n + self._cutoff = max_repeating_ngram_ratio + self._max_ratio = 1.0 + self._word_splitter = get_word_splitter(lang) + self._name = f'repeating_top_{n}grams' + + def score_document(self, text): + ngrams = self._ngrams + if ngrams is None: + split_text = self._word_splitter(text.strip()) + if len(split_text) < self._n: + return self._max_ratio + ngrams = get_ngrams(split_text, self._n) + unique_ngrams = set(ngrams) + # Find the most frequent ngram in the zipped ngram list + counts = { + ngram: { + 'freq': 0, + 'num_chars': sum(len(word) for word in ngram) + } for ngram in unique_ngrams + } + for ngram in ngrams: + counts[ngram]['freq'] += 1 + most_frqnt_ngram = ' '.join(max(counts, key=lambda x: counts[x]['freq'])) + # Find the number of characters the most frequent ngram + # contributes to the document + nchar = len(text) + len_diff = nchar - len(text.replace(most_frqnt_ngram, '')) + if nchar > 0: + score = len_diff / nchar + else: + # Remove if the document is empty + score = 1.0 + return score + + def keep_document(self, score): + return score <= self._cutoff + + +class RepeatingDuplicateNGramsFilter(DocumentFilter): + """ + If the document shrinks by > x% in terms of number of characters + after removing all duplicate n-grams then discard. + Source: Gopher (Rae et al., 2021) + """ + + def __init__(self, n=2, max_repeating_duplicate_ngram_ratio=0.2, lang='en'): + super().__init__() + self._n = n + self._cutoff = max_repeating_duplicate_ngram_ratio + self._max_ratio = 1.0 + self._word_splitter = get_word_splitter(lang) + self._name = f'repeating_dup_{n}gram' + + def score_document(self, text): + ngrams = self._ngrams + if ngrams is None: + split_text = self._word_splitter(text.strip()) + if len(split_text) < self._n: + return self._max_ratio + ngrams = get_ngrams(split_text, self._n) + + counts = {} + duplicated_nchar = 0 + overlapping_ngrams = 0 + for ngram in ngrams: + counts[ngram] = counts.get(ngram, 0) + 1 + if counts[ngram] > 1: + # Count the number of characters in this ngram that haven't been counted already + duplicated_ngrams = sum(len(gram) for gram in ngram[overlapping_ngrams:]) + # Count the spaces between the ngrams + nspaces = min(self._n - overlapping_ngrams, self._n - 1) + duplicated_nchar += duplicated_ngrams + nspaces + overlapping_ngrams = self._n + overlapping_ngrams = max(overlapping_ngrams - 1, 0) + + nchar = len(text) + if nchar > 0: + score = duplicated_nchar / nchar + else: + # Remove if the document is empty + score = 1.0 + return score + + def keep_document(self, score): + return score <= self._cutoff + + +class PunctuationFilter(DocumentFilter): + """ + If more than 85% of the sentences do not end with a + punctuation mark then discard. + Source: Google C4 processing + """ + + def __init__(self, max_num_sentences_without_endmark_ratio=0.85): + super().__init__() + self._cutoff = max_num_sentences_without_endmark_ratio + self._name = 'punctuation' + + def score_document(self, text): + sentences = self._sentences + if sentences is None: + sentences = get_sentences(text) + num_sentence_without_endmarks = len( + [s for s in sentences if not s.strip().endswith(end_marks)]) + return num_sentence_without_endmarks / len(sentences) + + def keep_document(self, score): + return score <= self._cutoff + + +class EllipsisFilter(DocumentFilter): + """ + If more than 30% of the sentences end with an elipsis then discard. + Source: Google C4 processing + """ + + def __init__(self, max_num_lines_ending_with_ellipsis_ratio=0.3): + super().__init__() + self._cutoff = max_num_lines_ending_with_ellipsis_ratio + self._name = 'ellipsis' + + def score_document(self, text): + sentences = self._sentences + if sentences is None: + sentences = get_sentences(text) + num_lines_ending_with_ellipsis = 0 + for sentence in sentences: + for ellipsis in ellipsis_marks: + if sentence.strip().lower().endswith(ellipsis): + num_lines_ending_with_ellipsis += 1 + break + return num_lines_ending_with_ellipsis / len(sentences) + + def keep_document(self, score): + return score <= self._cutoff + + +class CommonEnglishWordsFilter(DocumentFilter): + """ + If the sentence contains at least 2 common english words, keep + NOTE: we purposefully check for the lowercase versions of those common words + to remove documents with over-capitalization. + """ + + def __init__(self, min_num_common_words=2, stop_at_false=True): + super().__init__() + self._cutoff = min_num_common_words + self._stop_at_false = stop_at_false + self._word_splitter = get_word_splitter('en') + self._name = 'common_english_words' + + def score_document(self, text): + common_word_counter = 0 + for word in self._word_splitter(text.strip()): + if word in common_english_words: + common_word_counter += 1 + if self._stop_at_false and common_word_counter >= self._cutoff: + return common_word_counter + + return common_word_counter + + def keep_document(self, score): + return score >= self._cutoff + + +class WordsWithoutAlphabetsFilter(DocumentFilter): + """ + 80% of words in a document must contain at least one alphabetic character + Source: Gopher (Rae et al., 2021) + """ + + def __init__(self, min_words_with_alphabets=0.8, lang='en'): + super().__init__() + self._cutoff = min_words_with_alphabets + self._word_splitter = get_word_splitter(lang) + self._name = 'words_without_alphabets' + + def score_document(self, text): + num_english_alpha = 0 + words = self._word_splitter(text.strip()) + for word in words: + if regex_alpha.search(word): + num_english_alpha += 1 + + return num_english_alpha / len(words) + + def keep_document(self, score): + return score >= self._cutoff + + +class PornographicUrlsFilter(DocumentFilter): + """ + Check if any of the urls within the document point to porn + """ + + def __init__(self): + super().__init__() + + def score_document(self, text): + all_urls = regex_url.findall(text) + for url in all_urls: + if 'porn' in url: + return 1 + + return 0 + + def keep_document(self, score): + return score != 1 diff --git a/nemo_curator/gpu_deduplication/__init__.py b/nemo_curator/gpu_deduplication/__init__.py new file mode 100644 index 00000000..d9155f92 --- /dev/null +++ b/nemo_curator/gpu_deduplication/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_curator/gpu_deduplication/connected_component.py b/nemo_curator/gpu_deduplication/connected_component.py new file mode 100644 index 00000000..d2d8d464 --- /dev/null +++ b/nemo_curator/gpu_deduplication/connected_component.py @@ -0,0 +1,298 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from time import time + +import cudf +import cugraph +import cugraph.dask as dcg +import cugraph.dask.comms.comms as Comms +import cupy +import dask_cudf +import numpy as np +from dask.dataframe.shuffle import shuffle as dd_shuffle +from dask.utils import M + +from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import ( + convert_str_pair_adlr_ids_to_int,) +from nemo_curator.gpu_deduplication.utils import ( + enable_spilling, + get_client, + get_num_workers, + parse_nc_args, + timer, +) + + +def sort_adlr_id(df): + x = df[["adlr_id_x", "adlr_id_y"]].values + x = cupy.sort(x, axis=1) + df["adlr_id_x"] = x[:, 0] + df["adlr_id_y"] = x[:, 1] + for i in ["adlr_id_x", "adlr_id_y"]: + df[i] = df[i].astype("uint64") + return df + + +def thresholding(df, threshold=0.8): + mask = df.jaccard > threshold + df.loc[mask, "jaccard"] = np.int8(1) + df.loc[~mask, "jaccard"] = np.int8(0) + return df + + +@timer +def run_connected_components(jaccard_pairs_path, adlr_id_path, output_path): + Comms.initialize(p2p=True) + df = dask_cudf.read_parquet(jaccard_pairs_path, + blocksize="1GB", + aggregate_files=True) + df = df[df["jaccard"] == 1].reset_index(drop=True) + + labels_df = dask_cudf.read_parquet(adlr_id_path) + num_nodes = len(labels_df) + + self_edge_df = labels_df[["uid"]].rename(columns={"uid": "adlr_id_x"}) + self_edge_df["adlr_id_y"] = self_edge_df["adlr_id_x"] + + df = df[["adlr_id_x", "adlr_id_y"]].astype(np.int64) + df = dask_cudf.concat([df, self_edge_df]) + + G = cugraph.MultiGraph(directed=False) + G.from_dask_cudf_edgelist(df, + source="adlr_id_x", + destination="adlr_id_y", + renumber=False) + result = dcg.weakly_connected_components(G) + del G + max_partitions = min(32, result.npartitions) + n_components = len(result[["labels" + ]].drop_duplicates(split_out=max_partitions)) + num_labels = len(result) + print("# of groups", n_components) + print("# of docs removed", num_labels - n_components) + labels_df = labels_df.merge(result, + left_on=["uid"], + right_on=["vertex"], + how="inner") + labels_df = labels_df[["dataset_id", "doc_id", "labels"]] + labels_df = labels_df.rename(columns={"labels": "group"}) + labels_df = labels_df.persist() + # Doing an inner merge above + # should not change any rows + + assert num_nodes == len(labels_df) + print(f"assert num_nodes:{num_nodes}==labels_df:{len(labels_df)} passed") + labels_df.to_parquet(output_path, write_index=False) + Comms.destroy() + + +def attach_args(parser=None): + description = """Computes connected component""" + if not parser: + parser = parse_nc_args(description=description) + + parser.add_argument( + "--jaccard-pairs-path", + type=str, + help="The directory containing the jaccard results", + ) + parser.add_argument( + "--output-dir", + type=str, + help="The output directory to write results to", + ) + parser.add_argument( + "--cache-dir", + type=str, + help="The cache directory to write intermediate results to", + ) + return parser + + +def delete_cache_data(path): + if "cache" not in path: + return + cmd = f"rm -rf {path}" + print(cmd) + os.system(cmd) + + +def write_output(ddf, output_path): + if not isinstance(output_path, str): + assert TypeError(f"output_path should be str. got {type(output_path)}") + print(f"write {output_path} ...") + ddf.to_parquet(output_path, write_index=False) + + +def get_unique_ids_per_partition(df): + unique_df_ls = [] + for tag in ["x", "y"]: + subset_df = df[[f"dataset_id_{tag}", f"doc_id_{tag}"]].drop_duplicates() + subset_df = subset_df.rename(columns={ + f"dataset_id_{tag}": "dataset_id", + f"doc_id_{tag}": "doc_id" + }) + unique_df_ls.append(subset_df) + unique_df = cudf.concat(unique_df_ls, ignore_index=True) + unique_df = unique_df.drop_duplicates() + return unique_df + + +@timer +def write_dedup_parsed_adlr_id(args): + dedup_parsed_adlr_id_path = f"{args.cache_dir}/dedup_parsed_adlr_id.parquet" + ddf = dask_cudf.read_parquet( + args.jaccard_pairs_path, + columns=["adlr_id_x", "adlr_id_y"], + blocksize="1GB", + aggregate_files=True, + ) + ddf = ddf.map_partitions( + convert_str_pair_adlr_ids_to_int, + meta={ + "dataset_id_x": "uint32", + "doc_id_x": "int64", + "dataset_id_y": "uint32", + "doc_id_y": "int64", + }, + ) + + unique_docs = ddf.map_partitions(get_unique_ids_per_partition) + unique_docs = unique_docs.drop_duplicates(split_out=ddf.npartitions // 4) + unique_docs["uid"] = np.uint64(1) + unique_docs["uid"] = unique_docs["uid"].cumsum() + unique_docs["uid"] = unique_docs["uid"] - 1 + write_output(unique_docs, dedup_parsed_adlr_id_path) + return dedup_parsed_adlr_id_path + + +def batched_merge_and_write(ddf, ddf_adlr_id, output_path, batch_size=32): + total_batches = (ddf.npartitions + batch_size - 1) // batch_size + for batch_id, offset in enumerate(range(0, ddf.npartitions, batch_size)): + st = time() + subset_ddf = ddf.partitions[offset:offset + batch_size] + for tag in ["x", "y"]: + subset_ddf = subset_ddf.merge( + ddf_adlr_id, + left_on=[f"dataset_id_{tag}", f"doc_id_{tag}"], + right_on=["dataset_id", "doc_id"], + how="inner", + broadcast=True, + ) + subset_ddf = subset_ddf.rename(columns={"uid": f"adlr_id_{tag}"}) + subset_ddf = subset_ddf.drop( + columns=[f"dataset_id_{tag}", f"doc_id_{tag}"]) + + subset_ddf = subset_ddf[["adlr_id_x", "adlr_id_y", "jaccard"]] + output_batch_path = os.path.join(output_path, f"{batch_id}.parquet") + subset_ddf.to_parquet(output_batch_path, write_index=False) + + et = time() + print(f"batch_id = {batch_id}/{total_batches}, time = {et - st}", + flush=True) + + +@timer +def write_encoded_jaccard_pair(args, client): + dedup_parsed_adlr_id_path = f"{args.cache_dir}/dedup_parsed_adlr_id.parquet" + output_path = f"{args.cache_dir}/encoded_jaccard_pair/" + ddf_adlr_id = dask_cudf.read_parquet(dedup_parsed_adlr_id_path, + blocksize="2GB", + aggregate_files=True) + ddf_adlr_id = ddf_adlr_id.persist() + len(ddf_adlr_id) + ddf = dask_cudf.read_parquet( + args.jaccard_pairs_path, + blocksize="256MB", + aggregate_files=True, + ) + ddf = ddf.map_partitions( + convert_str_pair_adlr_ids_to_int, + meta={ + "jaccard": "float32", + "dataset_id_x": "uint32", + "doc_id_x": "int64", + "dataset_id_y": "uint32", + "doc_id_y": "int64", + }, + ) + num_workers = get_num_workers(client) + batched_merge_and_write(ddf, ddf_adlr_id, output_path, num_workers) + + +@timer +def write_dedup_encoded_jaccard_pair(args, client): + input_path = f"{args.cache_dir}/encoded_jaccard_pair" + output_path = f"{args.cache_dir}/final_dedup_encoded_jaccard_pair.parquet" + + ddf = dask_cudf.read_parquet(input_path, + blocksize="512MB", + aggregate_files=True) + meta = {"adlr_id_x": "uint64", "adlr_id_y": "uint64", "jaccard": "float32"} + ddf = ddf.map_partitions(sort_adlr_id, meta=meta) + ddf = ddf.map_partitions(thresholding, meta=meta) + ddf = ddf.map_partitions( + M.drop_duplicates, + meta=ddf._meta, + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + ddf = dd_shuffle( + ddf, + ["adlr_id_x", "doc_id"], + ignore_index=True, + shuffle="tasks", + ) + ddf = ddf.map_partitions( + M.drop_duplicates, + meta=ddf._meta, + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + + write_output(ddf, output_path) + return output_path + + +def main(args): + description = """Takes a dataset consisting of document pairs + and their corresponding jaccard similarity to compute connected + components of docuements across pairs to find similar docuemnt + after applying a given threshold. The result is a dataset + consisting of all documents that are similar (above the threshold) + and the component they belong to.""" + start = time() + output_path = os.path.join(args.output_dir, "connected_components.parquet") + + client = get_client(args) + enable_spilling() + client.run(enable_spilling) + adlr_id_path = write_dedup_parsed_adlr_id(args) + write_encoded_jaccard_pair(args, client) + jaccard_pairs_path = write_dedup_encoded_jaccard_pair(args, client) + run_connected_components(jaccard_pairs_path, adlr_id_path, output_path) + print(f"All done in {time()-start:.1f} seconds") + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) + \ No newline at end of file diff --git a/nemo_curator/gpu_deduplication/ioutils.py b/nemo_curator/gpu_deduplication/ioutils.py new file mode 100644 index 00000000..88c18a17 --- /dev/null +++ b/nemo_curator/gpu_deduplication/ioutils.py @@ -0,0 +1,118 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Sequence + +import cudf +import dask_cudf +from dask import dataframe as dd +from tqdm import tqdm + +# TODO: +# Combine this with +# nemo_curator.distributed_utils.read_cudf_jsonl +def read_json_func(files, + engine="cudf", + include_path_column=False, + columns=None): + """ + Reads multiple Json Lines files into a cuDF + dataframe with an additional `path` column denoting the path + of the input file. + """ + if not include_path_column: + if columns: + return cudf.read_json(files, engine="cudf", lines=True)[columns] + else: + return cudf.read_json(files, engine="cudf", lines=True) + + dfs = [] + for file in files: + if columns: + df = cudf.read_json(file, engine=engine, lines=True)[columns] + else: + df = cudf.read_json(file, engine=engine, lines=True) + df["path"] = file + dfs.append(df) + return cudf.concat(dfs, ignore_index=True) + + +def bucketed_read(files, func=read_json_func, b_size=2, meta=None, **kwargs): + """ + Read files with `b_size` number of files per bucket. + Users can specify their own read + """ + filepaths = [ + files[i:i + b_size] for i in range(0, len(files), b_size) # noqa: E203 + ] + if meta: + return dd.from_map(func, filepaths, meta=meta, **kwargs) + else: + return dd.from_map(func, filepaths, **kwargs) + + +#TODO: Remove this function +def regular_read_json(files, include_path_column=False): + return dask_cudf.read_json(files, + engine="cudf", + lines=True, + include_path_column=include_path_column) + + +def batched_writing( + dask_df: dask_cudf.DataFrame, + output_path: str, + partition_on: Sequence[str], + parts_ber_batch: int = 32, +): + """ + Write a dask dataframe to parquet in batches. + This allows us to do batched exectution and prevent OOMs + Args: + dask_df: dask dataframe to write + output_path: path to write to + partition_on: columns to partition on + parts_ber_batch: number of partitions per batch + """ + + total_partitions = dask_df.npartitions + for batch_id, part_offset in tqdm( + enumerate(range(0, dask_df.npartitions, parts_ber_batch))): + print(f"\nStarted processing batch in = {batch_id}", flush=True) + df = dask_df.partitions[part_offset:part_offset + parts_ber_batch] + if partition_on: + df.to_parquet( + output_path, + partition_on=partition_on, + name_function=lambda x: f"batch_{batch_id}_part_{x}.parquet", + write_metadata_file=False, + ) + else: + df.to_parquet( + output_path, + name_function=lambda x: f"batch_{batch_id}_part_{x}.parquet", + write_metadata_file=False, + ) + print( + f"Part {part_offset+parts_ber_batch}/{total_partitions} completed", + flush=True, + ) + + +def strip_trailing_sep(path: str): + """ + Strips a path string of trailing path seperators like `/` if any. + """ + return path.rstrip(os.path.sep) diff --git a/nemo_curator/gpu_deduplication/jaccard_compute.py b/nemo_curator/gpu_deduplication/jaccard_compute.py new file mode 100644 index 00000000..674df14c --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_compute.py @@ -0,0 +1,150 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +import cudf +import dask.dataframe as dd +import numpy as np + +from nemo_curator.gpu_deduplication.jaccard_utils.jaccard_similarity_utils import ( + compute_jaccard_and_create_pair_df,) +from nemo_curator.gpu_deduplication.utils import ( + enable_spilling, + get_client, + get_num_workers, + parse_nc_args, +) + + +def create_bins(path_dicts, max_size): + path_dicts.sort(key=lambda x: x["str_bytes"], reverse=True) + bins, bin_sizes = [], [] + for path_d in path_dicts: + new_path, new_size = path_d["path"], path_d["str_bytes"] + for i, bin_size in enumerate(bin_sizes): + if bin_size + new_size <= max_size: + bins[i].append(new_path) + bin_sizes[i] += new_size + new_size = 0 + break + if new_size: + bins.append([new_path]) + bin_sizes.append(new_size) + return bins + + +def get_anchor_docs_and_string_size(path): + df = cudf.read_parquet(path) + str_bytes = df["text"].str.byte_count().sum() + is_anchor_flag = (df["adlr_id"] == df["anchor_1_adlr_id"]) | ( + df["adlr_id"] == df["anchor_0_adlr_id"]) + anchor_df = df[is_anchor_flag].reset_index(drop=True) + return anchor_df, {"path": path, "str_bytes": str_bytes} + + +def compute_jaccard_on_1_partition(path): + try: + df = cudf.read_parquet(path) + pair_df = compute_jaccard_and_create_pair_df(df) + except OverflowError: + paths = [entry.path for entry in os.scandir(os.path.join(path))] + anchor_df_str_size_ls = [ + get_anchor_docs_and_string_size(path) for path in paths + ] + anchor_df = cudf.concat( + [anchor_doc for anchor_doc, _ in anchor_df_str_size_ls], + ignore_index=True).drop_duplicates() + df_str_size = [str_size for _, str_size in anchor_df_str_size_ls] + paths = create_bins(df_str_size, np.iinfo(np.int32).max // 10) + pair_dfs = [] + for path in paths: + print(path) + df = cudf.read_parquet(path).reset_index(drop=True) + df = cudf.concat([df, anchor_df], ignore_index=True) + pair_df = compute_jaccard_and_create_pair_df(df) + pair_dfs.append(pair_df) + pair_df = cudf.concat(pair_dfs, ignore_index=True) + return pair_df + + +def run_jaccard_compute(shuffled_docs_path, output_final_results_path): + print("Starting Jaccard Computation", flush=True) + st = time.time() + paths = [ + entry.path + for entry in os.scandir(shuffled_docs_path) + if not entry.path.endswith(".txt") + ] + meta_df = cudf.DataFrame({ + "adlr_id_x": ["x"], + "adlr_id_y": ["y"], + "jaccard": np.float32([0.0]), + }) + result_df = dd.from_map(compute_jaccard_on_1_partition, paths, + meta=meta_df).reset_index(drop=True) + + result_df.to_parquet( + output_final_results_path, + write_index=False, + write_metadata_file=False, + ) + print(f"Jaccard Computing+Writing time: {time.time() - st:.1f} seconds") + + +def main(args): + description = """Computes the Jaccard similarity between document pairs + from partitioned parquet dataset. Result is a parquet dataset consiting of + document id pair along with their Jaccard similarity score. + """ + OUTPUT_PATH = args.output_dir + shuffled_docs_path = args.shuffled_docs_path + output_final_results_path = os.path.join(OUTPUT_PATH, + "dedup_final_results.parquet") + client = get_client(args) + enable_spilling() + client.run(enable_spilling) + print(f"Num Workers = {get_num_workers(client)}", flush=True) + print("Connected to dask cluster", flush=True) + print("Running jaccard compute script", flush=True) + + # Run actual computation + run_jaccard_compute(shuffled_docs_path, output_final_results_path) + + +def attach_args(parser=None): + description = """Computes jaccard similarity""" + if not parser: + parser = parse_nc_args(description=description) + + parser.add_argument( + "--shuffled-docs-path", + type=str, + help="The directory containing the shuffled documents", + ) + parser.add_argument( + "--output-dir", + type=str, + help="The output directory to write results to", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/gpu_deduplication/jaccard_map_buckets.py b/nemo_curator/gpu_deduplication/jaccard_map_buckets.py new file mode 100644 index 00000000..092d9225 --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_map_buckets.py @@ -0,0 +1,194 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +from dask.dataframe.shuffle import shuffle as dd_shuffle +from dask.utils import M + +from nemo_curator.gpu_deduplication.jaccard_utils.get_anchor_utils import ( + add_anchor_docs,) +from nemo_curator.gpu_deduplication.jaccard_utils.get_output_map_utils import ( + get_output_map_based_on_str_bytes,) +from nemo_curator.gpu_deduplication.jaccard_utils.io_utils import ( + get_bucket_ddf_from_parquet_path, + get_text_ddf_from_json_path_with_blocksize, +) +from nemo_curator.gpu_deduplication.utils import ( + get_client, + get_num_workers, + parse_nc_args, +) + + +def get_anchor_and_output_map_info( + input_data_paths, + input_bucket_path, + text_ddf_blocksize, + num_files, + num_workers, + shuffle_type, +): + """ + Get anchor docs with bucket info + Args: + input_data_paths: list of paths to input data + input_bucket_path: path to input buckets + text_ddf_blocksize: blocksize for text ddf + num_files: number of files to read + num_workers: number of workers + shuffle_type: type of shuffle to use + Returns: + ddf_anchor_docs_with_bk + """ + ddf_text = get_text_ddf_from_json_path_with_blocksize( + input_data_paths=input_data_paths, + num_files=num_files, + blocksize=text_ddf_blocksize, + ) + ddf_bk = get_bucket_ddf_from_parquet_path(input_bucket_path=input_bucket_path, + num_workers=num_workers) + output_map_df = get_output_map_based_on_str_bytes(ddf_bk=ddf_bk, + ddf_text=ddf_text) + ddf_anchor_docs_with_bk = ddf_bk.map_partitions(add_anchor_docs) + print("output_map_df is based on string bytes", flush=True) + ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.merge(output_map_df, + on=["bucket"]) + # Bucket is no longer needed + ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.drop(columns=["bucket"]) + # Below removes any duplicates lying around after dropping buckets + ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.map_partitions( + M.drop_duplicates, + meta=ddf_anchor_docs_with_bk._meta, + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + ddf_anchor_docs_with_bk = dd_shuffle( + ddf_anchor_docs_with_bk, + ["dataset_id", "doc_id"], + ignore_index=True, + shuffle=shuffle_type, + ).map_partitions( + M.drop_duplicates, + meta=ddf_anchor_docs_with_bk._meta, + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + del output_map_df + return ddf_anchor_docs_with_bk + + +def attach_args(parser=None): + description = """Takes the buckets generated from minhashes and uses + document length information to create a coarse mapping of mapping multiple + buckets to a logical partition by using a modified bin packing algorithm. + """ + if not parser: + parser = parse_nc_args(description=description) + parser.add_argument( + "--input-bucket-dir", + type=str, + help="The directory containing bucket information files", + ) + parser.add_argument( + "--text-ddf-blocksize", + type=int, + default=256, + help="The block size for chunking jsonl files for text ddf in mb", + ) + parser.add_argument( + "--output-dir", + type=str, + help="The output directory to write results in", + ) + parser.add_argument( + "--shuffle-type", + type=str, + default="tasks", + help="Type of shuffle to use before writing to parquet", + ) + return parser + + +def jaccard_get_output_map_workflow( + client, + input_data_paths, + input_bucket_path, + output_anchor_docs_with_bk_path, + text_ddf_blocksize, + num_files, + shuffle_type, +): + """ + Workflow for jaccard shuffle + Args: + client: dask client + input_data_paths: list of paths to input data + input_bucket_path: path to input buckets + output_anchor_docs_with_bk_path: path to save anchor docs with bucket info + text_ddf_blocksize: blocksize for text ddf + num_files: number of files to read + parts_per_worker: number of parts per worker + shuffle_type: type of shuffle to use before writing to parquet + """ + num_workers = get_num_workers(client) + ddf_anchor_docs_with_bk = get_anchor_and_output_map_info( + input_data_paths, + input_bucket_path, + text_ddf_blocksize, + num_files, + num_workers, + shuffle_type, + ) + ddf_anchor_docs_with_bk.to_parquet( + output_anchor_docs_with_bk_path, + write_index=False, + ) + + +def main(args): + input_data_paths = args.input_data_dirs + input_bucket_path = args.input_bucket_dir + OUTPUT_PATH = args.output_dir + output_anchor_docs_with_bk_path = os.path.join(OUTPUT_PATH, + "anchor_docs_with_bk.parquet") + client = get_client(args) + print(f"Num Workers = {get_num_workers(client)}", flush=True) + print("Connected to dask cluster", flush=True) + print("Running jaccard map buckets script", flush=True) + print(f"Args = {args}") + st = time.time() + jaccard_get_output_map_workflow( + client, + input_data_paths, + input_bucket_path, + output_anchor_docs_with_bk_path, + args.text_ddf_blocksize, + args.num_files, + args.shuffle_type, + ) + et = time.time() + print(f"Bucket Mapping time taken = {et-st} s") + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) + \ No newline at end of file diff --git a/nemo_curator/gpu_deduplication/jaccard_shuffle.py b/nemo_curator/gpu_deduplication/jaccard_shuffle.py new file mode 100644 index 00000000..3f1d7f5e --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_shuffle.py @@ -0,0 +1,395 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +import cudf +from tqdm import tqdm + +from nemo_curator.gpu_deduplication.jaccard_utils.batch_shuffle_utils import ( + text_bytes_aware_shuffle,) +from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import ( + combine_back_adlr_ids,) +from nemo_curator.gpu_deduplication.jaccard_utils.io_utils import ( + aggregated_anchor_docs_with_bk_read, + get_restart_offsets, + get_text_ddf_from_json_path_with_blocksize, + update_restart_offsets, +) +from nemo_curator.gpu_deduplication.jaccard_utils.merge_utils import ( + extract_partitioning_index, + filter_text_rows_by_bucket_batch, + merge_left_to_shuffled_right, +) +from nemo_curator.gpu_deduplication.utils import ( + get_client, + get_num_workers, + parse_nc_args, + performance_report_if, +) + + +def write_partitioned_file(df, output_path, partition_on, batch_id): + if len(df) == 0: + return cudf.Series([True]) + + cudf.io.parquet.write_to_dataset( + df, + output_path, + partition_cols=[partition_on], + filename=f"batch_{batch_id}.parquet", + ) + return cudf.Series([True]) + + +def batched_merge_and_write( + left_df, + right_df, + merge_on, + partition_on, + output_path, + parts_per_text_batch, + parts_per_bucket_batch, + bk_mapping, + num_workers=None, +): + + total_text_partitions = left_df.npartitions + total_bucket_partitions = right_df.npartitions + + # Extract global partitioning index + left_df, global_partitioning_index = extract_partitioning_index( + left_df, + merge_on, + bk_mapping, + parts_per_bucket_batch, + total_bucket_partitions, + ) + + # Set start offsets + bucket_part_start_offset, text_part_start_offset = get_restart_offsets( + output_path) + + # Set end offsets + # NOTE: These end offsets are always set to the end + # of the data. However, we may want to be able to set + # both the start and end offsets from the command line + # in the future. + bucket_part_end_offset = total_bucket_partitions + text_part_end_offset = total_text_partitions + + # Check that offsets are valid + assert bucket_part_start_offset % parts_per_bucket_batch == 0 + assert bucket_part_end_offset > bucket_part_start_offset + assert text_part_end_offset > text_part_start_offset + + # Initialize "retry" variables + # + # - retry_count: The number of successive batches that + # we have already performed at a reduced batch size. + # - retry_threshold: The number of successive batches + # for which we should keep the batch size low + # before attempting the default batch size again. + # Every time we return to the default batch size + # and immediately fail, retry_threshold will double. + parts_per_text_batch_retry = None + retry_count, retry_threshold = 0, 1 + + print( + f"Starting at bucket-map partition {bucket_part_start_offset}" + f" and text-df partition {text_part_start_offset}", + flush=True, + ) + + for bucket_part_offset in tqdm( + range(bucket_part_start_offset, bucket_part_end_offset, + parts_per_bucket_batch)): + + # Outer loop over batches of "bucket-map" partitions + end_bucket_offset = min(bucket_part_offset + parts_per_bucket_batch, + bucket_part_end_offset) + print( + f"\nStarted processing bucket-map partitions {bucket_part_offset} " + f"through {end_bucket_offset} of {bucket_part_end_offset}", + flush=True, + ) + st_bucket = time.time() + + # Select our bucket-mapping batch + subset_bucket_df = right_df.partitions[bucket_part_offset:end_bucket_offset] + subset_bucket_df = subset_bucket_df.persist() + + # Filter out rows of left_df that we know cannot + # align with any rows of subset_bucket_df + left_df_use = filter_text_rows_by_bucket_batch( + left_df, + global_partitioning_index, + bucket_part_offset, + bucket_part_end_offset, + total_bucket_partitions, + ) + + text_part_offset = text_part_start_offset + while text_part_offset < text_part_end_offset: + + # Check if we are "retrying" with a smaller "parts_per_text_batch" + if parts_per_text_batch_retry: + parts_per_text_batch_use = parts_per_text_batch_retry + else: + st_text = time.time() + parts_per_text_batch_use = parts_per_text_batch + print(f"Using {parts_per_text_batch_use} text partitions.", flush=True) + + # Select partitions for our text batch + end_text_offset = min(text_part_offset + parts_per_text_batch_use, + text_part_end_offset) + subset_text_df = left_df_use.partitions[text_part_offset:end_text_offset] + + try: + # NOTE: If we have more text-df partitions than bucket-map + # partitions, we are more likely to see an OverflowError + output_df = text_bytes_aware_shuffle( + merge_left_to_shuffled_right( + subset_text_df, + subset_bucket_df, + merge_on, + ), + partition_on, + num_workers=num_workers, + ) + except OverflowError as err: + # We encountered an overflow error! + # Let's try again with less text data + parts_per_text_batch_retry = int(parts_per_text_batch_use / 2) + if parts_per_text_batch_retry < 1: + raise err + print( + f"\nWe encountered an OverflowError and will retry " + f"the current batch with {parts_per_text_batch_retry} " + f"text partitions instead of {parts_per_text_batch_use}.", + flush=True, + ) + continue + + output_df = output_df.map_partitions(combine_back_adlr_ids) + batch_label = f"{end_bucket_offset}_{end_text_offset}" + written_files = output_df.map_partitions( + write_partitioned_file, + output_path, + partition_on, + batch_label, + meta=cudf.Series([True]), + ) + written_files = written_files.compute() + update_restart_offsets(output_path, bucket_part_offset, end_text_offset) + del output_df + + print( + "Text-df partition ", + f"{end_text_offset}/{text_part_end_offset} " + f"completed in {time.time()-st_text}", + flush=True, + ) + + # Update loop control-flow variables + if parts_per_text_batch_use == parts_per_text_batch: + # We succeeded at the default batch size. + # Reset the retry count + retry_count, retry_threshold = 0, 1 + else: + # We succeeded at a lower batch size + retry_count += 1 + if retry_count >= retry_threshold: + # Go back to the default text-batch size, + # but increase the retry_threshold in + # case we fail again + parts_per_text_batch_retry = None + retry_count, retry_threshold = 0, min(retry_threshold * 2, 16) + text_part_offset += parts_per_text_batch_use + + update_restart_offsets(output_path, end_bucket_offset, end_text_offset) + print( + "Bucket partition ", + f"{end_bucket_offset}/{bucket_part_end_offset} " + f"completed in {time.time()-st_bucket}", + flush=True, + ) + + # Need to reset text_part_start_offset to 0 after + # a single bucket-batch pass (only matters if we are + # breaking the bucket-mapping df into multiple batches) + text_part_start_offset = 0 + + +def jaccard_shuffling_workflow( + client, + input_data_paths, + input_anchor_docs_with_bk_dir, + output_shuffled_docs_path, + text_ddf_blocksize, + bucket_mapping_ddf_blocksize, + num_files, + parts_per_worker, + profile_path, + bucket_parts_per_worker, +): + """' + Args: + client: dask client + input_data_paths: paths to input data + input_anchor_docs_with_bk_dir: path to input anchor docs with buckets + output_shuffled_docs_path: path to output shuffled docs + text_ddf_blocksize: block size for chunking jsonl files for text ddf + bucket_mapping_ddf_blocksize: block size for chunking parquet files + for anchor_docs_with_bk ddf + num_files: number of files to process + parts_per_worker: parts per worker to process in a batch + profile_path: dask profile path + bucket_parts_per_worker: bucket parts per worker to process in a batch + """ + # Part1. Reading+Shuffling Data + # Read Text from Data from jsonl files + + text_ddf = get_text_ddf_from_json_path_with_blocksize( + input_data_paths=input_data_paths, + num_files=num_files, + blocksize=text_ddf_blocksize, + ) + print( + "Graph creation for get_text_ddf_from_json_path_with_blocksize" + " complete.", + flush=True) + print(f"text_ddf.npartitions = {text_ddf.npartitions}", flush=True) + st = time.time() + ddf_anchor_docs_with_bk, bk_mapping = aggregated_anchor_docs_with_bk_read( + input_anchor_docs_with_bk_dir, + blocksize=bucket_mapping_ddf_blocksize, + ) + print("Getting ddf_anchor_docs_with_bk completed") + print( + f"ddf_anchor_docs_with_bk.npartitions = {ddf_anchor_docs_with_bk.npartitions}", + flush=True, + ) + st = time.time() + num_workers = get_num_workers(client) + parts_per_batch = num_workers * parts_per_worker + print(f"parts_per_batch = {parts_per_batch}") + parts_per_bucket_batch = num_workers * bucket_parts_per_worker + print(f"parts_per_bucket_batch = {parts_per_bucket_batch}") + dask_profile_name = f"blocksize-{text_ddf_blocksize}" + dask_profile_name = dask_profile_name + f"parts_per_batch-{parts_per_batch}" + dask_profile_name = (dask_profile_name + + f"-parts_per_bucket_batch-{parts_per_bucket_batch}") + dask_profile_name = dask_profile_name + f"-jaccard-n_input_files-{num_files}.html" + + text_ddf = text_ddf[["dataset_id", "doc_id", "text"]] + + with performance_report_if(profile_path, dask_profile_name): + # Merge and write the dataframes + batched_merge_and_write( + text_ddf, + ddf_anchor_docs_with_bk, + output_path=output_shuffled_docs_path, + merge_on=["dataset_id", "doc_id"], + partition_on="output_partition_id", + parts_per_text_batch=parts_per_batch, + parts_per_bucket_batch=parts_per_bucket_batch, + bk_mapping=bk_mapping, + num_workers=num_workers, + ) + print(f"Writing+Shuffling data took = {time.time()-st} s", flush=True) + + +def main(args): + input_data_paths = args.input_data_dirs + input_anchor_docs_with_bk_dir = args.input_bucket_mapping_dir + OUTPUT_PATH = args.output_dir + output_anchor_docs_with_bk_path = os.path.join(OUTPUT_PATH, + "anchor_docs_with_bk.parquet") + output_shuffled_docs_path = os.path.join(OUTPUT_PATH, "shuffled_docs.parquet") + client = get_client(args) + print(f"Num Workers = {get_num_workers(client)}", flush=True) + print("Connected to dask cluster", flush=True) + print("Running jaccard shuffle script", flush=True) + print(f"Args = {args}") + st = time.time() + jaccard_shuffling_workflow( + client=client, + input_data_paths=input_data_paths, + input_anchor_docs_with_bk_dir=input_anchor_docs_with_bk_dir, + output_shuffled_docs_path=output_shuffled_docs_path, + text_ddf_blocksize=args.text_ddf_blocksize, + bucket_mapping_ddf_blocksize=args.bucket_mapping_ddf_blocksize, + num_files=args.num_files, + parts_per_worker=args.parts_per_worker, + profile_path=args.profile_path, + bucket_parts_per_worker=args.bucket_parts_per_worker, + ) + et = time.time() + print(f"Jaccard Shuffle E2E time taken = {et-st} s") + + +def attach_args(parser=None): + description = """Shuffles input text documents based on the given bucket + map. The output is a partitioned parquet dataset with the documents + shuffled by buckets + """ + if not parser: + parser = parse_nc_args(description=description) + + parser.add_argument( + "--input-bucket-mapping-dir", + type=str, + help="The directory containing anchor docs with bk files", + ) + parser.add_argument( + "--text-ddf-blocksize", + type=int, + default=256, + help="The block size for chunking jsonl files for text ddf in mb", + ) + parser.add_argument( + "--bucket-mapping-ddf-blocksize", + type=int, + default=256, + help="The block size for for anchor_docs_with_bk ddf in mb", + ) + parser.add_argument( + "--output-dir", + type=str, + help="The output directory to write results in", + ) + parser.add_argument( + "--parts-per-worker", + default=2, + type=int, + help="The number of parts to process per worker per batch", + ) + parser.add_argument( + "--bucket-parts-per-worker", + default=8, + type=int, + help="The number of bucket parts to process per worker per batch", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) + diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/__init__.py b/nemo_curator/gpu_deduplication/jaccard_utils/__init__.py new file mode 100644 index 00000000..fe99e99a --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py new file mode 100644 index 00000000..5bc6c1f7 --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_utils/batch_shuffle_utils.py @@ -0,0 +1,132 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cudf +import dask_cuda +import numpy as np +from dask import config +from dask.dataframe.shuffle import rearrange_by_column +from dask_cuda.explicit_comms.dataframe.shuffle import ( + shuffle as explicit_comms_shuffle, +) +from packaging.version import Version + +from nemo_curator.gpu_deduplication.jaccard_utils.get_output_map_utils import ( + build_partition, + get_agg_text_bytes_df, +) + +USE_EXCOMMS = Version(dask_cuda.__version__) >= Version("23.10") + + +def rearange_by_column_direct( + df, + col, + npartitions, + ignore_index, + excomms_default=USE_EXCOMMS, +): + # Execute a "direct" shuffle operation without staging + if config.get("explicit-comms", excomms_default): + # Use explicit comms unless the user has + # disabled it with the dask config system, + # or we are using an older version of dask-cuda + return explicit_comms_shuffle( + df, + [col], + npartitions=npartitions, + ignore_index=ignore_index, + ) + else: + return rearrange_by_column( + df, + col=col, + shuffle="tasks", + # Prevent staged shuffling by setting max_branch + # to the number of input partitions + 1 + max_branch=npartitions + 1, + npartitions=npartitions, + ignore_index=ignore_index, + ) + + +def get_shuffle_part_ids_df(agg_df, partition_on, num_workers=None): + sizes = agg_df[f"{partition_on}_text_bytes"].values + max_text_bytes_per_part = int(np.iinfo(np.int32).max // 1.2) + + # Adjust max_text_bytes_per_part if the number of output + # partitions is small compared to the number of workers. + # Sometimes we just have very few output partitions to + # deal with, and just need a larger batch + npartitions_min = int(num_workers * 0.8) + while True: + output_ar = build_partition(sizes.get(), max_text_bytes_per_part) + if output_ar.max() > npartitions_min or max_text_bytes_per_part < 2**24: + break + max_text_bytes_per_part = int(max_text_bytes_per_part // 2.0) + + df = cudf.DataFrame() + df[partition_on] = agg_df[partition_on] + df["_partitions"] = output_ar + return df + + +def get_shuffle_partition_info(df, partition_on, num_workers=None): + df["text_bytes"] = df["text"].map_partitions(lambda s: s.str.byte_count()) + agg_df = get_agg_text_bytes_df(df, partition_on, 1) + del df + + agg_df = agg_df.reset_index(drop=True) + shuffle_part_ids = agg_df.map_partitions( + get_shuffle_part_ids_df, partition_on, num_workers=num_workers + ).persist() + return shuffle_part_ids + + +def text_bytes_aware_shuffle(df, partition_on, num_workers=None): + """ + This shuffle takes into account the text bytes of each partition + and tries to make sure that the output partitions do not exceed + the char limit of cuDF + + Args: + df: dask_cudf dataframe + partition_on: column name to partition on + + + Returns: + dask_cudf dataframe with _partitions columns + """ + print("Starting text bytes aware shuffle", flush=True) + df = df.persist() + shuffle_part_ids = get_shuffle_partition_info( + df, partition_on, num_workers=num_workers + ) + n_output_partitions = shuffle_part_ids["_partitions"].max().compute() + 1 + n_output_partitions = int(n_output_partitions) + df = df.merge(shuffle_part_ids, on=partition_on, how="inner").persist() + + df = ( + rearange_by_column_direct( + df, + col="_partitions", + npartitions=n_output_partitions, + ignore_index=True, + excomms_default=True, + ) + .drop(columns=["_partitions"]) + .persist() + ) + print(f"Will write {len(df)} rows to disk", flush=True) + return df diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py b/nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py new file mode 100644 index 00000000..27c25f63 --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_utils/doc_id_mapping.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def convert_str_id_to_int(df, id_column="id"): + """ + Converts the legacy id format "dataset_name-0000034" + type of ID into 2 int based ID's + """ + dx = df[id_column].str.rsplit("-", n=1, expand=True) + df["doc_id"] = dx[1].astype("int64").values + df["dataset_id"] = dx[0].hash_values() + df.drop(columns=[id_column], inplace=True) + return df + + +def convert_str_pair_adlr_ids_to_int(df): + assert "adlr_id_x" in df.columns + assert "adlr_id_y" in df.columns + + for tag in ["x", "y"]: + dx = df[f"adlr_id_{tag}"].str.rsplit("-", n=1, expand=True) + df[f"dataset_id_{tag}"] = dx[0].astype("uint32").values + df[f"doc_id_{tag}"] = dx[1].astype("int64").values + # See the above convert_adlr_id_to_int function + df = df.drop(columns=[f"adlr_id_{tag}"]) + return df + + +def combine_back_adlr_ids(df): + df["adlr_id"] = df["dataset_id"].astype(str) + "-" + df["doc_id"].astype(str) + df.drop(columns=["dataset_id", "doc_id"], inplace=True) + + if "anchor_0_dataset_id" in df.columns: + df["anchor_0_adlr_id"] = ( + df["anchor_0_dataset_id"].astype(str) + + "-" + + df["anchor_0_doc_id"].astype(str) + ) + df.drop(columns=["anchor_0_dataset_id", "anchor_0_doc_id"], inplace=True) + + if "anchor_1_dataset_id" in df.columns: + df["anchor_1_adlr_id"] = ( + df["anchor_1_dataset_id"].astype(str) + + "-" + + df["anchor_1_doc_id"].astype(str) + ) + df.drop(columns=["anchor_1_dataset_id", "anchor_1_doc_id"], inplace=True) + return df diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py new file mode 100644 index 00000000..de8fef50 --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_utils/get_anchor_utils.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def random_select_anchor(df_bk, n=2): + """ + Randomly select `n` anchors from each bucket. + """ + df_bk = df_bk.copy() + df_bk["hash"] = df_bk[["doc_id", "dataset_id"]].hash_values() + df_bk = df_bk.sort_values(["bucket", "hash"]) + df_bk["order_in_bucket"] = df_bk.groupby("bucket").cumcount() + df_bk["is_anchor"] = df_bk["order_in_bucket"] < n + for i in range(0, n): + df_bk[f"is_anchor_id_{i}"] = df_bk["order_in_bucket"] == i + df_bk = df_bk.drop(columns=["hash", "order_in_bucket"], axis=1) + df_bk = df_bk.reset_index(drop=True) + df_bk = df_bk[df_bk.is_anchor] + return df_bk + + +def add_anchor_docs(df_bk): + """ + Get anchor documents for each bucket. + """ + num_anchors = 2 + df_anchor_bk = random_select_anchor(df_bk=df_bk, n=num_anchors) + df_anchor_bk_0 = df_anchor_bk[df_anchor_bk["is_anchor_id_0"]][ + ["bucket", "dataset_id", "doc_id"] + ].reset_index(drop=True) + df_anchor_bk_0 = df_anchor_bk_0.rename( + columns={"doc_id": "anchor_0_doc_id", "dataset_id": "anchor_0_dataset_id"} + ) + + df_anchor_bk_1 = df_anchor_bk[df_anchor_bk["is_anchor_id_1"]][ + ["bucket", "dataset_id", "doc_id"] + ].reset_index(drop=True) + df_anchor_bk_1 = df_anchor_bk_1.rename( + columns={"doc_id": "anchor_1_doc_id", "dataset_id": "anchor_1_dataset_id"} + ) + + df_anchor_docs = df_anchor_bk_1.merge(df_anchor_bk_0, on=["bucket"], how="inner") + df_anchor_docs_with_bk = df_bk.merge(df_anchor_docs, on=["bucket"], how="inner") + return df_anchor_docs_with_bk diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py new file mode 100644 index 00000000..2c492943 --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_utils/get_output_map_utils.py @@ -0,0 +1,148 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cudf +import dask_cudf +import numba +import numpy as np +from nemo_curator._compat import DASK_SHUFFLE_METHOD_ARG + + +# next-fit-descending bin packing +# https://en.wikipedia.org/wiki/Next-fit-decreasing_bin_packing +@numba.jit(nopython=True) +def build_partition(sizes: np.ndarray, max_size): + i: int = 0 + count: int = 0 + current: int = 0 + size: int = 0 + partition = np.empty(sizes.shape, dtype=np.int32) + for i in range(len(sizes)): + size = sizes[i] + if current + size < max_size: + partition[i] = count + current += size + else: + count += 1 + current = size + partition[i] = count + return partition + + +def update_id(df, lower_bound): + df["output_partition_id"] += lower_bound + return df + + +def get_output_part_ids_with_approx_equal_sum( + bucket_text_bytes_df, max_text_bytes_per_part: int +): + """' + Create a output_series that maps the ser.index into `nparts` + so that the total sum of bucket_val_counts_df + for each output id are all most equal and + less than max_text_bytes_per_part + This is used downstream for creating equal output_ids + """ + sizes = bucket_text_bytes_df["bucket_text_bytes"].values + bucket_output_ar = build_partition(sizes.get(), max_text_bytes_per_part) + df = cudf.DataFrame() + df["bucket"] = bucket_text_bytes_df["bucket"] + df["output_partition_id"] = bucket_output_ar + return df + + +def get_agg_text_bytes_df(df, agg_column, n_partitions, shuffle=False): + shuffle_arg = "shuffle_method" if DASK_SHUFFLE_METHOD_ARG else "shuffle" + agg_df = ( + df[[agg_column, "text_bytes"]] + .groupby([agg_column]) + .agg({"text_bytes": "sum"}, split_out=n_partitions, **{shuffle_arg: shuffle}) + ) + agg_df = agg_df.rename(columns={"text_bytes": f"{agg_column}_text_bytes"}) + agg_df = agg_df.reset_index(drop=False) + # Doing a per partition sort + # seems to cause issues with + # jaccard shuffle (Overflow errors) + # which are caught and then + # retried with text_bytes_aware_merge + agg_df = agg_df.persist() + agg_df = agg_df.sort_values( + by=[f"{agg_column}_text_bytes"], ascending=False, ignore_index=True + ) + agg_df = agg_df.persist() + # Added length to force computation + # after persist + print(f"Agg_df computed of length = {len(agg_df)}", flush=True) + return agg_df + + +def get_output_map_from_text_bytes_per_bucket(ddf_bk_text_bytes): + # String bytes limit for cuDF + max_text_bytes_per_part = int(np.iinfo(np.int32).max // 1.2) + print(f"max_text_bytes_per_part = {max_text_bytes_per_part}") + + # Increasing in an attempt to prevent hitting + # ulimits + output_map_df_meta = cudf.DataFrame({"bucket": [0], "output_partition_id": [1]}) + output_map_df_meta["bucket"] = output_map_df_meta["bucket"].astype(np.uint64) + output_map_df_meta["output_partition_id"] = output_map_df_meta[ + "output_partition_id" + ].astype(np.int32) + output_map_df = ddf_bk_text_bytes.map_partitions( + get_output_part_ids_with_approx_equal_sum, + max_text_bytes_per_part, + meta=output_map_df_meta, + ) + output_map_df = output_map_df.persist() + print(f"Step 1 of output_map_df of len: {len(output_map_df)} computed") + lower_bounds = ( + output_map_df["output_partition_id"] + .map_partitions(lambda s: (s.max() + 1)) + .compute() + ) + lower_bounds = np.cumsum(lower_bounds) + + updated_parts = [ + output_map_df.get_partition(i).map_partitions(update_id, lower_bounds[i - 1]) + for i in range(1, len(lower_bounds)) + ] + updated_parts.append(output_map_df.get_partition(0)) + output_map_df = dask_cudf.concat(updated_parts) + output_map_df = output_map_df.persist() + print(f"All steps of output_map_df of len: {len(output_map_df)} computed") + return output_map_df + + +def get_output_map_based_on_str_bytes(ddf_bk, ddf_text): + """ + Add output_partition_id to ddf_bk + """ + print("Getting text bytes", flush=True) + ddf_text["text_bytes"] = ddf_text["text"].map_partitions( + lambda s: s.str.byte_count() + ) + n_partitions = ddf_bk.npartitions + ddf_text = ddf_text.drop(columns=["text"]).repartition(npartitions=n_partitions) + ddf_bk = ddf_bk.merge(ddf_text).repartition(npartitions=n_partitions) + del ddf_text + ddf_bk_text_bytes = get_agg_text_bytes_df( + ddf_bk, + agg_column="bucket", + n_partitions=n_partitions, + shuffle=True, + ) + del ddf_bk + output_map_df = get_output_map_from_text_bytes_per_bucket(ddf_bk_text_bytes) + return output_map_df diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py new file mode 100644 index 00000000..0c1d4895 --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_utils/io_utils.py @@ -0,0 +1,188 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from glob import glob + +import cudf +import dask_cudf +import numpy as np +from dask import dataframe as dd + +from nemo_curator.gpu_deduplication.ioutils import ( + bucketed_read, + read_json_func, +) +from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import ( + convert_adlr_id_to_int, +) + + +def get_bucket_ddf_from_parquet_path(input_bucket_path, num_workers): + # Read parquet-formatted parquet files + ddf_bk = dask_cudf.read_parquet( + input_bucket_path, + blocksize="512MiB", + aggregate_files=True, + ) + # Repartition to ensure we at least have num_workers partitions + npartitions = max(ddf_bk.npartitions, num_workers) + ddf_bk = ddf_bk.repartition(npartitions=npartitions) + print(f"Number of ddf_bk partitions = {ddf_bk.npartitions}", flush=True) + return ddf_bk + + +def aggregated_anchor_docs_with_bk_read(path, blocksize): + from dask.utils import natural_sort_key + from pyarrow.dataset import dataset + + ds = dataset( + sorted(glob(f"{path}/*.parquet"), key=natural_sort_key), + format="parquet", + ) + chunks = chunk_files(ds.get_fragments(), blocksize) + + # Record mapping between file indices and partition indices. + # We need to do this, because our anchor_docs_with_bk data + # should be shuffled on disk. + assert len(chunks) + part_id = np.repeat( + np.arange(len(chunks), dtype="int32"), + np.fromiter(map(len, chunks), dtype="int32"), + ) + file_id = np.arange(len(part_id), dtype="int32") + mapping_df = cudf.DataFrame({"file_id": file_id, "part_id": part_id}) + + meta = cudf.DataFrame.from_arrow(ds.schema.empty_table()) + return dd.from_map(cudf.read_parquet, chunks, meta=meta), mapping_df + + +def get_text_ddf_from_json_path(input_data_paths, num_files, files_per_input_partition): + data_paths = [ + entry.path for data_path in input_data_paths for entry in os.scandir(data_path) + ] + data_paths = [f for f in data_paths if f.endswith(".jsonl")] + if num_files != -1: + data_paths = data_paths[:num_files] + meta_df = cudf.DataFrame( + { + "text": ["x"], + "adlr_id": ["x"], + } + ) + print( + f"Number of files being read for jaccard shuffling= {len(data_paths)}", + flush=True, + ) + + text_ddf = bucketed_read( + data_paths, + b_size=files_per_input_partition, + columns=list(meta_df.columns), + meta=meta_df, + ) + text_ddf = text_ddf.map_partitions( + convert_adlr_id_to_int, + meta=cudf.DataFrame({"text": ["a"], "doc_id": [0], "dataset_id": np.uint32(1)}), + ) + return text_ddf + + +def get_file_size(file_path): + return os.path.getsize(file_path) + + +def get_frag_size(frag): + # Pyarrow dataset fragment + return sum(rg.total_byte_size for rg in frag.row_groups) + + +def chunk_files(file_list, max_size_mb): + """ + Chunk files into lists of files that are less than max_size_mb + """ + + max_size_bytes = max_size_mb * 1024 * 1024 + chunks = [] + current_chunk = [] + current_size = 0 + + for frag_or_path in file_list: + if isinstance(frag_or_path, str): + file_path = frag_or_path + file_size = get_file_size(file_path) + else: + file_path = frag_or_path.path + file_size = get_frag_size(frag_or_path) + + if current_size + file_size <= max_size_bytes: + current_chunk.append(file_path) + current_size += file_size + else: + # Handle case when the first + # file is larger than max_size_mb + if current_chunk: + chunks.append(current_chunk) + current_chunk = [file_path] + current_size = file_size + + if current_chunk: + chunks.append(current_chunk) + + return chunks + + +def get_text_ddf_from_json_path_with_blocksize(input_data_paths, num_files, blocksize): + data_paths = [ + entry.path for data_path in input_data_paths for entry in os.scandir(data_path) + ] + data_paths = [f for f in data_paths if f.endswith(".jsonl")] + data_paths.sort() + if num_files != -1: + data_paths = data_paths[:num_files] + meta_df = cudf.DataFrame( + { + "text": ["x"], + "adlr_id": ["x"], + } + ) + print( + f"Number of files being read for jaccard calculation = {len(data_paths)}", + flush=True, + ) + filepaths_ls = chunk_files(data_paths, blocksize) + text_ddf = dd.from_map( + read_json_func, filepaths_ls, columns=list(meta_df.columns), meta=meta_df + ) + text_ddf = text_ddf.map_partitions( + convert_adlr_id_to_int, + meta=cudf.DataFrame({"text": ["a"], "doc_id": [0], "dataset_id": np.uint32(1)}), + ) + return text_ddf + + +def get_restart_offsets(output_path): + bucket_offset, text_offset = 0, 0 + fn = f"{output_path}/_restart_offset.txt" + if os.path.exists(fn): + with open(fn, "r") as f: + offsets = f.readline().strip("\n").split(",") + bucket_offset = int(offsets[0]) + text_offset = int(offsets[1]) + return bucket_offset, text_offset + + +def update_restart_offsets(output_path, bucket_offset, text_offset): + with open(f"{output_path}/_restart_offset.txt", "w") as f: + f.write(f"{bucket_offset},{text_offset}\n") diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py new file mode 100644 index 00000000..be3f4d7b --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_utils/jaccard_similarity_utils.py @@ -0,0 +1,103 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cudf +import numpy as np + + +def compute_jaccard_partition(df): + df["jaccard"] = df["text_x"].str.jaccard_index(df["text_y"], width=5) + df.drop(columns=["text_x", "text_y"], inplace=True) + return df + + +def get_max_num_rows_to_process_once(df): + nbytes = df["text"].str.byte_count().sum() + # Number of exmploded bytes + exploded_bytes = nbytes * 5 * 2 + max_chars_allowed = 2_147_483_647 + byte_ratio = int(exploded_bytes) // max_chars_allowed + if byte_ratio > 1: + nrows_at_once = len(df) // byte_ratio + else: + nrows_at_once = len(df) + + nrows_at_once = max(1, nrows_at_once) + return nrows_at_once + + +def create_empty_jaccard_result(): + df = cudf.DataFrame() + df["adlr_id_x"] = "x" + df["adlr_id_y"] = "y" + df["jaccard"] = np.empty(shape=0, dtype=np.float32) + return df + + +def compute_jaccard_pair(docs_df, anchor_df): + nrows_at_once = get_max_num_rows_to_process_once(docs_df) + result_ls = [] + for i in range(0, docs_df.shape[0], nrows_at_once): + pair_df = docs_df[i : i + nrows_at_once] + pair_df = pair_df.merge(anchor_df, on="anchor_adlr_id") + pair_df = pair_df.rename( + columns={"adlr_id": "adlr_id_x", "anchor_adlr_id": "adlr_id_y"} + ) + mask = pair_df.adlr_id_x != pair_df.adlr_id_y + pair_df = pair_df[mask].reset_index(drop=True) + if len(pair_df) == 0: + result_df = create_empty_jaccard_result() + else: + result_df = compute_jaccard_partition(pair_df) + result_ls.append(result_df) + if len(result_ls) == 0: + return create_empty_jaccard_result() + df_pair = cudf.concat(result_ls) + return df_pair + + +def get_anchor_df(df, anchor_col): + anchor_df = df[df["adlr_id"] == df[anchor_col]] + anchor_df = anchor_df.reset_index(drop=True) + anchor_df = anchor_df[[anchor_col, "text"]] + anchor_df = anchor_df.rename(columns={anchor_col: "anchor_adlr_id"}) + return anchor_df + + +def compute_jaccard_and_create_pair_df(df): + df = df.drop_duplicates( + subset=["adlr_id", "anchor_1_adlr_id", "anchor_0_adlr_id"], ignore_index=True + ) + anchor_columns = ["anchor_0_adlr_id", "anchor_1_adlr_id"] + result_ls = [] + try: + for anchor_col in anchor_columns: + doc_df = df[["adlr_id", "text", anchor_col]] + doc_df = doc_df.rename(columns={anchor_col: "anchor_adlr_id"}) + doc_df = doc_df[doc_df["adlr_id"] != doc_df["anchor_adlr_id"]] + anchor_df = get_anchor_df(df, anchor_col) + result_df = compute_jaccard_pair(doc_df, anchor_df) + result_ls.append(result_df) + + return cudf.concat(result_ls) + except OverflowError as e: + print( + "Failed with OverflowError in compute_jaccard_and_create_pair_df", + flush=True, + ) + print(df, flush=True) + print("--" * 30) + print("Error") + print("---" * 30) + raise e diff --git a/nemo_curator/gpu_deduplication/jaccard_utils/merge_utils.py b/nemo_curator/gpu_deduplication/jaccard_utils/merge_utils.py new file mode 100644 index 00000000..08fcea53 --- /dev/null +++ b/nemo_curator/gpu_deduplication/jaccard_utils/merge_utils.py @@ -0,0 +1,216 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from operator import getitem + +import numpy as np +from dask.base import tokenize +from dask.dataframe.core import new_dd_object +from dask.dataframe.shuffle import partitioning_index +from dask.highlevelgraph import HighLevelGraph +from dask.utils import M + +from nemo_curator.gpu_deduplication.jaccard_utils.batch_shuffle_utils import ( + rearange_by_column_direct, +) + + +def _split_part(part, nsplits): + total_rows = len(part) + stride = math.ceil(total_rows / nsplits) + return { + i: part.iloc[offset : offset + stride] + for i, offset in enumerate(range(0, total_rows, stride)) + } + + +def text_bytes_aware_merge(text_df, right_df, broadcast=True, *, on): + if not isinstance(on, list): + on = [on] + + string_size = text_df[on + ["text"]] + string_size["text"] = string_size["text"].map_partitions( + lambda s: s.str.byte_count() + ) + post_merge_size = ( + string_size.merge(right_df[on], on=on, how="inner", broadcast=broadcast)["text"] + .map_partitions(lambda s: s.sum()) + .compute() + ) + limit = 2**30 + splits = np.ceil(post_merge_size.divide(limit)).astype("int") + + # Check if we need to split any partitions + if (splits <= 1).all(): + # No need to split + left = text_df + right = right_df + else: + # Split partitions of "large" DataFrame. + # The split functionality requires us to target + # the input collection with more partitions. + if text_df.npartitions >= right_df.npartitions: + large_size = "left" + large_frame = text_df + else: + large_size = "right" + large_frame = right_df + + # Build graph to split large partitions + dsk = {} + npartitions = 0 + token = tokenize(large_frame, splits) + name = f"get-split-{token}" + split_name = f"split-large-{token}" + for i, nsplits in enumerate(splits): + input_partition = (large_frame._name, i) + if nsplits > 1: + dsk[(split_name, i)] = (_split_part, input_partition, nsplits) + for _i in range(nsplits): + dsk[(name, npartitions)] = (getitem, (split_name, i), _i) + npartitions += 1 + else: + dsk[(name, npartitions)] = input_partition + npartitions += 1 + divisions = (None,) * (npartitions + 1) + graph = HighLevelGraph.from_collections(name, dsk, dependencies=[large_frame]) + + if large_size == "left": + left = new_dd_object(graph, name, large_frame._meta, divisions) + right = right_df + else: + left = text_df + right = new_dd_object(graph, name, large_frame._meta, divisions) + + return left.merge(right, on=on, how="inner", broadcast=broadcast) + + +def blockwise_merge(left, right, on, how="inner"): + return left.map_partitions( + M.merge, + right, + on=on, + how=how, + meta=left._meta.merge(right._meta, on=on, how=how), + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + + +def apply_bk_mapping(part, bk_map): + # Need "sort" to preserve order after merge + part["sort"] = range(len(part)) + return part.merge(bk_map, on="file_id", how="left").sort_values( + "sort", ignore_index=True + )["part_id"] + + +def extract_partitioning_index( + left_df, + merge_on, + bk_mapping, + parts_per_bucket_batch, + total_bucket_partitions, +): + # We know `right_df` is already shuffled by `merge_on`. + # It is easy to calculate the partitioning index that each + # row of `left_df` would need to move to for us to apply + # a partition-wise merge between `left_df` and `right_df`. + # We call this `global_partitioning_index`: + + num_bucket_files = bk_mapping.file_id.max() + 1 + global_partitioning_index = left_df[merge_on].map_partitions( + partitioning_index, + npartitions=num_bucket_files, + meta=left_df._meta._constructor_sliced([0]), + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + + if total_bucket_partitions < num_bucket_files: + # Our bucket-map files were aggregated. + # Use bk_mapping to adjust `global_partitioning_index` + global_partitioning_index = global_partitioning_index.to_frame( + name="file_id" + ).map_partitions( + apply_bk_mapping, + bk_mapping, + meta=global_partitioning_index._meta, + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + + # Since we are iterating over `right_df`, we do not really + # want to send the rows of `left_df` to the partition + # indices encoded in `global_partitioning_index`. Instead, we + # need to take a modulus with `parts_per_bucket_batch` to + # define a `"_partitoins"` column. + left_df["_partitions"] = global_partitioning_index % parts_per_bucket_batch + + return left_df, global_partitioning_index + + +def filter_text_rows_by_bucket_batch( + left_df, + global_partitioning_index, + bucket_part_offset, + bucket_part_end_offset, + total_bucket_partitions, +): + # Drop rows that don't apply to this "bucket" batch. + # Since we are not using ALL of `right_df`, we do not + # need to bother transferring data that will not align + # with the target partition + if (bucket_part_end_offset - bucket_part_offset) < total_bucket_partitions: + return left_df[ + global_partitioning_index.isin( + list( + range( + bucket_part_offset, + bucket_part_end_offset, + ) + ) + ) + ] + else: + # No need to filter left_df + return left_df + + +def merge_left_to_shuffled_right( + subset_text_df, + subset_bucket_df, + merge_on, +): + # We are merging an unshuffled batch of "left" partitions + # with a shuffled batch of "right" partitions. To minimize + # data movement, we can manaully rerrange the "left" batch + # by the "_partitions" column, and perform a partition-wise + # inner merge. + shuffled_text_df = rearange_by_column_direct( + subset_text_df, + col="_partitions", + npartitions=subset_bucket_df.npartitions, + ignore_index=True, + ).drop(columns=["_partitions"]) + + return blockwise_merge( + shuffled_text_df, + subset_bucket_df, + on=merge_on, + ) diff --git a/nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py b/nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py new file mode 100644 index 00000000..23278940 --- /dev/null +++ b/nemo_curator/gpu_deduplication/prepare_fuzzy_ids.py @@ -0,0 +1,93 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import cudf +import json +from dask.distributed import Client +from dask import dataframe as dd + + +def main(args): + # Create the ID mapping + df = cudf.DataFrame() + df['base_id'] = [base_id for base_id in args.base_ids.split(",")] + df['dataset_id'] = df['base_id'].hash_values() + df_pd = df.to_pandas() + + output_dict = { + hashed_id: base_id + for base_id, hashed_id in zip(df_pd['base_id'], df_pd['dataset_id']) + } + + # Write out the mapping to disk + with open(args.output_id_mapping, 'w') as output_file: + json.dump(output_dict, output_file) + + # Index the parquet files by group + client = Client() + ddf = dd.read_parquet(args.path_to_connected_components) + ddf = ddf.set_index("group") + ddf.to_parquet(args.output_indexed_connected_components) + + +def attach_args( + parser=argparse.ArgumentParser( + """ +Prepares the output connected components from dedup for +extraction to .txt and .jsonl files + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + )): + parser.add_argument( + "--base-ids", + type=str, + default="doc_id", + help="A comma-delimited list of base-ids that were used for " + "different datasets during dedup. For example, " + "if you were deduplicating Wikipedia and Common Crawl, you might " + "have adlr_ids such has wiki-000001 and cc-000001. " + "The base-ids in this case would be 'wiki,cc'", + ) + parser.add_argument( + "--path-to-connected-components", + type=str, + default=None, + help="Path to the connected components that is created " + "at the last step of the fuzzy dedup.", + ) + parser.add_argument( + "--output-indexed-connected-components", + type=str, + default=None, + help="Path to the output connected components " + "that have been prepared for " + "extraction to .txt and .jsonl files", + ) + parser.add_argument( + "--output-id-mapping", + type=str, + default="mapping.json", + help="A mapping between each of the strings specified " + "in '--base-ids' and their respective hashes", + ) + return parser + + +if __name__ == "__main__": + main(attach_args().parse_args()) + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/gpu_deduplication/utils.py b/nemo_curator/gpu_deduplication/utils.py new file mode 100644 index 00000000..90e7c9e0 --- /dev/null +++ b/nemo_curator/gpu_deduplication/utils.py @@ -0,0 +1,225 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import os +import socket +from contextlib import nullcontext +from time import time + +import cudf +from dask_cuda import LocalCUDACluster +from distributed import Client, performance_report + + +def create_logger(rank, log_file, name="logger", log_level=logging.INFO): + # Create the logger + logger = logging.getLogger(name) + logger.setLevel(log_level) + + myhost = socket.gethostname() + + extra = {"host": myhost, "rank": rank} + formatter = logging.Formatter( + "%(asctime)s | %(host)s | Rank %(rank)s | %(message)s") + + # File handler for output + file_handler = logging.FileHandler(log_file, mode="a") + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + logger = logging.LoggerAdapter(logger, extra) + + return logger + + +#TODO: Remove below to use nemo_curator.distributed_utils.get_client +def get_client(args) -> Client: + if args.scheduler_address: + if args.scheduler_file: + raise ValueError( + "Only one of scheduler_address or scheduler_file can be provided") + else: + return Client(address=args.scheduler_address, timeout="30s") + elif args.scheduler_file: + return Client(scheduler_file=args.scheduler_file, timeout="30s") + else: + extra_kwargs = ({ + "enable_tcp_over_ucx": True, + "enable_nvlink": True, + "enable_infiniband": False, + "enable_rdmacm": False, + } if args.nvlink_only and args.protocol == "ucx" else {}) + + cluster = LocalCUDACluster( + rmm_pool_size=args.rmm_pool_size, + protocol=args.protocol, + rmm_async=True, + **extra_kwargs, + ) + return Client(cluster) + + +def performance_report_if(path=None, report_name="dask-profile.html"): + if path is not None: + return performance_report(os.path.join(path, report_name)) + else: + return nullcontext() + + +#TODO: Remove below to use nemo_curator.distributed_utils._enable_spilling +def enable_spilling(): + """ + Enables spilling to host memory for cudf + """ + cudf.set_option("spill", True) + + +def get_num_workers(client): + """ + Returns the number of workers in the cluster + """ + worker_list = list(client.scheduler_info()["workers"].keys()) + return len(worker_list) + + +def get_list_of_lists(lst, nchunks): + """ + Splits a list into nchunks lists + """ + return [lst[i::nchunks] for i in range(nchunks)] + + +def parse_nc_args( + description="Default gpu dedup nemo_curator argument parser", +) -> argparse.ArgumentParser: + """ + Adds default set of arguments that are common to multiple stages + of the pipeline + """ + parser = argparse.ArgumentParser( + description, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--input-data-dirs", + type=str, + nargs="+", + default=None, + required=False, + help="Input directories consisting of .jsonl files that are accessible " + "to all nodes. This path must be accessible by all machines in the cluster", + ) + parser.add_argument( + "--scheduler-address", + type=str, + default=None, + help="Address to the scheduler of a created dask cluster. If not provided" + "a single node LocalCUDACluster will be started.", + ) + parser.add_argument( + "--scheduler-file", + type=str, + default=None, + help="Path to the scheduler file of a created dask cluster. If not provided" + " a single node LocalCUDACluster will be started.", + ) + parser.add_argument( + "--rmm-pool-size", + type=str, + default=None, + help="Initial pool size to use for the RMM Pool Memory allocator" + "Note: This only applies to the localCUDACluster. If providing an user created " + "cluster refer to" + "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-rmm-pool-size", # noqa: E501 + ) + parser.add_argument( + "--protocol", + type=str, + default="tcp", + help="Protcol to use for dask cluster" + "Note: This only applies to the localCUDACluster. If providing an user created " + "cluster refer to" + "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-protocol", # noqa: E501 + ) + parser.add_argument( + "--nvlink-only", + action="store_true", + help="Start a local cluster with only NVLink enabled." + "Only applicable when protocol=ucx and no scheduler file/address is specified", + ) + parser.add_argument( + "--input-json-text-field", + type=str, + default="text", + help="The name of the field within each json object of the jsonl " + "file that contains the text from which minhashes will be computed. ", + ) + parser.add_argument( + "--input-json-id-field", + type=str, + default="adlr_id", + help="The name of the field within each json object of the jsonl " + "file that assigns a unqiue ID to each document. " + "Can be created by running the script " + "'./prospector/add_id.py' which adds the field 'adlr_id' " + "to the documents in a distributed fashion", + ) + parser.add_argument( + "--log-dir", + type=str, + default="./logs/", + help="The output log directory where node and local", + ) + parser.add_argument( + "--files-per-partition", + type=int, + default=2, + help="Number of jsonl files to combine into single partition", + ) + parser.add_argument( + "--num-files", + type=int, + default=None, + help="Upper limit on the number of json files to process", + ) + parser.add_argument( + "--log-frequency", + type=int, + default=500, + help="The frequency with which to write log messages when " + "computing MinHashses. By default a log message will " + "be written every 500 partitions", + ) + parser.add_argument( + "--profile-path", + type=str, + default=None, + help="Path to save dask profile", + ) + return parser + + +def timer(func): + + def wrapper(*args, **kw): + print(f"function {func.__name__} started...") + start = time() + res = func(*args, **kw) + duration = time() - start + timing = f"function {func.__name__} finished in {duration:.1f} seconds" + print(timing) + return res + + return wrapper diff --git a/nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py b/nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py new file mode 100644 index 00000000..36aea14d --- /dev/null +++ b/nemo_curator/gpu_deduplication/verify_all_pairs_jaccard.py @@ -0,0 +1,166 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +from time import time + +import cudf +import dask_cudf + +from nemo_curator.gpu_deduplication.jaccard_utils.jaccard_similarity_utils import ( + compute_jaccard_partition, + create_empty_jaccard_result, +) +from nemo_curator.gpu_deduplication.utils import get_client, parse_nc_args + + +def num_ngram(ds): + return ds.str.character_ngrams(5, True).list.unique().list.len() + + +def write_eligible_pairs(dedup_with_text_path, cache_dir): + df = cudf.read_parquet(dedup_with_text_path) + df["num_ngram"] = num_ngram(df["text"]) + df.drop(columns="text", inplace=True) + df["group"] = 0 + B = 8_000 + rm = 0 + for s in range(0, df.shape[0], B): + e = min(s + B, df.shape[0]) + da = df.iloc[s:e] + db = da.merge(df, on="group") + mask = db["adlr_id_x"] < db["adlr_id_y"] + db = db[mask] + mask = (db["num_ngram_x"] < db["num_ngram_y"] * + 0.8) | (db["num_ngram_y"] < db["num_ngram_x"] * 0.8) + print(db.shape, mask.sum()) + rm += mask.sum() + db = db[~mask] + db.drop(columns=["group", "num_ngram_x", "num_ngram_y"], inplace=True) + db.to_parquet(f"{cache_dir}/pair_{s}.parquet") + del da, db + print("total pairs removed", rm) + + +def merge_text(df, dedup_with_text_path): + dg = cudf.read_parquet(dedup_with_text_path) + for i in "xy": + df = df.merge(dg, left_on=f"adlr_id_{i}", right_on="adlr_id") + df.drop(columns="adlr_id", inplace=True) + return df + + +def get_max_num_rows_to_process_once(df): + nbytes = max(df["text_x"].str.byte_count().sum(), + df["text_y"].str.byte_count().sum()) + + # TODO: fix below + # to 4x + exploded_bytes = nbytes * 5 * 4 + max_chars_allowed = 2_147_483_647 + byte_ratio = int(exploded_bytes) // max_chars_allowed + if byte_ratio > 1: + nrows_at_once = len(df) // byte_ratio + else: + nrows_at_once = len(df) + + nrows_at_once = max(1, nrows_at_once) + return nrows_at_once + + +def compute_jaccard_pair(docs_df): + nrows_at_once = get_max_num_rows_to_process_once(docs_df) + result_ls = [] + for i in range(0, docs_df.shape[0], nrows_at_once): + pair_df = docs_df[i:i + nrows_at_once] + if len(pair_df) == 0: + result_df = create_empty_jaccard_result() + else: + result_df = compute_jaccard_partition(pair_df) + result_ls.append(result_df) + if len(result_ls) == 0: + return create_empty_jaccard_result() + df_pair = cudf.concat(result_ls) + return df_pair + + +def run_verify_all_pairs_jaccard(dedup_with_text_path, cache_dir, output_dir): + ddf = dask_cudf.read_parquet(f"{cache_dir}/pair_*.parquet") + ddf = ddf.repartition(npartitions=2048) + + meta_df = cudf.DataFrame({ + "adlr_id_x": [0], + "adlr_id_y": [0], + "text_x": ["x"], + "text_y": ["x"], + }) + + ddf = ddf.map_partitions(partial(merge_text, + dedup_with_text_path=dedup_with_text_path), + meta=meta_df) + + meta_df = cudf.DataFrame({ + "adlr_id_x": [0], + "adlr_id_y": [0], + "jaccard": [1.0], + }) + + ddf = ddf.map_partitions(compute_jaccard_pair, meta=meta_df) + mask = ddf["jaccard"] > 0.8 + dup_pairs = ddf[mask].compute() + print("# of duplicated pairs with jaccard>0.8", dup_pairs.shape[0]) + dup_pairs.to_parquet(f"{output_dir}/duplicated_pairs.parquet") + + +def main(args): + start = time() + description = """Verify correctness of deduped results by calculating all pairs""" + dedup_with_text_path = f"{args.output_dir}/dedup_with_text.parquet" + + write_eligible_pairs(dedup_with_text_path, args.cache_dir) + client = get_client(args) + + # Run actual computation + run_verify_all_pairs_jaccard( + dedup_with_text_path, + args.cache_dir, + args.output_dir, + ) + print(f"All done in {time()-start:.1f} seconds") + + +def attach_args(parser=None): + description = """verify all pairs jaccard""" + if not parser: + parser = parse_nc_args(description=description) + + parser.add_argument( + "--output-dir", + type=str, + help="The output directory to write results to", + ) + parser.add_argument( + "--cache-dir", + type=str, + help="The cache directory to write intermediate results to", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/gpu_deduplication/write_deduped_result_with_text.py b/nemo_curator/gpu_deduplication/write_deduped_result_with_text.py new file mode 100644 index 00000000..851f3bd2 --- /dev/null +++ b/nemo_curator/gpu_deduplication/write_deduped_result_with_text.py @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial + +import cudf + +from nemo_curator.gpu_deduplication.jaccard_utils.io_utils import ( + get_text_ddf_from_json_path,) +from nemo_curator.gpu_deduplication.utils import parse_nc_args + + +def merge_text_partition(df, connected_components_path): + res = cudf.read_parquet(connected_components_path).drop(columns="dataset_id") + res = res.drop_duplicates("group") + res = res.drop(columns=["group"]) + df = res.merge(df, on="doc_id", how="left") + df = df.rename(columns={"doc_id": "adlr_id"}) + return df.drop(columns="dataset_id") + + +def write_result_text_parquet(original_path, output_dir): + ddf = get_text_ddf_from_json_path(original_path, + num_files=-1, + files_per_input_partition=10) + + connected_components_path = f"{output_dir}/connected_components.parquet" + print(ddf.head()) + merge_func = partial(merge_text_partition, + connected_components_path=connected_components_path) + ddf = ddf.map_partitions(merge_func, meta={"adlr_id": "uint32", "text": "O"}) + + mask = ddf.text.isnull() + ddf = ddf[~mask] + + df = ddf.compute() + df = df.reset_index(drop=True) + df.to_parquet(f"{output_dir}/dedup_with_text.parquet") + +def main(args): + write_result_text_parquet(original_path=[args.original_path], + output_dir=args.output_dir) + +def attach_args(parser=None): + description = """verify all pairs jaccard""" + if not parser: + parser = parse_nc_args(description=description) + + parser.add_argument( + "--output-dir", + type=str, + help="The output directory to write results to", + ) + parser.add_argument( + "--original-path", + type=str, + help="The path of original jsonl files", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + args = attach_args().parse_args() diff --git a/nemo_curator/log.py b/nemo_curator/log.py new file mode 100644 index 00000000..33f1461e --- /dev/null +++ b/nemo_curator/log.py @@ -0,0 +1,92 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import logging +import socket + +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir + + +def create_logger(rank, log_file, name='logger', log_level=logging.INFO): + # Create the logger + logger = logging.getLogger(name) + logger.setLevel(log_level) + + myhost = socket.gethostname() + + extra = {'host': myhost, 'rank': rank} + formatter = logging.Formatter( + '%(asctime)s | %(host)s | Rank %(rank)s | %(message)s') + + # File handler for output + file_handler = logging.FileHandler(log_file, mode='a') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + logger = logging.LoggerAdapter(logger, extra) + + return logger + + +def create_rank_logger( + rank, + log_dir, + name='node_logger', + log_level=logging.INFO, +): + # Make the log directory if it does not exist + log_dir = expand_outdir_and_mkdir(log_dir) + + # Create the rank subdirectory + rank_tag = str(rank).rjust(3, '0') + rank_dir = os.path.join(log_dir, f'rank_{rank_tag}') + rank_dir = expand_outdir_and_mkdir(rank_dir) + + log_file = os.path.join(rank_dir, f'rank_{rank_tag}.log') + return create_logger(rank, log_file, name=name, log_level=log_level) + + +def create_local_logger( + rank, + local_id, + log_dir, + name='local_logger', + log_level=logging.INFO, +): + # Create the logger + logger = logging.getLogger(name) + logger.setLevel(log_level) + + # Make tags + rank_tag = str(rank).rjust(3, '0') + local_id_tag = str(local_id).rjust(3, '0') + + myhost = socket.gethostname() + extra = {'host': myhost, 'node': rank_tag, 'local': local_id_tag} + formatter = logging.Formatter('%(asctime)s | %(host)s | Node rank %(node)s ' + '| Local rank %(local)s | %(message)s') + + # Output log file + rank_dir = os.path.join(log_dir, f'rank_{rank_tag}') + log_file = os.path.join(rank_dir, f'local_{local_id_tag}.log') + + # File handler for output + file_handler = logging.FileHandler(log_file, mode='a') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + logger = logging.LoggerAdapter(logger, extra) + + return logger diff --git a/nemo_curator/modifiers/__init__.py b/nemo_curator/modifiers/__init__.py new file mode 100644 index 00000000..e6790d23 --- /dev/null +++ b/nemo_curator/modifiers/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .doc_modifier import DocumentModifier +from .c4 import BoilerPlateStringModifier +from .fasttext import FastTextLabelModifier +from .unicode_reformatter import UnicodeReformatter + +__all__ = ["DocumentModifier", "BoilerPlateStringModifier", "FastTextLabelModifier", "UnicodeReformatter"] \ No newline at end of file diff --git a/nemo_curator/modifiers/c4.py b/nemo_curator/modifiers/c4.py new file mode 100644 index 00000000..631bc71a --- /dev/null +++ b/nemo_curator/modifiers/c4.py @@ -0,0 +1,87 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_curator.modifiers import DocumentModifier + +from nemo_curator.utils.constants import policy_substrings +from nemo_curator.utils.text_utils import ( + get_paragraphs, + is_paragraph_indices_in_top_or_bottom_only, +) + +class BoilerPlateStringModifier(DocumentModifier): + """ + If the sentence contains any of the boilerplate strings then discard. + This includes things like "terms of use", "privacy policy", etc. + Source: Adapted significantly from Google C4 processing. + """ + + def __init__( + self, + remove_if_at_top_or_bottom=True, + ): + super().__init__() + self._remove_if_at_top_or_bottom = remove_if_at_top_or_bottom + self._top_or_bottom_only = False + self._boilerplate_paragraph_indices = [] + self._name = 'boilerplate_string_ratio' + + def modify_document(self, text): + # Initialize variables + self._boilerplate_paragraph_indices = [] + + # Return an empty string when the document should be removed entirely + empty_string = "" + + # Get the paragraphs + paragraphs = self._paragraphs + if paragraphs is None: + paragraphs = get_paragraphs(text) + + # Check each paragraph + for idx, paragraph in enumerate(paragraphs): + paragraph = paragraph.strip().lower() + if 'lorem ipsum' in paragraph: + return empty_string + if any(p in paragraph for p in policy_substrings): + if not self._remove_if_at_top_or_bottom: + return empty_string + else: + self._boilerplate_paragraph_indices.append(idx) + + # Keep the document if we did not find any boilerplate + if len(self._boilerplate_paragraph_indices) == 0: + return text + + # Mark if boilerplate is only at top or bottom + self._top_or_bottom_only = is_paragraph_indices_in_top_or_bottom_only( + self._boilerplate_paragraph_indices, + len(paragraphs), + ) + + if self._top_or_bottom_only: + # In case paragraphs is None, recompute it + if self._paragraphs is None: + self._paragraphs = get_paragraphs(text) + modified_doc = '\n\n'.join([ + p for idx, p in enumerate(self._paragraphs) + if idx not in self._boilerplate_paragraph_indices + ]) + # Set the paragraphs back to None as the document has been + # changed + else: + modified_doc = text + + self._paragraphs = None + return modified_doc \ No newline at end of file diff --git a/nemo_curator/modifiers/doc_modifier.py b/nemo_curator/modifiers/doc_modifier.py new file mode 100644 index 00000000..16b4eddf --- /dev/null +++ b/nemo_curator/modifiers/doc_modifier.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod + + +class DocumentModifier(ABC): + def __init__(self): + super().__init__() + self._name = self.__class__.__name__ + self._sentences = None + self._paragraphs = None + self._ngrams = None + + @abstractmethod + def modify_document(self, text): + pass \ No newline at end of file diff --git a/nemo_curator/modifiers/fasttext.py b/nemo_curator/modifiers/fasttext.py new file mode 100644 index 00000000..8aa4d1ef --- /dev/null +++ b/nemo_curator/modifiers/fasttext.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_curator.modifiers import DocumentModifier + +class FastTextLabelModifier(DocumentModifier): + def __init__(self, label): + super().__init__() + self.label = label + + def modify_document(self, text): + text = text.replace('\n', ' ').replace('__label__', ' ') + return f"{self.label} {text}" \ No newline at end of file diff --git a/nemo_curator/modifiers/pii_modifier.py b/nemo_curator/modifiers/pii_modifier.py new file mode 100644 index 00000000..4316f5c1 --- /dev/null +++ b/nemo_curator/modifiers/pii_modifier.py @@ -0,0 +1,101 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Dict + +import pandas as pd + +from nemo_curator.modifiers import DocumentModifier +from nemo_curator.pii.algorithm import DEFAULT_LANGUAGE +from nemo_curator.utils.distributed_utils import load_object_on_worker + +__all__ = ["PiiModifierBatched"] + +DEFAULT_BATCH_SIZE = 2000 + + +class PiiModifierBatched(DocumentModifier): + """ + This class is the entry point to using the PII de-identification module on documents stored as CSV, JSONL or + other formats. It works with the `Modify` functionality as shown below: + + dataframe = pd.DataFrame({'text': ['Sarah and Ryan went out to play', 'Jensen is the CEO of NVIDIA']}) + dd = dask.dataframe.from_pandas(dataframe, npartitions=1) + dataset = DocumentDataset(dd) + + modifier = PiiModifierBatched( + batch_size=2000, + language='en', + supported_entities=['PERSON', "EMAIL_ADDRESS"], + anonymize_action='replace') + + modify = Modify(modifier, batched=True) + modified_dataset = modify(dataset) + modified_dataset.df.to_json('output_files/*.jsonl', lines=True, orient='records') + + """ + + def __init__(self, + language: str = DEFAULT_LANGUAGE, + supported_entities: List[str] = None, + anonymize_action: str = "redact", + batch_size: int = DEFAULT_BATCH_SIZE, + device: str = 'gpu', + **kwargs): + super().__init__() + + self.language = language + self.supported_entities = supported_entities + self.anonymize_action = anonymize_action + self.kwargs = kwargs + + self.batch_size = batch_size + self.device = device + + def modify_document(self, text: pd.Series, partition_info: Dict = None): + import logging + logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S") + + deidentifier = load_object_on_worker( + "deidentifier", + self.load_deidentifier, + {} + ) + try: + output: List[str] = deidentifier.deidentify_text_batch(text.tolist(), self.batch_size) + except Exception as e: + logging.error(f"Encountered error {str(e)} in partition {partition_info['number']}") + return pd.Series([True]) + output: pd.Series = pd.Series(output) + return output + + def load_deidentifier(self): + """ + Helper function to load the de-identifier + """ + import spacy + if self.device == 'gpu': + spacy.require_gpu() + from nemo_curator.pii.algorithm import PiiDeidentifier, DEFAULT_MAX_DOC_SIZE + + deidentifier: PiiDeidentifier = PiiDeidentifier( + language=self.language, + supported_entities=self.supported_entities, + anonymize_action=self.anonymize_action, + **self.kwargs + ) + deidentifier.analyzer.nlp_engine.nlp[deidentifier.language].max_length = DEFAULT_MAX_DOC_SIZE + + return deidentifier diff --git a/nemo_curator/modifiers/unicode_reformatter.py b/nemo_curator/modifiers/unicode_reformatter.py new file mode 100644 index 00000000..806934da --- /dev/null +++ b/nemo_curator/modifiers/unicode_reformatter.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ftfy +from nemo_curator.modifiers import DocumentModifier + +class UnicodeReformatter(DocumentModifier): + def __init__(self): + super().__init__() + + def modify_document(self, text): + return ftfy.fix_text(text) \ No newline at end of file diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py new file mode 100644 index 00000000..01f483ae --- /dev/null +++ b/nemo_curator/modules/__init__.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .distributed_data_classifier import DomainClassifier, QualityClassifier +from .exact_dedup import ExactDuplicates +from .filter import Filter, Score, ScoreFilter +from .fuzzy_dedup import LSH, MinHash +from .meta import Sequential +from .modify import Modify +from .task import TaskDecontamination +from .add_id import AddId + +__all__ = [ + "DomainClassifier", + "ExactDuplicates", + "Filter", + "LSH", + "MinHash", + "Modify", + "QualityClassifier", + "Score", + "ScoreFilter", + "Sequential", + "TaskDecontamination", + "AddId", +] diff --git a/nemo_curator/modules/add_id.py b/nemo_curator/modules/add_id.py new file mode 100644 index 00000000..168b1836 --- /dev/null +++ b/nemo_curator/modules/add_id.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dask.dataframe as dd +from dask import delayed +import numpy as np + +from nemo_curator.datasets import DocumentDataset + + +class AddId: + def __init__(self, id_field, id_prefix="doc_id", start_index=0) -> None: + self.id_field = id_field + self.id_prefix = id_prefix + self.start_index = start_index + + def __call__(self, dataset: DocumentDataset) -> DocumentDataset: + original_meta = dataset.df.dtypes.to_dict() + original_meta[self.id_field] = "object" + delayed_dataset = dataset.df.to_delayed() + + parition_lengths = [0] + for partition in delayed_dataset[:-1]: + parition_lengths.append(delayed(len)(partition)) + + lower_id_bounds = delayed(np.cumsum)(parition_lengths) + delayed_id_dataset = [] + for i, partition in enumerate(delayed_dataset): + delayed_id_dataset.append(delayed(self._add_id_to_partition)(partition, lower_id_bounds[i])) + + id_dataset = DocumentDataset(dataset_df=dd.from_delayed(delayed_id_dataset, meta=original_meta)) + + return id_dataset + + def _add_id_to_partition(self, partition, partition_start_id): + id_column = [f"{self.id_prefix}-{int(i + self.start_index):010d}" for i in range(partition_start_id, len(partition) + partition_start_id)] + partition[self.id_field] = id_column + + return partition \ No newline at end of file diff --git a/nemo_curator/modules/distributed_data_classifier.py b/nemo_curator/modules/distributed_data_classifier.py new file mode 100644 index 00000000..ae65a498 --- /dev/null +++ b/nemo_curator/modules/distributed_data_classifier.py @@ -0,0 +1,329 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +import os +from packaging import version + +import torch +from transformers import __version__ as TRANSFORMERS_VERSION +from transformers.models.deberta_v2 import DebertaV2TokenizerFast + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.distributed_data_classification.pytorch_utils import ( + CFG, + CustomModel, + TestDataset, + collate, +) +from nemo_curator.utils.distributed_utils import ( + load_object_on_worker, + process_all_batches, +) + + +class DistributedDataClassifier(ABC): + """ Abstract class for running multi-node multi-GPU data classification """ + + def __init__( + self, + model_file_name, + labels, + filter_by, + batch_size, + out_dim, + pred_column, + max_chars, + num_workers, + device_type, + autocast, + ): + self.model_file_name = model_file_name + self.labels = labels + self.filter_by = filter_by + self.batch_size = batch_size + self.out_dim = out_dim + self.pred_column = pred_column + self.max_chars = max_chars + self.num_workers = num_workers + self.device_type = device_type + self.autocast = autocast + + def __call__(self, dataset: DocumentDataset): + result_doc_dataset = self._run_classifier(dataset) + + if self.filter_by is not None: + return self._filter_documents(result_doc_dataset) + + return result_doc_dataset + + @abstractmethod + def _run_classifier(self): + pass + + def _cfg_per_partition(self): + return load_object_on_worker( + "cfg_with_tokenizer", + self._load_cfg_with_tokenizer, + {}, + ) + + def _filter_documents( + self, + dataset: DocumentDataset, + ): + df = dataset.df + + filter_by = self.filter_by + if type(filter_by) == str: + filtered_df = df[df[self.pred_column].astype(str) == filter_by] + return DocumentDataset(filtered_df) + elif type(filter_by) == list: + filtered_df = df[df[self.pred_column].isin(filter_by)] + return DocumentDataset(filtered_df) + + raise TypeError("filter_by must be a string or list type") + + +class DomainClassifier(DistributedDataClassifier): + def __init__( + self, + model_file_name, + labels, + filter_by=None, + batch_size=256, + out_dim=None, + pred_column="pred", + max_chars=2000, + num_workers=0, + device_type="cuda", + autocast=True, + ): + if out_dim is None: + out_dim = len(labels) + + super().__init__( + model_file_name=model_file_name, + labels=labels, + filter_by=filter_by, + batch_size=batch_size, + out_dim=out_dim, + pred_column=pred_column, + max_chars=max_chars, + num_workers=num_workers, + device_type=device_type, + autocast=autocast, + ) + + def _run_classifier(self, dataset: DocumentDataset): + print("Starting domain classifier inference", flush=True) + + df = dataset.df + + meta_df = df._meta.copy() + meta_df[self.pred_column] = [0] * len(meta_df) + + df = df.map_partitions( + self._inference_per_partition, + meta=meta_df, + enforce_metadata=False, + ) + + return DocumentDataset(df) + + def _inference_per_partition(self, df): + cfg = self._cfg_per_partition() + + dataset_valid = TestDataset(cfg, df, self.max_chars) + loader_valid = torch.utils.data.DataLoader( + dataset_valid, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, + ) + + device = torch.device(self.device_type) + load_model_kwargs = {"cfg": cfg, "device": device} + + preds = process_all_batches( + loader_valid, + self._load_model, + load_model_kwargs, + self._run_inference, + {}, + ) + preds = preds.cpu().numpy() + df[self.pred_column] = [self.labels[i] for i in preds] + + return df + + def _load_cfg_with_tokenizer(self): + cfg = CFG() + tokenizer = DebertaV2TokenizerFast.from_pretrained(cfg.model) + cfg.tokenizer = tokenizer + return cfg + + def _load_model(self, cfg, device): + model = CustomModel( + cfg, out_dim=self.out_dim, config_path=None, pretrained=True + ) + model = model.to(device) + sd = torch.load(os.path.join(self.model_file_name), map_location="cpu") + sd = {k[7:] if k.startswith("module.") else k: sd[k] for k in sd.keys()} + if version.parse(TRANSFORMERS_VERSION) >= version.parse("4.31.0"): + sd.pop("model.embeddings.position_ids", None) + + model.load_state_dict(sd, strict=True) + model.eval() + return model + + def _run_inference(self, batch, model): + with torch.no_grad(): + batch = collate(batch) + if self.autocast: + with torch.autocast(device_type=self.device_type): + out = model(batch)[:, 0, :] + else: + out = model(batch)[:, 0, :] + pred_idx = torch.sigmoid(out).argmax(1) + + return pred_idx + + +# TODO: Implement MultipleModelQualityClassifier class +class QualityClassifier(DistributedDataClassifier): + def __init__( + self, + model_file_name, + labels, + filter_by=None, + batch_size=256, + out_dim=None, + pred_column="quality_pred", + prob_column="quality_prob", + max_chars=6000, + num_workers=0, + device_type="cuda", + autocast=True, + max_len=1024, + ): + # Binary case + if len(labels) == 2: + out_dim = 1 + self.binary_classification = True + else: + if out_dim is None: + out_dim = len(labels) + self.binary_classification = False + + self.prob_column = prob_column + self.max_len = max_len + + super().__init__( + model_file_name=model_file_name, + labels=labels, + filter_by=filter_by, + batch_size=batch_size, + out_dim=out_dim, + pred_column=pred_column, + max_chars=max_chars, + num_workers=num_workers, + device_type=device_type, + autocast=autocast, + ) + + def _run_classifier(self, dataset: DocumentDataset): + print("Starting quality classifier inference", flush=True) + + df = dataset.df + + meta_df = df._meta.copy() + meta_df[self.pred_column] = ["low"] * len(meta_df) + meta_df[self.prob_column] = [[0, 0, 1]] * len(meta_df) + + df = df.map_partitions( + self._inference_per_partition, + meta=meta_df, + enforce_metadata=False, + ) + + return DocumentDataset(df) + + def _inference_per_partition(self, df): + cfg = self._cfg_per_partition() + + dataset_valid = TestDataset(cfg, df, self.max_chars) + loader_valid = torch.utils.data.DataLoader( + dataset_valid, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, + ) + device = torch.device(self.device_type) + if len(self.labels) == 1: + raise ValueError("Labels must be more than 1") + + load_model_kwargs = { + "cfg": cfg, + "device": device, + } + + probs = process_all_batches( + loader_valid, + self._load_model, + load_model_kwargs, + self._run_inference, + {}, + ) + + if self.binary_classification: + preds = (probs > 0.5).to(torch.int64).squeeze() + else: + preds = torch.argmax(probs, dim=1) + + df[self.pred_column] = [self.labels[i] for i in preds.to("cpu").numpy().tolist()] + df[self.prob_column] = probs.to("cpu").numpy().tolist() + + return df + + def _load_cfg_with_tokenizer(self): + cfg = CFG(max_len=self.max_len) + tokenizer = DebertaV2TokenizerFast.from_pretrained(cfg.model) + cfg.tokenizer = tokenizer + return cfg + + def _load_model(self, cfg, device): + model = CustomModel(cfg, out_dim=self.out_dim, config_path=None, pretrained=True) + model = model.to(device) + sd = torch.load(self.model_file_name, map_location="cpu") + if "model_state_dict" in sd: + sd = sd["model_state_dict"] + sd = {k[7:] if k.startswith("module.") else k: sd[k] for k in sd.keys()} + model.load_state_dict(sd, strict=True) + model.eval() + return model + + def _run_inference(self, batch, model): + with torch.no_grad(): + batch = collate(batch) + if self.autocast: + with torch.autocast(device_type=self.device_type): + out = model(batch)[:, 0, :] + else: + out = model(batch)[:, 0, :] + if self.binary_classification: + probs = torch.sigmoid(out) + else: + probs = torch.softmax(out, dim=1) + return probs diff --git a/nemo_curator/modules/exact_dedup.py b/nemo_curator/modules/exact_dedup.py new file mode 100644 index 00000000..271ba136 --- /dev/null +++ b/nemo_curator/modules/exact_dedup.py @@ -0,0 +1,172 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +import time +import warnings +from contextlib import nullcontext +from datetime import datetime +from hashlib import md5 +from typing import Union + +import pandas as pd +from dask import dataframe as dd, config +from nemo_curator._compat import DASK_P2P_ERROR +from nemo_curator.datasets import DocumentDataset +from nemo_curator.gpu_deduplication.utils import create_logger, performance_report_if +from nemo_curator.utils.gpu_utils import is_cudf_type + + +class ExactDuplicates: + """Find exact duplicates in a document corpus""" + + SUPPORTED_HASHES = {"md5"} + + def __init__( + self, + logger: Union[logging.LoggerAdapter, str] = "./", + id_field: str = "id", + text_field: str = "text", + hash_method: str = "md5", + profile_dir: str = None, + cache_dir: str = None, + ): + """ + Parameters + ---------- + logger: Existing logger to log to, or a path to a log directory. + id_field: Column in the Dataset denoting document ID. + text_field: Column in the Dataset denoting document content. + hash_method: The hashing algorithm used for identifying exact duplicates. Currently supports {"md5"} + profile_dir: str, Default None + If specified directory to write dask profile + cache_dir: str, Default None + If specified, will compute & write duplicate id's to cache directory. + """ + + if hash_method not in self.SUPPORTED_HASHES: + raise ValueError( + f"{hash_method} not in supported hash_methods. Choose a hash_method from {self.SUPPORTED_HASHES}" + ) + self.hash_method = hash_method + self.id_field = id_field + self.text_field = text_field + if cache_dir is None and profile_dir is not None: + warnings.warn( + "cache_dir for intermediate outputs is required to generate profiles" + ) + self.cache_dir = cache_dir + self.profile_dir = profile_dir + + if isinstance(logger, str): + self._logger = create_logger( + rank=0, + log_file=os.path.join(logger, "ExactDuplicates.log"), + name="ExactDuplicates", + ) + else: + self._logger = logger + + def _exact_dup_ids(self, df: dd.DataFrame): + """ + Get the id's for text/documents that are exact duplicates + Parameters + ---------- + df: dask.dataframe.core.DataFrame + A dataframe with the following requirements: + * A column where each row is the text from one document + * A unique ID column for each document + """ + hash_df = self._compute_hashes(df) + shuffle_context = ( + config.set({"dataframe.shuffle.method": "tasks"}) + if DASK_P2P_ERROR + else nullcontext() + ) + with shuffle_context: + dup_ids = hash_df.shuffle( + on=["_hashes"], + ignore_index=True, + npartitions=max(1, (hash_df.npartitions // 3)), + ).map_partitions(lambda x: x[x["_hashes"].duplicated(keep=False)]) + return dup_ids + + def _compute_hashes( + self, + df: dd.DataFrame, + ) -> dd.DataFrame: + """ + Computes the hash of the text_column provided and returns a dataframe + containing the id_column and relevant hashes in the _hashes column. + """ + self._logger.info("Starting lazy hash generation") + res = df[[self.id_field]] + res["_hashes"] = df[self.text_field].map_partitions(self.hash_documents) + self._logger.info( + f"Lazy hash generation complete for {res.npartitions} partitions" + ) + return res + + def hash_documents( + self, df: Union[cudf.Series, pd.Series] + ) -> Union[cudf.Series, pd.Series]: + """ + Compute hashes for a Series containing documents + """ + if is_cudf_type(df): + return df.hash_values(method=self.hash_method) + elif isinstance(df, pd.Series): + # TODO: Generalize ty using self.hash_method + return df.apply(lambda x: md5(x.encode()).hexdigest()) + + def __call__(self, dataset: DocumentDataset) -> Union[DocumentDataset, str]: + """ + Find document ID's for exact duplicates in a given DocumentDataset + Parameters + ---------- + dataset: DocumentDataset + The input datset to find exact duplicates + Returns + ------- + DocumentDataset containing ID's and hashes of all duplicate documents + """ + result = self._exact_dup_ids(df=dataset.df) + + if self.cache_dir is None: + return DocumentDataset(result) + + t0 = time.time() + self._logger.info("Starting execution for ExactDedup") + write_path = os.path.join(self.cache_dir, "_exact_duplicates.parquet") + if os.path.exists(write_path): + warnings.warn( + f"Output path f{write_path} already exists and will be overwritten" + ) + with performance_report_if( + self.profile_dir, + f"exact-dedup-profile-{datetime.now().strftime('%Y%m%d_%H%M%S')}.html", + ): + result.to_parquet(write_path, write_index=False, overwrite=True) + self._logger.info( + f"Exact dedup computation for dataset took {time.time() - t0}s complete at {write_path}" # noqa:E501 + ) + if is_cudf_type(result): + import dask_cudf + + result_dataset = dask_cudf.read_parquet(write_path, split_row_groups=False) + else: + result_dataset = dd.read_parquet(write_path) + return DocumentDataset(result_dataset) diff --git a/nemo_curator/modules/filter.py b/nemo_curator/modules/filter.py new file mode 100644 index 00000000..563d42b8 --- /dev/null +++ b/nemo_curator/modules/filter.py @@ -0,0 +1,107 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_curator.datasets import DocumentDataset +from dask.typing import no_default + +class Score: + def __init__(self, score_fn, score_field, text_field="text", batched=False, score_type=None): + """ + Args: + score_fn: The score function that takes in a document string and outputs a score for the document + score_field: The field the score will be stored in. + text_field: The field the documents will be read from + """ + self.score_fn = score_fn + self.score_field = score_field + self.text_field = text_field + self.batched = batched + self.score_type = score_type + + def __call__(self, dataset): + # Set the metadata for the function calls if provided + if self.score_type: + meta = (None, self.score_type) + else: + meta = no_default + + if self.batched: + dataset.df[self.score_field] = dataset.df[self.text_field].map_partitions(self.score_fn, meta=meta) + else: + dataset.df[self.score_field] = dataset.df[self.text_field].apply(self.score_fn, meta=meta) + + return dataset + + +class Filter: + def __init__(self, filter_fn, filter_field, invert=False, batched=False): + """ + Args: + filter_fn: A function that returns True if the document is to be kept + filter_field: The field(s) to be passed into the filter function. + invert: Whether to invert the filter condition + """ + self.filter_fn = filter_fn + self.filter_field = filter_field + self.invert = invert + self.batched = batched + + def __call__(self, dataset): + if self.batched: + bool_mask = dataset.df[self.filter_field].map_partitions(self.filter_fn, meta=(None, bool)) + else: + bool_mask = dataset.df[self.filter_field].apply(self.filter_fn, meta=(None, bool)) + + if self.invert: + bool_mask = ~bool_mask + + return DocumentDataset(dataset.df[bool_mask]) + + +class ScoreFilter: + def __init__(self, filter_obj, text_field="text", score_field=None, score_type=None, invert=False, batched=False): + """ + Args: + score_field: The field to which the scores will be written. If None, scores will be immediately discarded after use. + """ + self.filter_obj = filter_obj + self.text_field = text_field + self.score_field = score_field + self.score_type = score_type + self.invert = invert + self.batched = batched + + def __call__(self, dataset): + # Set the metadata for the function calls if provided + if self.score_type: + meta = (None, self.score_type) + else: + meta = no_default + + if self.batched: + scores = dataset.df[self.text_field].map_partitions(self.filter_obj.score_document, meta=meta) + else: + scores = dataset.df[self.text_field].apply(self.filter_obj.score_document, meta=meta) + + if self.score_field is not None: + dataset.df[self.score_field] = scores + + if self.batched: + bool_mask = scores.map_partitions(self.filter_obj.keep_document, meta=(None, bool)) + else: + bool_mask = scores.apply(self.filter_obj.keep_document, meta=(None, bool)) + if self.invert: + bool_mask = ~bool_mask + + return DocumentDataset(dataset.df[bool_mask]) diff --git a/nemo_curator/modules/fuzzy_dedup.py b/nemo_curator/modules/fuzzy_dedup.py new file mode 100644 index 00000000..99a3bb84 --- /dev/null +++ b/nemo_curator/modules/fuzzy_dedup.py @@ -0,0 +1,1331 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import math +import os +import time +import warnings +from datetime import datetime +from typing import List, Tuple, Union + +import cudf +import cugraph +import cugraph.dask as dcg +import cugraph.dask.comms.comms as Comms +import cupy as cp +import dask_cudf +import numpy as np +from dask import dataframe as dd +from dask.dataframe.shuffle import shuffle as dd_shuffle +from dask.utils import M +from tqdm import tqdm + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.gpu_deduplication.jaccard_utils.merge_utils import ( + extract_partitioning_index, + filter_text_rows_by_bucket_batch, + merge_left_to_shuffled_right, +) +from nemo_curator.gpu_deduplication.utils import create_logger, performance_report_if +from nemo_curator.utils.distributed_utils import get_current_client, get_num_workers +from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import convert_str_id_to_int, int_ids_to_str +from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( + aggregated_anchor_docs_with_bk_read, + get_restart_offsets, + update_restart_offsets, +) +from nemo_curator.utils.fuzzy_dedup_utils.output_map_utils import ( + build_partition, + get_agg_text_bytes_df, +) +from nemo_curator.utils.fuzzy_dedup_utils.shuffle_utils import ( + text_bytes_aware_shuffle, + write_partitioned_file, +) + + +class MinHash: + """ + Computes minhash signatures of a document corpus + """ + + def __init__( + self, + seed: int = 42, + num_hashes: int = 260, + char_ngrams: int = 5, + use_64bit_hash: bool = False, + logger: Union[logging.LoggerAdapter, str] = "./", + id_field: str = "id", + text_field: str = "text", + profile_dir: str = None, + cache_dir: str = None, + ): + """ + Parameters + ---------- + seed: Seed for minhash permutations + num_hashes: Length of minhash signature (No. of minhash permutations) + char_ngrams: Width of text window (in characters) while computing minhashes. + use_64bit_hash: Whether to use a 64 bit hash function. + logger: Existing logger to log to, or a path to a log directory. + id_field: Column in the Dataset denoting document ID. + text_field: Column in the Dataset denoting document content. + profile_dir: str, Default None + If specified directory to write dask profile + cache_dir: str, Default None + If specified, will compute & write id,minhash pairs to directory + """ + self.num_hashes = num_hashes + self.char_ngram = char_ngrams + self.seeds = self.generate_seeds(n_seeds=self.num_hashes, seed=seed) + self.minhash_method = self.minhash64 if use_64bit_hash else self.minhash32 + self.id_field = id_field + self.text_field = text_field + + if cache_dir is None and profile_dir is not None: + warnings.warn( + "cache_dir for intermediate outputs is required to generate profiles" + ) + self.cache_dir = cache_dir + self.profile_dir = profile_dir + + if isinstance(logger, str): + self._logger = create_logger( + rank=0, + log_file=os.path.join(logger, "Minhash.log"), + name="Minhash", + ) + else: + self._logger = logger + + def generate_seeds(self, n_seeds: int = 260, seed: int = 0) -> np.ndarray: + """ + Generate seeds for all minhash permutations based on the given seed. + """ + gen = np.random.RandomState(seed) + return gen.randint(0, 1e6, size=n_seeds) + + def minhash32( + self, ser: cudf.Series, seeds: np.ndarray, char_ngram: int + ) -> cudf.Series: + """ + Compute 32bit minhashes based on the MurmurHash3 algorithm + """ + if not isinstance(ser, cudf.Series): + raise TypeError("Expected data of type cudf.Series") + seeds = cudf.Series(seeds, dtype="uint32") + return ser.str.minhash(seeds=seeds, width=char_ngram) + + def minhash64( + self, ser: cudf.Series, seeds: np.ndarray, char_ngram: int + ) -> cudf.Series: + """ + Compute 64bit minhashes based on the MurmurHash3 algorithm + """ + if not isinstance(ser, cudf.Series): + raise TypeError("Expected data of type cudf.Series") + seeds = cudf.Series(seeds, dtype="uint64") + return ser.str.minhash64(seeds=seeds, width=char_ngram) + + def __call__(self, dataset: DocumentDataset) -> Union[str, DocumentDataset]: + """ + Computes the MinHash Signatures for a given dataset. + Parameters + ---------- + dataset: DocumentDataset + The input datset to compute MinHashes. + Returns + ------- + DocumentDataset containing IDs of all documents and the corresponding MinHash Signature + """ + result = dataset.df[[self.id_field]] + result["_minhash_signature"] = dataset.df[self.text_field].map_partitions( + self.minhash_method, + seeds=self.seeds, + char_ngram=self.char_ngram, + ) + + if self.cache_dir is None: + return DocumentDataset(result) + + t0 = time.time() + self._logger.info("Starting execution for Minhashes") + write_path = os.path.join(self.cache_dir, "_minhashes.parquet") + if os.path.exists(write_path): + warnings.warn( + f"Output path {write_path} already exists and will be overwritten" + ) + with performance_report_if( + self.profile_dir, + f"minhash-profile-{datetime.now().strftime('%Y%m%d_%H%M%S')}.html", + ): + result.to_parquet(write_path, write_index=False, overwrite=True) + self._logger.info( + f"Minhash signature computation for dataset took {time.time() - t0}s complete at {write_path}" # noqa:E501 + ) + return DocumentDataset( + dask_cudf.read_parquet(write_path, blocksize="2GB", aggregate_files=True) + ) + + +class LSH: + """ + Performs LSH on a MinhashSignatures + """ + + def __init__( + self, + cache_dir: str, + minhash_length: int, + num_buckets: int, + buckets_per_shuffle: int = 1, + logger: Union[logging.LoggerAdapter, str] = "./", + id_fields: Union[str, list] = "id", + minhash_field: str = "_minhash_signature", + profile_dir: str = None, + ): + """ + Parameters + ---------- + cache_dir: str + Needs to be specified, will compute & write duplicate id, bucket pairs to cache directory. + minhash_length: Length of minhash signature + num_buckets: Number of bands/buckets to create from the minhash signature. + Hashes_per_signature = minhash_length / num_buckets + buckets_per_shuffle: Number of bands/buckets to shuffle concurrently. + Larger values process larger batches by processing multiple bands + but might lead to memory pressures and related errors. + logger: Existing logger to log to, or a path to a log directory. + id_field: Columns in the Dataset denoting document ID. + minhash_field: Column in the Dataset denoting minhash signature. + profile_dir: str, Default None + If specified directory to write dask profile + """ + self.minhash_length = minhash_length + self.num_buckets = num_buckets + self.id_fields = [id_fields] if isinstance(id_fields, str) else id_fields + self.minhash_field = minhash_field + self.buckets_per_shuffle = buckets_per_shuffle + self.bucket_ranges = self._generate_bucket_ranges( + self.num_buckets, self.minhash_length + ) + + if cache_dir is None: + raise ValueError( + "cache_dir for intermediate outputs is required for this stage" + ) + self.cache_dir = cache_dir + self.profile_dir = profile_dir + + if isinstance(logger, str): + self._logger = create_logger( + rank=0, + log_file=os.path.join(logger, "LSH.log"), + name="LSH", + ) + else: + self._logger = logger + + def _generate_bucket_ranges( + self, num_buckets: int, minhash_length: int + ) -> List[List[int]]: + """ + Generates a list of indices for the minhash ranges given num_bands & + minhash_length. + eg: num_bands=3, minhash_length=6 + [[0, 1], [2, 3], [4, 5]] + """ + minhashes_per_bucket = minhash_length // num_buckets + + bucket_ranges = [ + list( + range( + bucket * minhashes_per_bucket, (bucket + 1) * minhashes_per_bucket + ) + ) + for bucket in range(num_buckets) + ] + return bucket_ranges + + def minhash_to_buckets( + self, + df: cudf.DataFrame, + bucket_ranges: List[List[int]], + ) -> cudf.DataFrame: + df2 = df[self.id_fields] + for i, h in enumerate(bucket_ranges): + indices = cudf.Series([h]).repeat(len(df2)) + df2[f"_bucket_{i}"] = f"b{i}_" + df[self.minhash_field].list.take( + indices + ).hash_values(method="md5") + return df2 + + def bucket_id_to_int( + self, + bucket_ddf: dask_cudf.DataFrame, + bucket_col_name: str = "bucket_id", + start_id: int = 0, + ) -> Tuple[dask_cudf.DataFrame, int]: + """ + Maps bucket ids to a contigious integer range from starting from start_id. + """ + unique_bucket_df = ( + bucket_ddf[[bucket_col_name]] + .map_partitions(lambda x: x.drop_duplicates(ignore_index=True)) + .persist() + ) + end_bucket_id = len(unique_bucket_df) - 1 + start_id + unique_bucket_df["bucket_int_id"] = np.uint64(1) + unique_bucket_df["bucket_int_id"] = unique_bucket_df["bucket_int_id"].cumsum() + unique_bucket_df["bucket_int_id"] = ( + unique_bucket_df["bucket_int_id"] - 1 + start_id + ) + bucket_ddf = bucket_ddf.merge(unique_bucket_df, on=[bucket_col_name]) + bucket_ddf = bucket_ddf.drop(columns=[bucket_col_name]) + bucket_ddf = bucket_ddf.rename(columns={"bucket_int_id": "_bucket_id"}) + bucket_ddf["_bucket_id"] = bucket_ddf["_bucket_id"].astype(np.uint64) + return (bucket_ddf, end_bucket_id) + + def _minhash_to_bucket_meta( + self, df: dask_cudf.DataFrame + ) -> Tuple[cudf.DataFrame, int]: + meta = df._meta_nonempty[self.id_fields] + meta[self.minhash_field] = [np.ones(self.minhash_length)] * len(meta) + return self.minhash_to_buckets(meta, self.bucket_ranges) + + def lsh( + self, + write_path: str, + df: dask_cudf.DataFrame, + ) -> None: + """ + Computes buckets and writes them as parquet files to the write_path + """ + meta = self._minhash_to_bucket_meta(df) + df = df.map_partitions( + self.minhash_to_buckets, + bucket_ranges=self.bucket_ranges, + meta=meta, + ) + + bucket_start_id = 0 + for i in range(0, self.num_buckets, self.buckets_per_shuffle): + value_vars = [ + f"_bucket_{i}" + for i in range(i, min(self.num_buckets, i + self.buckets_per_shuffle)) + ] + df2 = df.melt( + id_vars=self.id_fields, value_name="_bucket_id", value_vars=value_vars + )[self.id_fields + ["_bucket_id"]] + + df2 = df2.shuffle( + on=["_bucket_id"], + ignore_index=True, + npartitions=max(1, 2 ** math.floor(math.log2(df2.npartitions))), + ).map_partitions(lambda x: x[x["_bucket_id"].duplicated(keep=False)]) + + df2 = df2.reset_index(drop=True) + df2, end_id = self.bucket_id_to_int( + df2, bucket_col_name="_bucket_id", start_id=bucket_start_id + ) + # If bucketing return empty dataframe + if end_id < bucket_start_id: + continue + bucket_start_id = end_id + 1 + + # Workaround for dtype mismatches with empty partitions + dtypes = df2.dtypes.to_dict() + df2 = df2.map_partitions(lambda x: x.astype(dtypes)) + + if i == 0: + if os.path.exists(write_path): + warnings.warn( + f"Output path {write_path} already exists and will be overwritten" + ) + df2.to_parquet(write_path, write_index=False, overwrite=True) + else: + df2.to_parquet(write_path, write_index=False, append=True) + + self._logger.info(f"Wrote data for buckets: {value_vars}") + + def __call__(self, dataset: DocumentDataset) -> DocumentDataset: + df = dataset.df + + write_path = os.path.join(self.cache_dir, "_buckets.parquet") + t0 = time.time() + with performance_report_if( + self.profile_dir, + f"lsh-profile-{datetime.now().strftime('%Y%m%d_%H%M%S')}.html", + ): + self.lsh(write_path=write_path, df=df) + self._logger.info(f"Computing and writing buckets took {time.time() - t0} s") + + buckets_df = dask_cudf.read_parquet(write_path, split_row_groups=False) + return DocumentDataset(buckets_df) + + +class _MapBuckets: + """ + buckets to a logical partition by using a modified bin packing algorithm. + Combines buckets generated from LSH (typically high cardinality) + to more coarse lower cardinality bucket groups by mapping multiple buckets + to a logical partition using document length information and a modified bin + packing algorithm. + Only needed if running False Postive check to remove false positives. + """ + + def __init__( + self, + id_fields: Union[list, str] = "id", + text_field: str = "text", + bucket_field: str = "_bucket_id", + num_anchors: int = 2, + logger: Union[logging.LoggerAdapter, str] = "./", + ): + """ + id_fields: list or str + id fields of df + text_field: str = "text", + bucket_column: str = "bucket_column", + num_anchors: int = 2, + logger: Union[logging.LoggerAdapter, str] = "./", + """ + self.id_fields = [id_fields] if isinstance(id_fields, str) else id_fields + self.text_field = text_field + self.num_anchors = num_anchors + self.bucket_field = bucket_field + if isinstance(logger, str): + self._logger = create_logger( + rank=0, + log_file=os.path.join(logger, "Map_Buckets.log"), + name="Map_Buckets", + ) + else: + self._logger = logger + + @staticmethod + def _get_output_part_ids_with_approx_equal_sum( + bucket_text_bytes_df: cudf.DataFrame, + max_text_bytes_per_part: int, + buckets_column: str, + bytes_column: str, + output_partition_column: str, + ) -> cudf.DataFrame: + """ + Create a output_series that maps the ser.index into `nparts` + so that the total sum of bucket_val_counts_df + for each output id are all most equal and + less than max_text_bytes_per_part + This is used downstream for creating equal output_ids + """ + sizes = bucket_text_bytes_df[bytes_column].values + bucket_output_ar = build_partition( + sizes=sizes.get(), max_size=max_text_bytes_per_part + ) + df = cudf.DataFrame() + df[buckets_column] = bucket_text_bytes_df[buckets_column] + df[output_partition_column] = bucket_output_ar + return df + + def _get_output_map_from_text_bytes_per_bucket( + self, + ddf_bk_text_bytes, + bytes_column, + output_partition_column="_output_partition_id", + ): + # String bytes limit for cuDF + # https://github.com/rapidsai/cudf/issues/13733 + max_text_bytes_per_part = int(np.iinfo(np.int32).max // 1.2) + + self._logger.info(f"max_text_bytes_per_part = {max_text_bytes_per_part}") + # Increasing in an attempt to prevent hitting + # ulimits + output_map_df_meta = cudf.DataFrame( + {self.bucket_field: [0], output_partition_column: [1]} + ) + output_map_df_meta = output_map_df_meta.astype( + {self.bucket_field: np.uint64, output_partition_column: np.int32} + ) + + output_map_df = ddf_bk_text_bytes.map_partitions( + _MapBuckets._get_output_part_ids_with_approx_equal_sum, + max_text_bytes_per_part=max_text_bytes_per_part, + buckets_column=self.bucket_field, + bytes_column=bytes_column, + output_partition_column=output_partition_column, + meta=output_map_df_meta, + ) + output_map_df = output_map_df.persist() + self._logger.info( + f"Step 1 of output_map_df of len: {len(output_map_df)} computed" + ) + lower_bounds = ( + output_map_df[output_partition_column] + .map_partitions(lambda s: (s.max() + 1)) + .compute() + ) + lower_bounds = np.cumsum(lower_bounds) + + def update_id(df, lower_bound): + df[output_partition_column] += lower_bound + return df + + updated_parts = [ + output_map_df.get_partition(i).map_partitions( + update_id, lower_bounds[i - 1] + ) + for i in range(1, len(lower_bounds)) + ] + updated_parts.append(output_map_df.get_partition(0)) + output_map_df = dask_cudf.concat(updated_parts) + output_map_df = output_map_df.persist() + self._logger.info( + f"All steps of output_map_df of len: {len(output_map_df)} computed" + ) + return output_map_df + + def _get_output_map_based_on_str_bytes( + self, buckets_df, documents_df, bytes_column="_text_bytes" + ): + """ + Add output_partition_id to buckets_ddf + """ + documents_df[bytes_column] = documents_df[self.text_field].map_partitions( + lambda s: s.str.byte_count() + ) + n_partitions = buckets_df.npartitions + documents_df = documents_df.drop(columns=[self.text_field]).repartition( + npartitions=n_partitions + ) + buckets_df = buckets_df.merge(documents_df).repartition( + npartitions=n_partitions + ) + del documents_df + ddf_bk_text_bytes, agg_df_len = get_agg_text_bytes_df( + df=buckets_df, + agg_column=self.bucket_field, + bytes_column=bytes_column, + n_partitions=n_partitions, + shuffle=True, + ) + self._logger.info(f"Agg_df computed of length = {agg_df_len}") + del buckets_df + output_map_df = self._get_output_map_from_text_bytes_per_bucket( + ddf_bk_text_bytes=ddf_bk_text_bytes, + bytes_column=bytes_column, + ) + return output_map_df + + def _random_select_anchor(self, buckets_df, n=2): + """ + Randomly select `n` anchors from each bucket. + """ + buckets_df = buckets_df.copy() + buckets_df["_id_hash"] = buckets_df[self.id_fields].hash_values() + buckets_df = buckets_df.sort_values([self.bucket_field, "_id_hash"]) + buckets_df["_order_in_bucket"] = buckets_df.groupby( + self.bucket_field + ).cumcount() + buckets_df["is_anchor"] = buckets_df["_order_in_bucket"] < n + for i in range(0, n): + buckets_df[f"is_anchor_id_{i}"] = buckets_df["_order_in_bucket"] == i + buckets_df = buckets_df.drop(columns=["_id_hash", "_order_in_bucket"], axis=1) + buckets_df = buckets_df.reset_index(drop=True) + buckets_df = buckets_df[buckets_df.is_anchor] + return buckets_df + + def _add_anchor_docs(self, buckets_df, num_anchors): + """ + Get anchor documents for each bucket. + """ + df_anchor_bk = self._random_select_anchor(buckets_df=buckets_df, n=num_anchors) + df_anchor_docs = None + for i in range(num_anchors): + df_anchor_bk_i = df_anchor_bk[df_anchor_bk[f"is_anchor_id_{i}"]][ + [self.bucket_field] + self.id_fields + ].reset_index(drop=True) + column_mapping = {id: f"anchor_{i}_{id}" for id in self.id_fields} + df_anchor_bk_i = df_anchor_bk_i.rename(columns=column_mapping) + if i == 0: + df_anchor_docs = df_anchor_bk_i + else: + df_anchor_docs = df_anchor_bk_i.merge( + df_anchor_docs, on=[self.bucket_field], how="inner" + ) + + df_anchor_docs_with_bk = buckets_df.merge( + df_anchor_docs, on=[self.bucket_field], how="inner" + ) + return df_anchor_docs_with_bk + + def map_buckets_with_anchors( + self, + documents_df: dask_cudf.DataFrame, + buckets_df: dask_cudf.DataFrame, + shuffle_type: Union[str, bool, None] = "tasks", + ) -> dask_cudf.DataFrame: + """ + Get anchor docs with bucket info + Args: + input_data_paths: list of paths to input data + input_bucket_path: path to input buckets + text_ddf_blocksize: blocksize for text ddf + num_files: number of files to read + num_workers: number of workers + shuffle_type: type of shuffle to use + Returns: + ddf_anchor_docs_with_bk + """ + output_map_df = self._get_output_map_based_on_str_bytes( + buckets_df=buckets_df, documents_df=documents_df + ) + ddf_anchor_docs_with_bk = buckets_df.map_partitions( + self._add_anchor_docs, num_anchors=self.num_anchors + ) + self._logger.info("output_map_df is based on string bytes") + ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.merge( + output_map_df, on=self.bucket_field + ) + # Bucket is no longer needed + ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.drop( + columns=[self.bucket_field] + ) + # Below removes any duplicates lying around after dropping buckets + ddf_anchor_docs_with_bk = ddf_anchor_docs_with_bk.map_partitions( + M.drop_duplicates, + meta=ddf_anchor_docs_with_bk._meta, + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + ddf_anchor_docs_with_bk = dd_shuffle( + ddf_anchor_docs_with_bk, + self.id_fields, + ignore_index=True, + shuffle=shuffle_type, + ).map_partitions( + M.drop_duplicates, + meta=ddf_anchor_docs_with_bk._meta, + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + del output_map_df + return ddf_anchor_docs_with_bk + + +class _Shuffle: + def __init__( + self, + id_fields: Union[str, list] = "id", + text_field: str = "text", + logger: Union[logging.LoggerAdapter, str] = "./", + profile_dir: str = None, + int_to_str_id: str = None, + ): + if isinstance(logger, str): + self._logger = create_logger( + rank=0, + log_file=os.path.join(logger, "LSH.log"), + name="LSH", + ) + else: + self._logger = logger + + self.id_fields = id_fields + self.text_field = text_field + self.profile_dir = profile_dir + self.int_to_str_id = int_to_str_id + + def shuffle_docs_on_buckets( + self, + documents_df: dask_cudf.DataFrame, + bucket_w_anchors_path: str, + output_shuffled_docs_path: str, + bucket_mapping_df_blocksize, + parts_per_worker: int = 1, + bucket_parts_per_worker: int = 8, + partition_on: str = "_output_partition_id", + ): + ddf_anchor_docs_with_bk, bk_mapping = aggregated_anchor_docs_with_bk_read( + path=bucket_w_anchors_path, + blocksize=bucket_mapping_df_blocksize, + ) + self._logger.info("Getting ddf_anchor_docs_with_bk completed") + self._logger.debug( + f"ddf_anchor_docs_with_bk.npartitions = {ddf_anchor_docs_with_bk.npartitions}" + ) + st = time.time() + num_workers = get_num_workers(get_current_client()) + parts_per_batch = num_workers * parts_per_worker + self._logger.debug(f"parts_per_batch = {parts_per_batch}") + parts_per_bucket_batch = num_workers * bucket_parts_per_worker + self._logger.debug(f"parts_per_bucket_batch = {parts_per_bucket_batch}") + + dask_profile_name = "suffle_docs" + dask_profile_name = dask_profile_name + f"-parts_per_batch-{parts_per_batch}" + dask_profile_name = ( + dask_profile_name + f"-parts_per_bucket_batch-{parts_per_bucket_batch}" + ) + dask_profile_name = ( + dask_profile_name + f"-{datetime.now().strftime('%Y%m%d_%H%M%S')}.html" + ) + documents_df = documents_df[self.id_fields + [self.text_field]] + + with performance_report_if(self.profile_dir, dask_profile_name): + self._batched_merge_and_write( + left_df=documents_df, + right_df=ddf_anchor_docs_with_bk, + output_path=output_shuffled_docs_path, + merge_on=self.id_fields, + partition_on=partition_on, + parts_per_text_batch=parts_per_batch, + parts_per_bucket_batch=parts_per_bucket_batch, + bk_mapping=bk_mapping, + num_workers=num_workers, + ) + self._logger.info(f"Writing+Shuffling data took = {time.time()-st} s") + + def _batched_merge_and_write( + self, + left_df: dask_cudf.DataFrame, + right_df: dask_cudf.DataFrame, + output_path: str, + merge_on: List[str], + partition_on: str, + parts_per_text_batch: int, + parts_per_bucket_batch: int, + bk_mapping, + num_workers: int = None, + ): + total_text_partitions = left_df.npartitions + total_bucket_partitions = right_df.npartitions + + # Extract global partitioning index + left_df, global_partitioning_index = extract_partitioning_index( + left_df, + merge_on, + bk_mapping, + parts_per_bucket_batch, + total_bucket_partitions, + ) + + # Set start offsets + bucket_part_start_offset, text_part_start_offset = get_restart_offsets( + output_path + ) + + # Set end offsets + # NOTE: These end offsets are always set to the end + # of the data. However, we may want to be able to set + # both the start and end offsets from the command line + # in the future. + bucket_part_end_offset = total_bucket_partitions + text_part_end_offset = total_text_partitions + + # Check that offsets are valid + assert bucket_part_start_offset % parts_per_bucket_batch == 0 + assert bucket_part_end_offset > bucket_part_start_offset + assert text_part_end_offset > text_part_start_offset + + # Initialize "retry" variables + # + # - retry_count: The number of successive batches that + # we have already performed at a reduced batch size. + # - retry_threshold: The number of successive batches + # for which we should keep the batch size low + # before attempting the default batch size again. + # Every time we return to the default batch size + # and immediately fail, retry_threshold will double. + parts_per_text_batch_retry = None + retry_count, retry_threshold = 0, 1 + + self._logger.info( + f"Starting at bucket-map partition {bucket_part_start_offset}" + f" and text-df partition {text_part_start_offset}", + ) + + for bucket_part_offset in tqdm( + range( + bucket_part_start_offset, bucket_part_end_offset, parts_per_bucket_batch + ) + ): + + # Outer loop over batches of "bucket-map" partitions + end_bucket_offset = min( + bucket_part_offset + parts_per_bucket_batch, bucket_part_end_offset + ) + print( + f"\nStarted processing bucket-map partitions {bucket_part_offset} " + f"through {end_bucket_offset} of {bucket_part_end_offset}", + flush=True, + ) + st_bucket = time.time() + + # Select our bucket-mapping batch + subset_bucket_df = right_df.partitions[bucket_part_offset:end_bucket_offset] + subset_bucket_df = subset_bucket_df.persist() + + # Filter out rows of left_df that we know cannot + # align with any rows of subset_bucket_df + left_df_use = filter_text_rows_by_bucket_batch( + left_df, + global_partitioning_index, + bucket_part_offset, + bucket_part_end_offset, + total_bucket_partitions, + ) + + text_part_offset = text_part_start_offset + while text_part_offset < text_part_end_offset: + + # Check if we are "retrying" with a smaller "parts_per_text_batch" + if parts_per_text_batch_retry: + parts_per_text_batch_use = parts_per_text_batch_retry + else: + st_text = time.time() + parts_per_text_batch_use = parts_per_text_batch + print(f"Using {parts_per_text_batch_use} text partitions.", flush=True) + + # Select partitions for our text batch + end_text_offset = min( + text_part_offset + parts_per_text_batch_use, text_part_end_offset + ) + subset_text_df = left_df_use.partitions[ + text_part_offset:end_text_offset + ] + + try: + # NOTE: If we have more text-df partitions than bucket-map + # partitions, we are more likely to see an OverflowError + output_df = text_bytes_aware_shuffle( + df=merge_left_to_shuffled_right( + subset_text_df, + subset_bucket_df, + merge_on, + ), + partition_on=partition_on, + text_column=self.text_field, + num_workers=num_workers, + ) + except OverflowError as err: + # We encountered an overflow error! + # Let's try again with less text data + parts_per_text_batch_retry = int(parts_per_text_batch_use / 2) + if parts_per_text_batch_retry < 1: + raise err + print( + f"\nWe encountered an OverflowError and will retry " + f"the current batch with {parts_per_text_batch_retry} " + f"text partitions instead of {parts_per_text_batch_use}.", + flush=True, + ) + continue + + if self.int_to_str_id is not None: + output_df = output_df.map_partitions( + int_ids_to_str, id_column=self.int_to_str_id + ) + batch_label = f"{end_bucket_offset}_{end_text_offset}" + written_files = output_df.map_partitions( + write_partitioned_file, + output_path, + partition_on, + batch_label, + meta=cudf.Series([True]), + ) + written_files = written_files.compute() + update_restart_offsets(output_path, bucket_part_offset, end_text_offset) + del output_df + + print( + "Text-df partition ", + f"{end_text_offset}/{text_part_end_offset} " + f"completed in {time.time()-st_text}", + flush=True, + ) + + # Update loop control-flow variables + if parts_per_text_batch_use == parts_per_text_batch: + # We succeeded at the default batch size. + # Reset the retry count + retry_count, retry_threshold = 0, 1 + else: + # We succeeded at a lower batch size + retry_count += 1 + if retry_count >= retry_threshold: + # Go back to the default text-batch size, + # but increase the retry_threshold in + # case we fail again + parts_per_text_batch_retry = None + retry_count, retry_threshold = 0, min(retry_threshold * 2, 16) + text_part_offset += parts_per_text_batch_use + + update_restart_offsets(output_path, end_bucket_offset, end_text_offset) + print( + "Bucket partition ", + f"{end_bucket_offset}/{bucket_part_end_offset} " + f"completed in {time.time()-st_bucket}", + flush=True, + ) + + # Need to reset text_part_start_offset to 0 after + # a single bucket-batch pass (only matters if we are + # breaking the bucket-mapping df into multiple batches) + text_part_start_offset = 0 + + +class JaccardSimilarity: + def __init__( + self, + id_field="id", + anchor_id_fields=["anchor_0_id", "anchor_1_id"], + text_field="text", + ngram_width=5, + ): + self.id_field = id_field + self.anchor_id_fields = anchor_id_fields + self.text_field = text_field + self.anchor_id = f"anchor_{id_field}" + self.left_id = f"{self.id_field}_x" + self.right_id = f"{self.id_field}_y" + self.ngram_width = ngram_width + + def __call__(DocumentDataset): + raise NotImplementedError + + def jaccard_compute(self, shuffled_docs_path): + paths = [ + entry.path + for entry in os.scandir(shuffled_docs_path) + if not entry.path.endswith(".txt") + ] + meta_df = cudf.DataFrame( + { + self.left_id: ["x"], + self.right_id: ["y"], + "jaccard": np.float32([0.0]), + } + ) + result_df = dd.from_map( + self._compute_jaccard_on_1_partition, paths, meta=meta_df + ).reset_index(drop=True) + return result_df + + def _compute_jaccard_on_1_partition(self, path): + try: + df = cudf.read_parquet(path) + pair_df = self._compute_jaccard_and_create_pair_df(df) + except OverflowError: + paths = [entry.path for entry in os.scandir(os.path.join(path))] + anchor_df_str_size_ls = [ + self._get_anchor_docs_and_string_size(path) for path in paths + ] + anchor_df = cudf.concat( + [anchor_doc for anchor_doc, _ in anchor_df_str_size_ls], + ignore_index=True, + ).drop_duplicates() + df_str_size = [str_size for _, str_size in anchor_df_str_size_ls] + paths = JaccardSimilarity._create_bins( + df_str_size, np.iinfo(np.int32).max // 10 + ) + pair_dfs = [] + for path in paths: + print(path) + df = cudf.read_parquet(path).reset_index(drop=True) + df = cudf.concat([df, anchor_df], ignore_index=True) + pair_df = self._compute_jaccard_and_create_pair_df(df) + pair_dfs.append(pair_df) + pair_df = cudf.concat(pair_dfs, ignore_index=True) + return pair_df + + def _get_anchor_docs_and_string_size(self, path): + df = cudf.read_parquet(path) + str_bytes = df[self.text_field].str.byte_count().sum() + is_anchor_flag = df[self.id_field] == df[self.anchor_id_fields[0]] + for anchor_id in self.anchor_id_fields[1:]: + is_anchor_flag = is_anchor_flag | (df[self.id_field] == df[anchor_id]) + anchor_df = df[is_anchor_flag].reset_index(drop=True) + return anchor_df, {"path": path, "str_bytes": str_bytes} + + @staticmethod + def _create_bins(path_dicts, max_size): + path_dicts.sort(key=lambda x: x["str_bytes"], reverse=True) + bins, bin_sizes = [], [] + for path_d in path_dicts: + new_path, new_size = path_d["path"], path_d["str_bytes"] + for i, bin_size in enumerate(bin_sizes): + if bin_size + new_size <= max_size: + bins[i].append(new_path) + bin_sizes[i] += new_size + new_size = 0 + break + if new_size: + bins.append([new_path]) + bin_sizes.append(new_size) + return bins + + def _compute_jaccard_and_create_pair_df(self, df): + df = df.drop_duplicates( + subset=[self.id_field] + self.anchor_id_fields, ignore_index=True + ) + anchor_columns = self.anchor_id_fields + id_field = self.id_field + result_ls = [] + try: + for anchor_col in anchor_columns: + doc_df = df[[id_field, self.text_field, anchor_col]] + doc_df = doc_df.rename(columns={anchor_col: self.anchor_id}) + doc_df = doc_df[doc_df[id_field] != doc_df[self.anchor_id]] + anchor_df = self._get_anchor_df(df, anchor_col) + result_df = self._compute_jaccard_pair(doc_df, anchor_df) + result_ls.append(result_df) + + return cudf.concat(result_ls) + except OverflowError as e: + print( + "Failed with OverflowError in compute_jaccard_and_create_pair_df", + flush=True, + ) + print(df, flush=True) + print("--" * 30) + print("Error") + print("---" * 30) + raise e + + def _get_anchor_df(self, df, anchor_col): + anchor_df = df[df[self.id_field] == df[anchor_col]] + anchor_df = anchor_df.reset_index(drop=True) + anchor_df = anchor_df[[anchor_col, self.text_field]] + anchor_df = anchor_df.rename(columns={anchor_col: self.anchor_id}) + return anchor_df + + def _compute_jaccard_pair(self, docs_df, anchor_df): + nrows_at_once = JaccardSimilarity._get_max_num_rows_to_process_once( + df=docs_df, text_field=self.text_field + ) + result_ls = [] + for i in range(0, docs_df.shape[0], nrows_at_once): + pair_df = docs_df[i : i + nrows_at_once] + pair_df = pair_df.merge(anchor_df, on=self.anchor_id) + pair_df = pair_df.rename( + columns={self.id_field: self.left_id, self.anchor_id: self.right_id} + ) + mask = pair_df[self.left_id] != pair_df[self.right_id] + pair_df = pair_df[mask].reset_index(drop=True) + if len(pair_df) == 0: + result_df = self._create_empty_jaccard_result() + else: + result_df = self._compute_jaccard_partition(pair_df) + result_ls.append(result_df) + if len(result_ls) == 0: + return self._create_empty_jaccard_result() + df_pair = cudf.concat(result_ls) + return df_pair + + def _create_empty_jaccard_result(self): + df = cudf.DataFrame() + df[self.left_id] = "x" + df[self.right_id] = "y" + df["jaccard"] = np.empty(shape=0, dtype=np.float32) + return df + + def _compute_jaccard_partition(self, df): + text_x = f"{self.text_field}_x" + text_y = f"{self.text_field}_y" + df["jaccard"] = df[text_x].str.jaccard_index(df[text_y], width=self.ngram_width) + df.drop(columns=[text_x, text_y], inplace=True) + return df + + @staticmethod + def _get_max_num_rows_to_process_once(df, text_field): + nbytes = df["text"].str.byte_count().sum() + # Number of exmploded bytes + exploded_bytes = nbytes * 5 * 2 + max_chars_allowed = 2_147_483_647 + byte_ratio = int(exploded_bytes) // max_chars_allowed + if byte_ratio > 1: + nrows_at_once = len(df) // byte_ratio + else: + nrows_at_once = len(df) + + nrows_at_once = max(1, nrows_at_once) + return nrows_at_once + + +class ConnectedComponents: + def __init__( + self, + cache_dir: str, + jaccard_pairs_path: str, + id_column="id", + convert_str_ids=False, + jaccard_threshold: int = 0.8, + ): + self.cache_dir = cache_dir + self.jaccard_pairs_path = jaccard_pairs_path + self.id_column = id_column + self.left_id = f"{id_column}_x" + self.right_id = f"{id_column}_y" + self.convert_str_ids = convert_str_ids + self.jaccard_threshold = jaccard_threshold + + def cc_workflow(self, output_path): + deduped_parsed_id_path = self._write_dedup_parsed_id() + encoded_jaccard_pair_path = self._write_encoded_jaccard_pair( + deduped_parsed_id_path + ) + deduped_encoded_jaccard_path = self._write_dedup_encoded_jaccard_pair( + encoded_jaccard_pair_path + ) + cc_path = self._run_connected_components( + deduped_encoded_jaccard_path, deduped_parsed_id_path, output_path + ) + return cc_path + + def _run_connected_components( + self, + deduped_encoded_jaccard_path, + deduped_parsed_id_path, + output_path, + ): + Comms.initialize(p2p=True) + df = dask_cudf.read_parquet( + deduped_encoded_jaccard_path, blocksize="1GB", aggregate_files=True + ) + df = df[df["jaccard"] == 1].reset_index(drop=True) + + labels_df = dask_cudf.read_parquet(deduped_parsed_id_path) + num_nodes = len(labels_df) + self_edge_df = labels_df[["uid"]].rename(columns={"uid": self.left_id}) + self_edge_df[self.right_id] = self_edge_df[self.left_id] + + df = df[[self.left_id, self.right_id]].astype(np.int64) + df = dask_cudf.concat([df, self_edge_df]) + + G = cugraph.MultiGraph(directed=False) + G.from_dask_cudf_edgelist( + df, source=self.left_id, destination=self.right_id, renumber=False + ) + result = dcg.weakly_connected_components(G) + del G + max_partitions = min(32, result.npartitions) + n_components = len(result[["labels"]].drop_duplicates(split_out=max_partitions)) + num_labels = len(result) + print("# of groups", n_components) + print("# of docs removed", num_labels - n_components) + labels_df = labels_df.merge( + result, left_on=["uid"], right_on=["vertex"], how="inner" + ) + id_columns = ( + ["dataset_id", "doc_id"] if self.convert_str_ids else [self.id_column] + ) + labels_df = labels_df[id_columns + ["labels"]] + labels_df = labels_df.rename(columns={"labels": "group"}) + labels_df = labels_df.persist() + # Doing an inner merge above + # should not change any rows + + assert num_nodes == len(labels_df) + print(f"assert num_nodes:{num_nodes}==labels_df:{len(labels_df)} passed") + labels_df.to_parquet(output_path, write_index=False) + Comms.destroy() + + @staticmethod + def _sort_ids(df, id_columns): + x = df[id_columns].values + x = cp.sort(x, axis=1) + for i, id_column in enumerate(id_columns): + df[id_column] = x[:, i] + df[id_column] = df[id_column].astype("uint64") + return df + + @staticmethod + def thresholding(df, threshold, column_to_threshold): + mask = df[column_to_threshold] > threshold + df.loc[mask, column_to_threshold] = np.int8(1) + df.loc[~mask, column_to_threshold] = np.int8(0) + return df + + def _write_dedup_encoded_jaccard_pair(self, encoded_jaccard_pair_path): + output_path = f"{self.cache_dir}/final_dedup_encoded_jaccard_pair.parquet" + + ddf = dask_cudf.read_parquet( + encoded_jaccard_pair_path, blocksize="512MB", aggregate_files=True + ) + meta = {self.left_id: "uint64", self.right_id: "uint64", "jaccard": "float32"} + ddf = ddf.map_partitions( + ConnectedComponents._sort_ids, + id_columns=[self.left_id, self.right_id], + meta=meta, + ) + ddf = ddf.map_partitions( + ConnectedComponents.thresholding, + threshold=self.jaccard_threshold, + column_to_threshold="jaccard", + meta=meta, + ) + ddf = ddf.map_partitions( + M.drop_duplicates, + meta=ddf._meta, + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + + ddf = dd_shuffle( + ddf, + [self.left_id, self.right_id], + ignore_index=True, + shuffle="tasks", + ) + ddf = ddf.map_partitions( + M.drop_duplicates, + meta=ddf._meta, + enforce_metadata=False, + transform_divisions=False, + align_dataframes=False, + ) + ddf.to_parquet(output_path, write_index=False) + return output_path + + def _convert_str_id_pair_to_int(self, df): + for id, tag in zip([self.left_id, self.right_id], ["x", "y"]): + dx = df[id].str.rsplit("-", n=1, expand=True) + df[f"dataset_id_{tag}"] = dx[0].astype("uint32").values + df[f"doc_id_{tag}"] = dx[1].astype("int64").values + df = df.drop(columns=[id]) + return df + + def _write_dedup_parsed_id(self): + dedup_parsed_id_path = f"{self.cache_dir}/dedup_parsed_id.parquet" + ddf = dask_cudf.read_parquet( + self.jaccard_pairs_path, + columns=[self.left_id, self.right_id], + blocksize="1GB", + aggregate_files=True, + ) + + id_columns = [self.id_column] + if self.convert_str_ids: + ddf = ddf.map_partitions( + self._convert_str_id_pair_to_int, + meta={ + "dataset_id_x": "uint32", + "doc_id_x": "int64", + "dataset_id_y": "uint32", + "doc_id_y": "int64", + }, + ) + id_columns = ["dataset_id", "doc_id"] + + unique_docs = ddf.map_partitions( + ConnectedComponents._get_unique_ids_per_partition, id_columns=id_columns + ) + unique_docs = unique_docs.drop_duplicates(split_out=ddf.npartitions // 4) + unique_docs["uid"] = np.uint64(1) + unique_docs["uid"] = unique_docs["uid"].cumsum() + unique_docs["uid"] = unique_docs["uid"] - 1 + unique_docs.to_parquet(dedup_parsed_id_path, write_index=False) + return dedup_parsed_id_path + + def _write_encoded_jaccard_pair(self, dedup_parsed_id_path): + output_path = f"{self.cache_dir}/encoded_jaccard_pair/" + ddf_id = dask_cudf.read_parquet( + dedup_parsed_id_path, blocksize="2GB", aggregate_files=True + ) + ddf_id = ddf_id.persist() + len(ddf_id) + ddf = dask_cudf.read_parquet( + self.jaccard_pairs_path, + blocksize="256MB", + aggregate_files=True, + ) + id_columns = [self.id_column] + if self.convert_str_ids: + ddf = ddf.map_partitions( + self._convert_str_id_pair_to_int, + meta={ + "jaccard": "float32", + "dataset_id_x": "uint32", + "doc_id_x": "int64", + "dataset_id_y": "uint32", + "doc_id_y": "int64", + }, + ) + id_columns = ["dataset_id", "doc_id"] + + num_workers = get_num_workers(get_current_client()) + self._batched_merge_and_write( + ddf=ddf, + ddf_id=ddf_id, + output_path=output_path, + id_columns=id_columns, + batch_size=num_workers, + ) + return output_path + + def _batched_merge_and_write( + self, ddf, ddf_id, output_path, id_columns, batch_size=32 + ): + total_batches = (ddf.npartitions + batch_size - 1) // batch_size + for batch_id, offset in enumerate(range(0, ddf.npartitions, batch_size)): + st = time.time() + subset_ddf = ddf.partitions[offset : offset + batch_size] + for tag in ["x", "y"]: + pair_ids = [] + for id_col in id_columns: + pair_ids.append(f"{id_col}_{tag}") + subset_ddf = subset_ddf.merge( + ddf_id, + left_on=pair_ids, + right_on=id_columns, + how="inner", + broadcast=True, + ) + subset_ddf = subset_ddf.rename( + columns={"uid": f"{self.id_column}_{tag}"} + ) + subset_ddf = subset_ddf.drop( + columns=[f"dataset_id_{tag}", f"doc_id_{tag}"] + ) + + subset_ddf = subset_ddf[[self.left_id, self.right_id, "jaccard"]] + output_batch_path = os.path.join(output_path, f"{batch_id}.parquet") + subset_ddf.to_parquet(output_batch_path, write_index=False) + + et = time.time() + print( + f"batch_id = {batch_id}/{total_batches}, time = {et - st}", flush=True + ) + + @staticmethod + def _get_unique_ids_per_partition(df, id_columns): + unique_df_ls = [] + for tag in ["x", "y"]: + cols_to_drop = [] + for id_col in id_columns: + cols_to_drop.append(f"{id_col}_{tag}") + + subset_df = df[cols_to_drop].drop_duplicates() + subset_df = subset_df.rename( + columns={f"{id_col}_{tag}": f"{id_col}" for id_col in id_columns} + ) + unique_df_ls.append(subset_df) + unique_df = cudf.concat(unique_df_ls, ignore_index=True) + unique_df = unique_df.drop_duplicates() + return unique_df diff --git a/nemo_curator/modules/meta.py b/nemo_curator/modules/meta.py new file mode 100644 index 00000000..ab5b2774 --- /dev/null +++ b/nemo_curator/modules/meta.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class Sequential: + def __init__(self, modules): + self.modules = modules + + def __call__(self, dataset): + for module in self.modules: + dataset = module(dataset) + return dataset \ No newline at end of file diff --git a/nemo_curator/modules/modify.py b/nemo_curator/modules/modify.py new file mode 100644 index 00000000..d02ce9c7 --- /dev/null +++ b/nemo_curator/modules/modify.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_curator.modifiers import DocumentModifier +from nemo_curator.datasets import DocumentDataset + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modifiers import DocumentModifier + + +class Modify: + def __init__(self, modifier: DocumentModifier, text_field="text", batched=False): + self.modifier = modifier + self.text_field = text_field + self.batched = batched + + def __call__(self, dataset: DocumentDataset) -> DocumentDataset: + if self.batched: + dataset.df[self.text_field] = dataset.df[self.text_field].map_partitions(self.modifier.modify_document, meta=(None, str)) + else: + dataset.df[self.text_field] = dataset.df[self.text_field].apply(self.modifier.modify_document, meta=(None, str)) + + return dataset diff --git a/nemo_curator/modules/task.py b/nemo_curator/modules/task.py new file mode 100644 index 00000000..f3e0181b --- /dev/null +++ b/nemo_curator/modules/task.py @@ -0,0 +1,430 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Iterable, Union, List +from collections import defaultdict +from functools import reduce, partial + +from dask import delayed +import dask.dataframe as dd + +from nemo_curator.tasks.downstream_task import DownstreamTask +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.text_utils import get_words +from nemo_curator.utils.distributed_utils import single_partition_write_with_filename + +class TaskDecontamination: + def __init__(self, tasks: Union[DownstreamTask, Iterable[DownstreamTask]], text_field="text", max_ngram_size=13, max_matches=10, min_document_length=200, remove_char_each_side=200, max_splits=10, removed_dir=None) -> None: + """ + Removes segments of downstream evaluation tasks from a dataset + Args: + max_ngram_size: The maximum amount of task grams that are considered at once for contamination. + max_matches: If an ngram is found more than max_matches times, it is considered too common and will not be removed from the dataset. + min_document_length: When a document is split, if a split falls below this character length it is discarded. + remove_char_each_side: The number of characters to remove on either side of the matching ngram + max_splits: The maximum number of times a document may be split before being entirely discarded. + removed_dir: If not None, the documents split too many times will be written to this directory using the filename in the dataset. + """ + if isinstance(tasks, DownstreamTask): + tasks = [tasks] + self.tasks = tasks + self.text_field = text_field + self.max_ngram_size = 13 + self.max_matches = max_matches + self.min_document_length = min_document_length + self.remove_char_each_side = remove_char_each_side + self.max_splits = max_splits + self.removed_dir = removed_dir + + def __call__(self, dataset: DocumentDataset) -> DocumentDataset: + + # Convert the dataframe to delayed objects for complex operations + original_meta = dataset.df.dtypes.to_dict() + delayed_dataset = dataset.df.to_delayed() + + # Perform task decontamintation + task_ngrams = self.prepare_task_ngram_count() + found_result = self._find_matching_ngrams(task_ngrams, delayed_dataset) + matched_ngrams, ngram_freq = found_result['matched-ngrams'], found_result['ngrams-freq'] + delayed_removed_dataset = self._remove_matching_ngrams(matched_ngrams, ngram_freq, delayed_dataset) + + # Restore the dataset to its original format + removed_dataset = DocumentDataset(dataset_df=dd.from_delayed(delayed_removed_dataset, meta=original_meta)) + + return removed_dataset + + @staticmethod + def _merge_task_ngrams(first: dict, second: dict): + first.update(second) + return first + + def prepare_task_ngram_count(self) -> dict: + """ + Computes a dictionary of all ngrams in each task as keys and each value set to 0. + """ + delayed_ngrams = [delayed(task.generate_ngrams)() for task in self.tasks] + aggregated_ngrams = delayed(reduce)(TaskDecontamination._merge_task_ngrams, delayed_ngrams) + + return aggregated_ngrams + + @staticmethod + def _compute_ngram_freq_sorted(task_ngrams): + ngrams_freq = defaultdict(int) + for ngram_key in task_ngrams.keys(): + ngram_words, _ = get_words(ngram_key) + length = len(ngram_words) + ngrams_freq[length] += 1 + + ngrams_freq_sorted = sorted(ngrams_freq.items(), key=lambda item: item[0]) + + return ngrams_freq_sorted + + def find_matching_ngrams(self, task_ngrams: dict, dataset: DocumentDataset) -> dict: + delayed_dataset = dataset.df.to_delayed() + + return self._find_matching_ngrams(task_ngrams, delayed_dataset) + + def _find_matching_ngrams(self, task_ngrams: dict, delayed_dataset) -> dict: + task_ngrams_frequency_sorted = delayed(self._compute_ngram_freq_sorted)(task_ngrams) + delayed_counts = [delayed(self._find_ngrams_partition)(partition, task_ngrams, task_ngrams_frequency_sorted) for partition in delayed_dataset] + combined_counts = delayed(reduce)(self._merge_counts, delayed_counts) + formatted_result = delayed(self._format_matching_ngrams_result)(combined_counts, task_ngrams_frequency_sorted) + + return formatted_result + + def _find_ngrams_partition(self, dataset_partition, task_ngrams, ngrams_freq_sorted): + partition_count = defaultdict(int) + for document in dataset_partition[self.text_field]: + doc_result = self._find_ngrams(document, task_ngrams, ngrams_freq_sorted) + partition_count = TaskDecontamination._merge_counts(partition_count, doc_result) + + return partition_count + + @staticmethod + def _merge_counts(first: dict, second: dict): + for ngram, count in second.items(): + first[ngram] = first.get(ngram, 0) + count + + return first + + @staticmethod + def _format_matching_ngrams_result(matched_ngrams, ngram_freq): + return { + "matched-ngrams": matched_ngrams, + "ngrams-freq": ngram_freq, + } + + def _find_ngrams(self, document, task_ngrams, ngrams_freq_sorted): + """ + Searches for matching n-grams in a document + """ + text_buf = [document] + + local_ngram = defaultdict(int) + while len(text_buf) > 0: + + # get the first one from the buffer + text = text_buf.pop(0) + words, positions = get_words(text) + + ngram_free = True + # First, loop over all n-grams in document + for i in range(len(words) - self.max_ngram_size + 1): + # Check if we found a matching n-gram + check_ngram_free = TaskDecontamination._check_text( + words[i:i + self.max_ngram_size], + task_ngrams, + text, + positions[i], + text_buf, + local_ngram, + ) + + # If we found a match, break + # the remainder of the text is appended to text_buf + # for futher processing + if not check_ngram_free: + ngram_free = False + break + + # Continue searching for the remaining dominant n-grams + for ngram_len, _ in ngrams_freq_sorted: + # Check if we found a matching n-gram + check_ngram_free = TaskDecontamination._check_text( + words[i:i + ngram_len], + task_ngrams, + text, + positions[i], + text_buf, + local_ngram, + ) + + # Again, if we find match, break + # the remainder of the text is appended to text_buf + # for futher processing + if not check_ngram_free: + ngram_free = False + break + + # Additional break to break out of both loops + if not ngram_free: + break + + # If did not find a match for the max_ngram_size + # check the ending n-gram + if ngram_free and len(words) - self.max_ngram_size > 0: + # get the last words of the lax max ngram + last_seq_words = words[len(words) - self.max_ngram_size:len(words)] + last_seq_start_position = len(words) - self.max_ngram_size + + # check all n-grams lower than max ngram-len + for pos, (ngram_len, _) in enumerate(ngrams_freq_sorted): + + # ignore the max ngram as has been considered already + if ngram_len == self.max_ngram_size: + continue + + # find each ngram of ngram_len in max n-grams and check + for i in range(len(last_seq_words) - ngram_len + 1): + # Check for matching n-grams + check_ngram_free = TaskDecontamination._check_text( + last_seq_words[i:i + ngram_len], + task_ngrams, + text, + positions[last_seq_start_position + i], + text_buf, + local_ngram, + ) + + # If we find a match, break + if not check_ngram_free: + ngram_free = False + break + + # Break from both loops + if not ngram_free: + break + + return local_ngram + + @staticmethod + def _check_text(words, task_ngrams, text, start_position, text_buf, local_ngram): + seq = " ".join(words) + if seq in task_ngrams: + print(" [matched]: {}".format(seq), flush=True) + # If this flag is set, we just look for matching n-grams + # we don't remove any matching n-grams + # Count the matched n-gram and consider it later + local_ngram[seq] += 1 + if (start_position + len(seq) + 1) < len(text): + text_buf.append(text[start_position + len(seq) + 1:len(text)]) + return False + + return True + + def remove_matching_ngrams(self, matched_ngrams: dict, ngram_freq: List[tuple], dataset: DocumentDataset): + original_meta = dataset.df.dtypes.to_dict() + delayed_dataset = dataset.df.to_delayed() + delayed_removed_dataset = self._remove_matching_ngrams(matched_ngrams, ngram_freq, delayed_dataset) + removed_dataset = DocumentDataset(dataset_df=dd.from_delayed(delayed_removed_dataset, meta=original_meta)) + + return removed_dataset + + def _remove_matching_ngrams(self, matched_ngrams: dict, ngram_freq: List[tuple], delayed_dataset): + threshhold_ngrams = delayed(self._threshold_ngram_count)(matched_ngrams) + delayed_removed_dataset = [delayed(self._remove_ngrams_partition)(partition, threshhold_ngrams, ngram_freq) for partition in delayed_dataset] + + return delayed_removed_dataset + + def _threshold_ngram_count(self, matched_ngrams: dict) -> set: + filtered_ngrams = set() + for ngram, count in matched_ngrams.items(): + if count <= self.max_matches: + filtered_ngrams.add(ngram) + + return filtered_ngrams + + def _remove_ngrams_partition(self, partition, task_ngrams, ngrams_freq_sorted): + document_fn = partial(self._remove_ngrams, task_ngrams=task_ngrams, ngrams_freq_sorted=ngrams_freq_sorted) + split_text = partition[self.text_field].apply(document_fn) + num_splits = split_text.apply(len) + + valid_documents_mask = (1 <= num_splits) & (num_splits <= self.max_splits) + + if self.removed_dir: + removed_docs = partition[~valid_documents_mask] + single_partition_write_with_filename(removed_docs, self.removed_dir) + + partition[self.text_field] = split_text + filtered_partition = partition[valid_documents_mask] + return filtered_partition.explode(self.text_field, ignore_index=True) + + + def _remove_ngrams(self, document, task_ngrams, ngrams_freq_sorted): + """ + Searches for matching n-grams in a document + """ + text_buf = [document] + + text_buf_ngram_free = [] + while len(text_buf) > 0: + # get the first one from the buffer + text = text_buf.pop(0) + words, positions = get_words(text) + + ngram_free = True + # First, loop over all n-grams in document + for i in range(len(words) - self.max_ngram_size + 1): + # Check if we found a matching n-gram + check_ngram_free = self._clean_text( + words[i:i + self.max_ngram_size], + task_ngrams, + text, + positions[i], + text_buf, + text_buf_ngram_free, + ) + + # If we found a match, break + # the remainder of the text is appended to text_buf + # for futher processing + if not check_ngram_free: + ngram_free = False + break + + # Continue searching for the remaining dominant n-grams + for ngram_len, _ in ngrams_freq_sorted: + # Check if we found a matching n-gram + check_ngram_free = self._clean_text( + words[i:i + ngram_len], + task_ngrams, + text, + positions[i], + text_buf, + text_buf_ngram_free, + ) + + # Again, if we find match, break + # the remainder of the text is appended to text_buf + # for futher processing + if not check_ngram_free: + ngram_free = False + break + + # Additional break to break out of both loops + if not ngram_free: + break + + # If did not find a match for the max_ngram_size + # check the ending n-gram + if ngram_free and len(words) - self.max_ngram_size > 0: + # get the last words of the lax max ngram + last_seq_words = words[len(words) - self.max_ngram_size:len(words)] + last_seq_start_position = len(words) - self.max_ngram_size + + # check all n-grams lower than max ngram-len + for pos, (ngram_len, _) in enumerate(ngrams_freq_sorted): + + # ignore the max ngram as has been considered already + if ngram_len == self.max_ngram_size: + continue + + # find each ngram of ngram_len in max n-grams and check + for i in range(len(last_seq_words) - ngram_len + 1): + # Check for matching n-grams + check_ngram_free = self._clean_text( + last_seq_words[i:i + ngram_len], + task_ngrams, + text, + positions[last_seq_start_position + i], + text_buf, + text_buf_ngram_free, + ) + + # If we find a match, break + if not check_ngram_free: + ngram_free = False + break + + # Break from both loops + if not ngram_free: + break + + # texts are ngram free + if ngram_free: + text_buf_ngram_free.append(text) + + # check if the text has only been trimmed + trimmed = 0 + if len(text_buf_ngram_free) == 1: + if len(text_buf_ngram_free[0]) < len(text): + trimmed = 1 + + return text_buf_ngram_free + + def _clean_text(self, words, matched_ngrams, text, start_position, text_buf, text_buf_ngram_free, nosplit_remove=False): + seq = " ".join(words) + if seq in matched_ngrams: + print(" [matched]: {}".format(seq), flush=True) + + # for NMT data we want to completely remove the sample + # which has a match + if nosplit_remove: + return False + + # split the text + text_first, text_second = TaskDecontamination._split_text( + text, + start_position, + self.remove_char_each_side, + seq, + ) + + # Free up the first part of matching n-grams + if len(text_first) > self.min_document_length: + text_buf_ngram_free.append(text_first) + + # The second part of the text is added to the output buffer + # and will be processed later + if len(text_second) > self.min_document_length: + text_buf.append(text_second) + + # Is not free of matching ngrams + return False + + # Free of matching n-grams + return True + + @staticmethod + def _split_text(text, start_pos, remove_char_each_side, seq): + # first part of the text + punctuations = ".!?" + pos = start_pos - remove_char_each_side + text_first = "" + while pos > 0 and not text[pos] in punctuations: + pos -= 1 + if pos > 0: + text_first = text[0:pos + 1] + + # add length of seq and remove_char_each_side + pos = start_pos + len(seq) + remove_char_each_side + + # last part of the text + text_second = "" + while pos < len(text) and not text[pos] in punctuations: + pos += 1 + if pos + 1 < len(text): + text_second = text[pos + 1:len(text)] + + return text_first, text_second \ No newline at end of file diff --git a/nemo_curator/pii/__init__.py b/nemo_curator/pii/__init__.py new file mode 100644 index 00000000..d9155f92 --- /dev/null +++ b/nemo_curator/pii/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_curator/pii/algorithm.py b/nemo_curator/pii/algorithm.py new file mode 100644 index 00000000..fd6ed5e8 --- /dev/null +++ b/nemo_curator/pii/algorithm.py @@ -0,0 +1,285 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import ( + List, + Mapping, + Any, + Union +) + +import yaml +from presidio_analyzer import ( + AnalyzerEngine, + RecognizerRegistry +) +from presidio_analyzer.nlp_engine import NerModelConfiguration +from presidio_analyzer.nlp_engine.ner_model_configuration import LABELS_TO_IGNORE +from presidio_analyzer.predefined_recognizers import ( + EmailRecognizer, + PhoneRecognizer, + SpacyRecognizer, + UsSsnRecognizer, + CreditCardRecognizer, + IpRecognizer +) +from presidio_anonymizer import ( + AnonymizerEngine, + BatchAnonymizerEngine +) +from presidio_anonymizer.entities import OperatorConfig + +from nemo_curator.pii.custom_batch_analyzer_engine import CustomBatchAnalyzerEngine +from nemo_curator.pii.custom_nlp_engine import CustomNlpEngine +from nemo_curator.pii.recognizers.address_recognizer import AddressRecognizer + +__all__ = ['DEFAULT_LANGUAGE', 'SUPPORTED_ENTITIES', 'DEFAULT_MAX_DOC_SIZE', 'PiiDeidentifier'] + + +DEFAULT_LANGUAGE = 'en' +SUPPORTED_ENTITIES = [ + "ADDRESS", + "CREDIT_CARD", + "EMAIL_ADDRESS", + "DATE_TIME", + "IP_ADDRESS", + "LOCATION", + "PERSON", + "URL", + "US_SSN", + "US_PASSPORT", + "US_DRIVER_LICENSE", + "PHONE_NUMBER", +] +DEFAULT_MAX_DOC_SIZE = 2000000 + + +class PiiDeidentifier(object): + """Cleans PII from an unstructured text""" + + def __init__( + self, + language: str = DEFAULT_LANGUAGE, + supported_entities: List[str] = None, + anonymize_action: str = "replace", + **kwargs + + ): + """ + Parameters: + :param language (str): 2-digit language code + :param supported_entities (List[str]): List of entities to consider while doing deidentification + :param anonymize_action (str): String that determines what to do for anonymization. Options are: + redact, hash, replace, mask and custom + + kwargs are additional anonymization related arguments. + if anonymize_action is 'replace', 'new_value' can be provided as a substitution string + if anonymize_action is 'hash', 'hash_type' can be provided (sha256, sha512 or md5) + if anonymize_action is 'mask', 'chars_to_mask' and 'masking_char' can be provided + if anonymize_action is 'custom', 'lambda' function can be provided + """ + additional_labels_to_ignore = { + "GPE", + "NORP", + "AGE", + "ID", + "PATIENT", + "HOSP", + "PATORG", + "HCW", + "HOSPITAL", + "FAC" + } + LABELS_TO_IGNORE.update(additional_labels_to_ignore) + + recognizer_registry = RecognizerRegistry(recognizers=[ + EmailRecognizer(), + PhoneRecognizer(), + SpacyRecognizer(), + UsSsnRecognizer(), + CreditCardRecognizer(), + IpRecognizer() + ]) + + self.language = language + ner_model_configuration = NerModelConfiguration(labels_to_ignore=LABELS_TO_IGNORE) + self.analyzer = AnalyzerEngine( + registry=recognizer_registry, + nlp_engine=CustomNlpEngine(ner_model_configuration=ner_model_configuration), + ) + self.anonymizer = AnonymizerEngine() + self.batch_analyzer = CustomBatchAnalyzerEngine(self.analyzer) + self.batch_anonymizer = BatchAnonymizerEngine(self.anonymizer) + self.operators = {} + + if anonymize_action == "redact": + self.operators["DEFAULT"] = OperatorConfig("redact", {}) + + elif anonymize_action == "hash": + self.operators["DEFAULT"] = OperatorConfig( + "hash", {"hash_type": kwargs.get("hash_type")} + ) + + elif anonymize_action == "mask": + self.operators["DEFAULT"] = OperatorConfig( + "mask", {"chars_to_mask": kwargs.get("chars_to_mask", 100), + "masking_char": kwargs.get("masking_char", "*"), + "from_end": False} + ) + + elif anonymize_action == 'lambda': + self.operators["DEFAULT"] = OperatorConfig( + "custom", {"lambda": kwargs.get("lambda")} + ) + + else: + self.operators["DEFAULT"] = OperatorConfig( + "replace", {"new_value": kwargs.get("new_value")} + ) + + self.supported_entities = ( + supported_entities or SUPPORTED_ENTITIES + ) + + if "ADDRESS" in self.supported_entities: + self.add_custom_recognizer( + AddressRecognizer(supported_entities=["ADDRESS"]) + ) + + @staticmethod + def from_config(config: Mapping[str, Any]): + config = config.get('pii_config') + language = config.get('language') + supported_entities = config.get('supported_entities') + operator_config = config.get('anonymize', {}) + operator_name = operator_config.get('action') + if operator_name: + del operator_config['action'] + + return PiiDeidentifier( + language=language, + supported_entities=supported_entities, + anonymize_action=operator_name, + **operator_config + ) + + @staticmethod + def from_yaml_file(path: Union[Path, str]): + with open(path) as f: + return PiiDeidentifier.from_config(yaml.safe_load(f)) + + @staticmethod + def from_default_config(): + return PiiDeidentifier( + PiiDeidentifier.DEFAULT_LANGUAGE, + supported_entities=SUPPORTED_ENTITIES, anonymize_action='replace' + ) + + def list_supported_entities(self): + """List all entities that are detected while cleaning a text""" + return self.supported_entities.copy() + + def list_operators(self): + """List all operators used to clean PII entities""" + return self.operators.copy() + + def add_custom_recognizer(self, recognizer): + """Add a custom recognizer to detect entities based on user-defined logic""" + self.supported_entities.extend(recognizer.get_supported_entities()) + self.analyzer.registry.add_recognizer(recognizer) + + def add_custom_operator(self, entity, operator): + """Use a custom cleaning operation for a specific entity types""" + self.operators[entity] = operator + + def analyze_text(self, + text, + entities: List[str] = None, + language: str = 'en' + ): + if not entities: + entities = self.supported_entities + return self.analyzer.analyze(text, language, entities=entities) + + def analyze_text_batch(self, + texts: List[str], + entities: List[str] = None, + language: str = 'en', + batch_size: int = 32 + ): + """ + For processing batches, use batch analyzer + + Parameters: + texts (List[str]): List of texts to perform deidentification on + batch_size (int): The number of texts to handle in a batch. This + parameter is useful when using spacy models. + + Returns: + List(str): list of deidentified text + """ + if not entities: + entities = self.supported_entities + + return self.batch_analyzer.analyze_iterator(texts, language, entities=entities, batch_size=batch_size) + + def deidentify_text_batch(self, texts: List[str], batch_size: int = 32): + """ + For processing batches, use batch analyzer + + Parameters: + texts (List[str]): List of texts to perform deidentification on + batch_size (int): The number of texts to handle in a batch. This + parameter is useful when using spacy models. + + Returns: + List(str): list of deidentified text + """ + analyzer_results_list = self.batch_analyzer.analyze_iterator( + texts, self.language, entities=self.supported_entities, batch_size=batch_size + ) + + anonymized_results_list = self.batch_anonymizer.anonymize_list( + texts, analyzer_results_list, operators=self.operators + ) + return anonymized_results_list + + def deidentify_text(self, text: str): + """ + Cleans PII data from text + + Parameters: + text (str): Text that may contain personally-identifiable information + + Returns: + str: Returns anonymized text + """ + analyzer_results = self.analyzer.analyze( + text=text, entities=self.supported_entities, language=self.language + ) + anonymized_results = self.anonymizer.anonymize( + text=text, analyzer_results=analyzer_results, operators=self.operators + ) + return anonymized_results.text + + +if __name__ == "__main__": + txt = "Hello, I am John. I was born on December 5, 1983. My email is john.doe@gmail.com and " \ + "you can call me on (814) 566 4637" + piid = PiiDeidentifier("en", ["DATE_TIME"]) + print(piid.deidentify_text(txt)) + + piid = PiiDeidentifier("en", ["ADDRESS", "PERSON"], anonymize_action='replace') + print(piid.deidentify_text_batch([txt])) diff --git a/nemo_curator/pii/custom_batch_analyzer_engine.py b/nemo_curator/pii/custom_batch_analyzer_engine.py new file mode 100644 index 00000000..a56d4b64 --- /dev/null +++ b/nemo_curator/pii/custom_batch_analyzer_engine.py @@ -0,0 +1,180 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import List, Iterable, Dict, Union, Any, Optional, Iterator + +from presidio_analyzer import DictAnalyzerResult, RecognizerResult, AnalyzerEngine, BatchAnalyzerEngine, \ + EntityRecognizer +from presidio_analyzer.nlp_engine import NlpArtifacts + +from nemo_curator.pii.custom_nlp_engine import CustomNlpEngine + +logger = logging.getLogger("presidio-analyzer") + + +class CustomBatchAnalyzerEngine(BatchAnalyzerEngine): + """ + Batch analysis of documents (tables, lists, dicts). + + Wrapper class to run Presidio Analyzer Engine on multiple values, + either lists/iterators of strings, or dictionaries. + + :param: analyzer_engine: AnalyzerEngine instance to use + for handling the values in those collections. + """ + + def __init__(self, analyzer_engine: Optional[AnalyzerEngine] = None): + super().__init__(analyzer_engine) + + def analyze_batch( + self, + texts: Iterable[str], + language: str, + entities: Optional[List[str]] = None, + correlation_id: Optional[str] = None, + score_threshold: Optional[float] = None, + return_decision_process: Optional[bool] = False, + ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None, + context: Optional[List[str]] = None, + allow_list: Optional[List[str]] = None, + nlp_artifacts_batch: Optional[Iterable[NlpArtifacts]] = None, + ): + all_fields = not entities + + recognizers = self.analyzer_engine.registry.get_recognizers( + language=language, + entities=entities, + all_fields=all_fields, + ad_hoc_recognizers=ad_hoc_recognizers, + ) + + if all_fields: + # Since all_fields=True, list all entities by iterating + # over all recognizers + entities = self.analyzer_engine.get_supported_entities(language=language) + + all_results = [] + + for text, nlp_artifacts in nlp_artifacts_batch: + results = [] + for recognizer in recognizers: + current_results = recognizer.analyze( + text=text, entities=entities, nlp_artifacts=nlp_artifacts + ) + if current_results: + # add recognizer name to recognition metadata inside results + # if not exists + results.extend(current_results) + + all_results.append(results) + + return all_results + + def analyze_iterator( + self, + texts: Iterable[Union[str, bool, float, int]], + language: str, + batch_size: int = 32, + **kwargs, + ) -> List[List[RecognizerResult]]: + """ + Analyze an iterable of strings. + + :param texts: An list containing strings to be analyzed. + :param language: Input language + :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method. + :param batch_size + """ + # Process the texts as batch for improved performance + nlp_engine: CustomNlpEngine = self.analyzer_engine.nlp_engine + nlp_artifacts_batch: Iterable[NlpArtifacts] = nlp_engine.process_batch( + texts=texts, + language=language, + batch_size=batch_size, + as_tuples=kwargs.get('as_tuples', False) + ) + + results = self.analyze_batch( + texts=texts, nlp_artifacts_batch=nlp_artifacts_batch, language=language, **kwargs + ) + + return results + + def analyze_dict( + self, + input_dict: Dict[str, Union[Any, Iterable[Any]]], + language: str, + keys_to_skip: Optional[List[str]] = None, + **kwargs, + ) -> Iterator[DictAnalyzerResult]: + """ + Analyze a dictionary of keys (strings) and values/iterable of values. + + Non-string values are returned as is. + + :param input_dict: The input dictionary for analysis + :param language: Input language + :param keys_to_skip: Keys to ignore during analysis + :param kwargs: Additional keyword arguments + for the `AnalyzerEngine.analyze` method. + Use this to pass arguments to the analyze method, + such as `ad_hoc_recognizers`, `context`, `return_decision_process`. + See `AnalyzerEngine.analyze` for the full list. + """ + + context = [] + if "context" in kwargs: + context = kwargs["context"] + del kwargs["context"] + + if not keys_to_skip: + keys_to_skip = [] + + for key, value in input_dict.items(): + if not value or key in keys_to_skip: + yield DictAnalyzerResult(key=key, value=value, recognizer_results=[]) + continue # skip this key as requested + + # Add the key as an additional context + specific_context = context[:] + specific_context.append(key) + + if type(value) in (str, int, bool, float): + results: List[RecognizerResult] = self.analyzer_engine.analyze( + text=str(value), language=language, context=[key], **kwargs + ) + elif isinstance(value, dict): + new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip) + results = self.analyze_dict( + input_dict=value, + language=language, + context=specific_context, + keys_to_skip=new_keys_to_skip, + **kwargs, + ) + elif isinstance(value, Iterable): + # Recursively iterate nested dicts + + results: List[List[RecognizerResult]] = self.analyze_iterator( + texts=value, + language=language, + context=specific_context, + **kwargs, + ) + else: + raise ValueError(f"type {type(value)} is unsupported.") + + yield DictAnalyzerResult(key=key, value=value, recognizer_results=results) + diff --git a/nemo_curator/pii/custom_nlp_engine.py b/nemo_curator/pii/custom_nlp_engine.py new file mode 100644 index 00000000..705c9f5f --- /dev/null +++ b/nemo_curator/pii/custom_nlp_engine.py @@ -0,0 +1,71 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Optional, List, Dict, Union, Tuple, Iterator + +import spacy +from presidio_analyzer.nlp_engine import SpacyNlpEngine, NerModelConfiguration, NlpArtifacts +from spacy import Language + +logger = logging.getLogger("presidio-analyzer") + + +class CustomNlpEngine(SpacyNlpEngine): + + def __init__( + self, + models: Optional[List[Dict[str, str]]] = None, + ner_model_configuration: Optional[NerModelConfiguration] = None, + ): + super().__init__(models, ner_model_configuration) + self.nlp: Dict[str, Language] = None + + def load(self) -> None: + """Load the spaCy NLP model.""" + logger.debug(f"Loading SpaCy models: {self.models}") + + self.nlp = {} + # Download spaCy model if missing + for model in self.models: + self._validate_model_params(model) + self._download_spacy_model_if_needed(model["model_name"]) + self.nlp[model["lang_code"]] = spacy.load(model["model_name"], enable=["ner"]) + + def process_batch( + self, + texts: Union[List[str], List[Tuple[str, object]]], + language: str, + as_tuples: bool = False, + batch_size: int = 32 + ) -> Iterator[Optional[NlpArtifacts]]: + """Execute the NLP pipeline on a batch of texts using spacy pipe. + + :param texts: A list of texts to process. + :param language: The language of the texts. + :param as_tuples: If set to True, inputs should be a sequence of + (text, context) tuples. Output will then be a sequence of + (doc, context) tuples. Defaults to False. + :param batch_size: The batch size. + """ + + if not self.nlp: + raise ValueError("NLP engine is not loaded. Consider calling .load()") + + texts = [str(text) for text in texts] + docs = self.nlp[language].pipe(texts, as_tuples=as_tuples, batch_size=batch_size) + for doc in docs: + yield doc.text, self._doc_to_nlp_artifact(doc, language) + + diff --git a/nemo_curator/pii/recognizers/address_recognizer.py b/nemo_curator/pii/recognizers/address_recognizer.py new file mode 100644 index 00000000..80c0126f --- /dev/null +++ b/nemo_curator/pii/recognizers/address_recognizer.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import usaddress +from presidio_analyzer import LocalRecognizer, RecognizerResult + + +class AddressRecognizer(LocalRecognizer): + """ + Address Detector based on usaddress library + + Typical US address format: + * The recipient’s first and last name + * Street number and name (address line 1) + * Apartment or unit and its number (address line 2) + * City, state and zip code (include all of this on one line with a comma between city and state, but not zip code) + * Country + """ + + def load(self) -> None: + pass + + def analyze(self, text, entities, nlp_artifacts): + output = usaddress.parse(text) + curr_pos = 0 + results = [] + + for token, _type in output: + token_pos_start = text.find(token, curr_pos) + token_pos_end = token_pos_start + len(token) + curr_pos = token_pos_end + 1 + if _type != "Recipient" and not _type.startswith("USPS"): + result = RecognizerResult( + entity_type="ADDRESS", + start=token_pos_start, + end=token_pos_end, + score=1, + ) + results.append(result) + return results diff --git a/nemo_curator/sample_dataframe.py b/nemo_curator/sample_dataframe.py new file mode 100644 index 00000000..d8341dc0 --- /dev/null +++ b/nemo_curator/sample_dataframe.py @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time + +from nemo_curator.distributed_data_classification.arg_utils import ( + add_cluster_args, + add_input_output_args, +) +from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk +from nemo_curator.utils.file_utils import get_all_files_paths_under + + +def sample_dataframe(df, num_samples): + """ + This function samples a specified number of rows from a DataFrame. + + Args: + df: A DataFrame. + num_samples: The number of rows to randomly sample from the DataFrame. + Returns: + The sampled DataFrame. + + """ + len_df = len(df) + print(f"Total length = {len_df}", flush=True) + sampled_df = df.sample(frac=(num_samples / len_df), random_state=42) + return sampled_df + + +if __name__ == "__main__": + """ + This script is useful if a user wishes to sample a very large dataset for smaller scale testing, + for example, a dataset written as a directory containing thousands of jsonl files. + """ + parser = argparse.ArgumentParser(description="Sample rows and write them to disk") + + parser = add_cluster_args(parser) + parser = add_input_output_args(parser) + parser.add_argument( + "--num_samples", + type=int, + help="The number of rows to sample", + required=True, + ) + args = parser.parse_args() + print(f"Arguments parsed = {args}", flush=True) + client = get_client(args, cluster_type="gpu") + + print("Starting sampling workflow", flush=True) + st = time.time() + df = read_data( + input_files=get_all_files_paths_under(args.input_file_path, recurse_subdirecties=False), + file_type=args.input_file_type, + add_filename=True, + ) + input_files = get_all_files_paths_under(args.input_file_path, recurse_subdirecties=False) + sampled_df = sample_dataframe(df, num_samples=args.num_samples) + write_to_disk( + df=sampled_df, + output_file_dir=args.output_file_path, + write_to_filename=True, + ) + et = time.time() + print(f"Sampling workflow completed in {et-st}", flush=True) + client.close() diff --git a/nemo_curator/scripts/__init__.py b/nemo_curator/scripts/__init__.py new file mode 100644 index 00000000..fe99e99a --- /dev/null +++ b/nemo_curator/scripts/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/nemo_curator/scripts/add_id.py b/nemo_curator/scripts/add_id.py new file mode 100644 index 00000000..433e1d79 --- /dev/null +++ b/nemo_curator/scripts/add_id.py @@ -0,0 +1,138 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import random +import nemo_curator +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.script_utils import ( + add_distributed_args, + attach_bool_arg, +) +from nemo_curator.utils.file_utils import ( + expand_outdir_and_mkdir, + get_all_files_paths_under, +) +from nemo_curator.utils.distributed_utils import ( + get_client, + read_data, + write_to_disk +) + + +def main(args): + client = get_client(args, args.device) + + output_dir = expand_outdir_and_mkdir(args.output_data_dir) + files = get_all_files_paths_under(args.input_data_dir) + if args.shuffle: + random.seed(args.seed) + random.shuffle(files) + + dataset = DocumentDataset(read_data(files, file_type=args.input_file_type, backend="pandas", add_filename=True)) + add_id = nemo_curator.AddId(args.id_field_name, id_prefix=args.id_prefix, start_index=args.starting_index) + id_dataset = add_id(dataset) + + write_to_disk(id_dataset.df, output_dir, write_to_filename=True, output_type=args.output_file_type) + + +def attach_args(parser=argparse.ArgumentParser( + """ +Adds unique identifiers to each document in the dataset. +Creates a new ID field with name specified by the argument +"--id-field-name" within each json. + +This script essentially works by counting the total +number of documents within the dataset and then, in parallel +assigns unique sequential ids to each document in the dataset. + +If a document identifier does not already exist for each document, then +these ids must be added prior to performing fuzzy/exact deduplication +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--input-data-dir", + type=str, + default=None, + help="Input directory consisting of .jsonl files that are accessible " + "to all nodes. Use this for a distributed file system", + ) + parser.add_argument( + "--starting-index", + type=int, + default=0, + help="Starting index from which to start indexing the documents", + ) + parser.add_argument( + "--output-data-dir", + type=str, + default=None, + help="The output directory to where the jsonl " + "files with ids will be written. If not specified, the ids will " + "be written in-place", + ) + parser.add_argument( + "--id-field-name", + type=str, + default='adlr_id', + help="The name of the field that will contain the id value. " + "Default is 'adlr_id'", + ) + parser.add_argument( + "--id-prefix", + type=str, + default="doc_id", + help="The prefix to the id number that will be assigned to the " + "document. When performing deduplication jointly with different" + "datasets, it is helpful to provide a prefix that denotes that a " + "document belongs to a particular dataset (e.g., wiki for documents" + "that come from the wikipedia dataset)", + ) + attach_bool_arg( + parser, + "shuffle", + help_str="Shuffle the order of files before assigning IDs." + "Useful for creating a copy dataset with different IDs", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="If shuffling is specified, use this random seed to " + "perform the random shuffling", + ) + parser.add_argument( + "--input-file-type", + type=str, + default="jsonl", + help="File type of the dataset to be read in. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + parser.add_argument( + "--output-file-type", + type=str, + default="jsonl", + help="File type the dataset will be written to. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/compute_minhashes.py b/nemo_curator/scripts/compute_minhashes.py new file mode 100644 index 00000000..c7a7e68b --- /dev/null +++ b/nemo_curator/scripts/compute_minhashes.py @@ -0,0 +1,165 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +from nemo_curator import MinHash +from nemo_curator.datasets import DocumentDataset +from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep +from nemo_curator.gpu_deduplication.utils import ( + create_logger, + parse_nc_args, + performance_report_if, +) +from nemo_curator.utils.distributed_utils import get_client, read_data +from nemo_curator.utils.file_utils import get_all_files_paths_under + + +def pre_imports(): + import cudf # noqa: F401 + + +def main(args): + logger = create_logger( + rank=0, log_file=os.path.join(args.log_dir, "rank_000.log"), name="minhash_log" + ) + logger.info(f"Starting workflow with args:\n {args}") + + assert args.hash_bytes in {4, 8}, "Currently only 32bit/64bit hashes are supported" + assert args.device == "gpu" + + args.set_torch_to_use_rmm = False + client = get_client(args, cluster_type=args.device) + logger.info(f"Client Created {client}") + client.run(pre_imports) + logger.info("Pre imports complete") + + data_paths = args.input_data_dirs + id_field = args.input_json_id_field + text_field = args.input_json_text_field + num_files = args.num_files + + minhasher = MinHash( + seed=args.seed, + num_hashes=args.minhash_length, + char_ngrams=args.char_ngram, + use_64bit_hash=False if args.hash_bytes == 4 else True, + logger=logger, + id_field=id_field, + text_field=text_field, + ) + + t0 = time.time() + for data_path in data_paths: + print(f"Computing minhashes for {data_path}", flush=True) + data_path = strip_trailing_sep(data_path) + if num_files is not None and num_files <= 0: + print(f"Processed {args.num_files}... quitting") + break + + files = get_all_files_paths_under(root=data_path, recurse_subdirectories=False) + files = [f for f in files if f.endswith(".jsonl")] + df = read_data( + files[:num_files] if num_files else files, + file_type="jsonl", + backend="cudf", + files_per_partition=args.files_per_partition, + add_filename=False, + )[[id_field, text_field]] + + if num_files is not None: + num_files -= len(files) + + res = minhasher(DocumentDataset(df)).df + logger.info( + f"Lazy minhash generation complete for {res.npartitions} partitions" + ) + logger.info(f"Starting execution for {data_path}") + write_path = os.path.join( + args.output_minhash_dir, os.path.basename(data_path), "minhashes.parquet" + ) + + t1 = time.time() + with performance_report_if( + args.profile_path, f"{os.path.basename(data_path)}-minhash-profile.html" + ): + res.to_parquet(write_path, write_index=False) + logger.info( + f"Minhash computation for f{data_path} took {time.time() - t1}s complete at {write_path}" # noqa:E501 + ) + logger.info( + f"Minhash computation across datasets took {time.time() - t0}s complete at {args.output_minhash_dir}" # noqa:E501 + ) + + +def attach_args(parser=None): + description = """Computes minhash signatures from an input directory of documents + contained within jsonl files. For each document a dataframe of document-ids + -minhash signatures is created. This dataframe is written to file after processing + """ + if not parser: + parser = parse_nc_args(description=description) + + parser.add_argument( + "--minhash-length", + type=int, + default=260, + help="The number of minhashes to compute for each document.", + ) + parser.add_argument( + "--char-ngram", + type=int, + default=5, + help="The number of consecutive characters to include in a sliding " + "window when creating the document shingles for computing " + "MinHash signatures.", + ) + parser.add_argument( + "--hash-bytes", + type=int, + default=4, + help="Number of bytes per computed minhash " + "(default is an unsigned 32-bit integer)", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed used for intializing the hash " + "functions used to compute the MinHashes", + ) + parser.add_argument( + "--output-minhash-dir", + type=str, + required=True, + help="Output directory where minhashes will be written. " + "Each file is a parquet file that contains two series, the document ids, " + "and a series of lists, each list denoting the minhash signature for that document id.", + ) + parser.add_argument( + "--device", + type=str, + default="gpu", + help="Type of cluster to start up", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/connected_components.py b/nemo_curator/scripts/connected_components.py new file mode 100644 index 00000000..dfafe04d --- /dev/null +++ b/nemo_curator/scripts/connected_components.py @@ -0,0 +1,87 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import os +from nemo_curator.modules.fuzzy_dedup import ConnectedComponents +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.gpu_deduplication.utils import ( + enable_spilling, + parse_nc_args, +) + + +def main(args): + """ + Takes a dataset consisting of document pairs + and their corresponding jaccard similarity to compute connected + components of docuements across pairs to find similar docuemnt + after applying a given threshold. The result is a dataset + consisting of all documents that are similar (above the threshold) + and the component they belong to. + """ + st = time.time() + output_path = os.path.join(args.output_dir, "connected_components.parquet") + args.set_torch_to_use_rmm = False + client = get_client(args, cluster_type="gpu") + enable_spilling() + client.run(enable_spilling) + components_stage = ConnectedComponents( + cache_dir=args.cache_dir, + jaccard_pairs_path=args.jaccard_pairs_path, + id_column=args.input_json_id_field, + convert_str_ids=True, + jaccard_threshold=args.jaccard_threshold, + ) + components_stage.cc_workflow(output_path=output_path) + print(f"All done in {time.time()-st:.1f} seconds") + print(f"Results written to {output_path}") + + +def attach_args(parser=None): + description = """Computes connected component""" + if not parser: + parser = parse_nc_args(description=description) + + parser.add_argument( + "--jaccard-pairs-path", + type=str, + help="The directory containing the jaccard results", + ) + parser.add_argument( + "--output-dir", + type=str, + help="The output directory to write results to", + ) + parser.add_argument( + "--cache-dir", + type=str, + help="The cache directory to write intermediate results to", + ) + parser.add_argument( + "--jaccard-threshold", + type=int, + default=0.8, + help="Jaccard threshold below which we don't consider documents" + " to be duplicate", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/download_and_extract.py b/nemo_curator/scripts/download_and_extract.py new file mode 100644 index 00000000..38c1b696 --- /dev/null +++ b/nemo_curator/scripts/download_and_extract.py @@ -0,0 +1,144 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +from nemo_curator.download import batch_download, download_and_extract +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import attach_bool_arg, add_distributed_args +from nemo_curator.utils.config_utils import build_downloader +from nemo_curator.utils.file_utils import get_all_files_paths_under, expand_outdir_and_mkdir + +def read_urls(file_path): + with open(file_path, 'r') as fp: + urls = fp.readlines() + return [url.strip() for url in urls] + +def main(args): + client = get_client(args, args.device) + + if args.input_url_file: + urls = read_urls(args.input_url_file) + outdir = os.path.abspath(os.path.expanduser(args.output_json_dir)) + output_paths = list(map(lambda url: os.path.join(outdir, url.split("/")[-1] + ".jsonl"), urls)) + elif args.input_data_dir: + # If input_data_dir is specified, we operate in extraction only mode. + urls = get_all_files_paths_under(args.input_data_dir) + output_paths = urls + else: + raise ValueError("One of --input-url-file or --input-data-dir must be specified") + + expand_outdir_and_mkdir(args.output_json_dir) + if args.output_download_dir: + raw_download_dir = args.output_download_dir + else: + raw_download_dir = os.path.join(args.output_json_dir, "downloads") + + downloader, iterator, extractor, output_format = build_downloader(args.builder_config_file, default_download_dir=raw_download_dir) + + if args.download_only: + output_paths = batch_download(urls, downloader) + print(f"{len(output_paths)} were downloaded") + return + + dataset = download_and_extract(urls, output_paths, downloader, iterator, extractor, output_format, keep_raw_download=args.keep_downloaded_files, force_download=args.overwrite_existing_json) + + # Sample to trigger the dask computation + sample = dataset.df.sample(frac=10 / len(dataset)).compute() + +def attach_args(parser=argparse.ArgumentParser( + """ +Takes an input list of urls and downloads the data +and then extracts the text from the downloaded data. Using +the --builder-config-file argument, users must provide a YAML file +that points to implementations of the downloader, iterator and extractor +classes that will be used to construct the documents that will " +"make up the dataset. Examples of these config files for the " +"CommonCrawl, Wikipedia and ArXiv datasets can be ound in the root " +"config directory of this repository. + +In the case that users have data that have been pre-downloaded, +this utility can also be used for "extraction-only" purposes. +For this scenario, users should either provide a valid path +to the "--input-data-dir/--input-local-data-dir" directories +(depending on if the data are globally or locally available to each +MPI rank). Additionally, the downloader class should be implemented +such that it simply returns the pre-downloaded file +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--input-url-file", + type=str, + default=None, + help="Input directory consisting of .jsonl files that are accessible " + "to all nodes. Use this for a distributed file system", + ) + parser.add_argument( + "--input-data-dir", + type=str, + default=None, + required=False, + help="Path to input data directory", + ) + parser.add_argument( + "--output-json-dir", + type=str, + default=None, + help="Output directory to store the extracted text in jsonl files", + ) + attach_bool_arg( + parser, + "download-only", + help_str="Specify this flag if you desire to only download the data" + "files and not extract text from the downloaded files", + ) + parser.add_argument( + "--builder-config-file", + type=str, + default=None, + required=True, + help="YAML file that contains paths to implementations of a downloader, " + "iterator and extractor that will be used in this program " + "to build the documents that make up the output dataset", + ) + attach_bool_arg( + parser, + "keep-downloaded-files", + help_str="If this flag is set to true, the downloaded data files " + "will be kept on disk and not removed after extraction", + ) + parser.add_argument( + "--output-download-dir", + type=str, + default=None, + required=False, + help="The directory to where data files will be written " + "in 'download-only' mode. Specify this argument only when " + "the '--download-only flag is specified'.", + ) + attach_bool_arg( + parser, + "overwrite-existing-json", + help_str="If this flag is specified, then the json data will be " + "overwritten if downloading from the the same file.", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/filter_documents.py b/nemo_curator/scripts/filter_documents.py new file mode 100644 index 00000000..e662410f --- /dev/null +++ b/nemo_curator/scripts/filter_documents.py @@ -0,0 +1,271 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse + +import nemo_curator +from nemo_curator.utils.file_utils import ( + expand_outdir_and_mkdir, + get_batched_files, +) +from nemo_curator.utils.script_utils import attach_bool_arg, add_distributed_args +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.distributed_utils import read_data, write_to_disk, get_client +from nemo_curator.utils.config_utils import build_filter_pipeline + + +def get_dataframe_complement(original_df, filtered_df): + def partition_complement(part_original_df, partition_info=None): + if not partition_info: + return part_original_df + part_filtered_df = filtered_df.get_partition(partition_info["number"]) + complement_mask = ~part_original_df.index.isin(part_filtered_df.index.persist()) + complement_df = part_original_df[complement_mask] + return complement_df + + return original_df.map_partitions(partition_complement) + +def get_score_fields(pipeline): + score_fields = [] + for nc_module in pipeline.modules: + if isinstance(nc_module, nemo_curator.Score) or isinstance(nc_module, nemo_curator.ScoreFilter): + if nc_module.score_field: + score_fields.append(nc_module.score_field) + + return score_fields + +def write_scores(df, output_dir): + for column in df.columns: + output_path = os.path.join(output_dir, f"{column}.txt") + df[column].to_csv(output_path, single_file=True, encoding="utf-8", header=False, index=False, mode="a") + +def main(args): + client = get_client(args, args.device) + if args.device == "cpu": + backend = "pandas" + elif args.device == "gpu": + backend = "cudf" + else: + raise ValueError(f"Invalid device '{args.device}'. Please specify either 'cpu' or 'gpu'.") + + # Make the output directories + kept_document_dir = args.output_retained_document_dir + removed_document_dir = args.output_removed_document_dir + if kept_document_dir: + expand_outdir_and_mkdir(kept_document_dir) + if removed_document_dir: + expand_outdir_and_mkdir(removed_document_dir) + + filter_pipeline = build_filter_pipeline(args.filter_config_file) + score_fields = get_score_fields(filter_pipeline) + + for files in get_batched_files(args.input_data_dir, kept_document_dir, args.input_file_type, batch_size=args.batch_size): + # Load the data and filter + dataset = DocumentDataset(read_data(files, file_type=args.input_file_type, backend=backend, add_filename=True)) + curr_dataset = prev_dataset = dataset + + # Process each filter individually so we can track which documents are removed at each step + for filter_module in filter_pipeline.modules: + curr_dataset = filter_module(curr_dataset).persist() + + filter_field = None + if isinstance(filter_module, nemo_curator.Filter): + filter_field = filter_module.filter_field + elif isinstance(filter_module, nemo_curator.ScoreFilter): + filter_field = filter_module.score_field + + # Save the documents removed by the filter + if removed_document_dir and filter_field: + removed_df = get_dataframe_complement(prev_dataset.df, curr_dataset.df) + removed_filter_dir = os.path.join(removed_document_dir, filter_field) + expand_outdir_and_mkdir(removed_filter_dir) + write_to_disk(removed_df, removed_filter_dir, write_to_filename=True, output_type=args.output_file_type) + prev_dataset = curr_dataset + filtered_dataset = curr_dataset + filtered_dataset = filter_pipeline(dataset).persist() + + # Write scores to separate directory + if args.output_document_score_dir: + if args.id_field in filtered_dataset.df.columns: + output_df = filtered_dataset.df[[args.id_field, *score_fields]] + else: + output_df = filtered_dataset.df[score_fields] + write_scores(output_df, args.output_document_score_dir) + + # Remove scores if not logged + if not args.log_scores: + filtered_dataset = DocumentDataset(filtered_dataset.df.drop(columns=score_fields)) + + # If kept_document_dir is specified, then create it + if kept_document_dir is not None: + write_to_disk(filtered_dataset.df, kept_document_dir, write_to_filename=True, output_type=args.output_file_type) + else: + # Overwrite the existing files + write_to_disk(filtered_dataset.df, args.input_data_dir, write_to_filename=True, output_type=args.output_file_type) + + client.close() + +def attach_args(parser=argparse.ArgumentParser( + """ + Main driver script for applying filters to documents distributed + across dataset files. Inputs are an input directory consisting + of dataset files and a configuration file defining the filter + to be applied to the documents (see the config directory for some + example configs). This script will then compute scores + on each document within the corpus and then, if specified by + the user, separate the documents based on the threshold + specified for the filter. + + For an example of how to use this script + (and apply to a corpus in distributed fashion), please see + the examples directory of this repository + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--input-data-dir", + type=str, + default=None, + help="Input directory consisting of dataset files that are accessible " + "to all nodes. Use this for a distributed file system", + ) + parser.add_argument( + "--input-local-data-dir", + type=str, + default=None, + help="Input directory consisting of dataset files. " + "Use this argument when a distributed file system is not available.", + ) + parser.add_argument( + "--filter-config-file", + type=str, + required=True, + help="The input filter configuration file that contains the " + "path to the filter module as well as the filter parameters", + ) + parser.add_argument( + "--output-retained-document-dir", + type=str, + default=None, + help="The output directory to where documents that are " + "retained during filtering will be written. If this argument " + "is not specified, then the document scores from the " + "filter(s) will be written to the document meta data in place", + ) + parser.add_argument( + "--output-removed-document-dir", + type=str, + default=None, + help="The output directory to where documents that are removed during " + "filtering will be written. This argument is mainly for quality control " + "in order examine documents that are not preserved during filtering. " + "If it is not specified and the retained-document-dir is specified, " + "then only the retained documents will be written to disk", + ) + attach_bool_arg( + parser, + "filter-only", + default=False, + help_str="Specifying this flag will indicate to the code that only the " + "filtering operation should be performed and that scores should not be " + "computed. This flag should be specified if scores have been " + "pre-computed on the documents (e.g., the code was run without the " + "'--output-retained-document-dir' argument) and users desire to apply " + "the filter using the pre-computed scores", + ) + attach_bool_arg( + parser, + "log-scores", + default=False, + help_str="Specifying this flag will cause the computed scores to be " + "logged as additional keys for each document. This only applies to " + "filters with 'log_score: True' in the config. This can aid in " + "performing an interactive quality check of the documents.", + ) + parser.add_argument( + "--output-document-score-dir", + type=str, + default=None, + help="The output directory to where the computed document scores will " + "be written. For each filter, its score will be written to a separate " + "file where each line of the file corresponds to the score computed " + "for each document in the corpus within this directory. This only applies to " + "filters with 'log_score: True' in the config. If this directory is not " + "specified, then filter scores will not be written", + ) + parser.add_argument( + "--id-field", + type=str, + default='adlr_id', + help="The name of the field within each object of the dataset " + "file that assigns a unqiue ID to each document. " + "If this is specified and found within the object, a list of all " + "ids will be written to the output score directory such that each line" + "is consistent with the lines of the written score files ") + attach_bool_arg( + parser, + "keep-node-scores-tmp-dir", + default=False, + help_str="If multiple nodes are used when computing scores, " + "each node will write out its scores to a temporary directory " + "shared across all nodes. Then, the rank 0 node will " + "concatenate all of the scores creating the output file. " + "By default, this directory is removed after concatenation, " + "however users can keep this temporary directory by specifying " + "the flag --keep-node-scores-tmp-dir ", + ) + parser.add_argument( + "--log-frequency", + type=int, + default=10000, + help="The frequency with which to write log messages when " + "computing scores. By default a log message will " + "be written every 10000 documents in a file", + ) + parser.add_argument( + "--log-dir", + type=str, + default="./log/filter_docs", + help="The output log directory where node and local" + " ranks will write their respective log files", + ) + parser.add_argument( + "--input-file-type", + type=str, + default="jsonl", + help="File type of the dataset to be read in. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + parser.add_argument( + "--output-file-type", + type=str, + default="jsonl", + help="File type the dataset will be written to. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=64, + help="Number of files to read into memory at a time.", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/find_exact_duplicates.py b/nemo_curator/scripts/find_exact_duplicates.py new file mode 100644 index 00000000..a7c84ce2 --- /dev/null +++ b/nemo_curator/scripts/find_exact_duplicates.py @@ -0,0 +1,115 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +import dask_cudf +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modules import ExactDuplicates +from nemo_curator.gpu_deduplication.ioutils import strip_trailing_sep +from nemo_curator.gpu_deduplication.utils import (create_logger, parse_nc_args) +from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.distributed_utils import read_data, get_client + + +def pre_imports(): + import cudf # noqa: F401 + + +def main(args): + logger = create_logger(rank=0, + log_file=os.path.join(args.log_dir, "rank_000.log"), + name="exact_dedup") + logger.info(f"Starting workflow with args:\n {args}") + + assert args.hash_method == "md5", "Currently only md5 hash is supported" + args.set_torch_to_use_rmm = False + client = get_client(args, cluster_type="cpu" if args.no_gpu else "gpu") + logger.info(f"Client Created {client}") + if not args.no_gpu: + client.run(pre_imports) + logger.info("Pre imports complete") + + data_paths = args.input_data_dirs + id_field = args.input_json_id_field + text_field = args.input_json_text_field + num_files = args.num_files + t0 = time.time() + dfs = [] + for data_path in data_paths: + data_path = strip_trailing_sep(data_path) + if num_files is not None and num_files <= 0: + logger.info(f"Processed {num_files}... quitting") + break + files = get_all_files_paths_under(root=data_path, + recurse_subdirectories=False) + files = [f for f in files if f.endswith(".jsonl")] + df = read_data( + files[:num_files] if num_files else files, + file_type="jsonl", + backend="pandas" if args.no_gpu else "cudf", + files_per_partition=args.files_per_partition, + add_filename=False, + )[[id_field, text_field]] + if num_files is not None: + num_files -= len(files) + dfs.append(df) + logger.info(f"Lazy read complete for {dfs[-1].npartitions} partitions") + + input_df = dask_cudf.concat(dfs, ignore_unknown_divisions=True) + exact_dups = ExactDuplicates(logger=logger, + id_field=id_field, + text_field=text_field, + hash_method=args.hash_method, + profile_dir=args.profile_path, + cache_dir=args.output_dir) + exact_dups(dataset=DocumentDataset(input_df)) + logger.info( + f"Exact dedup computation across datasets took {time.time() - t0}s complete at {args.output_dir}" # noqa:E501 + ) + + +def attach_args(parser=None): + description = """Compute Exact duplicates in a given dataset. + """ + if not parser: + parser = parse_nc_args(description=description) + parser.add_argument( + "--hash-method", + type=str, + default="md5", + help="Hash Method to use for exact dedup", + ) + parser.add_argument( + "--output-dir", + type=str, + required=True, + help="Output directory where duplicate docs will be written. " + "Each file is a pickle file that contains a dictionary of numpy arrays. " + "The keys are the document ids and the values are the duplicate docs", + ) + parser.add_argument("--no-gpu", + action="store_true", + help="Use CPU based exact dedup") + + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/find_matching_ngrams.py b/nemo_curator/scripts/find_matching_ngrams.py new file mode 100644 index 00000000..76a3e4a5 --- /dev/null +++ b/nemo_curator/scripts/find_matching_ngrams.py @@ -0,0 +1,112 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import pickle + +import nemo_curator +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.script_utils import add_distributed_args +from nemo_curator.utils.distributed_utils import get_client, read_data + + +def main(args): + client = get_client(args, args.device) + + # Each rank read in the task data + with open(args.input_task_ngrams, 'rb') as fp: + task_ngrams = pickle.load(fp) + + decontaminator = nemo_curator.TaskDecontamination([], text_field=args.input_text_field, max_ngram_size=args.max_ngram_size) + + files = get_all_files_paths_under(args.input_data_dir) + dataset = DocumentDataset(read_data(files, file_type=args.input_file_type, backend="pandas")) + + result = decontaminator.find_matching_ngrams(task_ngrams, dataset).compute() + print(f"Found a total of {len(result['matched-ngrams'])} matching n-grams") + + output = { + 'matched-ngrams': result['matched-ngrams'], + 'ngrams-freq': result['ngrams-freq'], + 'max-ngram-size': args.max_ngram_size, + 'min-ngram-size': args.min_ngram_size, + } + with open(args.output_matched_ngram_data, 'wb') as fp: + pickle.dump(output, fp) + + +def attach_args(parser=argparse.ArgumentParser( + """ + Searches for matching task n-grams in the input dataset + and writes out a list of n-grams that were found. +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--input-data-dir", + type=str, + default=None, + help="Input directory consisting of .jsonl files that are accessible " + "to all nodes. Use this for a distributed file system", + ) + parser.add_argument( + "--input-text-field", + type=str, + default='text', + help="The name of the field within each datapoint object of the input " + "file that contains the text.", + ) + parser.add_argument( + "--output-matched-ngram-data", + type=str, + default=None, + help="Output dictionary that contains the output matched n-grams " + "and the frequency of their matches, min-ngram size, max-ngram " + "size and the frequencies of n-gram sizes. All of these data will be " + "used by remove_matching_grams for which this program is a prequisite", + ) + parser.add_argument( + "--input-task-ngrams", + type=str, + default=None, + help="", + ) + parser.add_argument( + "--max-ngram-size", + type=int, + default=13, + help="The maximum n-gram size to consider within the dataset", + ) + parser.add_argument( + "--min-ngram-size", + type=int, + default=8, + help="The minimum n-gram size to consider within the datset", + ) + parser.add_argument( + "--input-file-type", + type=str, + default="jsonl", + help="File type of the dataset to be read in. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/find_pii_and_deidentify.py b/nemo_curator/scripts/find_pii_and_deidentify.py new file mode 100644 index 00000000..7682849f --- /dev/null +++ b/nemo_curator/scripts/find_pii_and_deidentify.py @@ -0,0 +1,202 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import time +from pathlib import Path + +from nemo_curator.modules.modify import Modify +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modifiers.pii_modifier import PiiModifierBatched +# from nemo_curator.pii.algorithm import DEFAULT_LANGUAGE +from nemo_curator.utils.distributed_utils import read_data, write_to_disk, get_client +from nemo_curator.utils.file_utils import get_batched_files +from nemo_curator.utils.script_utils import add_distributed_args + + +def main(args): + """Main function that performs PII de-identifcation given a batch of files""" + logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, + datefmt="%Y-%m-%d %H:%M:%S") + + logging.debug("Beginning PII job") + start_time = time.time() + Path(args.output_data_dir).mkdir(parents=True, exist_ok=True) + + supported_entities = args.supported_entities.split(',') if args.supported_entities else None + + modifier = PiiModifierBatched( + language=args.language, + supported_entities=supported_entities, + anonymize_action=args.anonymize_action, + hash_type=args.hash_type, + chars_to_mask=args.chars_to_mask, + masking_char=args.masking_char, + new_value=args.new_value, + batch_size=args.batch_size, + device=args.device) + + for file_names in get_batched_files( + args.input_data_dir, + args.output_data_dir, + args.input_file_type, + args.n_workers + ): + logging.info("Reading input files....") + source_data = read_data(file_names, file_type=args.input_file_type, backend='pandas', add_filename=True) + dataset = DocumentDataset(source_data) + logging.debug(f"Dataset has {source_data.npartitions} partitions") + + modify = Modify(modifier, batched=True) + modified_dataset = modify(dataset) + write_to_disk(modified_dataset.df, + args.output_data_dir, + write_to_filename=True, + output_type=args.output_file_type + ) + + end_time = time.time() + logging.debug("Total time taken in PII job: %0.3f seconds" % (end_time - start_time)) + + +def attach_args( + parser=argparse.ArgumentParser( + """ + Main driver script for applying PII redaction on documents. Inputs are in the input-data-dir directory. + This script will then perform PII detection and de-identification on each document within the corpus. + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) +): + + parser.add_argument( + "--language", + type=str, + default='en', + required=False, + help="Language of input documents" + ) + + parser.add_argument( + "--supported-entities", + type=str, + default=None, + required=False, + help="Comma separated list of PII entity types. None implies all supported types" + ) + + parser.add_argument( + "--anonymize-action", + type=str, + default='replace', + required=False, + help="Anonymization action. Choose from among: redact, hash, mask and replace" + ) + + parser.add_argument( + "--hash-type", + type=str, + default=None, + required=False, + help="The hash type. Choose from among: sha256, sha512 or md5" + ) + + parser.add_argument( + "--chars-to-mask", + type=int, + default=100, + required=False, + help="The number of characters to mask. Only applicable if anonymize action is mask" + ) + + parser.add_argument( + "--masking-char", + type=str, + default='*', + required=False, + help="The masking character. Only applicable if anonymize action is mask" + ) + + parser.add_argument( + "--new-value", + type=str, + default=None, + required=False, + help="The new value to replace with. Only applicable if anonymize action is replace" + ) + + parser.add_argument( + "--input-data-dir", + type=str, + default=None, + required=True, + help="Directory containing the input files" + ) + + parser.add_argument( + "--input-file-type", + type=str, + default='jsonl', + required=True, + choices=['jsonl', 'csv', 'text'], + help="The input file type (only jsonl is currently supported)", + ) + + parser.add_argument( + "--output-file-type", + type=str, + default='jsonl', + required=True, + choices=['jsonl', 'csv', 'text'], + help="The output file type (only jsonl is currently supported)", + ) + + parser.add_argument( + "--text-field", + type=str, + default="text", + help="The input field within each JSONL or CSV object on which the PII redactor will " + "operate. By default, the redactor will operate on the 'text' " + "field but other fields can be specified such as 'url' or 'id'.", + ) + + parser.add_argument( + "--batch-size", + type=int, + default=2000, + help="The batch size for processing multiple texts together.", + ) + + parser.add_argument( + "--output-data-dir", + type=str, + default=None, + required=True, + help="The output directory to where redacted documents will be written.", + ) + + return parser + + +def console_script(): + arguments = add_distributed_args(attach_args()).parse_args() + client = get_client(arguments, arguments.device) + if not arguments.n_workers: + arguments.n_workers = len(client.scheduler_info()['workers']) + main(arguments) + + +if __name__ == "__main__": + console_script() \ No newline at end of file diff --git a/nemo_curator/scripts/get_common_crawl_urls.py b/nemo_curator/scripts/get_common_crawl_urls.py new file mode 100644 index 00000000..aaf5b728 --- /dev/null +++ b/nemo_curator/scripts/get_common_crawl_urls.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nemo_curator.utils.script_utils import attach_bool_arg +from nemo_curator.utils.download_utils import get_common_crawl_urls + +def main(args): + urls = get_common_crawl_urls(args.starting_snapshot, args.ending_snapshot, data_domain_prefix=args.cc_data_domain_prefix, index_prefix=args.cc_index_prefix, news=args.cc_news) + + with open(args.output_warc_url_file, "w") as fp: + for url in urls: + fp.write(url) + fp.write('\n') + + +def attach_args(parser=argparse.ArgumentParser( + """ +Pulls URLs of WARC files stored within the common crawl data repository +and writes them to file so that they can be used to subsequently +download the WARC files. +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--cc-data-domain-prefix", + type=str, + default="https://data.commoncrawl.org", + help="The prefix that will be prepended to each WARC " + "file to create the URL. By default this value is " + " 'https://data.commoncrawl.org'", + ) + parser.add_argument( + "--cc-index-prefix", + type=str, + default="https://index.commoncrawl.org", + help="The prefix of the URL to the Common Crawl index. " + "By default this value is 'https://index.commoncrawl.org'", + ) + parser.add_argument( + "--output-warc-url-file", + type=str, + default=None, + required=True, + help="The output file to which the WARC urls will be written", + ) + parser.add_argument( + "--starting-snapshot", + type=str, + default="2020-50", + help="The starting snapshot to download. All WARC urls will be written " + "between the dates specified by --starting-snapshot " + "and --ending-snapshot. Snapshots must be specified by YYYY-WeekNumber " + "(e.g., '2020-50' or '2021-04'). For the CC-NEWS dataset, " + "(specified with the '--cc-news' flag) this changes to " + "Year-Month (YYYY-MM)", + ) + parser.add_argument( + "--ending-snapshot", + type=str, + default="2020-50", + help="The last snapshot for which WARC urls will be retrieved. " + "Snapshots must be specified by YYYY-WeekNumber " + "(e.g., '2020-50' or '2021-04')", + ) + attach_bool_arg( + parser, + "cc-news", + help_str="Specify --cc-news in order to download WARC URLs for " + "the CC-NEWS dataset instead of the CC-MAIN datasets. If this " + "is specified, then it is assumed that the format for the start " + "and end snapshots is 'YYYY-MM' (Year-Month). All WARC URLs between " + "the specified years and months will be download", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/get_wikipedia_urls.py b/nemo_curator/scripts/get_wikipedia_urls.py new file mode 100644 index 00000000..b216851d --- /dev/null +++ b/nemo_curator/scripts/get_wikipedia_urls.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from nemo_curator.utils.download_utils import get_wikipedia_urls + +def main(args): + wikipedia_urls = get_wikipedia_urls(language=args.language, wikidumps_index_prefix=args.wikidumps_index_baseurl) + with open(args.output_url_file, 'w') as output_file: + for url in wikipedia_urls: + output_file.write(url) + output_file.write('\n') + + +def attach_args(parser=argparse.ArgumentParser( + """ +Pulls urls pointing to the latest Wikipedia dumps +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--language", + type=str, + default='en', + help="Desired language of the Wikipedia dump", + ) + parser.add_argument( + "--wikidumps-index-baseurl", + type=str, + default='https://dumps.wikimedia.org', + help="The base url for all Wikipedia dumps", + ) + parser.add_argument( + "--output-url-file", + type=str, + default="wikipedia_urls_latest.txt", + help="The output file to which the urls containing " + "the latest dump data will be written", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/jaccard_compute.py b/nemo_curator/scripts/jaccard_compute.py new file mode 100644 index 00000000..41400772 --- /dev/null +++ b/nemo_curator/scripts/jaccard_compute.py @@ -0,0 +1,87 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import os + +from nemo_curator.gpu_deduplication.utils import ( + enable_spilling, + parse_nc_args, +) +from nemo_curator.utils.distributed_utils import get_num_workers, get_client +from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity + + +def main(args): + description = """Computes the Jaccard similarity between document pairs + from partitioned parquet dataset. Result is a parquet dataset consiting of + document id pair along with their Jaccard similarity score. + """ + OUTPUT_PATH = args.output_dir + shuffled_docs_path = args.shuffled_docs_path + output_final_results_path = os.path.join(OUTPUT_PATH, "jaccard_similarity_results.parquet") + client = get_client(args, "gpu") + enable_spilling() + client.run(enable_spilling) + print(f"Num Workers = {get_num_workers(client)}", flush=True) + print("Connected to dask cluster", flush=True) + print("Running jaccard compute script", flush=True) + st = time.time() + jaccard = JaccardSimilarity( + id_field=args.input_json_id_field, + text_field=args.input_json_text_field, + anchor_id_fields=[f"anchor_{i}_{args.input_json_id_field}" for i in range(2)], + ngram_width=args.ngram_size, + ) + # Run actual computation + result_df = jaccard.jaccard_compute(shuffled_docs_path) + + result_df.to_parquet( + output_final_results_path, + write_index=False, + write_metadata_file=False, + ) + print(f"Jaccard Computing+Writing time: {time.time() - st:.1f} seconds") + + +def attach_args(parser=None): + description = """Computes jaccard similarity""" + if not parser: + parser = parse_nc_args(description=description) + + parser.add_argument( + "--shuffled-docs-path", + type=str, + help="The directory containing the shuffled documents", + ) + parser.add_argument( + "--output-dir", + type=str, + help="The output directory to write results to", + ) + parser.add_argument( + "--ngram-size", + type=int, + default=5, + help="Size of ngram to use during jaccard similarity", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/jaccard_shuffle.py b/nemo_curator/scripts/jaccard_shuffle.py new file mode 100644 index 00000000..441d99f4 --- /dev/null +++ b/nemo_curator/scripts/jaccard_shuffle.py @@ -0,0 +1,124 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( + get_text_ddf_from_json_path_with_blocksize, +) +from nemo_curator.gpu_deduplication.utils import get_client, get_num_workers, parse_nc_args +from nemo_curator.modules.fuzzy_dedup import _Shuffle + + +def func(): + import cudf + from nemo_curator.modules.fuzzy_dedup import _Shuffle + + +def main(args): + input_data_paths = args.input_data_dirs + input_anchor_docs_with_bk_dir = args.input_bucket_mapping_dir + OUTPUT_PATH = args.output_dir + output_shuffled_docs_path = os.path.join(OUTPUT_PATH, "shuffled_docs.parquet") + + client = get_client(args) + client.run(func) + print(f"Num Workers = {get_num_workers(client)}", flush=True) + print("Connected to dask cluster", flush=True) + print("Running jaccard shuffle script", flush=True) + print(f"Args = {args}") + st = time.time() + text_ddf = get_text_ddf_from_json_path_with_blocksize( + input_data_paths=input_data_paths, + num_files=args.num_files, + blocksize=args.text_ddf_blocksize, + id_column=args.input_json_id_field, + text_column=args.input_json_text_field, + ) + print( + "Graph creation for get_text_ddf_from_json_path_with_blocksize" " complete.", + flush=True, + ) + print(f"text_ddf.npartitions = {text_ddf.npartitions}", flush=True) + shuffle = _Shuffle( + id_fields=["dataset_id", "doc_id"], + text_field=args.input_json_text_field, + profile_dir=args.profile_path, + int_to_str_id="adlr_id" + ) + shuffle.shuffle_docs_on_buckets( + documents_df=text_ddf, + bucket_w_anchors_path=input_anchor_docs_with_bk_dir, + output_shuffled_docs_path=output_shuffled_docs_path, + bucket_mapping_df_blocksize=args.bucket_mapping_ddf_blocksize, + parts_per_worker=args.parts_per_worker, + bucket_parts_per_worker=args.bucket_parts_per_worker, + partition_on="_output_partition_id", + ) + et = time.time() + print(f"Jaccard Shuffle E2E time taken = {et-st} s") + + +def attach_args(parser=None): + description = """Shuffles input text documents based on the given bucket + map. The output is a partitioned parquet dataset with the documents + shuffled by buckets + """ + if not parser: + parser = parse_nc_args(description=description) + + parser.add_argument( + "--input-bucket-mapping-dir", + type=str, + help="The directory containing anchor docs with bk files", + ) + parser.add_argument( + "--text-ddf-blocksize", + type=int, + default=256, + help="The block size for chunking jsonl files for text ddf in mb", + ) + parser.add_argument( + "--bucket-mapping-ddf-blocksize", + type=int, + default=256, + help="The block size for for anchor_docs_with_bk ddf in mb", + ) + parser.add_argument( + "--output-dir", + type=str, + help="The output directory to write results in", + ) + parser.add_argument( + "--parts-per-worker", + default=1, + type=int, + help="The number of parts to process per worker per batch", + ) + parser.add_argument( + "--bucket-parts-per-worker", + default=8, + type=int, + help="The number of bucket parts to process per worker per batch", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/make_data_shards.py b/nemo_curator/scripts/make_data_shards.py new file mode 100644 index 00000000..c9dc2281 --- /dev/null +++ b/nemo_curator/scripts/make_data_shards.py @@ -0,0 +1,81 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nemo_curator.utils.script_utils import add_distributed_args +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.file_utils import reshard_jsonl + +def main(args): + client = get_client(args, args.device) + + reshard_jsonl(args.input_data_dir, args.output_resharded_dir, output_file_size=args.output_file_size, start_index=args.start_index, file_prefix=args.prefix) + + +def attach_args(parser=argparse.ArgumentParser( + """ +Makes balanced text files of output size "--block-size" from +a directory of input files. The output files will be renamed +as output_dir/000.jsonl, output_dir/001.jsonl, ... etc. Users +may specify the desired number of output files and the block size +will be computed from this specified quantity. + +The size of the input files must be larger than the specified +"--block-size" +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--input-data-dir", + type=str, + default=None, + required=True, + help="Input directory consisting of .jsonl file(s)", + ) + parser.add_argument( + "--output-resharded-dir", + type=str, + default=None, + required=True, + help="Output directory to where the sharded " + ".jsonl files will be written", + ) + parser.add_argument( + "--output-file-size", + type=str, + default="100M", + help="Approximate size of output files. Must specify with a string and " + "with the unit K, M or G for kilo, mega or gigabytes", + ) + parser.add_argument( + "--start-index", + type=int, + default=0, + help="Starting index for naming the output files", + ) + parser.add_argument( + "--prefix", + type=str, + default="", + help="Prefix to use to prepend to output file number", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/map_buckets.py b/nemo_curator/scripts/map_buckets.py new file mode 100644 index 00000000..e584af3b --- /dev/null +++ b/nemo_curator/scripts/map_buckets.py @@ -0,0 +1,183 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +from nemo_curator.utils.fuzzy_dedup_utils.io_utils import ( + get_bucket_ddf_from_parquet_path, + get_text_ddf_from_json_path_with_blocksize, +) +from nemo_curator.gpu_deduplication.utils import get_client, get_num_workers, parse_nc_args +from nemo_curator.modules.fuzzy_dedup import _MapBuckets + + +def get_anchor_and_output_map_info( + input_data_paths, + input_bucket_path, + text_ddf_blocksize, + num_files, + num_workers, + shuffle_type, + input_bucket_field, + input_id_field, + input_text_field, +): + """ + Get anchor docs with bucket info + Args: + input_data_paths: list of paths to input data + input_bucket_path: path to input buckets + text_ddf_blocksize: blocksize for text ddf + num_files: number of files to read + num_workers: number of workers + shuffle_type: type of shuffle to use + Returns: + ddf_anchor_docs_with_bk + """ + ddf_text = get_text_ddf_from_json_path_with_blocksize( + input_data_paths=input_data_paths, + num_files=num_files, + blocksize=text_ddf_blocksize, + id_column=input_id_field, + text_column=input_text_field, + ) + ddf_bk = get_bucket_ddf_from_parquet_path( + input_bucket_path=input_bucket_path, num_workers=num_workers + ) + map_buckets = _MapBuckets( + id_fields=["dataset_id", "doc_id"], + bucket_field=input_bucket_field, + ) + ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors( + documents_df=ddf_text, buckets_df=ddf_bk, shuffle_type=shuffle_type + ) + return ddf_anchor_docs_with_bk + + +def attach_args(parser=None): + description = """Takes the buckets generated from minhashes and uses + document length information to create a coarse mapping of mapping multiple + buckets to a logical partition by using a modified bin packing algorithm. + """ + if not parser: + parser = parse_nc_args(description=description) + parser.add_argument( + "--input-bucket-dir", + type=str, + help="The directory containing bucket information files", + ) + parser.add_argument( + "--text-ddf-blocksize", + type=int, + default=256, + help="The block size for chunking jsonl files for text ddf in mb", + ) + parser.add_argument( + "--output-dir", + type=str, + help="The output directory to write results in", + ) + parser.add_argument( + "--shuffle-type", + type=str, + default="tasks", + help="Type of shuffle to use before writing to parquet", + ) + parser.add_argument( + "--input-bucket-field", + type=str, + default="_bucket_id", + help="Name of the column containing minhashes", + ) + return parser + + +def jaccard_get_output_map_workflow( + client, + input_data_paths, + input_bucket_path, + output_anchor_docs_with_bk_path, + text_ddf_blocksize, + num_files, + shuffle_type, + input_bucket_field, + input_id_field, + input_text_field, +): + """ + Workflow for jaccard shuffle + Args: + client: dask client + input_data_paths: list of paths to input data + input_bucket_path: path to input buckets + output_anchor_docs_with_bk_path: path to save anchor docs with bucket info + text_ddf_blocksize: blocksize for text ddf + num_files: number of files to read + parts_per_worker: number of parts per worker + shuffle_type: type of shuffle to use before writing to parquet + """ + num_workers = get_num_workers(client) + ddf_anchor_docs_with_bk = get_anchor_and_output_map_info( + input_data_paths, + input_bucket_path, + text_ddf_blocksize, + num_files, + num_workers, + shuffle_type, + input_bucket_field, + input_id_field, + input_text_field, + ) + ddf_anchor_docs_with_bk.to_parquet( + output_anchor_docs_with_bk_path, + write_index=False, + ) + + +def main(args): + input_data_paths = args.input_data_dirs + input_bucket_path = args.input_bucket_dir + OUTPUT_PATH = args.output_dir + output_anchor_docs_with_bk_path = os.path.join( + OUTPUT_PATH, "anchor_docs_with_bk.parquet" + ) + client = get_client(args) + print(f"Num Workers = {get_num_workers(client)}", flush=True) + print("Connected to dask cluster", flush=True) + print("Running jaccard map buckets script", flush=True) + print(f"Args = {args}") + st = time.time() + jaccard_get_output_map_workflow( + client, + input_data_paths, + input_bucket_path, + output_anchor_docs_with_bk_path, + args.text_ddf_blocksize, + args.num_files, + args.shuffle_type, + args.input_bucket_field, + args.input_json_id_field, + args.input_json_text_field, + ) + et = time.time() + print(f"Bucket Mapping time taken = {et-st} s") + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/minhash_lsh.py b/nemo_curator/scripts/minhash_lsh.py new file mode 100644 index 00000000..ecaeacce --- /dev/null +++ b/nemo_curator/scripts/minhash_lsh.py @@ -0,0 +1,133 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +import cudf +import dask_cudf +import numpy as np +from nemo_curator import LSH +from nemo_curator.datasets import DocumentDataset +from nemo_curator.gpu_deduplication.jaccard_utils.doc_id_mapping import \ + convert_str_id_to_int +from nemo_curator.gpu_deduplication.utils import create_logger, parse_nc_args +from nemo_curator.utils.distributed_utils import get_client + + +def pre_imports(): + import cudf # noqa: F401 + + +def main(args): + + logger = create_logger( + rank=0, log_file=os.path.join(args.log_dir, "rank_000.log"), name="lsh_log" + ) + logger.info(f"Starting workflow with args:\n {args}") + + assert args.device == "gpu" + args.set_torch_to_use_rmm = False + client = get_client(args, cluster_type=args.device) + logger.info(f"Client Created {client}") + client.run(pre_imports) + logger.info("Pre imports complete") + + data_paths = args.input_data_dirs + id_field = args.input_json_id_field + minhash_field = args.input_minhash_field + + dfs = [] + for data_path in data_paths: + dfs.append( + dask_cudf.read_parquet(data_path, blocksize="2GB", aggregate_files=True) + ) + df = dask_cudf.concat(dfs, ignore_unknown_divisions=True) + df = df.map_partitions( + convert_str_id_to_int, + id_column=id_field, + meta=cudf.DataFrame( + {minhash_field: [[1, 2, 3]], "doc_id": [1], "dataset_id": np.uint32(1)} + ), + ) + lsh = LSH( + cache_dir=args.output_bucket_dir, + minhash_length=args.minhash_length, + num_buckets=args.num_bands, + buckets_per_shuffle=args.buckets_per_shuffle, + id_fields=["dataset_id", "doc_id"], + profile_dir=args.profile_path, + minhash_field=minhash_field, + logger=logger, + ) + t1 = time.time() + _ = lsh(DocumentDataset(df)) + logger.info(f"Computing and writing buckets took {time.time() - t1} s") + + +def attach_args(parser=None): + description = """Compute buckets from existing minhashes and writes the output + to files. Each row corresponding to a document-id followed by the columns + denoting the bucket id's that document belongs to. + """ + if not parser: + parser = parse_nc_args(description=description) + + parser.add_argument( + "--minhash-length", + type=int, + default=260, + help="The minhash signature length of each input document", + ) + parser.add_argument( + "--input-minhash-field", + type=str, + default="_minhash_signature", + help="Name of the column containing minhashes", + ) + parser.add_argument( + "--num-bands", + type=int, + default=20, + help="The number of minhashes to compute for each document.", + ) + parser.add_argument( + "--buckets-per-shuffle", + type=int, + required=True, + help="Number of buckets to shuffle per batch", + ) + parser.add_argument( + "--device", + type=str, + default="gpu", + help="Type of cluster to start up", + ) + parser.add_argument( + "--output-bucket-dir", + type=str, + required=True, + help="Output directory where minhashes will be written. " + "Each file parquet file consiting of document and bucket IDs", + ) + + return parser + + +def console_script(): + main(attach_args().parse_args()) + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/prepare_fasttext_training_data.py b/nemo_curator/scripts/prepare_fasttext_training_data.py new file mode 100644 index 00000000..29dd6e41 --- /dev/null +++ b/nemo_curator/scripts/prepare_fasttext_training_data.py @@ -0,0 +1,125 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import argparse +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modules import Modify +from nemo_curator.modifiers import FastTextLabelModifier +from nemo_curator.utils.distributed_utils import get_client, read_data +from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.script_utils import add_distributed_args + +def sample_rows(df, n, seed): + samples = df.sample(frac=n / len(df) + 0.05, random_state=seed) + + return samples.head(n=n, compute=False) + + +def main(args): + client = get_client(args, args.device) + # Get local path + files = list(get_all_files_paths_under(args.input_data_dir)) + raw_data = read_data(files, file_type="jsonl", backend="pandas") + dataset = DocumentDataset(raw_data) + text_field = args.input_json_field + + # fastText requires each document to be prepended with a special label for training + preprocessing = Modify(FastTextLabelModifier(args.label), text_field=text_field) + labeled_data = preprocessing(dataset) + + samples = sample_rows(labeled_data.df, args.output_num_samples, args.seed) + + samples[text_field].to_csv(args.output_train_file, single_file=True, encoding="utf-8", header=False, index=False, quoting=csv.QUOTE_NONE, sep="\n") + client.close() + + +def attach_args(parser=argparse.ArgumentParser( + """ +Prepare data for training skip-gram classifier with FastText + +Takes as input a directory of .jsonl files, and writes an output +file of samples prepared in order to train a skip-gram classifier +with FastText. +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--input-data-dir", + type=str, + default=None, + help="Input directory consisting of .jsonl files that are accessible " + "to all nodes. Use this for a distributed file system", + ) + parser.add_argument( + "--input-local-data-dir", + type=str, + default=None, + help="Input directory consisting of .jsonl files. " + "Use this argument when a distributed file system is not available.", + ) + parser.add_argument( + "--input-json-field", + type=str, + default='text', + help="The input field within each JSON object on which the filter will " + "operate. By default, the filter will operate on the 'text' " + "field but other fields can be specified such as 'url' or 'id'.", + ) + parser.add_argument( + "--output-num-samples", + type=int, + default=None, + required=True, + help="The number of documents to randomly sample from the dataset and" + " use as training and validation samples to train the" + " skip-gram classifier", + ) + parser.add_argument( + "--output-train-file", + type=str, + default=None, + help="The output file containing prepared samples to train a " + "skip-gram classifier with FastText", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="The random seed to use for sampling from the dataset", + ) + parser.add_argument( + "--label", + type=str, + default=None, + required=True, + help="The label to be used at the beginning of each sample " + "in the output file. For example '__label__hq' could be " + "used for the high-quality (positive) samples", + ) + parser.add_argument( + "--log-dir", + type=str, + default="./log/prepare_filter_data", + help="The output log directory where node and local" + " ranks will write their respective log files", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/prepare_task_data.py b/nemo_curator/scripts/prepare_task_data.py new file mode 100644 index 00000000..2cc017b5 --- /dev/null +++ b/nemo_curator/scripts/prepare_task_data.py @@ -0,0 +1,82 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import yaml +import pickle + +import nemo_curator +from nemo_curator.tasks.downstream_task import import_task +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import add_distributed_args + + +def main(args): + client = get_client(args, args.device) + # Read in config file + with open(args.task_config_file, 'r') as config_file: + task_params = yaml.load(config_file, Loader=yaml.FullLoader) + + # Generate n-grams for all tasks + task_list = [] + for task in task_params['tasks']: + print(f"Generating N-grams for task {task['name']}") + task_class = import_task(task['name']) + task_object = task_class(**task['params']) + task_list.append(task_object) + + decontaminator = nemo_curator.TaskDecontamination(task_list) + all_ngrams = decontaminator.prepare_task_ngram_count() + + with open(args.output_task_ngrams, 'wb') as fp: + pickle.dump(all_ngrams, fp) + + +def attach_args(parser=argparse.ArgumentParser( + """ + Computes N-grams from input downstream task validation datasets. + Takes in an input configuration file (defaults can be found under the + config directory under the root directory of the repository) and + writes out the computed N-grams to a pickle file where they can be + used by the program find_matching_ngrams which will search for + matching N-grams in the input training dataset. +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--task-config-file", + type=str, + default=None, + required=True, + help="YAML configuration file that contains task information. " + "YAML files for already implemented tasks can be found in the config " + "directory that is located in the root directory of this repository.", + ) + parser.add_argument( + "--output-task-ngrams", + type=str, + default="./task_ngrams.pkl", + help="N-grams computed from input task data. N-grams are stored " + "as keys to a dictionary and the values of the dictionary " + "are the frequencies of which the n-grams occurr within a " + "training dataset (they are initialized to zero within this program)", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/remove_matching_ngrams.py b/nemo_curator/scripts/remove_matching_ngrams.py new file mode 100644 index 00000000..10a59c41 --- /dev/null +++ b/nemo_curator/scripts/remove_matching_ngrams.py @@ -0,0 +1,163 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import pickle + +import nemo_curator +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.distributed_utils import ( + read_data, + write_to_disk, + get_client +) +from nemo_curator.utils.file_utils import ( + get_all_files_paths_under, + expand_outdir_and_mkdir, + get_batched_files +) +from nemo_curator.utils.script_utils import add_distributed_args + + +def main(args): + client = get_client(args, args.device) + + output_tdd_dir = expand_outdir_and_mkdir(args.output_task_deduped_dir) + output_rm_doc_dir = None + if args.output_removed_doc_dir is not None: + output_rm_doc_dir = expand_outdir_and_mkdir(args.output_removed_doc_dir) + + # Each rank read in the task data + print(f"Reading in matched n-grams from {args.input_matched_ngrams}") + with open(args.input_matched_ngrams, 'rb') as fp: + matched_ngram_data = pickle.load(fp) + + # Unpack the results from find_matched_ngrams + matched_ngrams = matched_ngram_data['matched-ngrams'] + ngrams_freq = matched_ngram_data['ngrams-freq'] + max_ngram_size = matched_ngram_data['max-ngram-size'] + + decontaminator = nemo_curator.TaskDecontamination([], + text_field=args.input_text_field, + max_ngram_size=max_ngram_size, + max_matches=args.match_threshold, + max_splits=args.max_document_splits, + removed_dir=output_rm_doc_dir + ) + + files = list(get_all_files_paths_under(args.input_data_dir)) + for files in get_batched_files(args.input_data_dir, output_tdd_dir, args.input_file_type, batch_size=args.batch_size): + dataset = DocumentDataset(read_data(files, file_type=args.input_file_type, backend="pandas", add_filename=True)) + decontaminated_dataset = decontaminator.remove_matching_ngrams(matched_ngrams, ngrams_freq, dataset) + write_to_disk(decontaminated_dataset.df, output_tdd_dir, write_to_filename=True, output_type=args.output_file_type) + print(f"Finished decontaminating {len(files)} files") + + print("Finished decontaminating all files") + + +def attach_args(parser=argparse.ArgumentParser( + """ + Using the matching n-grams find by + nemo_curator/scripts/find_matching_ngrams.py + (provided by the argument --input-matched-ngrams), + passes over all documents and removes matching n-grams from the corpus by + splitting documents containing the match. If a document is split more than + --max-splits times, it is removed from the corpus. +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--input-data-dir", + type=str, + default=None, + help="Input directory consisting of .jsonl files that are accessible " + "to all nodes. Use this for a distributed file system", + ) + parser.add_argument( + "--input-text-field", + type=str, + default='text', + help="The name of the field within each datapoint object of the input " + "file that contains the text.", + ) + parser.add_argument( + "--input-matched-ngrams", + type=str, + default=None, + required=True, + help="Input dictionary (.pkl file), that contains matched " + "n-gram data from the find_matching_ngrams code", + ) + parser.add_argument( + "--output-task-deduped-dir", + type=str, + default=None, + required=True, + help="Output directory to where task-deduplicated (split) " + "documents will be written", + ) + parser.add_argument( + "--output-removed-doc-dir", + type=str, + default=None, + help="Output directory to where removed documents will be written. " + "Documents will be removed from the corpus if they are split more " + "than --max-document-splits number of times, or if the user specifies " + "that they be removed via the flag, --remove-split-docs", + ) + parser.add_argument( + "--match-threshold", + type=int, + default=10, + help="A threshold that determines if a matched n-gram will be " + "considered for removal in remove_matching_ngrams. N-grams that " + "exceed this number of matches in the training dataset will not be " + "considered during the removal stage", + ) + parser.add_argument( + "--max-document-splits", + type=int, + default=10, + help="A threshold used to determine if a document should be removed " + "from the corpus if it is split more than " + "--max-document-splits number of times", + ) + parser.add_argument( + "--input-file-type", + type=str, + default="jsonl", + help="File type of the dataset to be read in. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + parser.add_argument( + "--output-file-type", + type=str, + default="jsonl", + help="File type the dataset will be written to. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=64, + help="Number of files to read into memory at a time.", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/separate_by_metadata.py b/nemo_curator/scripts/separate_by_metadata.py new file mode 100644 index 00000000..617ace79 --- /dev/null +++ b/nemo_curator/scripts/separate_by_metadata.py @@ -0,0 +1,121 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import shutil +import json + +from nemo_curator.utils.file_utils import separate_by_metadata, get_all_files_paths_under, expand_outdir_and_mkdir +from nemo_curator.utils.distributed_utils import get_client, read_data +from nemo_curator.utils.script_utils import ( + attach_bool_arg, + add_distributed_args, +) + + +def main(args): + client = get_client(args, args.device) + + files = get_all_files_paths_under(args.input_data_dir) + input_data = read_data(files, file_type=args.input_file_type, backend="pandas", add_filename=True) + + output_dir = expand_outdir_and_mkdir(args.output_data_dir) + + metadata_field = args.input_metadata_field + print(f"Beginning metadata separation for {metadata_field}") + metadata_distribution = separate_by_metadata(input_data, output_dir, metadata_field, remove_metadata=args.remove_metadata_field, output_type=args.output_file_type).compute() + print(f"Finished metadata separation for {metadata_field}") + + with open(args.output_metadata_distribution, 'w') as fp: + json.dump(metadata_distribution, fp) + + if args.remove_input_dir: + print(f"Removing all files in {args.input_data_dir}") + shutil.rmtree(args.input_data_dir) + print(f"Finished removing all files in {args.input_data_dir}") + + + +def attach_args(parser=argparse.ArgumentParser( + """ + Spits a dataset into subdirectories based on metadata values +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--input-data-dir", + type=str, + default=None, + help="Input directory consisting of .jsonl files that are accessible " + "to all nodes. Use this for a distributed file system", + ) + parser.add_argument( + "--input-metadata-field", + type=str, + default='language', + help="The name of the field within each datapoint object of the input " + "file that the dataset should be separated by.", + ) + parser.add_argument( + "--output-metadata-distribution", + type=str, + help="Output json file containing the frequency of documents " + "that occur for a particular metadata.", + ) + parser.add_argument( + "--output-data-dir", + type=str, + required=True, + help="The output directory to where the metadata-separated " + "files will be written. Each file will be written to its " + "respective metadata directory that is a sub-directory " + "of this directory", + ) + attach_bool_arg( + parser, + "remove-metadata-field", + default=False, + help_str="Option of whether to remove the metadata field " + "after filtering. Useful only in the case in which one metadata " + "is desired to be separated from the others", + ) + attach_bool_arg( + parser, + "remove-input-dir", + default=False, + help_str="Specify '--remove-input-dir' to remove the original " + "input directory. This is false by default.", + ) + parser.add_argument( + "--input-file-type", + type=str, + default="jsonl", + help="File type of the dataset to be read in. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + parser.add_argument( + "--output-file-type", + type=str, + default="jsonl", + help="File type the dataset will be written to. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/text_cleaning.py b/nemo_curator/scripts/text_cleaning.py new file mode 100644 index 00000000..3a535cd3 --- /dev/null +++ b/nemo_curator/scripts/text_cleaning.py @@ -0,0 +1,104 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import nemo_curator +from nemo_curator.modifiers import UnicodeReformatter +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.file_utils import ( + expand_outdir_and_mkdir, + get_batched_files, +) +from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk +from nemo_curator.utils.script_utils import add_distributed_args + + +def main(args): + client = get_client(args, args.device) + + # Make the output directories + output_clean_dir = expand_outdir_and_mkdir(args.output_clean_dir) + + cleaner = nemo_curator.Modify(UnicodeReformatter(), text_field=args.input_text_field) + + for files in get_batched_files(args.input_data_dir, output_clean_dir, args.input_file_type, batch_size=args.batch_size): + dataset = DocumentDataset(read_data(files, file_type=args.input_file_type, backend="pandas", add_filename=True)) + cleaned_dataset = cleaner(dataset) + write_to_disk(cleaned_dataset.df, output_clean_dir, write_to_filename=True, output_type=args.output_file_type) + print(f"Finished reformatting {len(files)} files") + + print("Finished reformatting all files") + +def attach_args(parser=argparse.ArgumentParser( + """ +Text cleaning and language filtering + +Takes as input a directory consisting of .jsonl files with one +document per line and outputs to a separate directory the text +with fixed unicode. Also, performs language filtering using +the 'language' field within each JSON object. +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--input-data-dir", + type=str, + default=None, + help="Input directory consisting of .jsonl files that are accessible " + "to all nodes. Use this for a distributed file system", + ) + parser.add_argument( + "--input-text-field", + type=str, + default='text', + help="The name of the field within each datapoint object of the input " + "file that contains the text.", + ) + parser.add_argument( + "--input-file-type", + type=str, + default="jsonl", + help="File type of the dataset to be read in. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + parser.add_argument( + "--output-clean-dir", + type=str, + default=None, + required=True, + help="The output directory to where the cleaned " + "jsonl files will be written", + ) + parser.add_argument( + "--output-file-type", + type=str, + default="jsonl", + help="File type the dataset will be written to. Supported file formats" + " include 'jsonl' (default), 'pickle', or 'parquet'.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=64, + help="Number of files to read into memory at a time.", + ) + + parser = add_distributed_args(parser) + + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/scripts/train_fasttext.py b/nemo_curator/scripts/train_fasttext.py new file mode 100644 index 00000000..33b0a06c --- /dev/null +++ b/nemo_curator/scripts/train_fasttext.py @@ -0,0 +1,205 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +import random +import fasttext +import json +from sklearn.metrics import confusion_matrix +from tqdm import tqdm + +from nemo_curator.utils.file_utils import get_all_files_paths_under + + +def main(args): + + # Set the random seed for shuffling + random.seed(args.seed) + + # Read in all samples into a list and shuffle the samples + documents = [] + for ifile in get_all_files_paths_under(args.fasttext_files_dir): + if os.path.splitext(ifile)[-1] == '.txt': + with open(ifile, 'r') as fp: + documents += fp.readlines() + random.shuffle(documents) + + # Get total number of samples + nlines = len(documents) + + num_train = round(nlines * args.validation_split) + + # Split into training and validation samples + train_samples = documents[:num_train] + valid_samples = documents[num_train:] + + # Write out the training and validation samples + with open(args.output_train_file, 'w') as fp: + for document in train_samples: + fp.write(document) + + with open(args.output_validation_file, 'w') as fp: + for document in valid_samples: + fp.write(document) + + # Train the model + model = fasttext.train_supervised( + input=args.output_train_file, + lr=args.learning_rate, + dim=args.word_vector_dim, + epoch=args.num_epochs, + wordNgrams=args.wordNgrams, + ) + + # Save the classifier as a FastText model + model.save_model(args.output_model) + + if args.output_predictions is not None: + fout = open(args.output_predictions, 'wb') + + # Read in the model and compute accuracy and other metrics on the data + hq_label = args.high_quality_label + prds, lbls = [], [] + with open(args.output_validation_file, 'r') as f: + for line in tqdm(f.readlines()): + # Split the text and the label + label_t, doc = line.split(' ', 1) + doc = doc.rstrip() + labels_p, scores = model.predict(doc, k=2) + # Write the predictions to file + if args.output_predictions is not None: + line = { + 'text': doc, + 'label': label_t, + f'{labels_p[0]}': scores[0], + f'{labels_p[1]}': scores[1], + } + myjson = json.dumps(line, ensure_ascii=False) + fout.write(myjson.encode('utf-8')) + fout.write('\n'.encode('utf-8')) + # Save predictions and labels + prds.append(1) if labels_p[0] == hq_label else prds.append(0) + lbls.append(1) if label_t == hq_label else lbls.append(0) + + # Print out the metrics computed on the validation data + tn, fp, fn, tp = confusion_matrix(prds, lbls).ravel() + print(f"TN={tn} FP={fp} FN={fn} TP={tp}") + + accuracy = (tp + tn) / (tp + tn + fp + fn) + precision = tp / (tp + fp) + recall = tp / (tp + fn) + f1 = 2 * tp / (2 * tp + fp + fn) + + print(f"Acc={accuracy} Prec={precision} Rec={recall} f1={f1}") + + +def attach_args(parser=argparse.ArgumentParser( + """ +Train a skip-gram quality classifier with FastText + +Takes as input files with prepared samples for training +a skip-gram classifier with FastText, trains a skip-gram +classifier on the input samples and writes the trained classifier +out to disk as a FastText model. +""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +)): + parser.add_argument( + "--fasttext-files-dir", + type=str, + default=None, + required=True, + help="The input directory containing the file(s) " + "containing the prepared FastText samples", + ) + parser.add_argument( + "--high-quality-label", + type=str, + default="__label__hq", + help="The label assigned to the high quality samples " + "when preparing the data", + ) + parser.add_argument( + "--seed", + type=int, + default=1992, + help="The seed used for randomly shuffling the documents", + ) + parser.add_argument( + "--output-train-file", + type=str, + default="./fasttext_samples.train", + help="The concatenated, shuffled samples used " + "to train the skip-gram classifier", + ) + parser.add_argument( + "--output-validation-file", + type=str, + default="./fasttext_samples.valid", + help="The concatenated, shuffled samples used to " + "for computing validation metrics", + ) + parser.add_argument( + "--validation-split", + type=float, + default=0.9, + help="The training validation split", + ) + parser.add_argument( + "--output-model", + type=str, + default=None, + required=True, + help="The output trained skip-gram classifier written " + "as a FastText model", + ) + parser.add_argument( + "--wordNgrams", + type=int, + default=2, + help="The size of the word n-gram used to train the classifier " + "(default is bigram)", + ) + parser.add_argument( + "--learning-rate", + type=float, + default=0.1, + help="The learning rate used to train the classifier", + ) + parser.add_argument( + "--num-epochs", + type=int, + default=5, + help="Number of epochs used to train the classifier", + ) + parser.add_argument( + "--word-vector-dim", + type=int, + default=100, + help="Size of word vectors to be computed by the model", + ) + parser.add_argument( + "--output-predictions", + type=str, + default=None, + help="The output predictions on the validation data. " + "If a file is not specified, the predictions are not " + "written to file", + ) + return parser + + +def console_script(): + main(attach_args().parse_args()) diff --git a/nemo_curator/tasks/__init__.py b/nemo_curator/tasks/__init__.py new file mode 100644 index 00000000..6091d2bc --- /dev/null +++ b/nemo_curator/tasks/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .downstream_task import DownstreamTask, import_task +from .metrics import Race, Squad, ArcEasy, ArcChallenge, OpenBookQA, BoolQ, Copa, RTE, MultiRC, WSC, CB, ANLI, Record, COQA, TriviaQA, Quac, WebQA, Drop, WiC, MMLU, BigBenchHard, BigBenchLight, Multilingual, PIQA, Winogrande, Lambada, NumDasc, StoryCloze + +__all__ = ["DownstreamTask", "import_task", "Race", "Squad", "ArcEasy", "ArcChallenge", "OpenBookQA", "BoolQ", "Copa", "RTE", "MultiRC", "WSC", "CB", "ANLI", "Record", "COQA", "TriviaQA", "Quac", "WebQA", "Drop", "WiC", "MMLU", "BigBenchHard", "BigBenchLight", "Multilingual", "PIQA", "Winogrande", "Lambada", "NumDasc", "StoryCloze"] \ No newline at end of file diff --git a/nemo_curator/tasks/downstream_task.py b/nemo_curator/tasks/downstream_task.py new file mode 100644 index 00000000..3e534398 --- /dev/null +++ b/nemo_curator/tasks/downstream_task.py @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +from abc import ABC, abstractmethod + +from nemo_curator.utils.text_utils import get_words + + +class DownstreamTask(ABC): + + def __init__(self): + super().__init__() + self._task_name = None + self._ngrams = {} + + @abstractmethod + def generate_ngrams(self): + pass + + @property + def ngrams(self): + return self._ngrams + + def _update_ngrams(self, text, min_ngram_size=8, max_ngram_size=13): + words, positions = get_words(text) + if len(words) < min_ngram_size: + return + + if len(words) < max_ngram_size: + seq = " ".join(words) + if seq not in self._ngrams: + self._ngrams[seq] = 0 + + for i in range(len(words) - max_ngram_size + 1): + seq = " ".join(words[i:i + max_ngram_size]) + if seq not in self._ngrams: + self._ngrams[seq] = 0 + + +def import_task(task_path): + module_path, task_name = task_path.rsplit(".", 1) + task_module = importlib.import_module(module_path) + task_class = getattr(task_module, task_name) + if not issubclass(task_class, DownstreamTask): + raise ValueError(f"Input iterator {task_class.__name__} " + "must be derived from DownstreamTask" + "defined in nemo_curator.tasks.downstream_task") + return task_class diff --git a/nemo_curator/tasks/metrics.py b/nemo_curator/tasks/metrics.py new file mode 100644 index 00000000..74fec482 --- /dev/null +++ b/nemo_curator/tasks/metrics.py @@ -0,0 +1,563 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from datasets import load_dataset +from nemo_curator.tasks.downstream_task import DownstreamTask +from nemo_curator.utils.file_utils import get_all_files_paths_under + + +class Race(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'race' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset(self._task_name, 'all', split='test') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class Squad(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'squad' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('squad_v2', split='validation') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class ArcEasy(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'arceasy' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('ai2_arc', 'ARC-Easy', split='test') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class ArcChallenge(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'arcchallenge' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('ai2_arc', 'ARC-Challenge', split='test') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class OpenBookQA(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'openbookqa' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('openbookqa', 'main', split='test') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question_stem'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class BoolQ(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'boolq' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('super_glue', 'boolq', split='validation') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class Copa(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'copa' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('super_glue', 'copa', split='validation') + + def generate_ngrams(self): + for line in self._dataset: + text = line['premise'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class RTE(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'rte' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('glue', 'rte', split='validation') + + def generate_ngrams(self): + for line in self._dataset: + text = line["sentence1"] + '\n' + line["sentence2"] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class MultiRC(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'multirc' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('super_glue', 'multirc', split='validation') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class WSC(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'wsc' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('super_glue', 'multirc', split='validation') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class CB(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'cb' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('super_glue', 'cb', split='validation') + + def generate_ngrams(self): + for line in self._dataset: + text = line['premise'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class ANLI(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'anli' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('anli') + self._keys = ['test_r1', 'test_r2', 'test_r3'] + + def generate_ngrams(self): + for key in self._keys: + data = self._dataset[key] + for line in data: + try: + text = line['premise'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + text = line['hypothesis'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + except Exception as e: + print('Error:', e) + + return self.ngrams + + +class Record(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'record' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('super_glue', 'record', split='validation') + + def generate_ngrams(self): + for line in self._dataset: + text = line['query'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class COQA(DownstreamTask): + + def __init__(self, file_path, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'coqa' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + if file_path is None: + raise Exception("Must provide a path to the coqa.json file") + self._dataset = json.load(open(file_path))['data'] + + def generate_ngrams(self): + for line in self._dataset: + all_questions = line['questions'] + for question in all_questions: + self._update_ngrams( + question['input_text'], + self._min_ngram_size, + self._max_ngram_size, + ) + story = line['story'] + self._update_ngrams(story, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class TriviaQA(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'trivia_qa' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('trivia_qa', 'unfiltered', split='test') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class Quac(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'quac' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('quac', split='validation') + + def generate_ngrams(self): + for line in self._dataset: + all_questions = line['questions'] + for question in all_questions: + self._update_ngrams( + question, + self._min_ngram_size, + self._max_ngram_size, + ) + + return self.ngrams + + +class WebQA(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'webqa' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('web_questions', split='test') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class Drop(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'drop' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset('drop', split='validation') + + def generate_ngrams(self): + for line in self._dataset: + text = line['question'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class WiC(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'wic' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset( + path='super_glue', + name='wic', + split='validation', + ) + + def generate_ngrams(self): + for line in self._dataset: + text = line["sentence1"] + '\n' + line["sentence2"] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class MMLU(DownstreamTask): + + def __init__(self, path=None, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'mmlu' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._path = path + if self._path is None: + raise Exception("Must provide path that contain " + "MMLU task data in JSONL format") + + def generate_ngrams(self): + for ifile in get_all_files_paths_under(self._path): + for iline in open(ifile, 'rb'): + document = json.loads(iline) + text = document['text'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class BigBenchHard(DownstreamTask): + + def __init__(self, path=None, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'bigbench_hard' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._path = path + if self._path is None: + raise Exception("Must provide path that contain " + "BigBenchHard task data in JSONL format") + + def generate_ngrams(self): + for ifile in get_all_files_paths_under(self._path): + for iline in open(ifile, 'rb'): + document = json.loads(iline) + text = document['text'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class BigBenchLight(DownstreamTask): + + def __init__(self, path=None, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'bigbench_light' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._path = path + if self._path is None: + raise Exception("Must provide path that contain " + "BigBenchLight task data in JSONL format") + + def generate_ngrams(self): + for ifile in get_all_files_paths_under(self._path): + for iline in open(ifile, 'rb'): + document = json.loads(iline) + text = document['text'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class Multilingual(DownstreamTask): + + def __init__(self, path=None, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'multilingual' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._path = path + if self._path is None: + raise Exception("Must provide path to " + "multilingual task data in JSONL format") + + def generate_ngrams(self): + for ifile in get_all_files_paths_under(self._path): + for iline in open(ifile, 'rb'): + document = json.loads(iline) + text = document['text'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class PIQA(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'piqa' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset(self._task_name, split='test') + + def generate_ngrams(self): + for line in self._dataset: + text = line['goal'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class Winogrande(DownstreamTask): + + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'winogrande' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = load_dataset( + path='winogrande', + name='winogrande_xl', + split='validation', + ) + + def generate_ngrams(self): + for line in self._dataset: + text = line['sentence'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + + +class Lambada(DownstreamTask): + + def __init__(self, file_path, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'lambada' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._file_path = file_path + + def generate_ngrams(self): + with open(self._file_path, 'r') as f: + for line in f: + try: + myjson = json.loads(line) + text = myjson['text'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + except Exception as e: + print(f"Error {e}") + + return self.ngrams + + +class NumDasc(DownstreamTask): + + def __init__(self, n, file_path, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._n = n + self._task_name = '{n}dasc' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._file_path = file_path + + def generate_ngrams(self): + with open(self._file_path, 'r') as f: + for line in f: + try: + myjson = json.loads(line) + text = myjson['context'] + myjson['completion'] + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + except Exception as e: + print(f"Error {e}") + + return self.ngrams + + +class StoryCloze(DownstreamTask): + + def __init__(self, file_path, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'story_cloze' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._file_path = file_path + + def generate_ngrams(self): + with open(self._file_path, 'r') as f: + for line in f: + try: + myjson = json.loads(line) + text = " ".join([ + myjson["InputSentence1"], myjson["InputSentence2"], + myjson["InputSentence3"], myjson["InputSentence4"] + ]) + self._update_ngrams(text, self._min_ngram_size, self._max_ngram_size) + except Exception as e: + print(f"Error {e}") + + return self.ngrams \ No newline at end of file diff --git a/nemo_curator/utils/__init__.py b/nemo_curator/utils/__init__.py new file mode 100644 index 00000000..fe99e99a --- /dev/null +++ b/nemo_curator/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/nemo_curator/utils/code_meta.csv b/nemo_curator/utils/code_meta.csv new file mode 100644 index 00000000..7dc2134e --- /dev/null +++ b/nemo_curator/utils/code_meta.csv @@ -0,0 +1,306 @@ +,extension,language,count,low_alphanum_count,long_lines_count,non_lexable_count,XML_detected,Data_detected,Name,Overall quality,Alphanum filter,Long line filter,Lexer filter,Other comments,Include,Alphanum_threshold,Long_line_threshold,,XML filter,Alpha filter,Near-dedup settings +2,ada,ada,1000,0,4,31,0,2,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +0,adb,ada,1000,0,1,85,75,4,Harm,LGTM,LGTM,False positive,Mostly xml; few false positives,,1,0.25,1000,,1,, +1,ads,ada,1000,1,2,20,0,12,Harm,LGTM,False positive,TBD; breaks space,,,1,0.25,1000,,1,0.25, +3,agda,agda,1000,0,3,49,0,1,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +4,als,alloy,1000,0,1,39,0,2,Harm,LGTM,LGTM,False positive,,,1,0.25,1000,,1,, +5,g4,antlr,1000,3,0,443,0,8,Harm,LGTM,False positive; lower to 0.2,LGTM,,,1,0.25,1000,,1,, +7,applescript,applescript,1000,0,38,113,0,9,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +6,scpt,applescript,1000,3,22,57,0,3,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +10,a51,assembly,28,0,0,0,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +8,asm,assembly,1000,2,115,0,0,318,Evgenii,LGTM,Some false positives,Many similar false postives with a long comment line containing only numbers,,,1,0.25,Custom! Can we remove comments for files with long lines? ,,1,remove, +9,nasm,assembly,159,0,0,0,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +11,aug,augeas,255,0,0,0,19,6,Evgenii,"LGTM, some minor misclassifications: xml or data",,,,,1,0.25,1000,,1,0.25, +16,auk,awk,3,0,0,3,0,0,Harm,LGTM,,,,,1,0.25,1000,,1,, +12,awk,awk,1000,5,2,255,2,27,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +13,gawk,awk,225,0,1,103,0,0,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +14,mawk,awk,22,0,0,13,0,0,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +15,nawk,awk,8,0,0,1,0,0,Harm,LGTM,,,,,1,0.25,1000,,1,, +18,bat,batchfile,1000,1,37,13,0,2,Evgenii,LGTM,,Some are false positives,,,1,0.25,1000,,1,, +17,cmd,batchfile,1000,0,163,14,0,3,Evgenii,LGTM,,Some are false positives,,,1,0.25,1000,,1,, +19,bison,bison,176,0,1,0,0,0,Harm,Exclude (too many parse error file logs),,,,,0,0.25,1000,,1,, +20,bsv,bluespec,1000,3,7,0,0,6,Harm,LGTM,LGTM,"False positives, remove filter",,,1,0.25,remove,,1,, +21,c,c,1000,5,4,11,0,15,Harm,LGTM,False positive; but let's keep filter,LGTM,,,1,0.25,1000,,1,, +23,cats,c,3,0,0,2,0,1,Harm,Exclude,,,,,0,,,,1,, +22,h,c,1000,1,3,192,0,5,Evgenii,LGTM,,,"Non-lexable files have @property and similar in them, can be used to filter Objective-C headers",,1,0.25,1000,,1,, +24,w,c,5,0,0,2,0,1,Harm,Exclude,,,,,0,,,,1,, +40,cake,c-sharp,9,0,0,8,0,1,Qian,LGTM,LGTM,LGTM,Cake extensions cannot be well recoginzed. False positive,,1,0.25,1000,,1,, +38,cs,c-sharp,1000,0,2,31,0,1,Qian,LGTM,LGTM,LGTM,False positive,,1,0.25,1000,,1,, +39,cshtml,c-sharp,585,0,9,429,0,0,Qian,LGTM,LGTM,LGTM,Mostly html pages with some C# lex. False positive,,1,0.25,1000,,1,, +41,csx,c-sharp,8,0,0,6,0,6,Qian,"Only few examples are meaningful - most are command plus path, may consider drop this extension?",,,,,0,,,,1,, +36,c++,c++,15,0,0,0,0,0,Harm,LGTM,,,,,1,0.25,1000,,1,, +27,cc,c++,1000,0,1,3,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +31,cp,c++,10,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +25,cpp,c++,1000,2,3,3,0,5,Zhihan Zhang,LGTM,,"2 cases in total, they are false positives",,,1,0.25,1000,,1,, +30,cxx,c++,389,0,2,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +37,h++,c++,1,0,0,0,0,1,Harm,LGTM,,,,,1,0.25,1000,,1,, +32,hh,c++,197,0,0,5,0,2,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +28,hpp,c++,1000,1,7,3,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +34,hxx,c++,140,0,1,0,0,0,Harm,LGTM,,False positive but let's keep as is,,,1,0.25,1000,,1,, +29,inl,c++,91,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +26,ipp,c++,20,0,2,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +33,tcc,c++,19,0,0,0,0,1,Ejiro,LGTM,,,,,1,0.25,1000,,1,, +35,tpp,c++,3,0,0,0,0,0,Harm,LGTM,,,,,1,0.25,1000,,1,, +45,boot,clojure,121,0,1,23,0,0,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +46,cl2,clojure,7,0,0,0,0,0,Harm,Exclude (very short meaningless files),,,,,0,,,,1,, +42,clj,clojure,1000,5,10,9,0,11,Harm,LGTM,LGTM,TBD; breaks space,,,1,0.25,1000,,1,, +44,cljc,clojure,1000,0,21,11,0,4,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +43,cljs,clojure,1000,2,2,3,0,4,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +47,cljx,clojure,8,0,0,0,0,0,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +48,cmake,cmake,1000,0,29,41,0,0,Harm,LGTM,LGTM,False positives; Remove filter,,,1,0.25,remove,,1,, +53,_coffee,coffeescript,4,0,0,0,0,0,Harm,LGTM,,,,,1,0.25,1000,,1,, +51,cjsx,coffeescript,185,0,1,8,0,0,Harm,LGTM,,,,,1,0.25,1000,,1,, +49,coffee,coffeescript,1000,2,20,47,1,7,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +50,cson,coffeescript,1000,3,13,0,1,4,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +52,iced,coffeescript,92,0,0,9,0,0,Harm,LGTM,,,,,1,0.25,1000,,1,, +55,asd,common-lisp,1000,0,1,6,6,0,Harm,LGTM,LGTM,LGTM;,,,1,0.25,1000,,1,, +54,lisp,common-lisp,1000,2,21,32,0,7,Harm,LGTM,LGTM,LGTM; removes auto-generated,,,1,0.25,1000,,1,, +56,lsp,common-lisp,1000,0,0,28,0,3,Harm,LGTM,,,,,1,0.25,1000,,1,, +58,ny,common-lisp,52,0,5,6,0,0,Harm,LGTM,,LGTM,,,1,0.25,1000,,1,, +57,sexp,common-lisp,197,0,107,63,0,8,Harm,Exclude,,,,,0,,,,1,, +59,css,css,1000,0,154,273,0,0,Nour Fahmy,25% not lexable,LGTM,lots of examples are one singular line of code; recommend to remove filter as CSS code can be accordingly compressed,,,1,0.25,1000,,1,, +61,cu,cuda,1000,0,4,2,0,1,Evgenii,LGTM,,Some small amount of legit code is filterered,,,1,0.25,1000,,1,, +60,cuh,cuda,1000,1,3,0,0,3,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +62,dart,dart,1000,0,3,17,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +64,1,dockerfile,1,0,0,0,0,0,Evgenii,"LGTM, only one file",,,,,1,0.25,1000,,1,, +66,3,dockerfile,1,0,0,0,0,0,Evgenii,"LGTM, only one file",,,,,1,0.25,1000,,1,, +65,dockerfile,dockerfile,334,0,0,1,0,0,Marco Zocca,LGTM,LGTM,,,,1,0.25,1000,,1,, +67,mustache,dockerfile,1,0,0,0,0,0,Evgenii,"LGTM, only one file",,,,,1,0.25,1000,,1,, +63,,dockerfile,1000,0,0,29,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +68,ex,elixir,1000,0,7,379,0,1,Raymond,LGTM,LGTM,False positives mostly,,,1,0.25,1000,,1,, +69,exs,elixir,1000,0,2,57,0,1,Raymond,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +70,elm,elm,1000,2,10,82,0,16,Raymond,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +71,el,emacs-lisp,1000,1,31,109,0,3,Marco Zocca,LGTM,,,LGTM,,1,0.25,1000,,1,, +72,emacs,emacs-lisp,142,0,4,10,0,0,Jason Stillerman,"LG, one bash script got in",LGTM,LGTM. LL files were encoding stuff,LGTM,,1,0.25,1000,,1,, +73,erl,erlang,1000,6,3,35,0,9,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +77,escript,erlang,71,0,1,6,0,0,Evgenii,"LGTM, ~10% not erlang",,,,,1,0.25,1000,,1,, +74,hrl,erlang,1000,2,8,13,1,13,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +76,xrl,erlang,30,0,0,30,0,0,Evgenii,"This is not Erlang, rather some lex-like language definition, not sure if we want to include it",,,,,0,,,,,, +75,yrl,erlang,57,0,0,6,0,0,Evgenii,"This is not Erlang, rather some yacc-like language definition, not sure if we want to include it",,,,,1,0.25,1000,Ask Arjun ,1,, +78,fs,f-sharp,1000,3,13,39,0,8,Claire Schlesinger,LGTM,LGTM,LGTM,LGTM but most are false positives,"Seems quite a few files end up being setting up types or other parameters, might be unhelpful when writing functions, but useful for more tasks like type inference.",1,0.25,1000,,1,, +80,fsi,f-sharp,516,0,0,5,0,1,Claire Schlesinger,LGTM,LGTM,LGTM,LGTM,,1,0.25,1000,,1,, +79,fsx,f-sharp,1000,1,32,31,0,4,Claire Schlesinger,LGTM,LGTM,LGTM,LGTM but most are false positives,,1,0.25,1000,,1,, +81,f,fortran,1000,6,38,559,0,25,Manan Dey,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +83,f03,fortran,145,0,0,1,0,0,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +86,f08,fortran,65,0,1,4,0,0,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +88,f77,fortran,5,0,0,0,0,0,Manan Dey,Maybe exclude? (most are short meaningless files),,,,,0,,,,1,, +82,f90,fortran,1000,9,1,14,0,16,Manan Dey,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +85,f95,fortran,216,0,0,4,0,1,Manan Dey,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +84,for,fortran,549,1,0,226,0,8,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +87,fpp,fortran,75,1,0,55,0,2,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +98,fp,glsl,221,2,37,84,1,87,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,0.25, +90,frag,glsl,1000,1,2,120,21,4,Manan Dey,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +94,frg,glsl,35,0,0,28,0,25,Harm,Exclude (mostly auto generated) ,,,,,0,,,,1,, +93,fsh,glsl,692,1,2,165,0,1,Harm,OK,LGTM,set to 500,,,1,0.25,500,,1,, +101,fshader,glsl,5,0,0,0,0,0,Harm,LGTM,,,,,1,0.25,1000,,1,, +97,geo,glsl,289,0,2,215,0,33,Harm,exclude,,,,,0,,,,1,, +96,geom,glsl,149,0,0,12,1,7,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +89,glsl,glsl,1000,2,22,119,0,11,Manan Dey,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +99,glslv,glsl,35,0,0,2,0,0,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +92,shader,glsl,1000,0,45,908,0,0,Evgenii,Requires filtering,,Many are autogenerated but long lines are only comments,,"I've looked at few, they are not only glsl, e.g., Unity shaders",1,0.25,1000,,1,, +91,vert,glsl,1000,0,1,44,0,2,Manan Dey,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +102,vrx,glsl,2,0,2,2,2,0,Manan Dey,Maybe exclude (only 2 files and looks like xml),,,,,0,,,,1,, +95,vsh,glsl,326,0,1,39,0,0,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +100,vshader,glsl,8,0,0,0,0,0,Harm,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +103,go,go,1000,0,13,0,1,1,Evgenii,"LGTM, small amount of autogen",,False positives: long comments or string constants,,,1,0.25,1000,,1,, +104,groovy,groovy,1000,0,3,16,0,2,Zhihan Zhang,LGTM,LGTM,false positives (long strings),,,1,0.25,1000,,1,, +107,grt,groovy,2,0,0,0,0,2,Zhihan Zhang,should be excluded: data instead of code,,,,,0,,,,1,, +105,gtpl,groovy,21,0,0,13,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +106,gvy,groovy,2,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +108,hs,haskell,1000,1,3,39,6,3,Zhihan Zhang,LGTM,"false positive (only 1 case in total, the script used too many indents)",false positives (only 2 cases in total),,,1,0.25,1000,,1,, +109,hsc,haskell,105,0,0,8,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +111,htm,html,541,1,241,51,13,4,Zhihan Zhang,LGTM,LGTM,"many false positives, since HTML does not require line breaks and many may compress their HTML code, I suggest remove the length limit for HTML",,,1,0.25,1000,,1,, +110,html,html,1000,0,240,131,14,3,Zhihan Zhang,LGTM,LGTM,"many false positives, since HTML does not require line breaks and many may compress their HTML code, I suggest remove the length limit for HTML",,,1,0.25,1000,,1,, +113,xht,html,30,0,0,0,1,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +112,xhtml,html,149,0,60,1,75,0,Zhihan Zhang,LGTM,LGTM,"many false positives, since HTML does not require line breaks and many may compress their HTML code, I suggest remove the length limit for HTML",,,1,0.25,1000,,1,, +114,idr,idris,1000,1,2,195,0,1,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +115,lidr,idris,291,0,0,23,0,0,Evgenii,"LGTM, but other languages use separate literal- version?",,,,,1,0.25,1000,,1,, +116,thy,isabelle,1000,0,20,399,2,2,Evgenii,LGTM,,"Some false positives, some autogenerated",,,1,0.25,1000,,1,, +117,java,java,1000,0,3,10,0,0,Nour Fahmy,LGTM,LGTM,data quality improves when max_length <= 100,,,1,0.25,1000,,1,, +118,jsp,java-server-pages,1000,1,9,362,4,2,Nour Fahmy,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +120,es6,javascript,9,0,0,1,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +119,js,javascript,1000,3,113,151,0,13,Nour Fahmy,LGTM - 15% is non-lexable,"3 files remaining, all FP","LGTM - lots of one lineliners, but common as JS can be compressed, I would suggest to remove the filter altogether",,,1,0.25,1000,,1,, +121,jsm,javascript,1,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +122,pac,javascript,2,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +124,sjs,javascript,3,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +123,xsjslib,javascript,1,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +126,jl,julia,1000,0,12,15,0,5,Evgenii,LGTM,,Some false positives: long documentation comments,,,1,0.25,1000,,1,, +127,kt,kotlin,1000,1,0,186,0,1,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +128,kts,kotlin,669,0,1,16,0,15,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +130,hlean,lean,268,0,0,0,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +129,lean,lean,1000,0,0,6,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +131,lagda,literate-agda,1000,0,0,12,0,1,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +132,litcoffee,literate-coffeescript,1000,0,4,0,0,2,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +133,lhs,literate-haskell,1000,0,2,114,0,1,Marco Zocca,LGTM,LGTM,LGTM,some false positives,,1,0.25,1000,,1,, +134,lua,lua,1000,1,15,15,0,10,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +135,nse,lua,91,0,0,1,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +136,wlua,lua,8,0,0,0,0,1,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +141,cmake,makefile,1,0,0,0,0,0,Evgenii,"LGTM, but single file",,,,,1,0.25,1000,,1,, +139,mak,makefile,273,1,11,111,0,1,Evgenii,"Includes mako templates, autogenerated nmake from VS and make from configure",,,,,1,0.25,1000,,1,, +138,mk,makefile,1000,1,20,104,0,6,Evgenii,"LGTM, some autogenerated files",,,,,1,0.25,1000,,1,, +140,txt,makefile,1,0,0,0,0,0,Evgenii,"LGTM, but single file",,,,,1,0.25,1000,,1,, +137,,makefile,1000,2,9,80,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +142,mpl,maple,1000,1,6,0,544,10,Evgenii,"Includes fair amount of xml, should be filtered",,,,,1,0.25,1000,,1,0.25, +144,markdown,markdown,244,0,32,2,0,0,Urvashi,"some false positives, should be filtered",0 file remaining,23 files. All look good,,,1,0.25,remove,Add language filter?,1,, +143,md,markdown,1000,2,73,20,0,2,Urvashi,A lot of non-English code/language,2 files remaining. ,"37 files left, 2 false positives",,,1,0.25,remove,,1,, +145,mkd,markdown,5,0,0,0,0,0,Urvashi,looks good,0 files remaining,0 files remain,,,1,0.25,remove,,1,, +147,mkdn,markdown,1,0,0,0,0,0,Urvashi,"only 1 file, looks good",0 files remaining,0 files remain,,,1,0.25,remove,,1,, +146,ron,markdown,2,0,0,0,0,0,Urvashi,"only 2 eamples, can be excluded",0 files remaining,0 files remain,,,1,0.25,remove,,1,, +149,cdf,mathematica,610,1,8,207,2,188,Evgenii,"Includes some common data format, quartus chain description files and other non-mathematica files. Should be filtered.",,,,,0,,,,1,, +148,ma,mathematica,739,0,390,680,0,120,Evgenii,"Includes Maya 3D files, some functional programming language (miniagda?), some assembly. Requires filtering.",,,,,1,,,,1,0.25, +155,mathematica,mathematica,1,0,0,1,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +151,mt,mathematica,766,6,61,657,2,54,Evgenii,"Significant amount of non-mathematica files, requires filtering",,,,,1,0.25,1000,,1,, +150,nb,mathematica,1000,0,720,301,0,81,Evgenii,"very large amount of near-duplicates, consider tuning near-dedup?",,False positives,,,1,,remove,,1,0.25,"ngram_size: 3, num_perm: 256, threshold: 0.7" +154,nbp,mathematica,30,0,0,30,0,0,Evgenii,"has many autogenerated CPROVER files, should be filtered",,,,,0,,,,1,, +152,wl,mathematica,686,1,13,503,0,16,Evgenii,LGTM,,"Small amount of non-passing legitimate examples, maybe should be tuned ",,,1,0.25,1000,,1,0.25, +153,wlt,mathematica,113,1,1,103,0,3,Evgenii,"Has unity3d layouts, willet language (?) files",,,,,0,,,,1,, +156,matlab,matlab,1000,0,338,2,0,918,Evgenii,"Many data files (something like tsv), requires filtering",,,,,1,0.25,remove,,1,0.25, +161,eliom,ocaml,10,0,0,0,0,0,Urvashi,2 generated files,0 files remain,0 files remain,,,1,0.25,1000,,1,, +162,eliomi,ocaml,2,0,0,0,0,0,Urvashi,1 generated file,0 files remain,0 files remain,,,1,0.25,1000,,1,, +157,ml,ocaml,1000,1,6,22,0,26,Urvashi,All good files,Only 1 file left. True positive,6 files left. Most look good,,,1,0.25,1000,,1,, +163,ml4,ocaml,11,0,0,0,0,0,Urvashi,looks good,0 files remain,0 files remain,,,1,0.25,1000,,1,, +158,mli,ocaml,1000,0,3,0,0,1,Urvashi,All good files,0 files remain,3 files left. All good,,,1,0.25,1000,,1,, +159,mll,ocaml,163,2,2,0,0,3,Urvashi,1 or 2 false positives. Looks good,"2 files remain, all good",2 files remain. All good,,,1,0.25,1000,,1,, +160,mly,ocaml,149,0,0,2,0,0,Urvashi,looks good. 1 auto-generated file,0 files remain,0 files remain,,,1,0.25,1000,,1,, +165,dfm,pascal,1000,0,0,3,1,65,Evgenii,LGTM,,,,,1,0.25,1000,,1,,remove +166,dpr,pascal,1000,0,0,6,0,0,Evgenii,"Look like form configuration, not sure we want those",,,,,0,,,,,, +167,lpr,pascal,339,0,1,18,10,0,Evgenii,LGTM but many samples with barely any code,,,,,1,0.25,1000,,1,, +164,pas,pascal,1000,0,4,62,0,6,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +171,al,perl,788,1,1,35,1,225,Zhihan Zhang,about 30% of files are data instead of code,LGTM,"false positive, but only 1 case in total, so no significant impact",,,0,,,,1,, +175,perl,perl,62,0,1,8,0,2,Zhihan Zhang,LGTM,LGTM,"false positive, but only 1 case in total, so no significant impact",,,1,0.25,1000,,1,, +173,ph,perl,62,0,7,4,0,1,Zhihan Zhang,20 out of 61 (33%) looks like data or auto-generated code,LGTM,LGTM,,,0,,,,1,, +168,pl,perl,1000,2,6,119,0,32,Zhihan Zhang,LGTM,LGTM,"false positives, but only 4 cases in total, so no significant impact",,,1,0.25,1000,,1,, +172,plx,perl,28,0,6,0,13,0,Zhihan Zhang,should be excluded: many looks like auto-generated,,,,,0,,,,1,, +170,pm,perl,1000,0,8,32,0,10,Zhihan Zhang,LGTM,LGTM,"false positives, but only 6 cases in total, so no significant impact",,,1,0.25,1000,,1,, +174,psgi,perl,11,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +169,t,perl,1000,1,5,130,0,12,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +178,ctp,php,23,0,1,0,0,0,Manan Dey,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +176,php,php,1000,1,15,0,0,6,Urvashi,looks good,"1 file in total, low quality file","9 files in total after applying filter, 1 false positive",,,1,0.25,1000,,1,, +177,phpt,php,92,0,0,0,0,0,Manan Dey,"Overall looks good, but many files contain the output as well such as warning messages, etc.",,,,,1,0.25,1000,,1,, +179,ps1,powershell,1000,0,6,15,0,2,suriya gunasekar,,,,,,1,0.25,1000,,1,, +181,psd1,powershell,999,1,9,1,0,1,suriya gunasekar,,,,,,1,0.25,1000,,1,, +180,psm1,powershell,1000,1,7,8,1,2,suriya gunasekar,,,,,,1,0.25,1000,,1,, +183,prolog,prolog,480,1,36,67,0,5,Evgenii,"LGTM, small amout of autogenerated files",,,,,1,0.25,1000,,1,, +182,yap,prolog,1000,2,18,36,0,17,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +184,proto,protocol-buffer,1000,0,17,0,0,25,Evgenii,"LGTM, small amout of autogenerated files",,Mostly false positives,,,1,0.25,1000,,1,, +186,bzl,python,50,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +188,gyp,python,15,0,0,0,0,0,Zhihan Zhang,"I think these are configuration files, so I suppose we should exclude them. not too familiar with .gyp things though",LGTM,LGTM,,,1,0.25,1000,,1,, +185,py,python,1000,0,11,1,0,2,Zhihan Zhang,LGTM,LGTM,6 out of 9 are false positives; these 9 examples still exist even though I set to 1200,,,1,0.25,1000,,1,, +189,pyde,python,2,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +187,pyw,python,6,0,0,0,0,0,Zhihan Zhang,2 out of 6 are very short,LGTM,LGTM,,,1,0.25,1000,,1,, +190,r,r,1000,4,20,18,1,23,Carolyn Anderson,LGTM,,Many false positives; increase threshold to 2000,,,1,0.25,remove,,1,, +191,rd,r,912,0,8,67,16,11,Carolyn Anderson,this is a text processing language like LaTex; not sure it should be grouped with r,,,,,1,0.25,1000,,1,, +192,rsx,r,38,1,2,0,0,1,Carolyn Anderson,3 non-r files,,,,,1,0.25,1000,,1,, +,rkt,racket,,,,,,,Carolyn Anderson,this is the main racket file extension and should be included,,,,,,,,,1,, +194,rktd,racket,281,1,132,0,0,60,Carolyn Anderson,exclude (mostly auto-generated),,,,,0,,,,1,, +195,rktl,racket,396,3,0,0,0,24,Carolyn Anderson,exclude,,,,,0,,,,1,, +193,scrbl,racket,1000,0,10,85,0,0,Carolyn Anderson,LGTM,,,,,1,0.25,1000,,1,, +197,rest,restructuredtext,41,0,2,0,0,0,Harm,Exclude (requests to REST servers),,,,,0,,,,1,, +196,rst,restructuredtext,1000,6,21,7,3,13,Harm,LGTM,,,,,1,0.25,remove,,1,, +198,rmd,rmarkdown,1000,0,72,0,0,1,Carolyn Anderson,LGTM,,Many false positives; increase threshold to 2000,,,1,0.25,remove,,1,, +205,builder,ruby,14,0,0,1,0,0,Nour Fahmy,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +200,gemspec,ruby,600,0,24,0,0,0,Nour Fahmy,LGTM,LGTM,LGTM,,"note: rspec files are often auto-generated, and may not need to be filled by the user",1,0.25,1000,,1,, +203,jbuilder,ruby,102,0,2,1,0,0,Nour Fahmy,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +201,podspec,ruby,430,0,0,0,0,0,Nour Fahmy,LGTM,LGTM,LGTM,,"note: podspec files are often auto-generated, and may not need to be filled by the user",1,0.25,1000,,1,, +207,rabl,ruby,12,0,0,0,0,0,Nour Fahmy,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +202,rake,ruby,134,0,0,0,0,0,Nour Fahmy,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +199,rb,ruby,1000,0,6,0,0,0,Nour Fahmy,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +209,rbw,ruby,1,0,0,0,0,0,Nour Fahmy,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +204,ru,ruby,59,0,12,24,0,1,Nour Fahmy,LGTM but lots of files are in Russian?,LGTM,LGTM,,,1,0.25,1000,,1,, +206,ruby,ruby,3,0,0,0,0,0,Nour Fahmy,"LGTM, but only 3 files",LGTM,LGTM,,,1,0.25,1000,,1,, +208,thor,ruby,2,0,0,0,0,0,Nour Fahmy,LGTM but only 2 files?,LGTM,LGTM,,,1,0.25,1000,,1,, +210,rs,rust,1000,0,3,1,0,1,Nour Fahmy (WIP),LGTM,LGTM,data tool lags :(,,,1,0.25,1000,,1,, +211,sas,sas,1000,200,43,230,0,210,Evgenii,LGTM,"A single false positive in multiple variations, can tune near dedup?",,,,1,remove,1000,,1,remove,"ngram_size: 5, num_perm: 256, threshold: 0.6" +213,sbt,scala,913,0,1,3,4,1,Jenny,Most instances are sbt-specific rather than just scala,LGTM,LGTM,LGTM,,1,0.25,1000,,1,, +212,scala,scala,1000,0,5,25,0,0,Jenny,LGTM,LGTM,"LGTM; the few false positives are from hashes, test data, comments","False positives mostly, a few auto generated files (that explicitly state they were ""Automatically generated"" in comments)",,1,0.25,1000,,1,, +214,scm,scheme,1000,5,9,86,3,20,Evgenii,LGTM,Some false positives,,,,1,0.25,1000,,1,0.25, +215,sld,scheme,1000,0,99,60,521,34,Evgenii,"Half of data is xml, should be filtered",,,,,1,0.25,1000,,1,0.25, +216,sps,scheme,446,2,9,87,64,25,Evgenii,"Many xmls, sql, and other non-scheme data",,,,,1,0.25,1000,,1,0.25, +218,bash,shell,312,1,4,0,0,1,Zhihan Zhang,LGTM,LGTM,"Three cases left after this filter, and they all look like human-written",,,1,0.25,1000,,1,, +220,bats,shell,127,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +223,command,shell,24,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +222,ksh,shell,26,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +217,sh,shell,1000,0,20,1,0,0,Zhihan Zhang,LGTM,LGTM,"On the inspection site, only 1 case has lines longer than 1000 and it comes from a long URL",,,1,0.25,1000,,1,, +219,tmux,shell,7,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +224,tool,shell,1,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +221,zsh,shell,232,0,1,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +225,st,smalltalk,1000,0,10,416,0,2,Arjun (WIP),"25% is not lexable, o/w",,,LGTM -- I think,,1,0.25,1000,,1,, +226,sol,solidity,1000,8,22,575,0,25,Evgenii,"LGTM, minor amount of data",,,,,1,0.25,1000,,1,, +228,rq,sparql,1000,2,6,126,0,4,Raymond,"a lot of URLs, but still valuable data",LGTM,LGTM,,,1,0.25,1000,,1,, +227,sparql,sparql,1000,1,43,138,0,6,Raymond,"a lot of URLs, but still valuable data",LGTM,LGTM,,,1,0.25,1000,,1,, +231,cql,sql,96,0,6,49,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +239,db2,sql,4,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +236,ddl,sql,75,0,2,22,0,4,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +237,pck,sql,9,0,3,4,0,1,Zhihan Zhang,half of the data are not code. Should be excluded.,,,,,0,,,,1,, +232,pkb,sql,26,0,0,1,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +235,pks,sql,16,0,0,1,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +241,plb,sql,1,0,0,0,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +234,pls,sql,17,0,0,1,0,1,Zhihan Zhang,half of the data are not code. Should be excluded.,,,,,0,,,,1,, +238,plsql,sql,7,0,0,1,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +233,prc,sql,23,0,2,9,0,2,Evgenii,"LGTM, a couple of very large and likely autogenerated files, maybe worth filtering",,,,,1,0.25,1000,,1,0.25, +229,sql,sql,1000,0,83,129,0,12,Zhihan Zhang,LGTM,LGTM,"many false positives, many long lines come from long strings",,,1,0.25,remove,,1,, +230,tab,sql,267,2,40,60,2,107,Zhihan Zhang,Most of them are table data instead of code. Should be excluded.,,,,,0,,,,1,, +240,udf,sql,7,0,0,7,0,0,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +125,stan,stan,1000,0,2,3,,,Nour Fahmy,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +242,stan,stan,1000,0,2,3,,,Zhihan Zhang,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +244,fun,standard-ml,958,0,0,463,0,4,Evgenii,"LGTM, small amount of non-standard-ml data",,,,,1,0.25,1000,,1,, +243,sig,standard-ml,1000,1,415,442,1,235,Evgenii,Large amount of non-standard-ml and non-code files (all kind of signatures). Should be filtered,,,,,0,,,,1,, +245,sml,standard-ml,1000,1,4,367,1,13,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +247,ado,stata,1000,15,24,20,231,62,Evgenii,"LGTM, significant amount of xml (20-30%)",,Broken linebreaks,,,1,0.25,remove,,1,remove, +246,do,stata,1000,3,56,29,0,8,Evgenii,Includes large amount of Xilinx Modelsim files,,Mostly false positives,,,1,0.25,remove,,1,remove, +251,doh,stata,5,0,0,0,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +249,ihlp,stata,406,0,0,1,0,1,Evgenii,"This is markdown analogue by stata, https://www.stata.com/manuals13/psmcl.pdf probably should be separated?",,,,,,,,,1,, +250,mata,stata,392,0,4,9,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +252,matah,stata,8,0,0,0,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +248,sthlp,stata,1000,0,0,17,0,0,Evgenii,"This is markdown analogue by stata, https://www.stata.com/manuals13/psmcl.pdf probably should be separated?",,,,,,,,,1,, +253,sv,systemverilog,1000,2,3,9,0,9,Evgenii,LGTM,there are some short files with large top comment that trigger filter,LGTM,,"A bit of autogenerated files, can be filtered",1,0.25,1000,,1,0.25, +254,svh,systemverilog,1000,15,0,47,0,18,Evgenii,LGTM,there are some short files with large top comment that trigger filter,LGTM,,,1,0.25,1000,,1,, +255,vh,systemverilog,615,2,10,5,0,30,Evgenii,LGTM,there are some short files with large top comment that trigger filter,small amount of false triggers,,,1,0.25,1000,,1,, +258,adp,tcl,322,0,0,318,2,2,Wenhao Yu,LGTM,0 file remaining,LGTM,,,1,0.25,1000,,1,, +256,tcl,tcl,1000,0,44,501,0,3,Wenhao Yu,"LGTM, but small portions of them are data, not code",0 file remaining,LGTM,,,1,0.25,1000,,1,, +257,tm,tcl,398,4,10,258,13,14,Wenhao Yu,"LGTM, but small portions of them are data, not code","4 file remaining, all FP",LGTM,,,1,0.25,1000,,1,, +259,csh,tcsh,1000,1,6,7,0,2,Wenhao Yu,"LGTM, some examples looks auto-generated, but not too many","1 file remaining, FP",LGTM,,,1,0.25,1000,,1,, +260,tcsh,tcsh,340,0,6,2,0,0,Wenhao Yu,LGTM,0 file remaining,"3 files remaining, 1 FP",,,1,0.25,1000,,1,, +268,aux,tex,438,0,26,0,0,0,Evgenii,"aux are autogenerated usually, suggest dropping",,,,,0,,,,1,, +270,bbx,tex,15,0,0,0,0,1,Evgenii,LGTM,,,,,1,0.25,remove,,1,, +263,bib,tex,1000,0,214,0,0,0,Evgenii,"LGTM, but bib is not latex per se, not sure if we want it",,"False positives, bib can contain long lines (abstracts)",,,1,0.25,remove,,1,, +272,cbx,tex,6,0,0,0,1,0,Evgenii,LGTM,,,,,1,0.25,remove,,1,, +267,dtx,tex,174,0,1,2,3,0,Evgenii,"LGTM, few misclassifications",,can be increased,,,1,0.25,remove,,1,, +266,ins,tex,44,1,2,0,0,10,Evgenii,"Quite a few misclassifications (data, assembly), can be filtered",,,,,1,0.25,remove,,1,, +271,lbx,tex,2,h,0,0,0,0,Evgenii,LGTM,,,,,1,0.25,remove,,1,, +262,ltx,tex,151,2,2,0,0,3,Evgenii,"Many miscalssifications, some game engine configs",,,,,0,,,,1,, +269,mkii,tex,12,0,0,0,0,0,Evgenii,LGTM,,,,,1,0.25,remove,,1,, +273,mkiv,tex,27,0,0,0,0,0,Evgenii,LGTM,,,,,1,0.25,remove,,1,, +274,mkvi,tex,3,0,0,0,0,0,Evgenii,LGTM,,,,,1,0.25,remove,,1,, +264,sty,tex,611,0,6,0,2,0,Evgenii,"LGTM, few misclassifications",,A few false positives,,,1,0.25,remove,,1,, +265,tex,tex,1000,4,88,2,1,32,Evgenii,LGTM,,False positives,,,1,0.25,remove,,1,, +261,toc,tex,479,0,25,0,0,4,Evgenii,"Many misclassification, but in general toc are usually autogenerated and do not contain ""interesting code"", probably should be dropped",,,,,0,,,,1,, +275,thrift,thrift,1000,0,0,32,0,2,denis kocetkov,LGTM,do not use,LGTM,not surte what is wrong with them,iterface definion language which is syntactically c++,1,0.25,1000,,1,, +276,ts,typescript,1000,0,9,32,20,3,Arjun,LGTM,LGTM,LGTM,LGTM: 30 files that don't lex are actually XML,,1,0.25,1000,,1,, +277,tsx,typescript,1000,0,15,775,3,22,Arjun,LGTM,LGTM,LGTM,LGTM: all unlexable files are just fine,,1,0.25,1000,,1,, +278,veo,verilog,1000,0,2,0,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +279,vhd,vhdl,1000,0,8,32,0,15,Evgenii,LGTM. A few autogenerated files can be filtered by top comment,,"Sometimes (but rarely) there are long constants, may want relax the filter",,,1,0.25,1000,,1,0.25, +280,vhdl,vhdl,1000,0,106,335,0,2,Evgenii,"There are many autogenerated files, possibly can be filtered by parsing top comment",,,,,1,0.25,1000,,1,, +284,vhf,vhdl,33,0,4,0,0,0,Evgenii,"Many autogenerated files, can probably be filtered",,,,,0,,,,1,, +282,vhi,vhdl,17,0,5,7,0,0,Evgenii,"LGTM, some autogenerated, small sample",,,,,1,0.25,1000,,1,, +281,vho,vhdl,309,0,0,59,0,0,Evgenii,LGTM.,,,,"Top comment are often long and repetative, maybe worth filtering out (applies to whole VHDL).",1,0.25,1000,,1,, +283,vht,vhdl,75,0,0,2,0,0,Evgenii,"Mostly autogenerated testbenches, can probably be filtered",,,,,0,,,,1,, +285,vhw,vhdl,5,0,0,0,0,0,Evgenii,LGTM,,,,,1,0.25,1000,,1,, +288,bas,visual-basic,1000,0,7,0,0,19,Harm,LGTM,,,,,1,0.25,1000,,1,0.25, +287,frm,visual-basic,1000,1,840,0,1,3,Evgenii,"Many near-duplicates, possibly autogenerated",LGTM,,,,1,,,,1,remove,"ngram_size: 3, num_perm: 256, threshold: 0.7" +292,frx,visual-basic,38,1,14,0,27,2,Raymond,"Remove, mostly repetitive XML",,,,,0,,,,1,, +286,vb,visual-basic,1000,0,6,0,0,0,Manan Dey,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +290,vba,visual-basic,77,0,0,0,0,0,Raymond,LGTM,LGTM,LGTM,,,1,0.25,1000,,1,, +291,vbhtml,visual-basic,57,0,1,0,0,0,Raymond,LGTM,LGTM,LGTM ,,,1,0.25,1000,,1,, +289,vbs,visual-basic,722,0,6,0,0,0,Jason Stillerman,LGTM,LGTM,"only two files with 1000+. One good, one bad",LGTM,,1,0.25,1000,,1,, +293,xsl,xslt,1000,0,20,2,861,0,denis kocetkov,LGTM,do not use,"seems long lines is a mix of code and data, but not sure if it is valid for the language",LGTM,xml based language to transform documents,1,0.25,1000,,0,, +294,xslt,xslt,1000,1,51,7,852,1,denis kocetkov,LGTM,do not use,"seems long lines is a mix of code and data, but not sure if it is valid for the language",LGTM,xml based language to transform documents,1,0.25,1000,,0,, +296,y,yacc,1000,8,0,0,1,115,Leandro,See comment,LGMT,LGTM ,LGTM,There are a files (10%) that contain single digit per line - no code,1,0.25,1000,,1,0.25, +297,yacc,yacc,14,0,0,0,0,0,Leandro,LGTM,LGTM,LGTM ,LGTM,,1,0.25,1000,,1,, +295,yy,yacc,1000,2,30,0,0,16,Leandro,"Remove, only JSON",,,,,0,,,,1,, +298,zig,zig,1000,0,5,123,0,2,Leandro,LGTM,LGTM,LGTM ,"Most were actually ok, one or two seemed autogenerated",,1,0.25,1000,,1,, +,,,,,,,,,,,,,,,,,,,,, +,,,,,,,,,300,,,,,,,,,,,, +,json,json,,,,,,,Raymond,,, ,"Manually added JSON and YAML",,1,0.25,,,1,0.5, +,yaml,yaml,,,,,,,Raymond,,, ,"Manually added JSON and YAML",,1,0.25,1000,,1,0.5, +,yml,yaml,,,,,,,Raymond,,, ,"Manually added JSON and YAML",,1,0.25,1000,,1,0.5, \ No newline at end of file diff --git a/nemo_curator/utils/config_utils.py b/nemo_curator/utils/config_utils.py new file mode 100644 index 00000000..12ff4a19 --- /dev/null +++ b/nemo_curator/utils/config_utils.py @@ -0,0 +1,81 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +import nemo_curator +from nemo_curator.filters import import_filter +from nemo_curator.download.doc_builder import ( + import_downloader, + import_iterator, + import_extractor, +) +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir +from pydoc import locate + +def build_filter(filter_config): + # Import the filter + filter_class = import_filter(filter_config['name']) + + # Check if constructor has been provided + if ('params' not in filter_config) or (filter_config['params'] is None): + filter_config['params'] = {} + + doc_filter = filter_class(**filter_config["params"]) + + if filter_config.get("filter_only", False): + filter_stage = nemo_curator.Filter(doc_filter.keep_document, filter_field=doc_filter.name) + else: + score_field = doc_filter._name if filter_config.get("log_score", False) else None + filter_stage = nemo_curator.ScoreFilter(doc_filter, filter_config.get("input_field"), score_field=score_field) + + return filter_stage + +def build_filter_pipeline(filter_config_file): + # Get the filter config file + with open(filter_config_file, 'r') as config_file: + filter_params = yaml.load(config_file, Loader=yaml.FullLoader) + + filters = [] + text_field = filter_params.get("input_field") + for nc_filter_config in filter_params.get("filters"): + if "input_field" not in nc_filter_config or nc_filter_config["input_field"] is None: + nc_filter_config["input_field"] = text_field + new_filter = build_filter(nc_filter_config) + filters.append(new_filter) + + return nemo_curator.Sequential(filters) + +def build_downloader(downloader_config_file, default_download_dir=None): + # Get the downloader config file + with open(downloader_config_file, 'r') as config_file: + downloader_params = yaml.load(config_file, Loader=yaml.FullLoader) + + download_class = import_downloader(downloader_params['download_module']) + no_download_dir = ("download_dir" not in downloader_params['download_params']) or (downloader_params['download_params'] is None) + if no_download_dir and default_download_dir: + downloader_params['download_params']['download_dir'] = default_download_dir + expand_outdir_and_mkdir(downloader_params['download_params']['download_dir']) + downloader = download_class(**downloader_params['download_params']) + + iterator_class = import_iterator(downloader_params['iterator_module']) + iterator = iterator_class(**downloader_params['iterator_params']) + + extractor_class = import_extractor(downloader_params['extract_module']) + extractor = extractor_class(**downloader_params['extract_params']) + + dataset_format = {} + for field, field_type in downloader_params["format"].items(): + dataset_format[field] = locate(field_type) + + return downloader, iterator, extractor, dataset_format \ No newline at end of file diff --git a/nemo_curator/utils/constants.py b/nemo_curator/utils/constants.py new file mode 100644 index 00000000..c374aa1f --- /dev/null +++ b/nemo_curator/utils/constants.py @@ -0,0 +1,80 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import regex + + +end_marks = (".", "?", "!", "\"", "\'") +ellipsis_marks = set( + ["...", "[...]", "…", "(...)", "[…]", "-»", "read more..", "read more"]) +policy_substrings = [ + "terms of use", + "privacy policy", + "cookie policy", + "uses cookies", + "privacy overview", + "use of cookies", + "use cookies", + "privacy & cookies policy", + "privacy and cookies policy", + "This website uses cookies to improve your experience while you " + "navigate through the website. Out of these cookies, the cookies " + "that are categorized as necessary are stored on your browser as they " + "are essential for the working of basic functionalities of the website. " + "We also use third-party cookies that help us analyze and understand how " + "you use this website. These cookies will be stored in your browser only " + "with your consent. You also have the option to opt-out of these " + "cookies. But opting out of some of these cookies may have an effect " + "on your browsing experience.".lower(), + "Necessary cookies are absolutely essential for the website to " + "function properly. This category only includes cookies that " + "ensures basic functionalities and security features of the website. " + "These cookies do not store any personal information.".lower(), + "Any cookies that may not be particularly necessary for the website " + "to function and is used specifically to collect user personal data " + "via analytics, ads, other embedded contents are termed as non-necessary " + "cookies. It is mandatory to procure user consent prior to running these " + "cookies on your website.".lower(), + "This site uses cookies, including for analytics, personalization, and " + "advertising purposes. For more information or to change your " + "cookie settings, click here.".lower(), + "If you continue to browse this site without changing your cookie " + "settings, you agree to this use. AcceptRead More".lower(), +] +white_space_list = ["\t", "\n", "\r", "\b", " "] +common_english_words = set( + ['the', 'be', 'to', 'of', 'and', 'that', 'have', 'with']) +bullet_list = set([ + '•', + '‣', + '⁃', + '⁌', + '⁍', + '∙', + '○', + '●', + '◘', + '◦', + '⦾', + '⦿', +]) + +regex_alpha = regex.compile("[[:alpha:]]") +regex_digit = regex.compile("[[:digit:]]") +regex_alphanum = re.compile('[a-zA-Z0-9\n?!,.]') +regex_url = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|' + '(?:%[0-9a-fA-F][0-9a-fA-F]))+') +regex_paren = re.compile(r"{|}|⟨|⟩|\[|\]|\(|\)") +regex_hash = re.compile('#+') diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py new file mode 100644 index 00000000..6a73aa2c --- /dev/null +++ b/nemo_curator/utils/distributed_utils.py @@ -0,0 +1,515 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ["RAPIDS_NO_INITIALIZE"] = "1" +import warnings +import cudf +import dask.dataframe as dd +import dask_cudf +import pandas as pd +from dask.distributed import Client, LocalCluster, get_worker +from dask_cuda import LocalCUDACluster +from typing import Union +from pathlib import Path + + +class DotDict: + def __init__(self, d): + self.__dict__['_data'] = d + + def __getattr__(self, name): + if name in self._data: + return self._data[name] + else: + raise AttributeError(f"No attribute '{name}'") + + +class NoWorkerError(Exception): + pass + +def start_dask_gpu_local_cluster(args) -> Client: + """ + This function sets up a Dask cluster across all the + GPUs present on the machine. + + """ + + # Setting conservative defaults + # which should work across most systems + nvlink_only = getattr(args, "nvlink_only", False) + protocol = getattr(args, "protocol", "tcp") + rmm_pool_size = getattr(args, "rmm_pool_size", "1024M") + enable_spilling = getattr(args, "enable_spilling", True) + set_torch_to_use_rmm = getattr(args, "set_torch_to_use_rmm", True) + + extra_kwargs = ( + { + "enable_tcp_over_ucx": True, + "enable_nvlink": True, + "enable_infiniband": False, + "enable_rdmacm": False, + } + if nvlink_only and protocol == "ucx" + else {} + ) + + cluster = LocalCUDACluster( + rmm_pool_size=rmm_pool_size, + protocol=protocol, + rmm_async=True, + **extra_kwargs, + ) + client = Client(cluster) + + if enable_spilling: + _enable_spilling() + client.run(_enable_spilling) + + if set_torch_to_use_rmm: + _set_torch_to_use_rmm() + client.run(_set_torch_to_use_rmm) + print("Torch is using RMM memory pool", flush=True) + return client + + +def start_dask_cpu_local_cluster(args) -> Client: + """ + This function sets up a Dask cluster across all the + CPUs present on the machine. + + """ + # TODO: expand args as needed for CPU clusters + n_workers = getattr(args, "n_workers", None) + if n_workers is None: + # Setting threads_per_worker + # to 1 ensure 1 worker per CPU + # by default + threads_per_worker = getattr(args, "threads_per_worker", 1) + else: + threads_per_worker = None + cluster = LocalCluster(n_workers=n_workers, threads_per_worker=threads_per_worker) + client = Client(cluster) + return client + + +def get_client(args, cluster_type) -> Client: + """ + This function sets up a Dask cluster across n GPUs if not already set up. + It also ensures maximum memory efficiency for the GPU by: + 1. Ensuring the PyTorch memory pool is the same as the RAPIDS memory pool. + 2. Enabling spilling for cuDF. + + Args: + args: An argparse Namespace object. + cluster_type="cpu": The type of cluster to set up. Either "cpu" or "gpu". + Returns: + A Dask client object. + + """ + if cluster_type not in ["cpu", "gpu"]: + raise ValueError(f"Unknown cluster type: {cluster_type}") + + # Helper class allows the user to pass in a dict instead of a parser + if type(args) == dict: + args = DotDict(args) + + if args.scheduler_address: + if args.scheduler_file: + raise ValueError( + "Only one of scheduler_address or scheduler_file can be provided" + ) + else: + return Client(address=args.scheduler_address, timeout="30s") + elif args.scheduler_file: + return Client(scheduler_file=args.scheduler_file, timeout="30s") + else: + if cluster_type == "gpu": + return start_dask_gpu_local_cluster(args) + else: + return start_dask_cpu_local_cluster(args) + + +def _set_torch_to_use_rmm(): + """ + This function sets up the PyTorch memory pool to be the same as the RAPIDS memory pool. + This helps avoid OOM errors when using both PyTorch and RAPIDS on the same GPU. + + See article: + https://medium.com/rapids-ai/pytorch-rapids-rmm-maximize-the-memory-efficiency-of-your-workflows-f475107ba4d4 + + """ + import torch + from rmm.allocators.torch import rmm_torch_allocator + + torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + + +def _enable_spilling(): + """ + Setting this environment variable enables automatic spilling (and "unspilling") + of buffers from device to host to enable out-of-memory computation, + i.e., computing on objects that occupy more memory than is available on the GPU. + + """ + cudf.set_option("spill", True) + + +def read_single_partition( + files, backend="cudf", filetype="jsonl", add_filename=False +) -> Union[cudf.DataFrame, pd.DataFrame]: + """ + This function reads a file with cuDF, sorts the columns of the DataFrame + and adds a "filename" column. + + Args: + files: The path to the jsonl files to read. + backend: The backend to use for reading the data. Either "cudf" or "pandas". + add_filename: Whether to add a "filename" column to the DataFrame. + Returns: + A cudf DataFrame or a pandas DataFrame. + + """ + if filetype == "jsonl": + read_kwargs = {"lines": True} + if backend == "cudf": + read_f = cudf.read_json + else: + read_kwargs["dtype"] = False + read_f = pd.read_json + elif filetype == "parquet": + read_kwargs = {} + if backend == "cudf": + read_f = cudf.read_parquet + else: + read_f = pd.read_parquet + else: + raise RuntimeError("Could not read data, please check file type") + + if add_filename: + read_files_one_at_a_time = True + else: + if backend == "cudf": + # cuDF supports reading multiple files at once + read_files_one_at_a_time = False + else: + # pandas does not support reading multiple files at once + read_files_one_at_a_time = True + + if read_files_one_at_a_time: + if backend == "cudf": + concat_f = cudf.concat + else: + concat_f = pd.concat + df_ls = [] + for file in files: + df = read_f(file, **read_kwargs) + if add_filename: + df["filename"] = os.path.basename(file) + df_ls.append(df) + df = concat_f(df_ls, ignore_index=True) + else: + df = read_f(files, **read_kwargs) + df = df[sorted(df.columns)] + return df + + +def read_pandas_pickle(file, add_filename=False) -> pd.DataFrame: + """ + This function reads a pickle file with pandas and adds a "filename" column. + + Args: + file: The path to the pickle file to read. + add_filename: Whether to add a "filename" column to the DataFrame. + Returns: + A pandas DataFrame. + + """ + if add_filename: + warnings.warn("add_filename is not supported for pickle files") + return pd.read_pickle(file) + + +def read_data( + input_files, + file_type="pickle", + backend="cudf", + files_per_partition=1, + add_filename=False, +) -> Union[dd.DataFrame, dask_cudf.DataFrame]: + """ + This function can read multiple data formats and returns a Dask-cuDF DataFrame. + + Args: + input_files: The path of the input file(s). + file_type: The type of the input file(s). + backend: The backend to use for reading the data. + files_per_partition: The number of files to read per partition. + add_filename: Whether to add a "filename" column to the DataFrame. + + Returns: + A Dask-cuDF or a Dask-pandas DataFrame. + + """ + if file_type == "pickle": + df = read_pandas_pickle(input_files[0], add_filename=add_filename) + df = dd.from_pandas(df, npartitions=16) + if backend == "cudf": + df = df.to_backend("cudf") + + elif file_type in ["jsonl", "parquet"]: + print(f"Reading {len(input_files)} files", flush=True) + input_files = sorted(input_files) + if files_per_partition > 1: + input_files = [ + input_files[i : i + files_per_partition] + for i in range(0, len(input_files), files_per_partition) + ] + else: + input_files = [[file] for file in input_files] + return dd.from_map( + read_single_partition, + input_files, + filetype=file_type, + backend=backend, + add_filename=add_filename, + enforce_metadata=False, + ) + else: + raise RuntimeError("Could not read data, please check file type") + return df + + +def process_batch( + load_model_function, load_model_kwargs, run_inference_function, run_inference_kwargs +): + """ + This function loads a model on a Dask worker and then runs inference on a batch of data. + + Args: + load_model_function: A user-provided function for loading a classifier. + load_model_kwargs: A dictionary of arguments necessary for `load_model_function`. + run_inference_function: A user-provided function for running inference, which has a "model" argument. + run_inference_kwargs: A dictionary of arguments necessary for `run_inference_function`. + Returns: + Whatever `run_inference_function` returns, such as a list or tensor of predictions. + + """ + model = load_object_on_worker("model", load_model_function, load_model_kwargs) + return run_inference_function(**run_inference_kwargs, model=model) + + +def process_all_batches( + loader_valid, + load_model_function, + load_model_kwargs, + run_inference_function, + run_inference_kwargs, +): + """ + This function iterates over batches of data, loading a model and running inference per batch. + + Args: + loader_valid: An iterable data object, such as a PyTorch DataLoader. + load_model_function: A user-provided function for loading a classifier. + load_model_kwargs: A dictionary of arguments necessary for `load_model_function`. + run_inference_function: A user-provided function for running inference, which has "model" and "batch" arguments. + run_inference_kwargs: A dictionary of arguments necessary for `run_inference_function`. + Returns: + A tensor of predictions for all batches of the data. + + """ + import torch + + return torch.cat( + [ + process_batch( + load_model_function, + load_model_kwargs, + run_inference_function, + dict(run_inference_kwargs, batch=batch), + ) + for batch in loader_valid + ] + ) + + +def single_partition_write_with_filename(df, output_file_dir, output_type="jsonl"): + """ + This function processes a DataFrame and writes it to disk + + Args: + df: A DataFrame. + output_file_dir: The output file path. + output_type="jsonl": The type of output file to write. + Returns: + If the DataFrame is non-empty, return a Series containing a single element, True. + If the DataFrame is empty, return a Series containing a single element, False. + + """ + assert "filename" in df.columns + + if len(df) > 0: + empty_partition = True + else: + warnings.warn(f"Empty partition found") + empty_partition = False + + if isinstance(df, pd.DataFrame): + success_ser = pd.Series([empty_partition]) + else: + success_ser = cudf.Series([empty_partition]) + + if empty_partition: + filename = df.filename.iloc[0] + num_filenames = len(df.filename.unique()) + if num_filenames > 1: + raise ValueError( + f"More than one filename found in partition: {num_filenames}" + ) + filename = Path(filename).stem + output_file_path = os.path.join(output_file_dir, filename) + if output_type == "jsonl": + output_file_path = output_file_path + ".jsonl" + if isinstance(df, pd.DataFrame): + df.to_json(output_file_path, orient="records", lines=True, force_ascii=False) + else: + # See open issue here: https://github.com/rapidsai/cudf/issues/15211 + # df.to_json( + # output_file_path, orient="records", lines=True, engine="cudf", force_ascii=False + # ) + df.to_json( + output_file_path, orient="records", lines=True, force_ascii=False + ) + elif output_type == "parquet": + output_file_path = output_file_path + ".parquet" + df.to_parquet(output_file_path) + else: + raise ValueError(f"Unknown output type: {output_type}") + + return success_ser + + +def write_to_disk(df, output_file_dir, write_to_filename=False, output_type="jsonl"): + """ + This function writes a Dask DataFrame to the specified file path. + If write_to_filename is True, then it expects the + DataFrame to have a "filename" column that specifies where to write the document. + + Args: + df: A Dask DataFrame. + output_file_dir: The output file path. + write_to_filename: Whether to write the filename using the "filename" column. + output_type="jsonl": The type of output file to write. + + """ + if write_to_filename and "filename" not in df.columns: + raise ValueError( + "write_using_filename is True but no filename column found in df" + ) + + if write_to_filename: + if isinstance(df, dd.DataFrame): + output_meta = pd.Series([True], dtype="bool") + else: + output_meta = cudf.Series([True]) + os.makedirs(output_file_dir, exist_ok=True) + output = df.map_partitions( + single_partition_write_with_filename, + output_file_dir, + output_type=output_type, + meta=output_meta, + enforce_metadata=False, + ) + output = output.compute() + else: + if output_type == "jsonl": + if isinstance(df, dask_cudf.DataFrame): + # See open issue here: https://github.com/rapidsai/cudf/issues/15211 + # df.to_json(output_file_dir, orient="records", lines=True, engine="cudf", force_ascii=False) + df.to_json(output_file_dir, orient="records", lines=True, force_ascii=False) + else: + df.to_json(output_file_dir, orient="records", lines=True, force_ascii=False) + elif output_type == "parquet": + df.to_parquet(output_file_dir, write_index=False) + else: + raise ValueError(f"Unknown output type: {output_type}") + + print(f"Writing to disk complete for {df.npartitions} partitions", flush=True) + + +def load_object_on_worker(attr, load_object_function, load_object_kwargs): + """ + This function checks if a Dask worker has a specified attribute for storing an object. + If it does, then fetch the object and return it. + If it does not, then load the object, set it as an attribute, and return it. + + Args: + attr: A string representing an attribute of a Dask worker. + load_object_function: A user-provided function for how to load the object. + load_object_kwargs: A dictionary of arguments necessary for `load_object_function`. + Returns: + The object of interest according to the `attr`. + + """ + # get_worker will fail during type inference of Dask functions + try: + worker = get_worker() + except ValueError as e: + raise NoWorkerError(str(e)) + + if hasattr(worker, attr): + obj = getattr(worker, attr) + else: + obj = load_object_function(**load_object_kwargs) + setattr(worker, attr, obj) + return obj + + +def offload_object_on_worker(attr): + """ + This function deletes an existing attribute from a Dask worker. + + Args: + attr: The name of the attribute to delete. + Returns: + True. + + """ + worker = get_worker() + if hasattr(worker, attr): + delattr(worker, attr) + return True + + +def get_num_workers(client): + """ + Returns the number of workers in the cluster + """ + if client is None: + return None + worker_list = list(client.scheduler_info()["workers"].keys()) + return len(worker_list) + + +def get_current_client(): + """ + Returns the context-local or latest initailised client. + If no Client instances exist, returns None + """ + try: + return Client.current() + except ValueError: + return None diff --git a/nemo_curator/utils/download_utils.py b/nemo_curator/utils/download_utils.py new file mode 100644 index 00000000..279beede --- /dev/null +++ b/nemo_curator/utils/download_utils.py @@ -0,0 +1,186 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import requests +import json +import zlib +from urllib.parse import urljoin +from datetime import datetime, timedelta +from collections import OrderedDict +from typing import List, Optional +from bs4 import BeautifulSoup +import subprocess + + +def get_main_warc_paths( + snapshot_index, + start_snapshot, + end_snapshot, + prefix="https://data.commoncrawl.org", +): + beg_year, beg_week = list(map(int, start_snapshot.split('-'))) + end_year, end_week = list(map(int, end_snapshot.split('-'))) + start_date = datetime.fromisocalendar(beg_year, beg_week, 1) + end_date = datetime.fromisocalendar(end_year, end_week, 1) + + if beg_year < 2013 or end_year < 2013: + print("Warning: Only snapshots after 2013 are supported by this script") + + total_prefix = urljoin(prefix, 'crawl-data/CC-MAIN') + + warc_paths = [] + for snapshot in snapshot_index: + date = list(map(int, snapshot['id'].split('-')[2:])) + + if len(date) == 2: + year, week = date + else: + continue + + if year >= 2013: + curr_date = datetime.fromisocalendar(year, week, 1) + if curr_date >= start_date and curr_date <= end_date: + warc_path = f'{total_prefix}-{year}-{week:02d}/warc.paths.gz' + warc_paths.append(warc_path) + + return warc_paths + +def get_news_warc_paths( + start_date, + end_date, + prefix="https://data.commoncrawl.org", +): + beg = datetime.strptime(start_date, "%Y-%m") + end = datetime.strptime(end_date, "%Y-%m") + + # Get current year and month + today = datetime.now() + + if beg.year < 2016 or end.year > today.year: + print("Warning: WARC paths exist only from 2016-8 to " + f"{today.year}-{today.month}") + total_prefix = urljoin(prefix, 'crawl-data/CC-NEWS') + + # Generate all valid YYYY-MM strings in range + dates = OrderedDict() + for day in range((end - beg).days + 1): + new_date = beg + timedelta(day) + dates[(new_date.year, new_date.month)] = None + + dates = list(dates.keys()) + + warc_paths = [] + for year, month in dates: + warc_path = f'{total_prefix}/{year}/{month:02d}/warc.paths.gz' + warc_paths.append(warc_path) + + return warc_paths + +def get_common_crawl_snapshot_index(index_prefix): + index_url = urljoin(index_prefix, "collinfo.json") + index_response = requests.get(index_url) + + return json.loads(index_response.content) + +def get_common_crawl_urls(starting_snapshot: str, ending_snapshot: str, data_domain_prefix="https://data.commoncrawl.org", index_prefix="https://index.commoncrawl.org", news=False) -> List[str]: + """ + Retrieves the URLs for all the compressed WARC files between given Common Crawl snapshots + + Args: + starting_snapshot: The first common crawl snapshot to include. Snapshots must be + specified by YYYY-WeekNumber (e.g., '2020-50' or '2021-04'). For the CC-NEWS dataset, + (specified with news=True flag) this changes to Year-Month (YYYY-MM). + ending_snapshot: The last common crawl snapshot to include. Must be chronologically + after the starting snapshot. + data_domain_prefix: The prefix that will be prepended to each WARC file to create the URL. + index_prefix: The prefix of the URL to the Common Crawl index. + news: If True, gets WARC URLs for the CC-NEWS dataset instead of the CC-MAIN datasets. + Also assumes that the format for the start and end snapshots is 'YYYY-MM' (Year-Month). + """ + if news: + warc_paths = get_news_warc_paths(starting_snapshot, ending_snapshot, prefix=data_domain_prefix) + else: + index = get_common_crawl_snapshot_index(index_prefix) + warc_paths = get_main_warc_paths(index, starting_snapshot, ending_snapshot, prefix=data_domain_prefix) + + common_crawl_urls = [] + for path in warc_paths: + try: + response = requests.get(path.rstrip(), stream=True) + data = zlib.decompress(response.content, zlib.MAX_WBITS | 32) + for warc in data.decode('utf-8').split('\n'): + if warc != '': + warc_url = urljoin(data_domain_prefix, warc) + common_crawl_urls.append(warc_url) + except Exception as e: + print(f"Could not get URLs for snapshot {path}") + print(response.content) + print(e) + + return common_crawl_urls + +def get_wikipedia_urls(language="en", wikidumps_index_prefix="https://dumps.wikimedia.org", dump_date: Optional[str]=None) -> List[str]: + """ + Retrieves all urls pointing to the latest Wikipedia dumps + + Args: + language: Desired language of the Wikipedia dump. + wikidumps_index_prefix: The base url for all wikipedia dumps + dump_date: A string formatted as "YYYYMMDD" for the wikipedia dump to use. + If None, latest dump is used. + """ + wiki_index_url = urljoin(wikidumps_index_prefix, f"{language}wiki") + if not dump_date: + # First get the index + raw_wiki_index = requests.get(wiki_index_url) + wiki_index = raw_wiki_index.content.decode("utf-8") + wiki_index_parsed = BeautifulSoup(wiki_index, "lxml") + + # Get all dumps available in the index + dumps = wiki_index_parsed.find_all("a") + dump_date = dumps[-2].text + else: + # A trailing / is needed for the url + dump_date = dump_date + "/" + + # Get the json dump data + wiki_latest_dump = urljoin(wiki_index_url + '/', dump_date) + wiki_latest_dump_status = urljoin(wiki_latest_dump, "dumpstatus.json") + raw_dump_data = requests.get(wiki_latest_dump_status) + try: + dump_data = json.loads(raw_dump_data.content) + except json.decoder.JSONDecodeError: + raise ValueError(f"No wikipedia dump found for {dump_date[:-1]}") + + # Get all multistream files within the dump data + wikipedia_urls = [] + for ifile in dump_data['jobs']['articlesmultistreamdump']['files']: + if 'xml' in ifile: + url = urljoin(wiki_latest_dump, ifile) + wikipedia_urls.append(url) + + return wikipedia_urls + +def get_arxiv_urls(): + command = "s5cmd --request-payer=requester ls s3://arxiv/src/ | grep '.tar'" + result = subprocess.run(command, capture_output=True, text=True, shell=True) + + if result.returncode != 0: + raise RuntimeError(f"Unable to get arxiv urls: {result.stderr}") + + urls = result.stdout.split()[3::4] + urls.sort() + + return urls \ No newline at end of file diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py new file mode 100644 index 00000000..ecc15780 --- /dev/null +++ b/nemo_curator/utils/file_utils.py @@ -0,0 +1,203 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pathlib +from functools import reduce +import pandas as pd +import dask.dataframe as dd +from dask import delayed +import dask.bag as db +import numpy as np +import json +from functools import partial + +from nemo_curator.utils.distributed_utils import single_partition_write_with_filename + + +def mkdir(d): + pathlib.Path(d).mkdir(parents=True, exist_ok=True) + + +def expand_outdir_and_mkdir(outdir): + outdir = os.path.abspath(os.path.expanduser(outdir)) + mkdir(outdir) + return outdir + + +def get_all_files_paths_under(root, recurse_subdirectories=True, followlinks=False): + """ + This function returns a list of all the files under a specified directory. + Args: + root: The path to the directory to read. + recurse_subdirecties: Whether to recurse into subdirectories. + Please note that this can be slow for large + number of files. + followlinks: Whether to follow symbolic links. + """ + if recurse_subdirectories: + file_ls = [ + os.path.join(r, f) + for r, subdirs, files in os.walk(root, followlinks=followlinks) + for f in files + ] + else: + file_ls = [entry.path for entry in os.scandir(root)] + + file_ls.sort() + return file_ls + +# Using this for restarting jobs +# can lead to problems when there is an error while +# writing a file we can use the offset counter approach +# in jaccard shuffle as a more robust way to restart jobs +def get_remaining_files(input_file_path, output_file_path, input_file_type): + """ + This function returns a list of the files that still remain to be read. + + Args: + input_file_path: The path of the input files. + output_file_path: The path of the output files. + input_file_type: The type of the input files. + Returns: + A list of files that still remain to be read. + + """ + if input_file_type == "pickle": + return [input_file_path] + completed_files = [ + os.path.basename(entry.path) for entry in os.scandir(output_file_path) + ] + completed_files = set(completed_files) + input_files = [ + entry.path + for entry in os.scandir(input_file_path) + if os.path.basename(entry.path) not in completed_files + ] + input_files.sort() + return input_files + + +def get_batched_files(input_file_path, output_file_path, input_file_type, batch_size=64): + """ + This function returns a batch of files that still remain to be processed. + + Args: + input_file_path: The path of the input files. + output_file_path: The path of the output files. + input_file_type: The type of the input files. + batch_size: The number of files to be processed at once + Returns: + A batch of files that are not in the output directory. + """ + remaining_files = get_remaining_files(input_file_path, output_file_path, input_file_type) + for i in range(0, len(remaining_files), batch_size): + yield remaining_files[i:i + batch_size] + +def write_dataframe_by_meta(df: pd.DataFrame, output_dir, metadata_field, remove_metadata, output_type): + counts = df[metadata_field].value_counts().to_dict() + + for meta_value in counts: + meta_output_dir = expand_outdir_and_mkdir(os.path.join(output_dir, meta_value)) + meta_slice = df[df[metadata_field] == meta_value] + if remove_metadata: + meta_slice = meta_slice.drop(columns=[metadata_field]) + single_partition_write_with_filename(meta_slice, meta_output_dir, output_type=output_type) + + return counts + +def merge_counts(first: dict, second: dict): + for ngram, count in second.items(): + first[ngram] = first.get(ngram, 0) + count + + return first + +def separate_by_metadata(df: dd.DataFrame, output_dir, metadata_field, remove_metadata=False, output_type="jsonl") -> dict: + """ + Saves the dataframe to subfolders named after a metadata + + Args: + df: The dataframe to write. Must have a filename column for the shard. + output_dir: The base directory for which all metadata based subdirs will be created under + metadata_field: The metadata field to split on + remove_metadata: Whether to remove the metadata from the dataframe when saving it + + Returns: + A delayed dictionary mapping each metadata to the count of entries with that metadata value. + """ + delayed_data = df.to_delayed() + delayed_counts = [delayed(write_dataframe_by_meta)(partition, output_dir, metadata_field, remove_metadata, output_type) for partition in delayed_data] + merged_counts = delayed(reduce)(merge_counts, delayed_counts) + + return merged_counts + +def parse_str_of_num_bytes(s, return_str=False): + try: + power = "kmg".find(s[-1].lower()) + 1 + size = float(s[:-1]) * 1024**power + except ValueError: + raise ValueError("Invalid size: {}".format(s)) + if return_str: + return s + else: + return int(size) + +def _save_jsonl(documents, output_path, start_index=0, max_index=10000, prefix=None): + """ Worker function to write out the data to jsonl files """ + + def _output_json(document): + myjson = json.dumps(document, ensure_ascii=False) + return myjson.encode('utf-8') + + def _name(start_index, npad, prefix, i): + tag = str(start_index + i).rjust(npad, '0') + return f"{prefix}{tag}" + + # Create the naming function + npad = int(np.log10(max_index) + 1) + name = partial(_name, start_index, npad, prefix) + + output_glob_string = os.path.join(output_path, "*.jsonl") + + documents.map(_output_json).to_textfiles( + output_glob_string, + name_function=name, + ) + +def reshard_jsonl(input_dir, output_dir, output_file_size="100M", start_index=0, file_prefix=""): + """ + Reshards a directory of jsonl files to have a new (approximate) file size for each shard + + Args: + input_dir: The input directory containing jsonl files + output_dir: The output directory where the resharded jsonl files will be written + output_file_size: Approximate size of output files. Must specify with a string and + with the unit K, M or G for kilo, mega or gigabytes + start_index: Starting index for naming the output files + file_prefix: Prefix to use to prepend to output file number + """ + + # Output file size in bytes + blocksize = parse_str_of_num_bytes(output_file_size) + + input_files = list(get_all_files_paths_under(input_dir)) + + # Read in the dask bag + b = db.read_text(input_files, blocksize=blocksize).map(json.loads) + + # Prepare the output + output_dir = expand_outdir_and_mkdir(output_dir) + + # Save to balanced files + _save_jsonl(b, output_dir, start_index=start_index, prefix=file_prefix) \ No newline at end of file diff --git a/nemo_curator/utils/fuzzy_dedup_utils/__init__.py b/nemo_curator/utils/fuzzy_dedup_utils/__init__.py new file mode 100644 index 00000000..fe99e99a --- /dev/null +++ b/nemo_curator/utils/fuzzy_dedup_utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/nemo_curator/utils/fuzzy_dedup_utils/id_mapping.py b/nemo_curator/utils/fuzzy_dedup_utils/id_mapping.py new file mode 100644 index 00000000..a566139e --- /dev/null +++ b/nemo_curator/utils/fuzzy_dedup_utils/id_mapping.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def convert_str_id_to_int(df, id_column="id"): + """ + Converts the legacy id format "dataset_name-0000034" + type of ID into 2 int based ID's + """ + dx = df[id_column].str.rsplit("-", n=1, expand=True) + df["doc_id"] = dx[1].astype("int64").values + df["dataset_id"] = dx[0].hash_values() + df.drop(columns=[id_column], inplace=True) + return df + + +def int_ids_to_str(df, id_column="id"): + """ + Converts int id's generated via `convert_str_id_to_int` + back to a string ID + """ + df[id_column] = df["dataset_id"].astype(str) + "-" + df["doc_id"].astype(str) + df.drop(columns=["dataset_id", "doc_id"], inplace=True) + + if "anchor_0_dataset_id" in df.columns: + df[f"anchor_0_{id_column}"] = ( + df["anchor_0_dataset_id"].astype(str) + + "-" + + df["anchor_0_doc_id"].astype(str) + ) + df.drop(columns=["anchor_0_dataset_id", "anchor_0_doc_id"], inplace=True) + + if "anchor_1_dataset_id" in df.columns: + df[f"anchor_1_{id_column}"] = ( + df["anchor_1_dataset_id"].astype(str) + + "-" + + df["anchor_1_doc_id"].astype(str) + ) + df.drop(columns=["anchor_1_dataset_id", "anchor_1_doc_id"], inplace=True) + return df diff --git a/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py b/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py new file mode 100644 index 00000000..842613a0 --- /dev/null +++ b/nemo_curator/utils/fuzzy_dedup_utils/io_utils.py @@ -0,0 +1,180 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from glob import glob + +import cudf +import dask_cudf +import numpy as np +from dask import dataframe as dd + +from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import convert_str_id_to_int + + +# TODO: +# Combine this with +# nemo_curator.distributed_utils.read_cudf_jsonl +def read_json_func(files, engine="cudf", include_path_column=False, columns=None): + """ + Reads multiple Json Lines files into a cuDF + dataframe with an additional `path` column denoting the path + of the input file. + """ + if not include_path_column: + if columns: + return cudf.read_json(files, engine="cudf", lines=True)[columns] + else: + return cudf.read_json(files, engine="cudf", lines=True) + + dfs = [] + for file in files: + if columns: + df = cudf.read_json(file, engine=engine, lines=True)[columns] + else: + df = cudf.read_json(file, engine=engine, lines=True) + df["path"] = file + dfs.append(df) + return cudf.concat(dfs, ignore_index=True) + + +def get_text_ddf_from_json_path_with_blocksize( + input_data_paths, num_files, blocksize, id_column, text_column +): + data_paths = [ + entry.path for data_path in input_data_paths for entry in os.scandir(data_path) + ] + data_paths = [f for f in data_paths if f.endswith(".jsonl")] + data_paths.sort() + if num_files != -1: + data_paths = data_paths[:num_files] + meta_df = cudf.DataFrame( + { + text_column: ["x"], + id_column: ["x"], + } + ) + print( + f"Number of files being read for jaccard calculation = {len(data_paths)}", + flush=True, + ) + filepaths_ls = chunk_files(data_paths, blocksize) + text_ddf = dd.from_map( + read_json_func, filepaths_ls, columns=list(meta_df.columns), meta=meta_df + ) + text_ddf = text_ddf.map_partitions( + convert_str_id_to_int, + id_column=id_column, + meta=cudf.DataFrame({text_column: ["a"], "doc_id": [0], "dataset_id": np.uint32(1)}), + ) + return text_ddf + + +def get_bucket_ddf_from_parquet_path(input_bucket_path, num_workers): + # Read parquet-formatted parquet files + ddf_bk = dask_cudf.read_parquet( + input_bucket_path, + blocksize="512MiB", + aggregate_files=True, + ) + # Repartition to ensure we at least have num_workers partitions + npartitions = max(ddf_bk.npartitions, num_workers) + ddf_bk = ddf_bk.repartition(npartitions=npartitions) + print(f"Number of ddf_bk partitions = {ddf_bk.npartitions}", flush=True) + return ddf_bk + + +def aggregated_anchor_docs_with_bk_read(path, blocksize): + from dask.utils import natural_sort_key + from pyarrow.dataset import dataset + + ds = dataset( + sorted(glob(f"{path}/*.parquet"), key=natural_sort_key), + format="parquet", + ) + chunks = chunk_files(ds.get_fragments(), blocksize) + + # Record mapping between file indices and partition indices. + # We need to do this, because our anchor_docs_with_bk data + # should be shuffled on disk. + assert len(chunks) + part_id = np.repeat( + np.arange(len(chunks), dtype="int32"), + np.fromiter(map(len, chunks), dtype="int32"), + ) + file_id = np.arange(len(part_id), dtype="int32") + mapping_df = cudf.DataFrame({"file_id": file_id, "part_id": part_id}) + + meta = cudf.DataFrame.from_arrow(ds.schema.empty_table()) + return dd.from_map(cudf.read_parquet, chunks, meta=meta), mapping_df + + +def get_restart_offsets(output_path): + bucket_offset, text_offset = 0, 0 + fn = f"{output_path}/_restart_offset.txt" + if os.path.exists(fn): + with open(fn, "r") as f: + offsets = f.readline().strip("\n").split(",") + bucket_offset = int(offsets[0]) + text_offset = int(offsets[1]) + return bucket_offset, text_offset + + +def update_restart_offsets(output_path, bucket_offset, text_offset): + with open(f"{output_path}/_restart_offset.txt", "w") as f: + f.write(f"{bucket_offset},{text_offset}\n") + + +def chunk_files(file_list, max_size_mb): + """ + Chunk files into lists of files that are less than max_size_mb + """ + + max_size_bytes = max_size_mb * 1024 * 1024 + chunks = [] + current_chunk = [] + current_size = 0 + + for frag_or_path in file_list: + if isinstance(frag_or_path, str): + file_path = frag_or_path + file_size = get_file_size(file_path) + else: + file_path = frag_or_path.path + file_size = get_frag_size(frag_or_path) + + if current_size + file_size <= max_size_bytes: + current_chunk.append(file_path) + current_size += file_size + else: + # Handle case when the first + # file is larger than max_size_mb + if current_chunk: + chunks.append(current_chunk) + current_chunk = [file_path] + current_size = file_size + + if current_chunk: + chunks.append(current_chunk) + + return chunks + + +def get_frag_size(frag): + # Pyarrow dataset fragment + return sum(rg.total_byte_size for rg in frag.row_groups) + + +def get_file_size(file_path): + return os.path.getsize(file_path) diff --git a/nemo_curator/utils/fuzzy_dedup_utils/output_map_utils.py b/nemo_curator/utils/fuzzy_dedup_utils/output_map_utils.py new file mode 100644 index 00000000..0b686711 --- /dev/null +++ b/nemo_curator/utils/fuzzy_dedup_utils/output_map_utils.py @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import numpy as np +import numba +from typing import Tuple +from nemo_curator._compat import DASK_SHUFFLE_METHOD_ARG + + +def get_agg_text_bytes_df( + df: dask_cudf.DataFrame, + agg_column: str, + bytes_column: str, + n_partitions: int, + shuffle: bool = False, +) -> Tuple[dask_cudf.DataFrame, int]: + """ + Groupby bucket and calculate total bytes for a bucket. + """ + shuffle_arg = "shuffle_method" if DASK_SHUFFLE_METHOD_ARG else "shuffle" + agg_df = ( + df[[agg_column, bytes_column]] + .groupby([agg_column]) + .agg({bytes_column: "sum"}, split_out=n_partitions, **{shuffle_arg: shuffle}) + ) + agg_df = agg_df.reset_index(drop=False) + # Doing a per partition sort + # seems to cause issues with + # jaccard shuffle (Overflow errors) + # which are caught and then + # retried with text_bytes_aware_merge + agg_df = agg_df.persist() + agg_df = agg_df.sort_values(by=[bytes_column], ascending=False, ignore_index=True) + agg_df = agg_df.persist() + # Added length to force computation + # after persist + agg_df_len = len(agg_df) + + return agg_df, agg_df_len + + +# next-fit-descending bin packing +# https://en.wikipedia.org/wiki/Next-fit-decreasing_bin_packing +@numba.jit(nopython=True) +def build_partition(sizes: np.ndarray, max_size: int) -> np.ndarray: + """ + Given an array of items and a max bin size this method + attempts to return a grouping of items such that no group exceeds + the max bin size using the Next-fit-decreasing bin packing approach. + """ + i: int = 0 + count: int = 0 + current: int = 0 + size: int = 0 + partition = np.empty(sizes.shape, dtype=np.int32) + for i in range(len(sizes)): + size = sizes[i] + if current + size < max_size: + partition[i] = count + current += size + else: + count += 1 + current = size + partition[i] = count + return partition diff --git a/nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py b/nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py new file mode 100644 index 00000000..020cab33 --- /dev/null +++ b/nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py @@ -0,0 +1,170 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cudf +import dask_cuda +import numpy as np +from dask import config +from dask.dataframe.shuffle import rearrange_by_column +from dask_cuda.explicit_comms.dataframe.shuffle import ( + shuffle as explicit_comms_shuffle, +) +from packaging.version import Version + +from nemo_curator.utils.fuzzy_dedup_utils.output_map_utils import ( + build_partition, + get_agg_text_bytes_df, +) + +USE_EXCOMMS = Version(dask_cuda.__version__) >= Version("23.10") + + +def write_partitioned_file(df, output_path, partition_on, batch_id): + if len(df) == 0: + return cudf.Series([True]) + + cudf.io.parquet.write_to_dataset( + df, + output_path, + partition_cols=[partition_on], + filename=f"batch_{batch_id}.parquet", + ) + return cudf.Series([True]) + + +def rearange_by_column_direct( + df, + col, + npartitions, + ignore_index, + excomms_default=USE_EXCOMMS, +): + # Execute a "direct" shuffle operation without staging + if config.get("explicit-comms", excomms_default): + # Use explicit comms unless the user has + # disabled it with the dask config system, + # or we are using an older version of dask-cuda + return explicit_comms_shuffle( + df, + [col], + npartitions=npartitions, + ignore_index=ignore_index, + ) + else: + return rearrange_by_column( + df, + col=col, + shuffle="tasks", + # Prevent staged shuffling by setting max_branch + # to the number of input partitions + 1 + max_branch=npartitions + 1, + npartitions=npartitions, + ignore_index=ignore_index, + ) + + +def get_shuffle_part_ids_df( + agg_df, + partition_on, + output_col, + size_col, + num_workers=0, +): + sizes = agg_df[size_col].values + max_text_bytes_per_part = int(np.iinfo(np.int32).max // 1.2) + + # Adjust max_text_bytes_per_part if the number of output + # partitions is small compared to the number of workers. + # Sometimes we just have very few output partitions to + # deal with, and just need a larger batch + npartitions_min = max(1, int(num_workers * 0.8)) + while True: + output_ar = build_partition(sizes.get(), max_text_bytes_per_part) + if output_ar.max() > npartitions_min or max_text_bytes_per_part < 2**24: + break + max_text_bytes_per_part = int(max_text_bytes_per_part // 2.0) + + df = cudf.DataFrame() + df[partition_on] = agg_df[partition_on] + df[output_col] = output_ar + return df + + +def get_shuffle_partition_info( + df, + partition_on, + output_column, + text_column, + bytes_column="_text_bytes", + num_workers=None, +): + df[bytes_column] = df[text_column].map_partitions(lambda s: s.str.byte_count()) + agg_df, _ = get_agg_text_bytes_df( + df, agg_column=partition_on, bytes_column=bytes_column, n_partitions=1 + ) + del df + + agg_df = agg_df.reset_index(drop=True) + shuffle_part_ids = agg_df.map_partitions( + get_shuffle_part_ids_df, + partition_on, + size_col=bytes_column, + num_workers=num_workers, + output_col=output_column, + ).persist() + return shuffle_part_ids + + +def text_bytes_aware_shuffle(df, partition_on, text_column, num_workers=None): + """ + This shuffle takes into account the text bytes of each partition + and tries to make sure that the output partitions do not exceed + the char limit of cuDF + + Args: + df: dask_cudf dataframe + partition_on: column name to partition on + + + Returns: + dask_cudf dataframe with _partitions columns + """ + print("Starting text bytes aware shuffle", flush=True) + output_col = "_partitions" + + df = df.persist() + shuffle_part_ids = get_shuffle_partition_info( + df=df, + partition_on=partition_on, + num_workers=num_workers, + output_column=output_col, + text_column=text_column, + ) + n_output_partitions = shuffle_part_ids[output_col].max().compute() + 1 + n_output_partitions = int(n_output_partitions) + df = df.merge(shuffle_part_ids, on=partition_on, how="inner").persist() + + df = ( + rearange_by_column_direct( + df, + col=output_col, + npartitions=n_output_partitions, + ignore_index=True, + excomms_default=True, + ) + .drop(columns=[output_col]) + .persist() + ) + print(f"Will write {len(df)} rows to disk", flush=True) + return df diff --git a/nemo_curator/utils/gpu_utils.py b/nemo_curator/utils/gpu_utils.py new file mode 100644 index 00000000..c7b1e89b --- /dev/null +++ b/nemo_curator/utils/gpu_utils.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def is_cudf_type(obj): + """ + Check if an object is a cuDF type + """ + types = [ + str(type(obj)), + str(getattr(obj, "_partition_type", "")), + str(getattr(obj, "_meta", "")), + ] + return any("cudf" in obj_type for obj_type in types) + diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py new file mode 100644 index 00000000..c005c0fd --- /dev/null +++ b/nemo_curator/utils/script_utils.py @@ -0,0 +1,190 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import argparse +from itertools import islice + + +def attach_bool_arg(parser, flag_name, default=False, help_str=None): + attr_name = flag_name.replace("-", "_") + parser.add_argument( + "--{}".format(flag_name), + dest=attr_name, + action="store_true", + help=flag_name.replace("-", " ") if help_str is None else help_str, + ) + parser.add_argument( + "--no-{}".format(flag_name), + dest=attr_name, + action="store_false", + help=flag_name.replace("-", " ") if help_str is None else help_str, + ) + parser.set_defaults(**{attr_name: default}) + + +def add_distributed_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + """ + Adds default set of arguments that are needed for Dask cluster setup + """ + parser.add_argument( + "--scheduler-address", + type=str, + default=None, + help="Address to the scheduler of a created dask cluster. If not provided" + "a single node LocalCUDACluster will be started.", + ) + parser.add_argument( + "--scheduler-file", + type=str, + default=None, + help="Path to the scheduler file of a created dask cluster. If not provided" + " a single node LocalCUDACluster will be started.", + ) + parser.add_argument( + "--n-workers", + type=int, + default=os.cpu_count(), + help="The number of workers to run in total on the Dask CPU cluster", + ) + parser.add_argument( + "--threads-per-worker", + type=int, + default=1, + help="The number of threads ot launch per worker on the Dask CPU cluster. Usually best set at 1 due to the GIL.", + ) + parser.add_argument( + "--rmm-pool-size", + type=str, + default=None, + help="Initial pool size to use for the RMM Pool Memory allocator" + "Note: This only applies to the localCUDACluster. If providing an user created " + "cluster refer to" + "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-rmm-pool-size", # noqa: E501 + ) + parser.add_argument( + "--protocol", + type=str, + default="tcp", + help="Protcol to use for dask cluster" + "Note: This only applies to the localCUDACluster. If providing an user created " + "cluster refer to" + "https://docs.rapids.ai/api/dask-cuda/stable/api.html#cmdoption-dask-cuda-protocol", # noqa: E501 + ) + parser.add_argument( + "--nvlink-only", + action="store_true", + help="Start a local cluster with only NVLink enabled." + "Only applicable when protocol=ucx and no scheduler file/address is specified", + ) + parser.add_argument( + "--files-per-partition", + type=int, + default=2, + help="Number of jsonl files to combine into single partition", + ) + parser.add_argument( + "--num-files", + type=int, + default=-1, + help="Upper limit on the number of json files to process", + ) + parser.add_argument( + "--device", + type=str, + default="cpu", + help="Device to run the script on. Either 'cpu' or 'gpu'.", + ) + + return parser + +def chunk_list(lst, nchnks): + nitem = len(lst) + splits = splitnum(nitem, nchnks) + beg, end = 0, splits[0] + for i in range(nchnks): + if i == nchnks - 1: + yield lst[beg:] + else: + yield lst[beg:end] + beg = end + end += splits[i + 1] + + +def get_ranges(n, nchnks): + splits = splitnum(n, nchnks) + beg, end = 0, splits[0] + for i in range(nchnks): + if i == nchnks - 1: + yield beg, n + else: + yield beg, end + beg = end + end += splits[i + 1] + + +def chunk_list_lean(lst, nchnks): + nitem = len(lst) + splits = splitnum(nitem, nchnks) + # Outer loop over chunks + for i in range(nchnks): + # Slice thie list + yield lst[0 : splits[i]] + # Remove the chunk from the total list + del lst[0 : splits[i]] + + +def chunk_dict(din, nchnks): + nitem = len(din) + splits = splitnum(nitem, nchnks) + beg, end = 0, splits[0] + # Outer loop over chunks + for i in range(nchnks): + it, out = iter(din), {} + # Slice the dictionary + for k in islice(it, beg, end): + out[k] = din[k] + if i == nchnks - 1: + yield out + else: + beg = end + end += splits[i + 1] + yield out + + +def chunk_dict_lean(din, nchnks): + nitem = len(din) + splits = splitnum(nitem, nchnks) + # Outer loop over chunks + for i in range(nchnks): + it = iter(din) + out = {} + # Slice the dictionary + for k in islice(it, splits[i]): + out[k] = din[k] + yield out + # Clear out chunked entries + for k in out.keys(): + del din[k] + + +def splitnum(num, div): + """Splits a number into nearly even parts""" + splits = [] + igr, rem = divmod(num, div) + for i in range(div): + splits.append(igr) + for i in range(rem): + splits[i] += 1 + + return splits diff --git a/nemo_curator/utils/text_utils.py b/nemo_curator/utils/text_utils.py new file mode 100644 index 00000000..4686d113 --- /dev/null +++ b/nemo_curator/utils/text_utils.py @@ -0,0 +1,187 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import ast +import warnings +import tokenize +from itertools import groupby +from io import StringIO +import string + + +def get_word_splitter(language): + language = language.lower() + if language == 'zh': + import jieba + return jieba.cut + else: + return default_splitter + + +def default_splitter(document): + return document.split() + + +def get_paragraphs(document): + # Split the document into paragraphs. + # A paragraph is defined as a sequence of lines + # separated by a double newline. + return document.split('\n\n') + + +def get_sentences(document): + # Split the document into sentences. + # A sentence is defined as a sequence of lines separated + # by a single newline. + return [x for x in document.split('\n') if len(x.strip()) > 0] + + +def get_ngrams(input_list, n): + # Fast function to return n-grams from a list of tokens. + return [item for item in zip(*[input_list[i:] for i in range(n)])] + + +def is_paragraph_indices_in_top_or_bottom_only( + boilerplate_paragraph_indices, + num_paragraphs, +): + + def _is_contiguous(indices): + # Indices are sorted in ascending order. + num_indices = len(indices) - 1 + return all(indices[i] + 1 == indices[i + 1] for i in range(num_indices)) + + # See if the indices are contiguous and exclusively at the top/bottom. + # Indices are sorted in ascending order. + # If num_paragraphs = 11: + # Valid indices example : [0, 1, 9, 10] + # Invalid indices example : [0, 1, 3, 9, 10] + # Invalid indices example : [0, 1, 3, 5, 6, 9, 10] + # Invalid indices example : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + if len(boilerplate_paragraph_indices) == num_paragraphs: + return False + return _is_contiguous(boilerplate_paragraph_indices) and ( + boilerplate_paragraph_indices[0] == 0 or + boilerplate_paragraph_indices[-1] == num_paragraphs - 1) + + +# Node types for processing abstract syntax tree +NODE_TYPES = { + ast.ClassDef: 'Class', + ast.FunctionDef: 'Function/Method', + ast.Module: 'Module' +} + + +def get_comments_and_docstring(source, comments=True, clean_comments=False): + """ + Extract all natural text in source: comments + doctsrings + the extraction fails in case of syntax errors in the file + Args: + source: the code to parse + comments: if True extract comments two + clean_comment: if True remove # from extracted comments + Returns: + a string with concatenated docstrings and comments + """ + + try: + docstrings = '\n'.join(get_docstrings(source)) + except Exception: + docstrings = None + warnings.warn("code couldn't be parsed due to compilation failure, " + "no docstring is extracted") + + if comments: + try: + comments = get_comments(source, clean=clean_comments) + except Exception: + comments = None + warnings.warn("tokenization error, no comment is extracted") + else: + comments = '' + + return docstrings, comments + + +def get_comments(s, clean=False): + "Returns a string including all coments" + coments = [] + g = tokenize.generate_tokens(StringIO(s).readline) + for toknum, tokval, _, _, _ in g: + # print(toknum,tokval) + if toknum == tokenize.COMMENT: + coments.append((toknum, tokval)) + result = tokenize.untokenize(coments) + if clean: + result = result.replace('#', '') + return result + + +def get_docstrings(source, module=''): + """Parse Python source code from file or string and print docstrings.""" + if hasattr(source, 'read'): + filename = getattr(source, 'name', module) + module = os.path.splitext(os.path.basename(filename))[0] + source = source.read() + + docstrings = sorted(parse_docstrings(source), + key=lambda x: (NODE_TYPES.get(type(x[0])), x[1])) + + grouped = groupby(docstrings, key=lambda x: NODE_TYPES.get(type(x[0]))) + results = [] + for _, group in grouped: + for _, name, docstring in group: + name = name if name else module + if docstring: + results.append(docstring) + return results + + +def parse_docstrings(source): + """Parse Python source code and yield a tuple of ast node instance, name, + and docstring for each function/method, class and module.""" + tree = ast.parse(source) + + for node in ast.walk(tree): + if isinstance(node, tuple(NODE_TYPES)): + docstring = ast.get_docstring(node) + + yield (node, getattr(node, 'name', None), docstring) + + +def remove_punctuation(str_in): + return str_in.translate(str_in.maketrans('', '', string.punctuation)) + + +def get_words(text): + word_start_char_positions = [] + prev = 0 + words = [] + + text = text.lower() + text = remove_punctuation(text) + if len(text) > 0: + for i in range(len(text)): + if text[i] != ' ': + if i == 0 or text[i - 1] == ' ': + word_start_char_positions.append(i) + if i != 0: + words.append(text[prev:i].strip()) + prev = i + words.append(text[prev:i + 1].strip()) + if words[0] == '': + words = words[1:] + return words, word_start_char_positions diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..9f07ebda --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + gpu: marks tests as GPU tests (deselect with '-m \"not gpu\"') + diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..807f2513 --- /dev/null +++ b/setup.py @@ -0,0 +1,104 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from setuptools import setup, find_packages +import pathlib + +here = pathlib.Path(__file__).parent.resolve() + +long_description = (here / 'README.md').read_text(encoding='utf-8') + +setup( + name='nemo_curator', + version='0.2.0', + description='Scalable Data Preprocessing Tool for ' + 'Training Large Language Models', + long_description=long_description, + long_description_content_type='text/markdown', + url='https://github.com/NVIDIA/NeMo-Curator', + author='Joseph Jennings, Mostofa Patwary, Sandeep Subramanian, ' + 'Shrimai Prabhumoye, Ayush Dattagupta, Vibhu Jawa, Jiwei Liu, Ryan Wolf', + author_email='jjennings@nvidia.com, mpatwary@nvidia.com, ' + 'rywolf@nvidia.com, sprabhumoye@nvidia.com', + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Programming Language :: Python :: 3', + ], + packages=find_packages(), + python_requires='>=3.7', + install_requires=[ + 'dask[complete]>=2021.7.1', + 'distributed>=2021.7.1', + 'dask-mpi>=2021.11.0', + 'charset_normalizer>=3.1.0', + 'awscli>=1.22.55', + 'fasttext==0.9.2', + 'pycld2==0.41', + 'justext==3.0.0', + 'ftfy==6.1.1', + 'warcio==1.7.4', + 'zstandard==0.18.0', + 'in-place==0.5.0', + 'unidic-lite==1.0.8', + 'jieba==0.42.1', + 'comment_parser', + 'beautifulsoup4', + 'mwparserfromhell @ git+https://github.com/earwig/mwparserfromhell.git@0f89f44', + 'cudf-cu12==23.10.*', + 'dask-cudf-cu12==23.10.*', + 'cugraph-cu12==23.10.*', + 'dask-cuda==23.10.*', + 'spacy>=3.6.0, <4.0.0', + 'presidio-analyzer==2.2.351', + 'presidio-anonymizer==2.2.351', + 'usaddress==0.5.10', + 'nemo_toolkit[nlp]>=1.23.0' + ], + entry_points={ + 'console_scripts': [ + 'get_common_crawl_urls=nemo_curator.scripts.get_common_crawl_urls:console_script', + 'get_wikipedia_urls=nemo_curator.scripts.get_wikipedia_urls:console_script', + 'download_and_extract=nemo_curator.scripts.download_and_extract:console_script', + 'text_cleaning=nemo_curator.scripts.text_cleaning:console_script', + 'add_id=nemo_curator.scripts.add_id:console_script', + 'get_metadata_from_corpus=nemo_curator.get_metadata_from_corpus:console_script', + 'make_data_shards=nemo_curator.scripts.make_data_shards:console_script', + 'prepare_fasttext_training_data=nemo_curator.scripts.prepare_fasttext_training_data:console_script', + 'train_fasttext=nemo_curator.scripts.train_fasttext:console_script', + 'filter_documents=nemo_curator.scripts.filter_documents:console_script', + 'separate_by_metadata=nemo_curator.scripts.separate_by_metadata:console_script', + 'prepare_task_data=nemo_curator.scripts.prepare_task_data:console_script', + 'find_matching_ngrams=nemo_curator.scripts.find_matching_ngrams:console_script', + 'remove_matching_ngrams=nemo_curator.scripts.remove_matching_ngrams:console_script', + 'gpu_compute_minhashes=nemo_curator.scripts.compute_minhashes:console_script', + 'minhash_buckets=nemo_curator.scripts.minhash_lsh:console_script', + 'jaccard_map_buckets=nemo_curator.scripts.map_buckets:console_script', + 'jaccard_shuffle=nemo_curator.scripts.jaccard_shuffle:console_script', + 'jaccard_compute=nemo_curator.scripts.jaccard_compute:console_script', + 'gpu_connected_component=nemo_curator.scripts.connected_components:console_script', + 'write_deduped_result_with_text=nemo_curator.gpu_deduplication.write_deduped_result_with_text:console_script', + 'verify_all_pairs_jaccard=nemo_curator.gpu_deduplication.verify_all_pairs_jaccard:console_script', + 'gpu_exact_dups=nemo_curator.scripts.find_exact_duplicates:console_script', + 'prepare_fuzzy_ids=nemo_curator.gpu_deduplication.prepare_fuzzy_ids:console_script', + 'create_list_of_duplicate_ids=nemo_curator.gpu_deduplication.create_list_of_duplicate_ids:console_script', + 'remove_duplicates=nemo_curator.gpu_deduplication.remove_duplicates:console_script', + 'deidentify=nemo_curator.scripts.find_pii_and_deidentify:console_script', + 'generate_statistics=nemo_curator.distributed_data_classification.generate_statistics:console_script', + 'domain_classifier_inference=nemo_curator.distributed_data_classification.domain_classifier_inference:console_script', + 'quality_classifier_multiple_models_inference=nemo_curator.distributed_data_classification.quality_classifier_multiple_models_inference:console_script', + 'quality_classifier_inference=nemo_curator.distributed_data_classification.quality_classifier_inference:console_script', + 'verify_results=nemo_curator.distributed_data_classification.verify_results:console_script', + ], + }, +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..fe99e99a --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/tests/pii_data/address.txt b/tests/pii_data/address.txt new file mode 100644 index 00000000..6642380b --- /dev/null +++ b/tests/pii_data/address.txt @@ -0,0 +1,50 @@ +My new address is
777 Brockton Avenue, Abington MA 2351
+He curently resides at
30 Memorial Drive, Avon MA 2322
+
250 Hartford Avenue, Bellingham MA 2019
+
700 Oak Street, Brockton MA 2301
+
66-4 Parkhurst Rd, Chelmsford MA 1824
+
591 Memorial Dr, Chicopee MA 1020
+
55 Brooksby Village Way, Danvers MA 1923
+
137 Teaticket Hwy, East Falmouth MA 2536
+
42 Fairhaven Commons Way, Fairhaven MA 2719
+
374 William S Canning Blvd, Fall River MA 2721
+
121 Worcester Rd, Framingham MA 1701
+
677 Timpany Blvd, Gardner MA 1440
+
337 Russell St, Hadley MA 1035
+
295 Plymouth Street, Halifax MA 2338
+
1775 Washington St, Hanover MA 2339
+
280 Washington Street, Hudson MA 1749
+
20 Soojian Dr, Leicester MA 1524
+
11 Jungle Road, Leominster MA 1453
+
301 Massachusetts Ave, Lunenburg MA 1462
+
780 Lynnway, Lynn MA 1905
+
70 Pleasant Valley Street, Methuen MA 1844
+
830 Curran Memorial Hwy, North Adams MA 1247
+
1470 S Washington St, North Attleboro MA 2760
+
506 State Road, North Dartmouth MA 2747
+
742 Main Street, North Oxford MA 1537
+
72 Main St, North Reading MA 1864
+
200 Otis Street, Northborough MA 1532
+
180 North King Street, Northhampton MA 1060
+
555 East Main St, Orange MA 1364
+
555 Hubbard Ave-Suite 12, Pittsfield MA 1201
+
300 Colony Place, Plymouth MA 2360
+
301 Falls Blvd, Quincy MA 2169
+
36 Paramount Drive, Raynham MA 2767
+
450 Highland Ave, Salem MA 1970
+
1180 Fall River Avenue, Seekonk MA 2771
+
1105 Boston Road, Springfield MA 1119
+
100 Charlton Road, Sturbridge MA 1566
+
262 Swansea Mall Dr, Swansea MA 2777
+
333 Main Street, Tewksbury MA 1876
+
550 Providence Hwy, Walpole MA 2081
+
352 Palmer Road, Ware MA 1082
+
3005 Cranberry Hwy Rt 6 28, Wareham MA 2538
+
250 Rt 59, Airmont NY 10901
+
141 Washington Ave Extension, Albany NY 12205
+
13858 Rt 31 W, Albion NY 14411
+
2055 Niagara Falls Blvd, Amherst NY 14228
+
101 Sanford Farm Shpg Center, Amsterdam NY 12010
+
297 Grant Avenue, Auburn NY 13021
+
4133 Veterans Memorial Drive, Batavia NY 14020
+
6265 Brockport Spencerport Rd, Brockport NY 14420
\ No newline at end of file diff --git a/tests/pii_data/birthdates.txt b/tests/pii_data/birthdates.txt new file mode 100644 index 00000000..87c9a8ef --- /dev/null +++ b/tests/pii_data/birthdates.txt @@ -0,0 +1 @@ +I was born on December 5, 1983. \ No newline at end of file diff --git a/tests/pii_data/card_no.txt b/tests/pii_data/card_no.txt new file mode 100644 index 00000000..ae2a039e --- /dev/null +++ b/tests/pii_data/card_no.txt @@ -0,0 +1,3 @@ +4929-3813-3266-4295 +345389698201044 +I have changed my credit card. My new credit card number is 5299-1561-5689-1938. \ No newline at end of file diff --git a/tests/pii_data/emails.txt b/tests/pii_data/emails.txt new file mode 100644 index 00000000..c2ed5e58 --- /dev/null +++ b/tests/pii_data/emails.txt @@ -0,0 +1,3 @@ +Hello, I am John. My email is johndoe@openai.com. Feel free to drop me an email. +testbotaccount@nvidia.com +testbotaccount@gmail.com \ No newline at end of file diff --git a/tests/pii_data/ip_address.txt b/tests/pii_data/ip_address.txt new file mode 100644 index 00000000..33b19ff2 --- /dev/null +++ b/tests/pii_data/ip_address.txt @@ -0,0 +1,2 @@ +inet 10.41.21.72 +Can you ping me on my ip 192.168.0.12? \ No newline at end of file diff --git a/tests/pii_data/multiple.txt b/tests/pii_data/multiple.txt new file mode 100644 index 00000000..41afb963 --- /dev/null +++ b/tests/pii_data/multiple.txt @@ -0,0 +1,2 @@ +Hello, I am David. I was born on December 5, 1983. My email is david@company.com and you can call me on (845) 450 5693 +My SSN is 284-89-3485. I live at
2456 Main Street, Silver Spring MD
\ No newline at end of file diff --git a/tests/pii_data/names.txt b/tests/pii_data/names.txt new file mode 100644 index 00000000..292be921 --- /dev/null +++ b/tests/pii_data/names.txt @@ -0,0 +1,4 @@ +Surprisingly, John, Shaun, and Ron finished their exam tests all at the same time. +Anton Perera +Sameer Rawat +Ilya Sutskever \ No newline at end of file diff --git a/tests/pii_data/phone_numbers.txt b/tests/pii_data/phone_numbers.txt new file mode 100644 index 00000000..b989a26b --- /dev/null +++ b/tests/pii_data/phone_numbers.txt @@ -0,0 +1,5 @@ +You can call me at (814) 360 8593 ++1-212-456-7890 ++12312322334 +2312322334 +(231) 232-2334 \ No newline at end of file diff --git a/tests/pii_data/ssn.txt b/tests/pii_data/ssn.txt new file mode 100644 index 00000000..d636a2e5 --- /dev/null +++ b/tests/pii_data/ssn.txt @@ -0,0 +1,2 @@ +514-14-8905 +My SSN is 284-89-3485. \ No newline at end of file diff --git a/tests/test_add_id.py b/tests/test_add_id.py new file mode 100644 index 00000000..08382f70 --- /dev/null +++ b/tests/test_add_id.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import dask.dataframe as dd +import pytest + +import nemo_curator +from nemo_curator.datasets import DocumentDataset + +def list_to_dataset(documents, col_name="text", npartitions=2): + data = {col_name: documents} + pdf = pd.DataFrame(data) + + return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions)) + +@pytest.fixture +def single_partition_dataset(): + return list_to_dataset(["First", "Second", "Third", "Fourth", "Fifth"], npartitions=1) + +@pytest.fixture +def two_partition_dataset(): + return list_to_dataset(["First", "Second", "Third", "Fourth", "Fifth"], npartitions=2) + +class TestPrepareTaskData: + def test_basic_id(self, single_partition_dataset): + id_field = "id" + add_id = nemo_curator.AddId(id_field) + id_dataset = add_id(single_partition_dataset) + actual_ids = id_dataset.df[id_field].compute() + expected_ids = pd.Series(["doc_id-0000000000", "doc_id-0000000001", "doc_id-0000000002", "doc_id-0000000003", "doc_id-0000000004"]) + + assert all(expected_ids == actual_ids), f"Expected: {expected_ids}, got: {actual_ids}" + + def test_two_partitions(self, two_partition_dataset): + id_field = "id" + add_id = nemo_curator.AddId(id_field) + id_dataset = add_id(two_partition_dataset) + actual_ids = id_dataset.df[id_field].compute() + expected_ids = pd.Series(["doc_id-0000000000", "doc_id-0000000001", "doc_id-0000000002", "doc_id-0000000003", "doc_id-0000000004"]) + + assert all(expected_ids == actual_ids), f"Expected: {expected_ids}, got: {actual_ids}" + + def test_id_prefix(self, two_partition_dataset): + id_field = "id" + id_prefix = "my_id" + add_id = nemo_curator.AddId(id_field, id_prefix=id_prefix) + id_dataset = add_id(two_partition_dataset) + actual_ids = id_dataset.df[id_field].compute() + expected_ids = pd.Series([f"{id_prefix}-0000000000", f"{id_prefix}-0000000001", f"{id_prefix}-0000000002", f"{id_prefix}-0000000003", f"{id_prefix}-0000000004"]) + + assert all(expected_ids == actual_ids), f"Expected: {expected_ids}, got: {actual_ids}" + + def test_start_index(self, two_partition_dataset): + id_field = "id" + start_index = 13 + add_id = nemo_curator.AddId(id_field, start_index=start_index) + id_dataset = add_id(two_partition_dataset) + actual_ids = id_dataset.df[id_field].compute() + expected_ids = pd.Series(["doc_id-0000000013", "doc_id-0000000014", "doc_id-0000000015", "doc_id-0000000016", "doc_id-0000000017"]) + + assert all(expected_ids == actual_ids), f"Expected: {expected_ids}, got: {actual_ids}" \ No newline at end of file diff --git a/tests/test_exact_dedup.py b/tests/test_exact_dedup.py new file mode 100644 index 00000000..827253fc --- /dev/null +++ b/tests/test_exact_dedup.py @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest +from dask import dataframe as dd +from dask.dataframe.utils import assert_eq +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modules import ExactDuplicates + + +@pytest.fixture( + params=[False, pytest.param(True, marks=pytest.mark.gpu)], ids=["no gpu", "gpu"] +) +def exact_dedup_data(request): + df = pd.DataFrame( + {"id": [1, 2, 300, 4, -1], "text": ["abc", "aba", "abb", "aba", "abc"]} + ) + df = dd.from_pandas(df, 2) + if request.param: + df = df.to_backend("cudf") + return DocumentDataset(df) + + +class TestExactDuplicates: + def test_unsupported_hash(self): + with pytest.raises(ValueError): + ExactDuplicates(hash_method="sha256") + + @pytest.mark.parametrize("cache_result", [False, True]) + def test_dup(self, exact_dedup_data, cache_result, tmpdir): + exact_dups = ExactDuplicates( + id_field="id", + text_field="text", + hash_method="md5", + cache_dir=tmpdir if cache_result else None, + ) + result = exact_dups(exact_dedup_data) + expected_df = exact_dedup_data.df.compute() + expected_df = expected_df[expected_df.text.duplicated(keep=False)] + assert_eq(result.df.id, expected_df.id, check_index=False) diff --git a/tests/test_filters.py b/tests/test_filters.py new file mode 100644 index 00000000..77fa27bd --- /dev/null +++ b/tests/test_filters.py @@ -0,0 +1,512 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd +from dask import dataframe as dd + +import pytest + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modules import ScoreFilter, Score, Filter, Sequential +from nemo_curator.filters import DocumentFilter, NonAlphaNumericFilter, SymbolsToWordsFilter, NumbersFilter, UrlsFilter, BulletsFilter, WhiteSpaceFilter, ParenthesesFilter, LongWordFilter, WordCountFilter, BoilerPlateStringFilter, MeanWordLengthFilter, RepeatedLinesFilter, RepeatedParagraphsFilter, RepeatedLinesByCharFilter, RepeatedParagraphsByCharFilter, RepeatingTopNGramsFilter, RepeatingDuplicateNGramsFilter, PunctuationFilter, EllipsisFilter, CommonEnglishWordsFilter, WordsWithoutAlphabetsFilter, PornographicUrlsFilter +from nemo_curator.filters import PythonCommentToCodeFilter, GeneralCommentToCodeFilter, NumberOfLinesOfCodeFilter, TokenizerFertilityFilter, XMLHeaderFilter, AlphaFilter, HTMLBoilerplateFilter, PerExtensionFilter + +class LetterCountFilter(DocumentFilter): + """ + Keeps documents that have at least some number of a given letter + """ + def __init__(self, letter="a", min_count=5): + super().__init__() + self.letter = letter + self.min_count = min_count + + def score_document(self, text): + return text.count(self.letter) + + def keep_document(self, score): + return score >= self.min_count + +class BatchedLengthFilter(DocumentFilter): + """ + Keeps documents of a given length + """ + def __init__(self, min_length=5, max_length=10): + super().__init__() + self.min_length = min_length + self.max_length = max_length + + def score_document(self, df): + return df.str.len() + + def keep_document(self, scores): + min_threshold = self.min_length <= scores + max_threshold = scores <= self.max_length + return min_threshold & max_threshold + +def all_equal(left_dataset, right_dataset): + return all(left_dataset.df.compute() == right_dataset.df.compute()) + +def list_to_dataset(documents, col_name="text", npartitions=2): + data = {col_name: documents} + pdf = pd.DataFrame(data) + + return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions)) + +@pytest.fixture +def letter_count_data(): + return list_to_dataset(["Two aa", "a a Three a", "Five aaa aa", "aaaSeven aaaa"], col_name="documents") + +class TestFilterModule: + def test_score_filter(self, letter_count_data): + letter_filter = LetterCountFilter() + filter_step = ScoreFilter(letter_filter, text_field="documents") + filtered_data = filter_step(letter_count_data) + + expected_indices = [2, 3] + expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_score(self, letter_count_data): + letter_filter = LetterCountFilter() + score_field = "a_count" + score_step = Score(letter_filter.score_document, text_field="documents", score_field=score_field) + scored_data = score_step(letter_count_data) + + expected_scores = pd.Series([2, 3, 5, 7]) + scores = scored_data.df[score_field] + assert all(expected_scores == scores.compute()), f"Expected {expected_scores} but got {scores}" + + def test_retain_score_filter(self, letter_count_data): + letter_filter = LetterCountFilter() + score_field = "count_a" + filter_step = ScoreFilter(letter_filter, text_field="documents", score_field=score_field) + filtered_data = filter_step(letter_count_data) + + expected_indices = [2, 3] + expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices]) + expected_data.df[score_field] = pd.Series([5, 7], index=expected_data.df.index) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_filter(self, letter_count_data): + letter_filter = LetterCountFilter() + score_field = "a_count" + score_step = Score(letter_filter.score_document, text_field="documents", score_field=score_field) + scored_data = score_step(letter_count_data) + filter_step = Filter(letter_filter.keep_document, score_field) + filtered_data = filter_step(scored_data) + + expected_indices = [2, 3] + expected_data = letter_count_data.df.loc[expected_indices] + expected_data[score_field] = pd.Series([5, 7], index=expected_data.index) + expected_data = DocumentDataset(expected_data) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_invert(self, letter_count_data): + letter_filter = LetterCountFilter() + filter_step = ScoreFilter(letter_filter, text_field="documents", invert=True) + filtered_data = filter_step(letter_count_data) + + expected_indices = [0, 1] + expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_sequential_filter(self, letter_count_data): + filters = Sequential([ + ScoreFilter(LetterCountFilter(), text_field="documents"), + ScoreFilter(LetterCountFilter(min_count=6), text_field="documents") + ]) + filtered_data = filters(letter_count_data) + + expected_indices = [3] + expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_batch_score_filter(self, letter_count_data): + length_filter = BatchedLengthFilter(min_length=8, max_length=11) + filter_step = ScoreFilter(length_filter, text_field="documents", batched=True) + filtered_data = filter_step(letter_count_data) + + expected_indices = [1, 2] + expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_batch_score(self, letter_count_data): + length_filter = BatchedLengthFilter(min_length=8, max_length=11) + score_field = "lengths" + score_step = Score(length_filter.score_document, text_field="documents", score_field=score_field, batched=True) + scored_data = score_step(letter_count_data) + + expected_scores = pd.Series([6, 11, 11, 13]) + scores = scored_data.df[score_field] + assert all(expected_scores == scores.compute()), f"Expected {expected_scores} but got {scores}" + + def test_batch_filter(self, letter_count_data): + length_filter = BatchedLengthFilter(min_length=8, max_length=11) + score_field = "lengths" + score_step = Score(length_filter.score_document, text_field="documents", score_field=score_field, batched=True) + scored_data = score_step(letter_count_data) + filter_step = Filter(length_filter.keep_document, score_field, batched=True) + filtered_data = filter_step(scored_data) + + expected_indices = [1, 2] + expected_data = letter_count_data.df.loc[expected_indices] + expected_data[score_field] = pd.Series([11, 11], index=expected_data.index) + expected_data = DocumentDataset(expected_data) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_score_filter_type(self, letter_count_data): + letter_filter = LetterCountFilter() + filter_step = ScoreFilter(letter_filter, text_field="documents", score_type=int) + filtered_data = filter_step(letter_count_data) + + expected_indices = [2, 3] + expected_data = DocumentDataset(letter_count_data.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_score_type(self, letter_count_data): + letter_filter = LetterCountFilter() + score_field = "a_count" + score_step = Score(letter_filter.score_document, text_field="documents", score_field=score_field, score_type=int) + scored_data = score_step(letter_count_data) + + expected_scores = pd.Series([2, 3, 5, 7]) + scores = scored_data.df[score_field] + assert all(expected_scores == scores.compute()), f"Expected {expected_scores} but got {scores}" + + +class TestHeuristicFilters: + def test_nonalpha(self): + dataset = list_to_dataset(["", "This is a test case.", "%$^%$^%$&^$()))))", "$aaa"]) + filters = ScoreFilter(NonAlphaNumericFilter()) + filtered_data = filters(dataset) + + expected_indices = [1, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_symbolswords(self): + dataset = list_to_dataset(["mixed bag ... #", "full of words", "... # ... # #", "barely ok 3 4 5 6 7 8 9 #"]) + filters = ScoreFilter(SymbolsToWordsFilter()) + filtered_data = filters(dataset) + + expected_indices = [1, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_numbers(self): + dataset = list_to_dataset(["purely letters", "34134543", "$!@$@!$!@", "abcdefghi1"]) + filters = ScoreFilter(NumbersFilter(max_number_to_text_ratio=0.1)) + filtered_data = filters(dataset) + + expected_indices = [0, 2, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_urls(self): + dataset = list_to_dataset(["https://www.nvidia.com/en-us/", "no urls here!", "$!@$@!$!@", "bunch of other words with url afdsjafidsaofjbwreowihfdsafbdashuoiotauhiofdafdsafd fdasfdafdsafdsafdsafdsafdsafdsa https://www.nvidia.com/en-us/ something else after the url etc more and more", "words with url https://www.nvidia.com/en-us/"]) + filters = ScoreFilter(UrlsFilter()) + filtered_data = filters(dataset) + + expected_indices = [1, 2, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_bullets(self): + dataset = list_to_dataset(["• not good", "good", "50 \n ⦾ 50", "⁌ this \n⁌ should \n⁌barely \n⁌pass \n⁌5 \n⁌6 \n⁌7 \n⁌8 \n⁌9 \n done!"]) + filters = ScoreFilter(BulletsFilter()) + filtered_data = filters(dataset) + + expected_indices = [1, 2, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_whitespace(self): + dataset = list_to_dataset(["\t\n\r", "good", "50%\n\n\n", "123\b"]) + filters = ScoreFilter(WhiteSpaceFilter()) + filtered_data = filters(dataset) + + expected_indices = [1, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_parentheses(self): + dataset = list_to_dataset(["()", "(not good)", "this is completely absolutely fine", "123456789("]) + filters = ScoreFilter(ParenthesesFilter()) + filtered_data = filters(dataset) + + expected_indices = [2, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_longword(self): + dataset = list_to_dataset(["tiny", "large"]) + filters = ScoreFilter(LongWordFilter(max_word_length=4)) + filtered_data = filters(dataset) + + expected_indices = [0] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_wordcount(self): + dataset = list_to_dataset(["", "one", "two words", "$#@$ %$@$#@ !#@!", "one two three four five"]) + filters = ScoreFilter(WordCountFilter(min_words=2, max_words=4)) + filtered_data = filters(dataset) + + expected_indices = [2, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_boilerplate(self): + dataset = list_to_dataset(["nothing\t here", "1\n\n2\n\n3\n\n4\n\n5\n\n6\n\nterms of use\n\n privacy policy\n\n cookie policy\n\nuses cookies", "too much \n\n privacy & cookies policy"]) + filters = ScoreFilter(BoilerPlateStringFilter()) + filtered_data = filters(dataset) + + expected_indices = [0, 1] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_meanwordlength(self): + dataset = list_to_dataset(["a", "aa", "superlongword short", "evenly balanced", "waytoolongforasingleword"]) + filters = ScoreFilter(MeanWordLengthFilter()) + filtered_data = filters(dataset) + + expected_indices = [2, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_repeatedlines(self): + dataset = list_to_dataset(["totally unique", "half.\nhalf."]) + filters = ScoreFilter(RepeatedLinesFilter()) + filtered_data = filters(dataset) + + expected_indices = [0] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_repeatedparagraphs(self): + dataset = list_to_dataset(["totally unique", "half.\n\nhalf."]) + filters = ScoreFilter(RepeatedParagraphsFilter()) + filtered_data = filters(dataset) + + expected_indices = [0] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_repeatedlineschar(self): + dataset = list_to_dataset(["totally unique", "a.\na.\nvery very very short duplicate.", "half.\nhalf.", "super very incredibly huge long duplicate.\nsuper very incredibly huge long duplicate.\na.\nb.\nc."]) + filters = ScoreFilter(RepeatedLinesByCharFilter()) + filtered_data = filters(dataset) + + expected_indices = [0, 1] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_repeatedparagraphschar(self): + dataset = list_to_dataset(["totally unique", "a.\n\n a.\n\n very very very short duplicate.", "half.\n\nhalf.", "super very incredibly huge long duplicate.\n\nsuper very incredibly huge long duplicate.\n\n a.\n\n b.\n\n c."]) + filters = ScoreFilter(RepeatedParagraphsByCharFilter()) + filtered_data = filters(dataset) + + expected_indices = [0, 1] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_repeatingtopngrams(self): + dataset = list_to_dataset(["this is a totally fine sentence with no repeating ngrams so we are ok", "a b . a b", "a a a a a a", "totally fine small dupe a b a b"]) + filters = ScoreFilter(RepeatingTopNGramsFilter()) + filtered_data = filters(dataset) + + expected_indices = [0, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_repeatingduplicatengrams(self): + dataset = list_to_dataset(["a a b b a a b b", "totally fine", "a a a a this should be fine as well"]) + filters = ScoreFilter(RepeatingDuplicateNGramsFilter()) + filtered_data = filters(dataset) + + expected_indices = [1, 2] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_punctuation(self): + dataset = list_to_dataset(["not good", "good.", "just\n barely\n fine\n ok\n yep."]) + filters = ScoreFilter(PunctuationFilter(max_num_sentences_without_endmark_ratio=0.8)) + filtered_data = filters(dataset) + + expected_indices = [1, 2] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_ellipsis(self): + dataset = list_to_dataset(["not good...", "good.", "just...\n barely...\n fine...\n ok...\n yep."]) + filters = ScoreFilter(EllipsisFilter(max_num_lines_ending_with_ellipsis_ratio=0.8)) + filtered_data = filters(dataset) + + expected_indices = [1, 2] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_commonenglishwords(self): + dataset = list_to_dataset(["uncommon", "the and", "the and and of to"]) + filters = ScoreFilter(CommonEnglishWordsFilter()) + filtered_data = filters(dataset) + + expected_indices = [1, 2] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_wordswithoutalphabets(self): + dataset = list_to_dataset(["totally fine", "good good good good !", "@"]) + filters = ScoreFilter(WordsWithoutAlphabetsFilter()) + filtered_data = filters(dataset) + + expected_indices = [0, 1] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_pornographicurls(self): + dataset = list_to_dataset(["no url", "fine url https://www.nvidia.com/en-us/", "bad url https://www.pornhub.com/"]) + filters = ScoreFilter(PornographicUrlsFilter()) + filtered_data = filters(dataset) + + expected_indices = [0, 1] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + +class TestCodeFilters: + def test_python_comment_to_code(self): + doc_1 = "# Good code\nprint('hello world')" + doc_2 = "print('bad code')" + doc_3 = "# Too many\n# comments!" + doc_4 = "'''Good comment'''\nprint('hello world')" + dataset = list_to_dataset([doc_1, doc_2, doc_3, doc_4]) + filters = ScoreFilter(PythonCommentToCodeFilter()) + filtered_data = filters(dataset) + + expected_indices = [0, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_general_commment_to_code(self): + doc_1 = "// Good code\nprintf(\"hello world\\n\")" + doc_2 = "printf(\"bad code\\n\")" + doc_3 = "// Way far too many\n// comments!" + doc_4 = "/*\nGood comment\n*/\nprintf(\"hello world\\n\")" + dataset = list_to_dataset([doc_1, doc_2, doc_3, doc_4]) + filters = ScoreFilter(GeneralCommentToCodeFilter("text/x-c++")) + filtered_data = filters(dataset) + + expected_indices = [0, 3] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_number_lines_code(self): + doc_1 = """print("too short")""" + doc_2 = """print("just") + print("right")""" + doc_3 = """print("way") + print("too") + print("long") + print("!")""" + dataset = list_to_dataset([doc_1, doc_2, doc_3]) + filters = ScoreFilter(NumberOfLinesOfCodeFilter(min_lines=2, max_lines=3)) + filtered_data = filters(dataset) + + expected_indices = [1] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_xml_header(self): + dataset = list_to_dataset(["no header", "", "slightly offset ?$#@!", "mixed <>"]) + filters = ScoreFilter(AlphaFilter()) + filtered_data = filters(dataset) + + expected_indices = [0, 2] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_html_boilerplate(self): + good_doc = """ + + + + Sample Webpage + + +

Welcome to my sample webpage

+

This is a very fun paragraph on my sample webpage.

+ + + """ + boilerplate_heavy_doc = """ + + + + Boilerplate Webpage + + +

Welcome to my boilerplate webpage

+
+
+

hi

+
+
+

hi

+
+
+ + + """ + small_doc = """ + + hello world + """ + dataset = list_to_dataset([good_doc, boilerplate_heavy_doc, small_doc]) + filters = ScoreFilter(HTMLBoilerplateFilter()) + filtered_data = filters(dataset) + + expected_indices = [0] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" + + def test_per_extension_filter(self): + good_cpp = """ + #include + + using namespace std; + + int main() { + cout << "Hello World!" << endl; + return 0; + }; + """ + dataset = list_to_dataset([good_cpp]) + metadata_file = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "nemo_curator", "utils", "code_meta.csv")) + filters = ScoreFilter(PerExtensionFilter("c++", "cpp", metadata_file=metadata_file)) + filtered_data = filters(dataset) + + expected_indices = [0] + expected_data = DocumentDataset(dataset.df.loc[expected_indices]) + assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" \ No newline at end of file diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py new file mode 100644 index 00000000..3c6a3275 --- /dev/null +++ b/tests/test_fuzzy_dedup.py @@ -0,0 +1,178 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from itertools import combinations +from typing import Iterable + +import cudf +import dask_cudf +import numpy as np +import pytest +from dask.dataframe.utils import assert_eq + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modules import LSH, MinHash + + +@pytest.fixture +def fuzzy_dedup_data(): + df = cudf.DataFrame( + { + "id": [1, 2, 300, 4, -1], + "text": [ + "A test string", + "A different test string", + "A different object", + "The quick brown fox jumps over the lazy dog", + "The quick black cat jumps over the lazy dog", + ], + } + ) + df = dask_cudf.from_cudf(df, 2) + return DocumentDataset(df) + + +def minhash_overlap(minhash1: np.array, minhash2: np.array): + assert len(minhash1) == len(minhash2) + overlap = sum(minhash1 == minhash2) + return overlap / len(minhash1) + + +def jaccard_index(str1: str, str2: str, char_ngrams): + return ( + cudf.Series([str1]) + .str.jaccard_index(cudf.Series([str2]), width=char_ngrams) + .values_host[0] + ) + + +def generate_all_pairs(item: Iterable): + return combinations(item, 2) + + +@pytest.mark.gpu +class TestMinhashes: + @pytest.mark.parametrize("use_64bit_hash", [False, True]) + @pytest.mark.parametrize("seed,char_ngrams,num_hashes", [(128, 3, 260)]) + def test_identical_minhash( + self, fuzzy_dedup_data, use_64bit_hash, seed, char_ngrams, num_hashes + ): + minhasher1 = MinHash( + seed=seed, + num_hashes=num_hashes, + char_ngrams=char_ngrams, + use_64bit_hash=use_64bit_hash, + ) + minhash_sig1 = minhasher1(fuzzy_dedup_data) + sig_lengths = minhash_sig1.df["_minhash_signature"].compute().list.len() + assert (sig_lengths == num_hashes).all() + + minhasher2 = MinHash( + seed=seed, + num_hashes=num_hashes, + char_ngrams=char_ngrams, + use_64bit_hash=use_64bit_hash, + ) + minhash_sig2 = minhasher2(fuzzy_dedup_data) + assert_eq(minhash_sig1.df, minhash_sig2.df) + + @pytest.mark.parametrize( + "use_64bit_hash,seed,char_ngrams,num_hashes", + [(False, 42, 5, 20), (True, 32768, 10, 18)], + ) + def test_minhash_approximation( + self, fuzzy_dedup_data, use_64bit_hash, seed, char_ngrams, num_hashes + ): + THRESHOLD = 0.15 + + minhasher = MinHash( + seed=seed, + num_hashes=num_hashes, + char_ngrams=char_ngrams, + use_64bit_hash=use_64bit_hash, + ) + minhashes = minhasher(fuzzy_dedup_data) + minhash_signatures = ( + minhashes.df["_minhash_signature"].compute().to_pandas().values + ) + strings = fuzzy_dedup_data.df["text"].compute().to_pandas().values + for (sig1, str1), (sig2, str2) in generate_all_pairs( + tuple(zip(minhash_signatures, strings)) + ): + true_jaccard = jaccard_index(str1, str2, char_ngrams) + minhash_approximation = minhash_overlap(sig1, sig2) + assert abs(true_jaccard - minhash_approximation) < THRESHOLD + + def test_minhash_cache(self, fuzzy_dedup_data, tmpdir): + minhasher = MinHash(cache_dir=tmpdir) + result = minhasher(fuzzy_dedup_data) + assert len(result) == len(fuzzy_dedup_data) + assert "_minhashes.parquet" in os.listdir(tmpdir) + assert len(os.listdir(tmpdir / "_minhashes.parquet")) != 0 + + +@pytest.mark.gpu +class TestLSH: + @pytest.fixture(autouse=True) + def minhash_data(self): + df = cudf.DataFrame( + { + "id": [1, 2, 3, 4, 5], + "dataset_id": [1, 1, 2, 3, 4], + "minhash_sig": [ + [1, 2, 1, 2, 1, 2], + [1, 2, 3, 4, 5, 6], + [3, 2, 1, 4, 5, 6], + [9, 8, 7, 6, 5, 4], + [3, 1, 2, 4, 5, 4], + ], + } + ) + df = dask_cudf.from_cudf(df, 2) + self.dataset = DocumentDataset(df) + + @pytest.mark.parametrize("buckets_per_shuffle", [1, 2, 3]) + def test_lsh(self, tmpdir, buckets_per_shuffle): + lsh = LSH( + cache_dir=tmpdir, + minhash_length=6, + num_buckets=3, + buckets_per_shuffle=buckets_per_shuffle, + minhash_field="minhash_sig", + id_fields="id", + ) + buckets = lsh(self.dataset) + buckets_df = buckets.df + docs_list = buckets_df.groupby("_bucket_id").id.collect() + expected_df = cudf.Series([[1, 2], [2, 3], [4, 5]], name="id") + assert_eq(expected_df, docs_list, check_index=False) + + def test_multiple_id_cols(self, tmpdir): + lsh = LSH( + cache_dir=tmpdir, + minhash_length=6, + num_buckets=3, + buckets_per_shuffle=1, + id_fields=["id", "dataset_id"], + minhash_field="minhash_sig", + ) + buckets = lsh(self.dataset) + buckets_df = buckets.df.compute().to_pandas() + buckets_df["new_id"] = list(zip(buckets_df.dataset_id, buckets_df.id)) + docs_list = buckets_df.groupby("_bucket_id").new_id.apply(list) + expected_df = cudf.Series( + [[(1, 1), (1, 2)], [(1, 2), (2, 3)], [(3, 4), (4, 5)]], name="new_id" + ) + assert_eq(expected_df, docs_list, check_index=False) diff --git a/tests/test_pii_accuracy.py b/tests/test_pii_accuracy.py new file mode 100644 index 00000000..7b3d836e --- /dev/null +++ b/tests/test_pii_accuracy.py @@ -0,0 +1,117 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import re +from pathlib import Path + +import pytest + +from nemo_curator.pii.algorithm import PiiDeidentifier + +LOGGER = logging.getLogger(__name__) + + +def load_test_cases(filename): + filepath = Path("tests/pii_data") / filename + with open(filepath) as fp: + data = fp.readlines() + + raw_data = [ + (re.sub(r"<[^>]*>([^<]*)]*>", r"\1", line)).strip() for line in data + ] + masked_data = [ + (re.sub(r"(<[^>]*>([^<]*)]*>)", lambda x: "*" * len(x.group(2)), line)).strip() for line in data + ] + + return list(zip(raw_data, masked_data)) + + +def compare_outputs(output1, output2): + output1 = re.sub(r"\*[\* ]*", "*****", output1) + output2 = re.sub(r"\*[\* ]*", "*****", output2) + return output1 == output2 + + +def generate_single_category_test(category, filename): + deidentifier = PiiDeidentifier("en", [category], "mask") + test_data = load_test_cases(filename) + + for input, target in test_data: + output = deidentifier.deidentify_text(input) + print("============================") + print("Input: ", input) + print("Output: ", output) + print("Expected Output: ", target) + print("Matches:", "No" if output != target else "Yes") + assert output == target + + +class TestPIIAccuracy: + def test_email(self): + generate_single_category_test("EMAIL_ADDRESS", "emails.txt") + + def test_ip_address(self): + generate_single_category_test("IP_ADDRESS", "ip_address.txt") + + def test_address(self): + generate_single_category_test("ADDRESS", "address.txt") + + def test_ssn(self): + generate_single_category_test("US_SSN", "ssn.txt") + + def test_birthdates(self): + generate_single_category_test("DATE_TIME", "birthdates.txt") + + def test_card_no(self): + generate_single_category_test("CREDIT_CARD", "card_no.txt") + + def test_names(self): + generate_single_category_test("PERSON", "names.txt") + + def test_phone_numbers(self): + generate_single_category_test("PHONE_NUMBER", "phone_numbers.txt") + + def test_multiple(self): + deidentifier = PiiDeidentifier("en", anonymize_action='mask') + test_data = load_test_cases("multiple.txt") + + for input, target in test_data: + output = deidentifier.deidentify_text(input) + print("============================") + print("Input: ", input) + print("Output: ", output) + print("Expected Output: ", target) + match = compare_outputs(output, target) + output1 = re.sub(r"\*[\* ]*", "*****", output) + output2 = re.sub(r"\*[\* ]*", "*****", target) + print(output1) + print(output2) + print("match value: ", match) + print("Matches:", "No" if not match else "Yes") + assert match == True + + def test_batch_accuracy(self): + deidentifier = PiiDeidentifier("en", anonymize_action='mask') + test_data = load_test_cases("multiple.txt") + inputs = [data[0] for data in test_data] + targets = [data[1] for data in test_data] + outputs = deidentifier.deidentify_text_batch(inputs) + print("Inputs: ", inputs) + print("Output: ", outputs) + print("Expected Outputs: ", targets) + + match = all(compare_outputs(x, y) for x, y in zip(outputs, targets)) + print("Matches:", "No" if not match else "Yes") + assert match == True \ No newline at end of file diff --git a/tests/test_task_decontamination.py b/tests/test_task_decontamination.py new file mode 100644 index 00000000..2c4f4b7f --- /dev/null +++ b/tests/test_task_decontamination.py @@ -0,0 +1,218 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import pandas as pd +import dask.dataframe as dd +from collections import defaultdict + +import nemo_curator +from nemo_curator.datasets import DocumentDataset +from nemo_curator.tasks import DownstreamTask + +class SimpleTask(DownstreamTask): + def __init__(self, min_ngram_size=8, max_ngram_size=13): + super().__init__() + self._task_name = 'simple' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = ["This is a simple example task document with enough words.", + "Two plus two equals four. Two times two equals four as well. However, two divided by two is one.", + "Cars are (sometimes) red. Bicycles can (occasionally) be blue. Trees often have green leaves.", + "This is a short one.", + "This is a short one."] + + def generate_ngrams(self): + for line in self._dataset: + self._update_ngrams(line, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + +class TinyTask(DownstreamTask): + def __init__(self, min_ngram_size=0, max_ngram_size=4): + super().__init__() + self._task_name = 'tiny' + self._min_ngram_size = min_ngram_size + self._max_ngram_size = max_ngram_size + self._dataset = ["Super tiny task with one document."] + + def generate_ngrams(self): + for line in self._dataset: + self._update_ngrams(line, self._min_ngram_size, self._max_ngram_size) + + return self.ngrams + +def list_to_dataset(documents, col_name="text", npartitions=2): + data = {col_name: documents} + pdf = pd.DataFrame(data) + + return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions)) + +@pytest.fixture +def contaminated_dataset(): + return list_to_dataset(["This document is fine", + "Before contamination. This is a simple example task document with enough words. After contamination.", + "This document is not good. Two plus two equals four. Two times two equals four as well. However, one minus one is zero.", + "Long contamination. Birds are (sometimes) red. Bicycles can (occasionally) be blue. Trees often have green leaves. After contamination.", + "Small contamination in a very short document 1. SupeR tiNy task with one document!", + "Small contamination in a very short document 2. Super tiNy task with one document?"], + col_name="text") + +class TestPrepareTaskData: + def test_single_task(self): + decontaminator = nemo_curator.TaskDecontamination(SimpleTask()) + actual_count = decontaminator.prepare_task_ngram_count().compute() + expected_ngrams = ["this is a simple example task document with enough words", + "two plus two equals four two times two equals four as well however", + "plus two equals four two times two equals four as well however two", + "two equals four two times two equals four as well however two divided", + "equals four two times two equals four as well however two divided by", + "four two times two equals four as well however two divided by two", + "two times two equals four as well however two divided by two is", + "times two equals four as well however two divided by two is one", + "cars are sometimes red bicycles can occasionally be blue trees often have green", + "are sometimes red bicycles can occasionally be blue trees often have green leaves", + ] + expected_count = {ngram: 0 for ngram in expected_ngrams} + + assert len(expected_count) == len(actual_count), f"Expected #{len(expected_count)}, got #{len(actual_count)}" + assert expected_count == actual_count, f"Expected: {expected_count}, got: {actual_count}" + + def test_multiple_tasks(self): + decontaminator = nemo_curator.TaskDecontamination([SimpleTask(), TinyTask()]) + actual_count = decontaminator.prepare_task_ngram_count().compute() + expected_ngrams = ["this is a simple example task document with enough words", + "two plus two equals four two times two equals four as well however", + "plus two equals four two times two equals four as well however two", + "two equals four two times two equals four as well however two divided", + "equals four two times two equals four as well however two divided by", + "four two times two equals four as well however two divided by two", + "two times two equals four as well however two divided by two is", + "times two equals four as well however two divided by two is one", + "cars are sometimes red bicycles can occasionally be blue trees often have green", + "are sometimes red bicycles can occasionally be blue trees often have green leaves", + "super tiny task with", + "tiny task with one", + "task with one document", + ] + expected_count = {ngram: 0 for ngram in expected_ngrams} + + assert len(expected_count) == len(actual_count), f"Expected #{len(expected_count)}, got #{len(actual_count)}" + assert expected_count == actual_count, f"Expected: {expected_count}, got: {actual_count}" + +class TestFindMatchingNgrams: + def test_single_task(self, contaminated_dataset): + decontaminator = nemo_curator.TaskDecontamination(SimpleTask()) + task_ngrams = decontaminator.prepare_task_ngram_count() + actual_result = decontaminator.find_matching_ngrams(task_ngrams, contaminated_dataset).compute() + actual_ngrams, actual_freq = actual_result['matched-ngrams'], actual_result['ngrams-freq'] + + expected_ngrams = defaultdict(int) + expected_ngrams["this is a simple example task document with enough words"] = 1 + expected_ngrams["two plus two equals four two times two equals four as well however"] = 1 + expected_ngrams["are sometimes red bicycles can occasionally be blue trees often have green leaves"] = 1 + + expected_freq = [(10, 1), (13, 9)] + + assert expected_freq == actual_freq, f"Expected #{expected_freq}, got #{actual_freq}" + assert expected_ngrams == actual_ngrams, f"Expected: {expected_ngrams}, got: {actual_ngrams}" + + def test_multiple_tasks(self, contaminated_dataset): + decontaminator = nemo_curator.TaskDecontamination([SimpleTask(), TinyTask()]) + task_ngrams = decontaminator.prepare_task_ngram_count() + actual_result = decontaminator.find_matching_ngrams(task_ngrams, contaminated_dataset).compute() + actual_ngrams, actual_freq = actual_result['matched-ngrams'], actual_result['ngrams-freq'] + + expected_ngrams = defaultdict(int) + expected_ngrams["this is a simple example task document with enough words"] = 1 + expected_ngrams["two plus two equals four two times two equals four as well however"] = 1 + expected_ngrams["are sometimes red bicycles can occasionally be blue trees often have green leaves"] = 1 + expected_ngrams["super tiny task with"] = 2 + + expected_freq = [(4, 3), (10, 1), (13, 9)] + + assert expected_freq == actual_freq, f"Expected #{expected_freq}, got #{actual_freq}" + assert expected_ngrams == actual_ngrams, f"Expected: {expected_ngrams}, got: {actual_ngrams}" + +class TestRemoveMatchingNgrams: + def test_single_task(self, contaminated_dataset): + decontaminator = nemo_curator.TaskDecontamination(SimpleTask(), min_document_length=1, remove_char_each_side=1) + task_ngrams = decontaminator.prepare_task_ngram_count() + ngram_data = decontaminator.find_matching_ngrams(task_ngrams, contaminated_dataset) + matched_ngrams, ngram_freq = ngram_data['matched-ngrams'], ngram_data['ngrams-freq'] + filtered_dataset = decontaminator.remove_matching_ngrams(matched_ngrams, ngram_freq, contaminated_dataset) + + actual_data = sorted(filtered_dataset.df.compute()['text'].to_list()) + expected_data = ["This document is fine", + "Before contamination.", + " After contamination.", + "This document is not good.", + "Long contamination.", + " After contamination.", + "Small contamination in a very short document 1. SupeR tiNy task with one document!", + "Small contamination in a very short document 2. Super tiNy task with one document?"] + expected_data.sort() + assert expected_data == actual_data, f"Expected #{expected_data}, got #{actual_data}" + + def test_multiple_tasks(self, contaminated_dataset): + decontaminator = nemo_curator.TaskDecontamination([SimpleTask(), TinyTask()], min_document_length=1, remove_char_each_side=1) + task_ngrams = decontaminator.prepare_task_ngram_count() + ngram_data = decontaminator.find_matching_ngrams(task_ngrams, contaminated_dataset) + matched_ngrams, ngram_freq = ngram_data['matched-ngrams'], ngram_data['ngrams-freq'] + filtered_dataset = decontaminator.remove_matching_ngrams(matched_ngrams, ngram_freq, contaminated_dataset) + + actual_data = sorted(filtered_dataset.df.compute()['text'].to_list()) + expected_data = ["This document is fine", + "Before contamination.", + " After contamination.", + "This document is not good.", + "Long contamination.", + " After contamination.", + "Small contamination in a very short document 1.", + "Small contamination in a very short document 2."] + expected_data.sort() + assert expected_data == actual_data, f"Expected #{expected_data}, got #{actual_data}" + +class TestFullPipeline: + def test_single_task(self, contaminated_dataset): + decontaminator = nemo_curator.TaskDecontamination(SimpleTask(), min_document_length=1, remove_char_each_side=1) + filtered_dataset = decontaminator(contaminated_dataset) + + actual_data = sorted(filtered_dataset.df.compute()['text'].to_list()) + expected_data = ["This document is fine", + "Before contamination.", + " After contamination.", + "This document is not good.", + "Long contamination.", + " After contamination.", + "Small contamination in a very short document 1. SupeR tiNy task with one document!", + "Small contamination in a very short document 2. Super tiNy task with one document?"] + expected_data.sort() + assert expected_data == actual_data, f"Expected #{expected_data}, got #{actual_data}" + + def test_multiple_tasks(self, contaminated_dataset): + decontaminator = nemo_curator.TaskDecontamination([SimpleTask(), TinyTask()], min_document_length=1, remove_char_each_side=1) + filtered_dataset = decontaminator(contaminated_dataset) + + actual_data = sorted(filtered_dataset.df.compute()['text'].to_list()) + expected_data = ["This document is fine", + "Before contamination.", + " After contamination.", + "This document is not good.", + "Long contamination.", + " After contamination.", + "Small contamination in a very short document 1.", + "Small contamination in a very short document 2."] + expected_data.sort() + assert expected_data == actual_data, f"Expected #{expected_data}, got #{actual_data}" \ No newline at end of file diff --git a/tests/test_unicode_reformatter.py b/tests/test_unicode_reformatter.py new file mode 100644 index 00000000..56f022e4 --- /dev/null +++ b/tests/test_unicode_reformatter.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dask.dataframe as dd +import pandas as pd + +import nemo_curator +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modifiers import UnicodeReformatter + +def list_to_dataset(documents, col_name="text", npartitions=2): + data = {col_name: documents} + pdf = pd.DataFrame(data) + + return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions)) + +class TestUnicodeReformatter: + def test_reformatting(self): + # Examples taken from ftfy documentation: + # https://ftfy.readthedocs.io/en/latest/ + dataset = list_to_dataset([ + "✔ No problems", + "The Mona Lisa doesn’t have eyebrows.", + "l’humanité", + "à perturber la réflexion", + "Clean document already." + ]) + expected_results = [ + "✔ No problems", + "The Mona Lisa doesn't have eyebrows.", + "l'humanité", + "à perturber la réflexion", + "Clean document already." + ] + expected_results.sort() + + modifier = nemo_curator.Modify(UnicodeReformatter()) + fixed_dataset = modifier(dataset) + actual_results = fixed_dataset.df.compute()["text"].to_list() + actual_results.sort() + + assert expected_results == actual_results, f"Expected: {expected_results}, but got: {actual_results}" \ No newline at end of file diff --git a/tutorials/tinystories/docbuilder.py b/tutorials/tinystories/docbuilder.py new file mode 100644 index 00000000..859fb35a --- /dev/null +++ b/tutorials/tinystories/docbuilder.py @@ -0,0 +1,95 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Set, Tuple + +import requests + +from nemo_curator.download.doc_builder import ( + DocumentDownloader, + DocumentExtractor, + DocumentIterator, +) + + +class TinyStoriesDownloader(DocumentDownloader): + def __init__(self, download_dir: str): + super().__init__() + + if not os.path.isdir(download_dir): + os.makedirs(download_dir) + + self._download_dir = download_dir + print("Download directory: ", self._download_dir) + + def download(self, url: str) -> str: + filename = os.path.basename(url) + output_file = os.path.join(self._download_dir, filename) + + if os.path.exists(output_file): + print(f"File '{output_file}' already exists, skipping download.") + return output_file + + print(f"Downloading TinyStories dataset from '{url}'...") + response = requests.get(url) + + with open(output_file, "wb") as file: + file.write(response.content) + + return output_file + + +class TinyStoriesIterator(DocumentIterator): + # The token that separates stories in the TinyStories dataset. + SEPARATOR_TOKEN = "<|endoftext|>" + + def __init__(self): + super().__init__() + self._counter = -1 + + def iterate(self, file_path): + self._counter = -1 + file_name = os.path.basename(file_path) + + with open(file_path, "r") as file: + example = [] + + def split_meta(example): + if example: + self._counter += 1 + content = " ".join(example) + meta = { + "filename": file_name, + "id": f"{file_name}-{self._counter}", + } + + return meta, content + + for line in file: + if line.strip() == TinyStoriesIterator.SEPARATOR_TOKEN: + if example: + yield split_meta(example) + example = [] + else: + example.append(line.strip()) + + if example: + yield split_meta(example) + + +class TinyStoriesExtractor(DocumentExtractor): + def extract(self, content: str) -> Tuple[Set, str]: + # No metadata for the text, just the content. + return {}, content diff --git a/tutorials/tinystories/filters.py b/tutorials/tinystories/filters.py new file mode 100644 index 00000000..1b5a6c3d --- /dev/null +++ b/tutorials/tinystories/filters.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_curator.filters import DocumentFilter + + +class IncompleteStoryFilter(DocumentFilter): + """ + If the document doesn't end with a terminating punctuation mark, then discard. + """ + + def __init__(self): + super().__init__() + # Accepted story terminators. + self._story_terminators = {".", "!", "?", '"', "”"} + + def score_document(self, text: str) -> bool: + """ + Determines if a document's score is valid based on the last character of the text. + + Args: + text (str): The document text. + + Returns: + bool: True if the document's score is valid, False otherwise. + """ + ret = text.strip()[-1] in self._story_terminators + return ret + + def keep_document(self, score) -> bool: + return score diff --git a/tutorials/tinystories/helpers.py b/tutorials/tinystories/helpers.py new file mode 100644 index 00000000..ac486a56 --- /dev/null +++ b/tutorials/tinystories/helpers.py @@ -0,0 +1,84 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os + +from docbuilder import ( + TinyStoriesExtractor, + TinyStoriesIterator, +) + + +def write_jsonl(input_filename: str, output_dir: str, dump_every_n: int = 10000): + """ + Convert a file to JSONL format and write it to the specified output directory. + + Args: + input_filename (str): The path to the input file. + output_dir (str): The path to the output directory. + dump_every_n (int, optional): The number of records to dump to file at once. + """ + if os.path.isdir(output_dir): + if len(os.listdir(output_dir)) > 0: + print( + f"Output directory '{output_dir}' is not empty. Skipping conversion to JSONL." + ) + return + else: + os.makedirs(output_dir) + + print(f"Converting '{input_filename}' to JSONL format (output: '{output_dir}')...") + + basename = os.path.basename(input_filename) + iterator = TinyStoriesIterator() + extractor = TinyStoriesExtractor() + to_dump = [] + dump_ctr = 0 + + def dump_to_file(to_dump, dump_ctr): + """Helper function to facilitate dumping to file.""" + output_filename = f"{basename}-{dump_ctr}.jsonl" + with open(os.path.join(output_dir, output_filename), "w") as output_file: + output_file.writelines(to_dump) + # Empty out the list and increment the counter. + return [], dump_ctr + 1 + + for item in iterator.iterate(input_filename): + record_meta, content = item + extracted = extractor.extract(content) + + if extracted is None: + continue + + text_meta, text = extracted + + if text is None: + continue + + line = { + "text": text, + **text_meta, + **record_meta, + } + json_out = json.dumps(line, ensure_ascii=False) + to_dump.append(json_out + "\n") + + # Should we dump what we have so far? + if len(to_dump) == dump_every_n: + to_dump, dump_ctr = dump_to_file(to_dump, dump_ctr) + + # Dump the remaining records. + if to_dump: + dump_to_file(to_dump, dump_ctr) diff --git a/tutorials/tinystories/main.py b/tutorials/tinystories/main.py new file mode 100644 index 00000000..cd0c1096 --- /dev/null +++ b/tutorials/tinystories/main.py @@ -0,0 +1,232 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import shutil +from typing import Any + +from docbuilder import TinyStoriesDownloader +from filters import IncompleteStoryFilter +from helpers import write_jsonl +from modifiers import QuotationUnifier + +from nemo_curator import ScoreFilter, Sequential +from nemo_curator.datasets import DocumentDataset +from nemo_curator.filters import ( + RepeatingTopNGramsFilter, + WordCountFilter, +) +from nemo_curator.modifiers.pii_modifier import PiiModifierBatched +from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter +from nemo_curator.modules import ExactDuplicates +from nemo_curator.modules.modify import Modify +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.script_utils import add_distributed_args + +SCRIPT_DIR_PATH = os.path.dirname(os.path.abspath(__file__)) +DATA_DIR = os.path.join(SCRIPT_DIR_PATH, "data") +JSONL_ROOT_DIR = os.path.join(DATA_DIR, "jsonl") +# The TinyStories dataset is split into two files, one for training and one for validation. +# For the purposes of this tutorial, we will use the smaller validation file to demonstrate the curation pipeline. +TINY_STORIES_URL = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories-valid.txt" + + +def download_and_convert_to_jsonl() -> str: + """ + Downloads the TinyStories dataset and converts it to JSONL format. + + Returns: + str: The directory path where the JSONL files are saved. + """ + + # Download the TinyStories dataset. + downloader = TinyStoriesDownloader(DATA_DIR) + tinystories_val_fp = downloader.download(TINY_STORIES_URL) + + # Convert to JSONL files. + jsonl_dir = os.path.join(JSONL_ROOT_DIR, "val") + write_jsonl(tinystories_val_fp, jsonl_dir) + + return jsonl_dir + + +def clean_and_unify(dataset: DocumentDataset) -> DocumentDataset: + """ + Cleans and unifies the given dataset using a set of predefined cleaners. + + Args: + dataset (DocumentDataset): The dataset to be cleaned and unified. + + Returns: + DocumentDataset: The cleaned and unified dataset. + """ + cleaners = Sequential( + [ + # Unify all the quotation marks + Modify(QuotationUnifier()), + # Unify all unicode + Modify(UnicodeReformatter()), + ] + ) + return cleaners(dataset) + + +def filter_dataset(dataset: DocumentDataset) -> DocumentDataset: + """ + Filters the given dataset based on various criteria. + + Args: + dataset (DocumentDataset): The dataset to be filtered. + + Returns: + DocumentDataset: The filtered dataset. + """ + filters = Sequential( + [ + ScoreFilter( + WordCountFilter(min_words=80), + text_field="text", + score_field="word_count", + ), + ScoreFilter(IncompleteStoryFilter(), text_field="text"), + ScoreFilter( + RepeatingTopNGramsFilter(n=2, max_repeating_ngram_ratio=0.2), + text_field="text", + ), + ScoreFilter( + RepeatingTopNGramsFilter(n=3, max_repeating_ngram_ratio=0.18), + text_field="text", + ), + ScoreFilter( + RepeatingTopNGramsFilter(n=4, max_repeating_ngram_ratio=0.16), + text_field="text", + ), + ] + ) + filtered_dataset = filters(dataset) + return filtered_dataset + + +def redact_pii(dataset: DocumentDataset) -> DocumentDataset: + """ + Redacts personally identifiable information (PII) from a given dataset. + + Args: + dataset (DocumentDataset): The dataset containing documents with PII. + + Returns: + DocumentDataset: The redacted dataset with PII replaced by a generic value. + """ + redactor = Modify( + PiiModifierBatched( + supported_entities=["PERSON"], + anonymize_action="replace", + device="cpu", + ), + batched=True, + ) + return redactor(dataset) + + +def dedupe(dataset: DocumentDataset) -> DocumentDataset: + """ + Remove exact duplicates from the given DocumentDataset. + + Args: + dataset (DocumentDataset): The dataset containing documents. + + Returns: + DocumentDataset: The deduplicated dataset. + """ + deduplicator = ExactDuplicates(id_field="id", text_field="text", hash_method="md5") + # Find the duplicates + duplicates = deduplicator(dataset) + docs_to_remove = duplicates.df.map_partitions( + lambda x: x[x._hashes.duplicated(keep="first")] + ) + # Remove the duplicates using their IDs. + duplicate_ids = list(docs_to_remove.compute().id) + dataset_df = dataset.df + deduped = dataset_df[~dataset_df.id.isin(duplicate_ids)] + return DocumentDataset(deduped) + + +def run_curation_pipeline(args: Any, jsonl_dir: str) -> None: + """ + Run the curation pipeline on the TinyStories dataset. + + Args: + args (Any): Command-line arguments. + jsonl_dir (str): Directory path where the JSONL files are stored. + """ + # Initialize the Dask cluster. + client = get_client(args, args.device) + print(f"Running curation pipeline on '{jsonl_dir}'...") + files = [ + fp + for fp in get_all_files_paths_under(jsonl_dir, recurse_subdirectories=False) + if fp.endswith(".jsonl") + ] + print("Reading the data...") + orig_dataset = DocumentDataset.read_json(files, add_filename=True) + dataset = orig_dataset + + curation_steps = Sequential( + [ + clean_and_unify, + filter_dataset, + dedupe, + redact_pii, + ] + ) + dataset = curation_steps(dataset) + print("Executing the pipeline...") + dataset = dataset.persist() + + print(f"Original dataset length: {len(orig_dataset.df)}") + print(f"After dataprep: {len(dataset.df)}") + print("Writing the results to disk...") + + # Overwrite existing files in the curated directory. + out_path = os.path.join(jsonl_dir, "curated") + + if os.path.isdir(out_path): + shutil.rmtree(out_path) + + os.makedirs(out_path) + dataset.to_json(out_path, write_to_filename=True) + client.close() + + +def main(): + parser = argparse.ArgumentParser() + parser = add_distributed_args(parser) + args = parser.parse_args() + # Limit the total number of workers to ensure we don't run out of memory. + args.n_workers = min(args.n_workers, 8) + + # Prepare the download and JSONL directories. + if not os.path.isdir(DATA_DIR): + os.makedirs(DATA_DIR) + if not os.path.isdir(JSONL_ROOT_DIR): + os.makedirs(JSONL_ROOT_DIR) + + jsonl_val_dir = download_and_convert_to_jsonl() + run_curation_pipeline(args, jsonl_val_dir) + + +if __name__ == "__main__": + main() diff --git a/tutorials/tinystories/modifiers.py b/tutorials/tinystories/modifiers.py new file mode 100644 index 00000000..cd06ef38 --- /dev/null +++ b/tutorials/tinystories/modifiers.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_curator.modifiers import DocumentModifier + + +class QuotationUnifier(DocumentModifier): + """ + A simple modifier that unifies the quotation marks in the documents. + """ + + def modify_document(self, text: str) -> str: + """ + Modifies the given text by replacing left and right single quotes with normal single quotes, + and replacing left and right double quotes with normal double quotes. + + Args: + text (str): The text to be modified. + + Returns: + str: The modified text. + """ + text = text.replace("‘", "'").replace("’", "'") + text = text.replace("“", '"').replace("”", '"') + return text