Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Common Crawl download #82

Merged
merged 9 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion docs/user-guide/download.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ In addition, it provides a flexible interface to extend the utility to other dat
Our Common Crawl example demonstrates how to process a crawl by downloading the data from S3, doing preliminary language filtering with pyCLD2,
and extracting the relevant text with jusText to output :code:`.jsonl` files.

NeMo Curator currently does not provide out-of-the-box support for web-crawling or web-scraping.
It provides utilities for downloading and extracting data from the preexisting online sources given above.
Users can easily implement these functions themselves and automatically scale them with the framework described below if they would like.

-----------------------------------------
Usage
-----------------------------------------
Expand All @@ -39,7 +43,7 @@ By "extraction", we typically mean the process of converting a data format from
common_crawl = download_common_crawl("/extracted/output/folder", "2020-50", "2021-04", output_type="jsonl")

* ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed.
* ``"2020-50"`` is the first common crawl snapshot that will be included in the download.
* ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here <https://data.commoncrawl.org/>`_.
* ``"2021-04"`` is the last common crawl snapshot that will be included in the download.
* ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported.

Expand Down
8 changes: 8 additions & 0 deletions nemo_curator/download/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,14 @@ def download_common_crawl(
common_crawl_urls = get_common_crawl_urls(
starting_snapshot=start_snapshot, ending_snapshot=end_snapshot, news=news
)

if len(common_crawl_urls) == 0:
raise ValueError(
f"No Common Crawl download urls found between {start_snapshot} and {end_snapshot}. "
"Ensure that a valid Common Crawl snapshot (https://data.commoncrawl.org/) is "
"within the range provided."
)

if url_limit:
common_crawl_urls = common_crawl_urls[:url_limit]
output_paths = list(
Expand Down
7 changes: 6 additions & 1 deletion nemo_curator/download/doc_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,13 @@ def download_and_extract(
Returns:
A DocumentDataset of the downloaded data
"""
if len(urls) == 0:
raise ValueError("No urls were provided to download")

if len(urls) != len(output_paths):
raise ValueError("Different number of urls and output_paths")
raise ValueError(
f"Different number of urls and output_paths. {len(urls)} urls vs {len(output_paths)} output_paths"
)

output_format = dict(sorted(output_format.items()))
df = dd.from_map(
Expand Down
10 changes: 10 additions & 0 deletions nemo_curator/utils/download_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ def get_main_warc_paths(
start_date = datetime.fromisocalendar(beg_year, beg_week, 1)
end_date = datetime.fromisocalendar(end_year, end_week, 1)

if start_date > end_date:
raise ValueError(
f"Start snapshot '{start_snapshot}' is after end snapshot '{end_snapshot}'"
)

if beg_year < 2013 or end_year < 2013:
print("Warning: Only snapshots after 2013 are supported by this script")

Expand Down Expand Up @@ -70,6 +75,11 @@ def get_news_warc_paths(
# Get current year and month
today = datetime.now()

if start_date > end_date:
raise ValueError(
f"Start snapshot '{start_date}' is after end snapshot '{end_date}'"
)

if beg.year < 2016 or end.year > today.year:
print(
"Warning: WARC paths exist only from 2016-8 to "
Expand Down
98 changes: 98 additions & 0 deletions tests/test_download.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
from pathlib import Path

import pytest

from nemo_curator.download import download_and_extract
from nemo_curator.download.commoncrawl import (
CommonCrawlWARCDownloader,
CommonCrawlWARCExtractor,
CommonCrawlWARCIterator,
get_common_crawl_urls,
)


class TestDownload:
def test_imports(self):
from nemo_curator.download import (
Expand All @@ -7,3 +20,88 @@ def test_imports(self):
)

assert True

def test_common_crawl_urls(self):
ryantwolf marked this conversation as resolved.
Show resolved Hide resolved
start_snapshot = "2021-04"
end_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot)

assert (
urls[0]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz"
)
assert (
urls[-1]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz"
)
assert len(urls) == 143840

def test_incorrect_snapshot_order(self):
with pytest.raises(ValueError):
end_snapshot = "2021-04"
start_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot)

def test_common_crawl_news_urls(self):
start_snapshot = "2021-04"
end_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)

assert (
urls[0]
== "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/04/CC-NEWS-20210401004522-01022.warc.gz"
)
assert (
urls[-1]
== "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/10/CC-NEWS-20211031225258-00089.warc.gz"
)
assert len(urls) == 3838

def test_incorrect_snapshot_order_news(self):
with pytest.raises(ValueError):
end_snapshot = "2021-04"
start_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)

def test_uneven_common_crawl_range(self):
start_snapshot = "2021-03"
end_snapshot = "2021-11"
urls = get_common_crawl_urls(start_snapshot, end_snapshot)

assert (
urls[0]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz"
)
assert (
urls[-1]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz"
)
assert len(urls) == 143840

def test_no_urls(self):
with pytest.raises(ValueError):
output_format = {
"text": str,
}
download_and_extract(
[],
[],
CommonCrawlWARCDownloader(download_dir="."),
CommonCrawlWARCIterator(),
CommonCrawlWARCExtractor(),
output_format,
)

def test_url_path_mismatch(self):
with pytest.raises(ValueError):
output_format = {
"text": str,
}
download_and_extract(
["one", "two", "three"],
["one"],
CommonCrawlWARCDownloader(download_dir="."),
CommonCrawlWARCIterator(),
CommonCrawlWARCExtractor(),
output_format,
)