Skip to content

Commit

Permalink
Improve Common Crawl download (#82)
Browse files Browse the repository at this point in the history
* Add error for invalid snapshot order

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add error for cc news snapshot order

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Improve error messaging for no urls

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Reformat error

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Update documentation

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Update links

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Update download tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

* Add output format to test

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>

---------

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
  • Loading branch information
ryantwolf authored Jun 28, 2024
1 parent e28fcec commit 3d57926
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 2 deletions.
6 changes: 5 additions & 1 deletion docs/user-guide/download.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ In addition, it provides a flexible interface to extend the utility to other dat
Our Common Crawl example demonstrates how to process a crawl by downloading the data from S3, doing preliminary language filtering with pyCLD2,
and extracting the relevant text with jusText to output :code:`.jsonl` files.

NeMo Curator currently does not provide out-of-the-box support for web-crawling or web-scraping.
It provides utilities for downloading and extracting data from the preexisting online sources given above.
Users can easily implement these functions themselves and automatically scale them with the framework described below if they would like.

-----------------------------------------
Usage
-----------------------------------------
Expand All @@ -39,7 +43,7 @@ By "extraction", we typically mean the process of converting a data format from
common_crawl = download_common_crawl("/extracted/output/folder", "2020-50", "2021-04", output_type="jsonl")
* ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed.
* ``"2020-50"`` is the first common crawl snapshot that will be included in the download.
* ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here <https://data.commoncrawl.org/>`_.
* ``"2021-04"`` is the last common crawl snapshot that will be included in the download.
* ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported.

Expand Down
8 changes: 8 additions & 0 deletions nemo_curator/download/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,14 @@ def download_common_crawl(
common_crawl_urls = get_common_crawl_urls(
starting_snapshot=start_snapshot, ending_snapshot=end_snapshot, news=news
)

if len(common_crawl_urls) == 0:
raise ValueError(
f"No Common Crawl download urls found between {start_snapshot} and {end_snapshot}. "
"Ensure that a valid Common Crawl snapshot (https://data.commoncrawl.org/) is "
"within the range provided."
)

if url_limit:
common_crawl_urls = common_crawl_urls[:url_limit]
output_paths = list(
Expand Down
7 changes: 6 additions & 1 deletion nemo_curator/download/doc_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,13 @@ def download_and_extract(
Returns:
A DocumentDataset of the downloaded data
"""
if len(urls) == 0:
raise ValueError("No urls were provided to download")

if len(urls) != len(output_paths):
raise ValueError("Different number of urls and output_paths")
raise ValueError(
f"Different number of urls and output_paths. {len(urls)} urls vs {len(output_paths)} output_paths"
)

output_format = dict(sorted(output_format.items()))
df = dd.from_map(
Expand Down
10 changes: 10 additions & 0 deletions nemo_curator/utils/download_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ def get_main_warc_paths(
start_date = datetime.fromisocalendar(beg_year, beg_week, 1)
end_date = datetime.fromisocalendar(end_year, end_week, 1)

if start_date > end_date:
raise ValueError(
f"Start snapshot '{start_snapshot}' is after end snapshot '{end_snapshot}'"
)

if beg_year < 2013 or end_year < 2013:
print("Warning: Only snapshots after 2013 are supported by this script")

Expand Down Expand Up @@ -70,6 +75,11 @@ def get_news_warc_paths(
# Get current year and month
today = datetime.now()

if start_date > end_date:
raise ValueError(
f"Start snapshot '{start_date}' is after end snapshot '{end_date}'"
)

if beg.year < 2016 or end.year > today.year:
print(
"Warning: WARC paths exist only from 2016-8 to "
Expand Down
98 changes: 98 additions & 0 deletions tests/test_download.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
from pathlib import Path

import pytest

from nemo_curator.download import download_and_extract
from nemo_curator.download.commoncrawl import (
CommonCrawlWARCDownloader,
CommonCrawlWARCExtractor,
CommonCrawlWARCIterator,
get_common_crawl_urls,
)


class TestDownload:
def test_imports(self):
from nemo_curator.download import (
Expand All @@ -7,3 +20,88 @@ def test_imports(self):
)

assert True

def test_common_crawl_urls(self):
start_snapshot = "2021-04"
end_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot)

assert (
urls[0]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz"
)
assert (
urls[-1]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz"
)
assert len(urls) == 143840

def test_incorrect_snapshot_order(self):
with pytest.raises(ValueError):
end_snapshot = "2021-04"
start_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot)

def test_common_crawl_news_urls(self):
start_snapshot = "2021-04"
end_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)

assert (
urls[0]
== "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/04/CC-NEWS-20210401004522-01022.warc.gz"
)
assert (
urls[-1]
== "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/10/CC-NEWS-20211031225258-00089.warc.gz"
)
assert len(urls) == 3838

def test_incorrect_snapshot_order_news(self):
with pytest.raises(ValueError):
end_snapshot = "2021-04"
start_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)

def test_uneven_common_crawl_range(self):
start_snapshot = "2021-03"
end_snapshot = "2021-11"
urls = get_common_crawl_urls(start_snapshot, end_snapshot)

assert (
urls[0]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz"
)
assert (
urls[-1]
== "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz"
)
assert len(urls) == 143840

def test_no_urls(self):
with pytest.raises(ValueError):
output_format = {
"text": str,
}
download_and_extract(
[],
[],
CommonCrawlWARCDownloader(download_dir="."),
CommonCrawlWARCIterator(),
CommonCrawlWARCExtractor(),
output_format,
)

def test_url_path_mismatch(self):
with pytest.raises(ValueError):
output_format = {
"text": str,
}
download_and_extract(
["one", "two", "three"],
["one"],
CommonCrawlWARCDownloader(download_dir="."),
CommonCrawlWARCIterator(),
CommonCrawlWARCExtractor(),
output_format,
)

0 comments on commit 3d57926

Please sign in to comment.