diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst index e2142de7..c6404264 100644 --- a/docs/user-guide/download.rst +++ b/docs/user-guide/download.rst @@ -20,6 +20,10 @@ In addition, it provides a flexible interface to extend the utility to other dat Our Common Crawl example demonstrates how to process a crawl by downloading the data from S3, doing preliminary language filtering with pyCLD2, and extracting the relevant text with jusText to output :code:`.jsonl` files. +NeMo Curator currently does not provide out-of-the-box support for web-crawling or web-scraping. +It provides utilities for downloading and extracting data from the preexisting online sources given above. +Users can easily implement these functions themselves and automatically scale them with the framework described below if they would like. + ----------------------------------------- Usage ----------------------------------------- @@ -39,7 +43,7 @@ By "extraction", we typically mean the process of converting a data format from common_crawl = download_common_crawl("/extracted/output/folder", "2020-50", "2021-04", output_type="jsonl") * ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed. - * ``"2020-50"`` is the first common crawl snapshot that will be included in the download. + * ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here `_. * ``"2021-04"`` is the last common crawl snapshot that will be included in the download. * ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported. diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py index 15736ee9..6373f967 100644 --- a/nemo_curator/download/commoncrawl.py +++ b/nemo_curator/download/commoncrawl.py @@ -313,6 +313,14 @@ def download_common_crawl( common_crawl_urls = get_common_crawl_urls( starting_snapshot=start_snapshot, ending_snapshot=end_snapshot, news=news ) + + if len(common_crawl_urls) == 0: + raise ValueError( + f"No Common Crawl download urls found between {start_snapshot} and {end_snapshot}. " + "Ensure that a valid Common Crawl snapshot (https://data.commoncrawl.org/) is " + "within the range provided." + ) + if url_limit: common_crawl_urls = common_crawl_urls[:url_limit] output_paths = list( diff --git a/nemo_curator/download/doc_builder.py b/nemo_curator/download/doc_builder.py index 34331875..122d3ea3 100644 --- a/nemo_curator/download/doc_builder.py +++ b/nemo_curator/download/doc_builder.py @@ -182,8 +182,13 @@ def download_and_extract( Returns: A DocumentDataset of the downloaded data """ + if len(urls) == 0: + raise ValueError("No urls were provided to download") + if len(urls) != len(output_paths): - raise ValueError("Different number of urls and output_paths") + raise ValueError( + f"Different number of urls and output_paths. {len(urls)} urls vs {len(output_paths)} output_paths" + ) output_format = dict(sorted(output_format.items())) df = dd.from_map( diff --git a/nemo_curator/utils/download_utils.py b/nemo_curator/utils/download_utils.py index 7c33c1ec..76cbc113 100644 --- a/nemo_curator/utils/download_utils.py +++ b/nemo_curator/utils/download_utils.py @@ -36,6 +36,11 @@ def get_main_warc_paths( start_date = datetime.fromisocalendar(beg_year, beg_week, 1) end_date = datetime.fromisocalendar(end_year, end_week, 1) + if start_date > end_date: + raise ValueError( + f"Start snapshot '{start_snapshot}' is after end snapshot '{end_snapshot}'" + ) + if beg_year < 2013 or end_year < 2013: print("Warning: Only snapshots after 2013 are supported by this script") @@ -70,6 +75,11 @@ def get_news_warc_paths( # Get current year and month today = datetime.now() + if start_date > end_date: + raise ValueError( + f"Start snapshot '{start_date}' is after end snapshot '{end_date}'" + ) + if beg.year < 2016 or end.year > today.year: print( "Warning: WARC paths exist only from 2016-8 to " diff --git a/tests/test_download.py b/tests/test_download.py index eea9bff5..2d725893 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -1,3 +1,16 @@ +from pathlib import Path + +import pytest + +from nemo_curator.download import download_and_extract +from nemo_curator.download.commoncrawl import ( + CommonCrawlWARCDownloader, + CommonCrawlWARCExtractor, + CommonCrawlWARCIterator, + get_common_crawl_urls, +) + + class TestDownload: def test_imports(self): from nemo_curator.download import ( @@ -7,3 +20,88 @@ def test_imports(self): ) assert True + + def test_common_crawl_urls(self): + start_snapshot = "2021-04" + end_snapshot = "2021-10" + urls = get_common_crawl_urls(start_snapshot, end_snapshot) + + assert ( + urls[0] + == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz" + ) + assert ( + urls[-1] + == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz" + ) + assert len(urls) == 143840 + + def test_incorrect_snapshot_order(self): + with pytest.raises(ValueError): + end_snapshot = "2021-04" + start_snapshot = "2021-10" + urls = get_common_crawl_urls(start_snapshot, end_snapshot) + + def test_common_crawl_news_urls(self): + start_snapshot = "2021-04" + end_snapshot = "2021-10" + urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True) + + assert ( + urls[0] + == "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/04/CC-NEWS-20210401004522-01022.warc.gz" + ) + assert ( + urls[-1] + == "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/10/CC-NEWS-20211031225258-00089.warc.gz" + ) + assert len(urls) == 3838 + + def test_incorrect_snapshot_order_news(self): + with pytest.raises(ValueError): + end_snapshot = "2021-04" + start_snapshot = "2021-10" + urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True) + + def test_uneven_common_crawl_range(self): + start_snapshot = "2021-03" + end_snapshot = "2021-11" + urls = get_common_crawl_urls(start_snapshot, end_snapshot) + + assert ( + urls[0] + == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz" + ) + assert ( + urls[-1] + == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz" + ) + assert len(urls) == 143840 + + def test_no_urls(self): + with pytest.raises(ValueError): + output_format = { + "text": str, + } + download_and_extract( + [], + [], + CommonCrawlWARCDownloader(download_dir="."), + CommonCrawlWARCIterator(), + CommonCrawlWARCExtractor(), + output_format, + ) + + def test_url_path_mismatch(self): + with pytest.raises(ValueError): + output_format = { + "text": str, + } + download_and_extract( + ["one", "two", "three"], + ["one"], + CommonCrawlWARCDownloader(download_dir="."), + CommonCrawlWARCIterator(), + CommonCrawlWARCExtractor(), + output_format, + )