Improve Common Crawl download (#82)

* Add error for invalid snapshot order Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Add error for cc news snapshot order Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Improve error messaging for no urls Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Reformat error Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Update documentation Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Update links Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Update download tests Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Add output format to test Signed-off-by: Ryan Wolf <rywolf@nvidia.com> --------- Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
NVIDIA · Jun 28, 2024 · 3d57926 · 3d57926
1 parent e28fcec
commit 3d57926
Show file tree

Hide file tree

Showing 5 changed files with 127 additions and 2 deletions.
diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst
@@ -20,6 +20,10 @@ In addition, it provides a flexible interface to extend the utility to other dat
 Our Common Crawl example demonstrates how to process a crawl by downloading the data from S3, doing preliminary language filtering with pyCLD2,
 and extracting the relevant text with jusText to output :code:`.jsonl` files.
 
+NeMo Curator currently does not provide out-of-the-box support for web-crawling or web-scraping.
+It provides utilities for downloading and extracting data from the preexisting online sources given above.
+Users can easily implement these functions themselves and automatically scale them with the framework described below if they would like.
+
 -----------------------------------------
 Usage
 -----------------------------------------
@@ -39,7 +43,7 @@ By "extraction", we typically mean the process of converting a data format from
     common_crawl = download_common_crawl("/extracted/output/folder", "2020-50", "2021-04", output_type="jsonl")
 
   * ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed.
-  * ``"2020-50"`` is the first common crawl snapshot that will be included in the download.
+  * ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here <https://data.commoncrawl.org/>`_.
   * ``"2021-04"`` is the last common crawl snapshot that will be included in the download.
   * ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported.
 

diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py
@@ -313,6 +313,14 @@ def download_common_crawl(
     common_crawl_urls = get_common_crawl_urls(
         starting_snapshot=start_snapshot, ending_snapshot=end_snapshot, news=news
     )
+
+    if len(common_crawl_urls) == 0:
+        raise ValueError(
+            f"No Common Crawl download urls found between {start_snapshot} and {end_snapshot}. "
+            "Ensure that a valid Common Crawl snapshot (https://data.commoncrawl.org/) is "
+            "within the range provided."
+        )
+
     if url_limit:
         common_crawl_urls = common_crawl_urls[:url_limit]
     output_paths = list(

diff --git a/nemo_curator/download/doc_builder.py b/nemo_curator/download/doc_builder.py
@@ -182,8 +182,13 @@ def download_and_extract(
     Returns:
       A DocumentDataset of the downloaded data
     """
+    if len(urls) == 0:
+        raise ValueError("No urls were provided to download")
+
     if len(urls) != len(output_paths):
-        raise ValueError("Different number of urls and output_paths")
+        raise ValueError(
+            f"Different number of urls and output_paths. {len(urls)} urls vs {len(output_paths)} output_paths"
+        )
 
     output_format = dict(sorted(output_format.items()))
     df = dd.from_map(

diff --git a/nemo_curator/utils/download_utils.py b/nemo_curator/utils/download_utils.py
@@ -36,6 +36,11 @@ def get_main_warc_paths(
     start_date = datetime.fromisocalendar(beg_year, beg_week, 1)
     end_date = datetime.fromisocalendar(end_year, end_week, 1)
 
+    if start_date > end_date:
+        raise ValueError(
+            f"Start snapshot '{start_snapshot}' is after end snapshot '{end_snapshot}'"
+        )
+
     if beg_year < 2013 or end_year < 2013:
         print("Warning: Only snapshots after 2013 are supported by this script")
 
@@ -70,6 +75,11 @@ def get_news_warc_paths(
     # Get current year and month
     today = datetime.now()
 
+    if start_date > end_date:
+        raise ValueError(
+            f"Start snapshot '{start_date}' is after end snapshot '{end_date}'"
+        )
+
     if beg.year < 2016 or end.year > today.year:
         print(
             "Warning: WARC paths exist only from 2016-8 to "

diff --git a/tests/test_download.py b/tests/test_download.py
@@ -1,3 +1,16 @@
+from pathlib import Path
+
+import pytest
+
+from nemo_curator.download import download_and_extract
+from nemo_curator.download.commoncrawl import (
+    CommonCrawlWARCDownloader,
+    CommonCrawlWARCExtractor,
+    CommonCrawlWARCIterator,
+    get_common_crawl_urls,
+)
+
+
 class TestDownload:
     def test_imports(self):
         from nemo_curator.download import (
@@ -7,3 +20,88 @@ def test_imports(self):
         )
 
         assert True
+
+    def test_common_crawl_urls(self):
+        start_snapshot = "2021-04"
+        end_snapshot = "2021-10"
+        urls = get_common_crawl_urls(start_snapshot, end_snapshot)
+
+        assert (
+            urls[0]
+            == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz"
+        )
+        assert (
+            urls[-1]
+            == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz"
+        )
+        assert len(urls) == 143840
+
+    def test_incorrect_snapshot_order(self):
+        with pytest.raises(ValueError):
+            end_snapshot = "2021-04"
+            start_snapshot = "2021-10"
+            urls = get_common_crawl_urls(start_snapshot, end_snapshot)
+
+    def test_common_crawl_news_urls(self):
+        start_snapshot = "2021-04"
+        end_snapshot = "2021-10"
+        urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)
+
+        assert (
+            urls[0]
+            == "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/04/CC-NEWS-20210401004522-01022.warc.gz"
+        )
+        assert (
+            urls[-1]
+            == "https://data.commoncrawl.org/crawl-data/CC-NEWS/2021/10/CC-NEWS-20211031225258-00089.warc.gz"
+        )
+        assert len(urls) == 3838
+
+    def test_incorrect_snapshot_order_news(self):
+        with pytest.raises(ValueError):
+            end_snapshot = "2021-04"
+            start_snapshot = "2021-10"
+            urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)
+
+    def test_uneven_common_crawl_range(self):
+        start_snapshot = "2021-03"
+        end_snapshot = "2021-11"
+        urls = get_common_crawl_urls(start_snapshot, end_snapshot)
+
+        assert (
+            urls[0]
+            == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-10/segments/1614178347293.1/warc/CC-MAIN-20210224165708-20210224195708-00000.warc.gz"
+        )
+        assert (
+            urls[-1]
+            == "https://data.commoncrawl.org/crawl-data/CC-MAIN-2021-04/segments/1610704847953.98/warc/CC-MAIN-20210128134124-20210128164124-00799.warc.gz"
+        )
+        assert len(urls) == 143840
+
+    def test_no_urls(self):
+        with pytest.raises(ValueError):
+            output_format = {
+                "text": str,
+            }
+            download_and_extract(
+                [],
+                [],
+                CommonCrawlWARCDownloader(download_dir="."),
+                CommonCrawlWARCIterator(),
+                CommonCrawlWARCExtractor(),
+                output_format,
+            )
+
+    def test_url_path_mismatch(self):
+        with pytest.raises(ValueError):
+            output_format = {
+                "text": str,
+            }
+            download_and_extract(
+                ["one", "two", "three"],
+                ["one"],
+                CommonCrawlWARCDownloader(download_dir="."),
+                CommonCrawlWARCIterator(),
+                CommonCrawlWARCExtractor(),
+                output_format,
+            )