From 166f2e60db8123982457b1d8903a30b33e34e76c Mon Sep 17 00:00:00 2001 From: rpauszek Date: Mon, 16 Oct 2023 17:43:23 +0200 Subject: [PATCH] file_download: update to new Zenodo API --- changelog.md | 1 + lumicks/pylake/file_download.py | 16 ++++++++++------ lumicks/pylake/tests/test_file_download.py | 12 ++++++------ 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/changelog.md b/changelog.md index 4dc396306..c9b402cfe 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,7 @@ #### Bug fixes +* Fixed `lk.download_from_doi()` to align with new Zenodo REST API. * Fixed a bug where the minimum length field of an exported `KymoTrackGroup` was formatted as an integer resulting in rounding errors when storing the tracks. Note that an incorrect minimum length can lead to biases when performing dwell time analysis. These values are now properly formatted as floating point numbers. The entry in the header was also changed to "minimum observable duration (seconds)" for additional clarity. This bug was introduced in version `1.2.0`. * Fixed a bug that prevented resaving a `KymoTrackGroup` loaded from an older version of Pylake. * Fixed a bug that inadvertently made us rely on `cachetools>=5.x`. Older versions of `cachetools` did not pass the instance to the key function resulting in a `TypeError: key() missing 1 required positional argument: '_'` error when accessing cached properties or methods. diff --git a/lumicks/pylake/file_download.py b/lumicks/pylake/file_download.py index 7c2f069b8..4acd4d9e9 100644 --- a/lumicks/pylake/file_download.py +++ b/lumicks/pylake/file_download.py @@ -67,9 +67,9 @@ def download_file(url, target_path, download_path, show_progress=True, block_siz file.write(data) -def verify_hash(file_name, algorithm, reference_hash, chunk_size=65536): +def verify_hash(file_name, reference_hash, chunk_size=65536): """Verify the hash of a file""" - m = hashlib.new(algorithm) + m = hashlib.new("md5") with open(file_name, "rb") as f: b = f.read(chunk_size) while len(b) > 0: @@ -119,7 +119,12 @@ def download_from_doi(doi, target_path="", force_download=False, show_progress=T file_names = [] for file in record_metadata["files"]: - file_name, url = file["key"], file["links"]["self"] + file_name = file["filename"] + url = ( + f"https://zenodo.org/api/records/{record_metadata['record_id']}" + f"/files/{file_name}/content" + ) + full_path = os.path.join(target_path, file_name) # If the file doesn't exist, we can't skip it @@ -127,8 +132,7 @@ def download_from_doi(doi, target_path="", force_download=False, show_progress=T # If a file with the requested filename exists but does not match the data from Zenodo, # throw an error. - hash_algorithm, checksum = file["checksum"].split(":") - if not download and not verify_hash(full_path, hash_algorithm, checksum): + if not download and not verify_hash(full_path, file["checksum"]): if not force_download: raise RuntimeError( f"File {file_name} does not match file from Zenodo. Set force_download=True " @@ -139,7 +143,7 @@ def download_from_doi(doi, target_path="", force_download=False, show_progress=T # Only download what we don't have yet. if download or force_download: download_file(url, target_path, file_name, show_progress) - if not verify_hash(full_path, hash_algorithm, checksum): + if not verify_hash(full_path, file["checksum"]): raise RuntimeError("Download failed. Invalid checksum after download.") file_names.append(full_path) diff --git a/lumicks/pylake/tests/test_file_download.py b/lumicks/pylake/tests/test_file_download.py index 99edab530..fad51b15b 100644 --- a/lumicks/pylake/tests/test_file_download.py +++ b/lumicks/pylake/tests/test_file_download.py @@ -10,7 +10,7 @@ @pytest.mark.preflight def test_grab_record(): - assert get_url_from_doi("10.5281/zenodo.4247279") == "https://zenodo.org/record/4247279" + assert get_url_from_doi("10.5281/zenodo.4247279") == "https://zenodo.org/records/4247279" with pytest.raises(RuntimeError, match="DOI could not be resolved"): assert get_url_from_doi("10.55281/zenodo.4247279") @@ -21,8 +21,8 @@ def test_download_record_metadata(): record = download_record_metadata("4280789") # Older version of Pylake # Verify that the fields we rely on stay the same - assert record["files"][0]["checksum"] == "md5:1a401193ab22f0983f87855e2581075b" - assert record["files"][0]["key"] == "lumicks/pylake-v0.7.1.zip" + assert record["files"][0]["checksum"] == "1a401193ab22f0983f87855e2581075b" + assert record["files"][0]["filename"] == "lumicks/pylake-v0.7.1.zip" assert record["files"][0]["links"]["self"].startswith("https://zenodo.org/") # Link may change @@ -39,14 +39,14 @@ def test_download_from_doi(tmpdir_factory): files = download_from_doi("10.5281/zenodo.4247279", tmpdir, show_progress=False) # Validate checksum - assert verify_hash(files[0], *record["files"][0]["checksum"].split(":")) + assert verify_hash(files[0], record["files"][0]["checksum"]) # Add a random character such that the checksum fails with open(files[0], "ab") as f: f.write(b"\x21") # Validate that the hash is no longer correct - assert not verify_hash(files[0], *record["files"][0]["checksum"].split(":")) + assert not verify_hash(files[0], record["files"][0]["checksum"]) with pytest.raises( RuntimeError, @@ -58,4 +58,4 @@ def test_download_from_doi(tmpdir_factory): download_from_doi("10.5281/zenodo.4247279", tmpdir, force_download=True, show_progress=False) # Validate checksum after forced re-download (should be OK again) - assert verify_hash(files[0], *record["files"][0]["checksum"].split(":")) + assert verify_hash(files[0], record["files"][0]["checksum"])