Skip to content

Commit

Permalink
file_download: update to new Zenodo API
Browse files Browse the repository at this point in the history
  • Loading branch information
rpauszek committed Oct 16, 2023
1 parent 8ed0967 commit 166f2e6
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 12 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#### Bug fixes

* Fixed `lk.download_from_doi()` to align with new Zenodo REST API.
* Fixed a bug where the minimum length field of an exported `KymoTrackGroup` was formatted as an integer resulting in rounding errors when storing the tracks. Note that an incorrect minimum length can lead to biases when performing dwell time analysis. These values are now properly formatted as floating point numbers. The entry in the header was also changed to "minimum observable duration (seconds)" for additional clarity. This bug was introduced in version `1.2.0`.
* Fixed a bug that prevented resaving a `KymoTrackGroup` loaded from an older version of Pylake.
* Fixed a bug that inadvertently made us rely on `cachetools>=5.x`. Older versions of `cachetools` did not pass the instance to the key function resulting in a `TypeError: key() missing 1 required positional argument: '_'` error when accessing cached properties or methods.
Expand Down
16 changes: 10 additions & 6 deletions lumicks/pylake/file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ def download_file(url, target_path, download_path, show_progress=True, block_siz
file.write(data)


def verify_hash(file_name, algorithm, reference_hash, chunk_size=65536):
def verify_hash(file_name, reference_hash, chunk_size=65536):
"""Verify the hash of a file"""
m = hashlib.new(algorithm)
m = hashlib.new("md5")
with open(file_name, "rb") as f:
b = f.read(chunk_size)
while len(b) > 0:
Expand Down Expand Up @@ -119,16 +119,20 @@ def download_from_doi(doi, target_path="", force_download=False, show_progress=T

file_names = []
for file in record_metadata["files"]:
file_name, url = file["key"], file["links"]["self"]
file_name = file["filename"]
url = (
f"https://zenodo.org/api/records/{record_metadata['record_id']}"
f"/files/{file_name}/content"
)

full_path = os.path.join(target_path, file_name)

# If the file doesn't exist, we can't skip it
download = not os.path.exists(full_path)

# If a file with the requested filename exists but does not match the data from Zenodo,
# throw an error.
hash_algorithm, checksum = file["checksum"].split(":")
if not download and not verify_hash(full_path, hash_algorithm, checksum):
if not download and not verify_hash(full_path, file["checksum"]):
if not force_download:
raise RuntimeError(
f"File {file_name} does not match file from Zenodo. Set force_download=True "
Expand All @@ -139,7 +143,7 @@ def download_from_doi(doi, target_path="", force_download=False, show_progress=T
# Only download what we don't have yet.
if download or force_download:
download_file(url, target_path, file_name, show_progress)
if not verify_hash(full_path, hash_algorithm, checksum):
if not verify_hash(full_path, file["checksum"]):
raise RuntimeError("Download failed. Invalid checksum after download.")

file_names.append(full_path)
Expand Down
12 changes: 6 additions & 6 deletions lumicks/pylake/tests/test_file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

@pytest.mark.preflight
def test_grab_record():
assert get_url_from_doi("10.5281/zenodo.4247279") == "https://zenodo.org/record/4247279"
assert get_url_from_doi("10.5281/zenodo.4247279") == "https://zenodo.org/records/4247279"

with pytest.raises(RuntimeError, match="DOI could not be resolved"):
assert get_url_from_doi("10.55281/zenodo.4247279")
Expand All @@ -21,8 +21,8 @@ def test_download_record_metadata():
record = download_record_metadata("4280789") # Older version of Pylake

# Verify that the fields we rely on stay the same
assert record["files"][0]["checksum"] == "md5:1a401193ab22f0983f87855e2581075b"
assert record["files"][0]["key"] == "lumicks/pylake-v0.7.1.zip"
assert record["files"][0]["checksum"] == "1a401193ab22f0983f87855e2581075b"
assert record["files"][0]["filename"] == "lumicks/pylake-v0.7.1.zip"
assert record["files"][0]["links"]["self"].startswith("https://zenodo.org/") # Link may change


Expand All @@ -39,14 +39,14 @@ def test_download_from_doi(tmpdir_factory):
files = download_from_doi("10.5281/zenodo.4247279", tmpdir, show_progress=False)

# Validate checksum
assert verify_hash(files[0], *record["files"][0]["checksum"].split(":"))
assert verify_hash(files[0], record["files"][0]["checksum"])

# Add a random character such that the checksum fails
with open(files[0], "ab") as f:
f.write(b"\x21")

# Validate that the hash is no longer correct
assert not verify_hash(files[0], *record["files"][0]["checksum"].split(":"))
assert not verify_hash(files[0], record["files"][0]["checksum"])

with pytest.raises(
RuntimeError,
Expand All @@ -58,4 +58,4 @@ def test_download_from_doi(tmpdir_factory):
download_from_doi("10.5281/zenodo.4247279", tmpdir, force_download=True, show_progress=False)

# Validate checksum after forced re-download (should be OK again)
assert verify_hash(files[0], *record["files"][0]["checksum"].split(":"))
assert verify_hash(files[0], record["files"][0]["checksum"])

0 comments on commit 166f2e6

Please sign in to comment.