diff --git a/CHANGELOG.md b/CHANGELOG.md index 62fa0e7a75f..4b7fc6be7af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added a `FileLock` class to `common.file_utils`. This is just like the `FileLock` from the `filelock` library, except that + it adds an optional flag `read_only_ok: bool`, which when set to `True` changes the behavior so that a warning will be emitted + instead of an exception when lacking write permissions on an existing file lock. + This makes it possible to use the `FileLock` class on a read-only file system. - Added a new learning rate scheduler: `CombinedLearningRateScheduler`. This can be used to combine different LR schedulers, using one after the other. - Moving `ModelCard` and `TaskCard` abstractions into the main repository. @@ -21,6 +25,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fixed typo with `LabelField` string representation: removed trailing apostrophe. +- `Vocabulary.from_files` and `cached_path` will issue a warning, instead of failing, when a lock on an existing resource + can't be acquired because the file system is read-only. ## [v1.3.0](https://github.com/allenai/allennlp/releases/tag/v1.3.0) - 2020-12-15 diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py index 70bd6fa2c0e..ae3e2182a36 100644 --- a/allennlp/common/file_utils.py +++ b/allennlp/common/file_utils.py @@ -33,11 +33,13 @@ import tarfile import shutil import time +import warnings import boto3 import botocore from botocore.exceptions import ClientError, EndpointConnectionError -from filelock import FileLock +from filelock import FileLock as _FileLock +from overrides import overrides import requests from requests.adapters import HTTPAdapter from requests.exceptions import ConnectionError @@ -63,6 +65,46 @@ ) +class FileLock(_FileLock): + """ + This is just a subclass of the `FileLock` class from the `filelock` library, except that + it adds an additional argument to the `__init__` method: `read_only_ok`. + + By default this flag is `False`, which an exception will be thrown when a lock + can't be acquired due to lack of write permissions. + But if this flag is set to `True`, a warning will be emitted instead of an error when + the lock already exists but the lock can't be acquired because write access is blocked. + """ + + def __init__( + self, lock_file: Union[str, PathLike], timeout=-1, read_only_ok: bool = False + ) -> None: + super().__init__(str(lock_file), timeout=timeout) + self._read_only_ok = read_only_ok + + @overrides + def acquire(self, timeout=None, poll_interval=0.05): + try: + super().acquire(timeout=timeout, poll_intervall=poll_interval) + except OSError as err: + # OSError could be a lot of different things, but what we're looking + # for in particular are permission errors, such as: + # - errno 1 - EPERM - "Operation not permitted" + # - errno 13 - EACCES - "Permission denied" + # - errno 30 - EROFS - "Read-only file system" + if err.errno not in (1, 13, 30): + raise + + if os.path.isfile(self._lock_file) and self._read_only_ok: + warnings.warn( + f"Lacking permissions required to obtain lock '{self._lock_file}'. " + "Race conditions are possible if other processes are writing to the same resource.", + UserWarning, + ) + else: + raise + + def _resource_to_filename(resource: str, etag: str = None) -> str: """ Convert a `resource` into a hashed filename in a repeatable way. @@ -583,7 +625,7 @@ def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str: # on the call to `lock.acquire()` until the process currently holding the lock # releases it. logger.debug("waiting to acquire lock on %s", cache_path) - with FileLock(cache_path + ".lock"): + with FileLock(cache_path + ".lock", read_only_ok=True): if os.path.exists(cache_path): logger.info("cache of %s is up-to-date", url) else: diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py index 1cadcc15db9..89661b5bd40 100644 --- a/allennlp/data/vocabulary.py +++ b/allennlp/data/vocabulary.py @@ -11,10 +11,8 @@ from collections import defaultdict from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Union, TYPE_CHECKING -from filelock import FileLock - from allennlp.common import Registrable -from allennlp.common.file_utils import cached_path +from allennlp.common.file_utils import cached_path, FileLock from allennlp.common.checks import ConfigurationError from allennlp.common.tqdm import Tqdm from allennlp.common.util import namespace_match @@ -338,7 +336,7 @@ def from_files( # We use a lock file to avoid race conditions where multiple processes # might be reading/writing from/to the same vocab files at once. - with FileLock(os.path.join(directory, ".lock")): + with FileLock(os.path.join(directory, ".lock"), read_only_ok=True): with codecs.open( os.path.join(directory, NAMESPACE_PADDING_FILE), "r", "utf-8" ) as namespace_file: diff --git a/tests/common/file_utils_test.py b/tests/common/file_utils_test.py index 39c2b855c47..550b30e111e 100644 --- a/tests/common/file_utils_test.py +++ b/tests/common/file_utils_test.py @@ -5,12 +5,14 @@ import time import shutil +from filelock import Timeout import pytest import responses from requests.exceptions import ConnectionError from allennlp.common import file_utils from allennlp.common.file_utils import ( + FileLock, _resource_to_filename, filename_to_url, get_from_cache, @@ -59,6 +61,42 @@ def head_callback(_): responses.add_callback(responses.HEAD, url, callback=head_callback) +class TestFileLock(AllenNlpTestCase): + def setup_method(self): + super().setup_method() + + # Set up a regular lock and a read-only lock. + open(self.TEST_DIR / "lock", "a").close() + open(self.TEST_DIR / "read_only_lock", "a").close() + os.chmod(self.TEST_DIR / "read_only_lock", 0o555) + + # Also set up a read-only directory. + os.mkdir(self.TEST_DIR / "read_only_dir", 0o555) + + def test_locking(self): + with FileLock(self.TEST_DIR / "lock"): + # Trying to acquire the lock again should fail. + with pytest.raises(Timeout): + with FileLock(self.TEST_DIR / "lock", timeout=0.1): + pass + + # Trying to acquire a lock when lacking write permissions on the file should fail. + with pytest.raises(PermissionError): + with FileLock(self.TEST_DIR / "read_only_lock"): + pass + + # But this should only issue a warning if we set the `read_only_ok` flag to `True`. + with pytest.warns(UserWarning, match="Lacking permissions"): + with FileLock(self.TEST_DIR / "read_only_lock", read_only_ok=True): + pass + + # However this should always fail when we lack write permissions and the file lock + # doesn't exist yet. + with pytest.raises(PermissionError): + with FileLock(self.TEST_DIR / "read_only_dir" / "lock", read_only_ok=True): + pass + + class TestFileUtils(AllenNlpTestCase): def setup_method(self): super().setup_method()