From c32d11e60f04a0798292c438186199ccdec0db61 Mon Sep 17 00:00:00 2001 From: Shantanu <12621235+hauntsaninja@users.noreply.github.com> Date: Sun, 13 Oct 2024 17:50:15 -0700 Subject: [PATCH] Significantly speed up file handling error paths (#17920) This can have a huge overall impact on mypy performance when search paths are long --- mypy/build.py | 17 +++++++------- mypy/fscache.py | 59 +++++++++++++++++++++-------------------------- mypy/fswatcher.py | 13 +++++------ 3 files changed, 40 insertions(+), 49 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 733f0685792e..964da5aac8b0 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -736,8 +736,8 @@ def maybe_swap_for_shadow_path(self, path: str) -> str: shadow_file = self.shadow_equivalence_map.get(path) return shadow_file if shadow_file else path - def get_stat(self, path: str) -> os.stat_result: - return self.fscache.stat(self.maybe_swap_for_shadow_path(path)) + def get_stat(self, path: str) -> os.stat_result | None: + return self.fscache.stat_or_none(self.maybe_swap_for_shadow_path(path)) def getmtime(self, path: str) -> int: """Return a file's mtime; but 0 in bazel mode. @@ -1394,9 +1394,9 @@ def validate_meta( if bazel: # Normalize path under bazel to make sure it isn't absolute path = normpath(path, manager.options) - try: - st = manager.get_stat(path) - except OSError: + + st = manager.get_stat(path) + if st is None: return None if not stat.S_ISDIR(st.st_mode) and not stat.S_ISREG(st.st_mode): manager.log(f"Metadata abandoned for {id}: file or directory {path} does not exist") @@ -1572,10 +1572,9 @@ def write_cache( plugin_data = manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=False)) # Obtain and set up metadata - try: - st = manager.get_stat(path) - except OSError as err: - manager.log(f"Cannot get stat for {path}: {err}") + st = manager.get_stat(path) + if st is None: + manager.log(f"Cannot get stat for {path}") # Remove apparently-invalid cache files. # (This is purely an optimization.) for filename in [data_json, meta_json]: diff --git a/mypy/fscache.py b/mypy/fscache.py index 15679ad03e85..8251f4bd9488 100644 --- a/mypy/fscache.py +++ b/mypy/fscache.py @@ -51,8 +51,8 @@ def set_package_root(self, package_root: list[str]) -> None: def flush(self) -> None: """Start another transaction and empty all caches.""" - self.stat_cache: dict[str, os.stat_result] = {} - self.stat_error_cache: dict[str, OSError] = {} + self.stat_or_none_cache: dict[str, os.stat_result | None] = {} + self.listdir_cache: dict[str, list[str]] = {} self.listdir_error_cache: dict[str, OSError] = {} self.isfile_case_cache: dict[str, bool] = {} @@ -62,24 +62,21 @@ def flush(self) -> None: self.hash_cache: dict[str, str] = {} self.fake_package_cache: set[str] = set() - def stat(self, path: str) -> os.stat_result: - if path in self.stat_cache: - return self.stat_cache[path] - if path in self.stat_error_cache: - raise copy_os_error(self.stat_error_cache[path]) + def stat_or_none(self, path: str) -> os.stat_result | None: + if path in self.stat_or_none_cache: + return self.stat_or_none_cache[path] + + st = None try: st = os.stat(path) - except OSError as err: + except OSError: if self.init_under_package_root(path): try: - return self._fake_init(path) + st = self._fake_init(path) except OSError: pass - # Take a copy to get rid of associated traceback and frame objects. - # Just assigning to __traceback__ doesn't free them. - self.stat_error_cache[path] = copy_os_error(err) - raise err - self.stat_cache[path] = st + + self.stat_or_none_cache[path] = st return st def init_under_package_root(self, path: str) -> bool: @@ -112,9 +109,9 @@ def init_under_package_root(self, path: str) -> bool: if not os.path.basename(dirname).isidentifier(): # Can't put an __init__.py in a place that's not an identifier return False - try: - st = self.stat(dirname) - except OSError: + + st = self.stat_or_none(dirname) + if st is None: return False else: if not stat.S_ISDIR(st.st_mode): @@ -145,7 +142,7 @@ def _fake_init(self, path: str) -> os.stat_result: assert basename == "__init__.py", path assert not os.path.exists(path), path # Not cached! dirname = os.path.normpath(dirname) - st = self.stat(dirname) # May raise OSError + st = os.stat(dirname) # May raise OSError # Get stat result as a list so we can modify it. seq: list[float] = list(st) seq[stat.ST_MODE] = stat.S_IFREG | 0o444 @@ -153,7 +150,6 @@ def _fake_init(self, path: str) -> os.stat_result: seq[stat.ST_NLINK] = 1 seq[stat.ST_SIZE] = 0 st = os.stat_result(seq) - self.stat_cache[path] = st # Make listdir() and read() also pretend this file exists. self.fake_package_cache.add(dirname) return st @@ -181,9 +177,8 @@ def listdir(self, path: str) -> list[str]: return results def isfile(self, path: str) -> bool: - try: - st = self.stat(path) - except OSError: + st = self.stat_or_none(path) + if st is None: return False return stat.S_ISREG(st.st_mode) @@ -248,18 +243,14 @@ def exists_case(self, path: str, prefix: str) -> bool: return res def isdir(self, path: str) -> bool: - try: - st = self.stat(path) - except OSError: + st = self.stat_or_none(path) + if st is None: return False return stat.S_ISDIR(st.st_mode) def exists(self, path: str) -> bool: - try: - self.stat(path) - except FileNotFoundError: - return False - return True + st = self.stat_or_none(path) + return st is not None def read(self, path: str) -> bytes: if path in self.read_cache: @@ -269,7 +260,7 @@ def read(self, path: str) -> bytes: # Need to stat first so that the contents of file are from no # earlier instant than the mtime reported by self.stat(). - self.stat(path) + self.stat_or_none(path) dirname, basename = os.path.split(path) dirname = os.path.normpath(dirname) @@ -294,8 +285,10 @@ def hash_digest(self, path: str) -> str: return self.hash_cache[path] def samefile(self, f1: str, f2: str) -> bool: - s1 = self.stat(f1) - s2 = self.stat(f2) + s1 = self.stat_or_none(f1) + s2 = self.stat_or_none(f2) + if s1 is None or s2 is None: + return False return os.path.samestat(s1, s2) diff --git a/mypy/fswatcher.py b/mypy/fswatcher.py index a574a36a0cc5..97a62ca9f9f7 100644 --- a/mypy/fswatcher.py +++ b/mypy/fswatcher.py @@ -2,6 +2,7 @@ from __future__ import annotations +import os from typing import AbstractSet, Iterable, NamedTuple from mypy.fscache import FileSystemCache @@ -56,8 +57,7 @@ def remove_watched_paths(self, paths: Iterable[str]) -> None: del self._file_data[path] self._paths -= set(paths) - def _update(self, path: str) -> None: - st = self.fs.stat(path) + def _update(self, path: str, st: os.stat_result) -> None: hash_digest = self.fs.hash_digest(path) self._file_data[path] = FileData(st.st_mtime, st.st_size, hash_digest) @@ -65,9 +65,8 @@ def _find_changed(self, paths: Iterable[str]) -> AbstractSet[str]: changed = set() for path in paths: old = self._file_data[path] - try: - st = self.fs.stat(path) - except FileNotFoundError: + st = self.fs.stat_or_none(path) + if st is None: if old is not None: # File was deleted. changed.add(path) @@ -76,13 +75,13 @@ def _find_changed(self, paths: Iterable[str]) -> AbstractSet[str]: if old is None: # File is new. changed.add(path) - self._update(path) + self._update(path, st) # Round mtimes down, to match the mtimes we write to meta files elif st.st_size != old.st_size or int(st.st_mtime) != int(old.st_mtime): # Only look for changes if size or mtime has changed as an # optimization, since calculating hash is expensive. new_hash = self.fs.hash_digest(path) - self._update(path) + self._update(path, st) if st.st_size != old.st_size or new_hash != old.hash: # Changed file. changed.add(path)