Skip to content

Commit

Permalink
LOCAL: Pull requests list Speed up non recursive list_dir for google …
Browse files Browse the repository at this point in the history
…cloud storage (#321)

* Speed up non recursive list_dir for google cloud storage (#318)

* use correct env vars for gs, azure bucket perf tests

* speedup non recursive _list_dir for gs

* linting

* make gs mock compatible with delimiter

* linting

* Code review changes

* revert dirs first

---------

Co-authored-by: axel.vlaminck <axel.vlaminck@gmail.com>
  • Loading branch information
pjbull and vlaminckaxel authored Feb 16, 2023
1 parent 088fe49 commit 874044a
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 37 deletions.
52 changes: 28 additions & 24 deletions cloudpathlib/gs/gsclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,31 +173,35 @@ def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[Tuple[GSPat
prefix = cloud_path.blob
if prefix and not prefix.endswith("/"):
prefix += "/"
if recursive:
yielded_dirs = set()
for o in bucket.list_blobs(prefix=prefix):
# get directory from this path
for parent in PurePosixPath(o.name[len(prefix) :]).parents:
# if we haven't surfaced this directory already
if parent not in yielded_dirs and str(parent) != ".":
yield (
self.CloudPath(f"gs://{cloud_path.bucket}/{prefix}{parent}"),
True, # is a directory
)
yielded_dirs.add(parent)
yield (self.CloudPath(f"gs://{cloud_path.bucket}/{o.name}"), False) # is a file
else:
iterator = bucket.list_blobs(delimiter="/", prefix=prefix)

# files must be iterated first for `.prefixes` to be populated exist:
# see: https://github.com/googleapis/python-storage/issues/863
for file in iterator:
yield (
self.CloudPath(f"gs://{cloud_path.bucket}/{file.name}"),
False, # is a file
)

yielded_dirs = set()

# NOTE: Not recursive may be slower than necessary since it just filters
# the recursive implementation
for o in bucket.list_blobs(prefix=prefix):
# get directory from this path
for parent in PurePosixPath(o.name[len(prefix) :]).parents:
# if we haven't surfaced thei directory already
if parent not in yielded_dirs and str(parent) != ".":
# skip if not recursive and this is beyond our depth
if not recursive and "/" in str(parent):
continue

yield (
self.CloudPath(f"gs://{cloud_path.bucket}/{prefix}{parent}"),
True, # is a directory
)
yielded_dirs.add(parent)

# skip file if not recursive and this is beyond our depth
if not recursive and "/" in o.name[len(prefix) :]:
continue

yield (self.CloudPath(f"gs://{cloud_path.bucket}/{o.name}"), False) # is a file
for directory in iterator.prefixes:
yield (
self.CloudPath(f"gs://{cloud_path.bucket}/{directory}"),
True, # is a directory
)

def _move_file(self, src: GSPath, dst: GSPath, remove_src: bool = True) -> GSPath:
# just a touch, so "REPLACE" metadata
Expand Down
35 changes: 24 additions & 11 deletions tests/mock_clients/mock_gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,30 +113,43 @@ def get_blob(self, blob):
else:
return None

def list_blobs(self, max_results=None, prefix=None):
def list_blobs(self, max_results=None, prefix=None, delimiter=None):
path = self.name if prefix is None else self.name / prefix
items = [
MockBlob(self.name, f.relative_to(self.name), client=self.client)
for f in path.glob("**/*")
if f.is_file() and not f.name.startswith(".")
]
pattern = "**/*" if delimiter is None else "*"
blobs, prefixes = [], []
for item in path.glob(pattern):
if not item.name.startswith("."):
if item.is_file():
blobs.append(
MockBlob(self.name, item.relative_to(self.name), client=self.client)
)
else:
prefixes.append(str(item.relative_to(self.name).as_posix()))

# bucket name for passing tests
if self.bucket_name == DEFAULT_GS_BUCKET_NAME:
return iter(MockHTTPIterator(items, max_results))
return MockHTTPIterator(blobs, prefixes, max_results)
else:
raise NotFound(
f"Bucket {self.name} not expected as mock bucket; only '{DEFAULT_GS_BUCKET_NAME}' exists."
)


class MockHTTPIterator:
def __init__(self, items, max_results):
self.items = items
def __init__(self, blobs, sub_directories, max_results):
self.blobs = blobs
self.sub_directories = sub_directories
self.max_results = max_results

def __iter__(self):
if self.max_results is None:
return iter(self.items)
return iter(self.blobs)
else:
return iter(self.items[: self.max_results])
return iter(self.blobs[: self.max_results])

def __next__(self):
yield from iter(self)

@property
def prefixes(self):
return self.sub_directories
4 changes: 2 additions & 2 deletions tests/performance/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def _configure(
logger.info("Bucket not set explicitly, loading from environment variable.")
bucket = {
"s3": os.environ.get("LIVE_S3_BUCKET"),
"gs": os.environ.get("LIVE_AZURE_CONTAINER"),
"azure": os.environ.get("LIVE_GS_BUCKET"),
"gs": os.environ.get("LIVE_GS_BUCKET"),
"azure": os.environ.get("LIVE_AZURE_CONTAINER"),
}.get(backend.value)

logger.info(f"Bucket: {bucket}")
Expand Down

0 comments on commit 874044a

Please sign in to comment.