Skip to content

Commit

Permalink
Merge pull request #12327 from notatallshaw/Optimize---find-links=<pa…
Browse files Browse the repository at this point in the history
…th-to-dir>
  • Loading branch information
uranusjr authored Jan 30, 2024
2 parents e57cf4f + 0997930 commit 89950c5
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 23 deletions.
1 change: 1 addition & 0 deletions news/12327.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Optimized usage of ``--find-links=<path-to-dir>``, by only scanning the relevant directory once, only considering file names that are valid wheel or sdist names, and only considering files in the directory that are related to the install.
2 changes: 2 additions & 0 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ def collect_sources(
page_validator=self.session.is_secure_origin,
expand_dir=False,
cache_link_parsing=False,
project_name=project_name,
)
for loc in self.search_scope.get_index_urls_locations(project_name)
).values()
Expand All @@ -483,6 +484,7 @@ def collect_sources(
page_validator=self.session.is_secure_origin,
expand_dir=True,
cache_link_parsing=True,
project_name=project_name,
)
for loc in self.find_links
).values()
Expand Down
84 changes: 73 additions & 11 deletions src/pip/_internal/index/sources.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
import logging
import mimetypes
import os
import pathlib
from typing import Callable, Iterable, Optional, Tuple
from collections import defaultdict
from typing import Callable, Dict, Iterable, List, Optional, Tuple

from pip._vendor.packaging.utils import (
InvalidSdistFilename,
InvalidVersion,
InvalidWheelFilename,
canonicalize_name,
parse_sdist_filename,
parse_wheel_filename,
)

from pip._internal.models.candidate import InstallationCandidate
from pip._internal.models.link import Link
Expand Down Expand Up @@ -36,6 +45,53 @@ def _is_html_file(file_url: str) -> bool:
return mimetypes.guess_type(file_url, strict=False)[0] == "text/html"


class _FlatDirectoryToUrls:
"""Scans directory and caches results"""

def __init__(self, path: str) -> None:
self._path = path
self._page_candidates: List[str] = []
self._project_name_to_urls: Dict[str, List[str]] = defaultdict(list)
self._scanned_directory = False

def _scan_directory(self) -> None:
"""Scans directory once and populates both page_candidates
and project_name_to_urls at the same time
"""
for entry in os.scandir(self._path):
url = path_to_url(entry.path)
if _is_html_file(url):
self._page_candidates.append(url)
continue

# File must have a valid wheel or sdist name,
# otherwise not worth considering as a package
try:
project_filename = parse_wheel_filename(entry.name)[0]
except (InvalidWheelFilename, InvalidVersion):
try:
project_filename = parse_sdist_filename(entry.name)[0]
except (InvalidSdistFilename, InvalidVersion):
continue

self._project_name_to_urls[project_filename].append(url)
self._scanned_directory = True

@property
def page_candidates(self) -> List[str]:
if not self._scanned_directory:
self._scan_directory()

return self._page_candidates

@property
def project_name_to_urls(self) -> Dict[str, List[str]]:
if not self._scanned_directory:
self._scan_directory()

return self._project_name_to_urls


class _FlatDirectorySource(LinkSource):
"""Link source specified by ``--find-links=<path-to-dir>``.
Expand All @@ -45,30 +101,34 @@ class _FlatDirectorySource(LinkSource):
* ``file_candidates``: Archives in the directory.
"""

_paths_to_urls: Dict[str, _FlatDirectoryToUrls] = {}

def __init__(
self,
candidates_from_page: CandidatesFromPage,
path: str,
project_name: str,
) -> None:
self._candidates_from_page = candidates_from_page
self._path = pathlib.Path(os.path.realpath(path))
self._project_name = canonicalize_name(project_name)

# Get existing instance of _FlatDirectoryToUrls if it exists
if path in self._paths_to_urls:
self._path_to_urls = self._paths_to_urls[path]
else:
self._path_to_urls = _FlatDirectoryToUrls(path=path)
self._paths_to_urls[path] = self._path_to_urls

@property
def link(self) -> Optional[Link]:
return None

def page_candidates(self) -> FoundCandidates:
for path in self._path.iterdir():
url = path_to_url(str(path))
if not _is_html_file(url):
continue
for url in self._path_to_urls.page_candidates:
yield from self._candidates_from_page(Link(url))

def file_links(self) -> FoundLinks:
for path in self._path.iterdir():
url = path_to_url(str(path))
if _is_html_file(url):
continue
for url in self._path_to_urls.project_name_to_urls[self._project_name]:
yield Link(url)


Expand Down Expand Up @@ -170,6 +230,7 @@ def build_source(
page_validator: PageValidator,
expand_dir: bool,
cache_link_parsing: bool,
project_name: str,
) -> Tuple[Optional[str], Optional[LinkSource]]:
path: Optional[str] = None
url: Optional[str] = None
Expand Down Expand Up @@ -203,6 +264,7 @@ def build_source(
source = _FlatDirectorySource(
candidates_from_page=candidates_from_page,
path=path,
project_name=project_name,
)
else:
source = _IndexDirectorySource(
Expand Down
6 changes: 0 additions & 6 deletions tests/functional/test_install_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,6 @@ def test_command_line_append_flags(
"Fetching project page and analyzing links: https://test.pypi.org"
in result.stdout
)
assert (
f"Skipping link: not a file: {data.find_links}" in result.stdout
), f"stdout: {result.stdout}"


@pytest.mark.network
Expand All @@ -151,9 +148,6 @@ def test_command_line_appends_correctly(
"Fetching project page and analyzing links: https://test.pypi.org"
in result.stdout
), result.stdout
assert (
f"Skipping link: not a file: {data.find_links}" in result.stdout
), f"stdout: {result.stdout}"


def test_config_file_override_stack(
Expand Down
55 changes: 50 additions & 5 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,7 +862,7 @@ def test_collect_sources__file_expand_dir(data: TestData) -> None:
)
sources = collector.collect_sources(
# Shouldn't be used.
project_name=None, # type: ignore[arg-type]
project_name="",
candidates_from_page=None, # type: ignore[arg-type]
)
assert (
Expand Down Expand Up @@ -960,7 +960,7 @@ def test_fetch_response(self, mock_get_simple_response: mock.Mock) -> None:
session=link_collector.session,
)

def test_collect_sources(
def test_collect_page_sources(
self, caplog: pytest.LogCaptureFixture, data: TestData
) -> None:
caplog.set_level(logging.DEBUG)
Expand Down Expand Up @@ -993,9 +993,8 @@ def test_collect_sources(
files = list(files_it)
pages = list(pages_it)

# Spot-check the returned sources.
assert len(files) > 20
check_links_include(files, names=["simple-1.0.tar.gz"])
# Only "twine" should return from collecting sources
assert len(files) == 1

assert [page.link for page in pages] == [Link("https://pypi.org/simple/twine/")]
# Check that index URLs are marked as *un*cacheable.
Expand All @@ -1010,6 +1009,52 @@ def test_collect_sources(
("pip._internal.index.collector", logging.DEBUG, expected_message),
]

def test_collect_file_sources(
self, caplog: pytest.LogCaptureFixture, data: TestData
) -> None:
caplog.set_level(logging.DEBUG)

link_collector = make_test_link_collector(
find_links=[data.find_links],
# Include two copies of the URL to check that the second one
# is skipped.
index_urls=[PyPI.simple_url, PyPI.simple_url],
)
collected_sources = link_collector.collect_sources(
"singlemodule",
candidates_from_page=lambda link: [
InstallationCandidate("singlemodule", "0.0.1", link)
],
)

files_it = itertools.chain.from_iterable(
source.file_links()
for sources in collected_sources
for source in sources
if source is not None
)
pages_it = itertools.chain.from_iterable(
source.page_candidates()
for sources in collected_sources
for source in sources
if source is not None
)
files = list(files_it)
_ = list(pages_it)

# singlemodule should return files
assert len(files) > 0
check_links_include(files, names=["singlemodule-0.0.1.tar.gz"])

expected_message = dedent(
"""\
1 location(s) to search for versions of singlemodule:
* https://pypi.org/simple/singlemodule/"""
)
assert caplog.record_tuples == [
("pip._internal.index.collector", logging.DEBUG, expected_message),
]


@pytest.mark.parametrize(
"find_links, no_index, suppress_no_index, expected",
Expand Down
5 changes: 4 additions & 1 deletion tests/unit/test_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,10 @@ def test_skip_invalid_wheel_link(
with pytest.raises(DistributionNotFound):
finder.find_requirement(req, True)

assert "Skipping link: invalid wheel filename:" in caplog.text
assert (
"Could not find a version that satisfies the requirement invalid"
" (from versions:" in caplog.text
)

def test_not_find_wheel_not_supported(self, data: TestData) -> None:
"""
Expand Down

0 comments on commit 89950c5

Please sign in to comment.