From 2a032a0ca183c4ed1b46270154419fb1774cee0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:48:55 +0200 Subject: [PATCH] perf: create link objects lazily --- src/poetry/repositories/link_sources/base.py | 38 +++++++---- src/poetry/repositories/link_sources/html.py | 14 +++- src/poetry/repositories/link_sources/json.py | 72 +++++++++++--------- tests/repositories/link_sources/test_base.py | 8 ++- 4 files changed, 78 insertions(+), 54 deletions(-) diff --git a/src/poetry/repositories/link_sources/base.py b/src/poetry/repositories/link_sources/base.py index 982467b8667..dfe344c51b0 100644 --- a/src/poetry/repositories/link_sources/base.py +++ b/src/poetry/repositories/link_sources/base.py @@ -11,6 +11,7 @@ from packaging.utils import canonicalize_name from poetry.core.constraints.version import Version from poetry.core.packages.package import Package +from poetry.core.packages.utils.utils import splitext from poetry.core.version.exceptions import InvalidVersionError from poetry.utils.patterns import sdist_file_re @@ -19,12 +20,17 @@ if TYPE_CHECKING: from collections import defaultdict + from collections.abc import Callable from collections.abc import Iterator from packaging.utils import NormalizedName from poetry.core.packages.utils.link import Link - LinkCache = defaultdict[NormalizedName, defaultdict[Version, list[Link]]] + # The cache stores factories that build a Link on demand, so that Links are + # only constructed for the (few) versions actually retrieved rather than for + # every file listed by the repository. + LinkFactory = Callable[[], Link] + LinkCache = defaultdict[NormalizedName, defaultdict[Version, list[LinkFactory]]] logger = logging.getLogger(__name__) @@ -76,29 +82,30 @@ def packages(self) -> Iterator[Package]: @property def links(self) -> Iterator[Link]: for links_per_version in self._link_cache.values(): - for links in links_per_version.values(): - yield from links + for link_factories in links_per_version.values(): + for make_link in link_factories: + yield make_link() @classmethod def _link_package_name_and_version( - cls, link: Link + cls, filename: str ) -> tuple[NormalizedName, Version] | None: - """Extract just the (normalized name, version) for a link. + """Extract just the (normalized name, version) from a filename. - This is the hot path used when building the link cache: it avoids the - cost of constructing a full `Package` (which initializes a large - number of attributes) when only the name and version are needed as - cache keys. + This is the hot path used when building the link cache: it works on the + filename alone so that the cache can be populated without constructing a + `Link` (let alone a full `Package`) for every file. The `Link` for a + given file is only built when its version is actually retrieved. """ name: str | None = None version_string: str | None = None - m = wheel_file_re.match(link.filename) or sdist_file_re.match(link.filename) + m = wheel_file_re.match(filename) or sdist_file_re.match(filename) if m: name = m.group("name") version_string = m.group("ver") else: - info, _ext = link.splitext() + info, _ext = splitext(filename, is_filename=True) match = cls.VERSION_REGEX.match(info) if match: name = match.group(1) @@ -111,8 +118,8 @@ def _link_package_name_and_version( version = Version.parse(version_string) except InvalidVersionError: logger.debug( - "Skipping url (%s) due to invalid version (%s)", - link.url, + "Skipping file (%s) due to invalid version (%s)", + filename, version_string, ) return None @@ -121,7 +128,7 @@ def _link_package_name_and_version( @classmethod def link_package_data(cls, link: Link) -> Package | None: - name_and_version = cls._link_package_name_and_version(link) + name_and_version = cls._link_package_name_and_version(link.filename) if name_and_version is None: return None @@ -131,7 +138,8 @@ def link_package_data(cls, link: Link) -> Package | None: def links_for_version( self, name: NormalizedName, version: Version ) -> Iterator[Link]: - yield from self._link_cache[name][version] + for make_link in self._link_cache[name][version]: + yield make_link() def clean_link(self, url: str) -> str: """Makes sure a link is fully encoded. That is, if a ' ' shows up in diff --git a/src/poetry/repositories/link_sources/html.py b/src/poetry/repositories/link_sources/html.py index e177dd57b12..1a111bbe284 100644 --- a/src/poetry/repositories/link_sources/html.py +++ b/src/poetry/repositories/link_sources/html.py @@ -15,6 +15,12 @@ if TYPE_CHECKING: from poetry.repositories.link_sources.base import LinkCache + from poetry.repositories.link_sources.base import LinkFactory + + +def _const_factory(link: Link) -> LinkFactory: + """Wrap an already-built link in a factory for the link cache.""" + return lambda: link class HTMLPage(LinkSource): @@ -60,10 +66,14 @@ def _link_cache(self) -> LinkCache: if link.ext not in self.SUPPORTED_FORMATS: continue - name_and_version = self._link_package_name_and_version(link) + # The HTML API has no separate filename field, so the filename + # (needed to parse name and version) has to be derived from the + # URL, which means the Link is built eagerly here. The cache + # stores factories, so it is wrapped in one that just returns it. + name_and_version = self._link_package_name_and_version(link.filename) if name_and_version: name, version = name_and_version - links[name][version].append(link) + links[name][version].append(_const_factory(link)) return links diff --git a/src/poetry/repositories/link_sources/json.py b/src/poetry/repositories/link_sources/json.py index 7c362176219..d5bfd7937ed 100644 --- a/src/poetry/repositories/link_sources/json.py +++ b/src/poetry/repositories/link_sources/json.py @@ -2,10 +2,12 @@ from collections import defaultdict from functools import cached_property +from functools import partial from typing import TYPE_CHECKING from typing import Any from poetry.core.packages.utils.link import Link +from poetry.core.packages.utils.utils import splitext from poetry.repositories.link_sources.base import LinkSource from poetry.repositories.link_sources.base import SimpleRepositoryRootPage @@ -25,49 +27,51 @@ def __init__(self, url: str, content: dict[str, Any]) -> None: @cached_property def _link_cache(self) -> LinkCache: + # Only the filename is needed to enumerate the available versions, so we + # defer building the Link (and cleaning its URL) to _make_link, which is + # only called when the version's links are actually retrieved. For large + # projects this avoids constructing tens of thousands of Link objects + # that are never used during resolution. links: LinkCache = defaultdict(lambda: defaultdict(list)) for file in self.content["files"]: - url = self.clean_link(make_absolute_url(file["url"], self._url)) - requires_python = file.get("requires-python") - hashes = file.get("hashes", {}) - yanked = file.get("yanked", False) - size = file.get("size") - upload_time = file.get("upload-time") - - # see https://peps.python.org/pep-0714/#clients - # and https://peps.python.org/pep-0691/#project-detail - metadata: dict[str, str] | bool = False - for metadata_key in ("core-metadata", "dist-info-metadata"): - if metadata_key in file: - metadata_value = file[metadata_key] - if metadata_value and isinstance(metadata_value, dict): - metadata = metadata_value - else: - metadata = bool(metadata_value) - break - - # use filename for performance (and strictly speaking also for correctness) - link = Link( - url, - filename=file["filename"], - requires_python=requires_python, - hashes=hashes, - yanked=yanked, - metadata=metadata, - size=size, - upload_time=upload_time, - ) - - if link.ext not in self.SUPPORTED_FORMATS: + filename = file["filename"] + if splitext(filename, is_filename=True)[1] not in self.SUPPORTED_FORMATS: continue - name_and_version = self._link_package_name_and_version(link) + name_and_version = self._link_package_name_and_version(filename) if name_and_version: name, version = name_and_version - links[name][version].append(link) + links[name][version].append(partial(self._make_link, file)) return links + def _make_link(self, file: dict[str, Any]) -> Link: + url = self.clean_link(make_absolute_url(file["url"], self._url)) + + # see https://peps.python.org/pep-0714/#clients + # and https://peps.python.org/pep-0691/#project-detail + metadata: dict[str, str] | bool = False + for metadata_key in ("core-metadata", "dist-info-metadata"): + if metadata_key in file: + metadata_value = file[metadata_key] + if metadata_value and isinstance(metadata_value, dict): + metadata = metadata_value + else: + metadata = bool(metadata_value) + break + + # use filename for performance (and strictly speaking also for correctness) + return Link( + url, + filename=file["filename"], + requires_python=file.get("requires-python"), + hashes=file.get("hashes", {}), + yanked=file.get("yanked", False), + metadata=metadata, + size=file.get("size"), + upload_time=file.get("upload-time"), + ) + class SimpleRepositoryJsonRootPage(SimpleRepositoryRootPage): """ diff --git a/tests/repositories/link_sources/test_base.py b/tests/repositories/link_sources/test_base.py index 089b6fdfd05..125226ba84a 100644 --- a/tests/repositories/link_sources/test_base.py +++ b/tests/repositories/link_sources/test_base.py @@ -49,10 +49,12 @@ def link_source(mocker: MockerFixture) -> LinkSource: list, { Version.parse("0.1.0"): [ - Link(f"{url}/demo-0.1.0.tar.gz"), - Link(f"{url}/demo-0.1.0-py2.py3-none-any.whl"), + lambda: Link(f"{url}/demo-0.1.0.tar.gz"), + lambda: Link(f"{url}/demo-0.1.0-py2.py3-none-any.whl"), + ], + Version.parse("0.1.1"): [ + lambda: Link(f"{url}/demo-0.1.1.tar.gz") ], - Version.parse("0.1.1"): [Link(f"{url}/demo-0.1.1.tar.gz")], }, ), },