-
Notifications
You must be signed in to change notification settings - Fork 3.3k
metadata-timeout-propagation-fix #47529
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -124,6 +124,11 @@ def build_options(kwargs: dict[str, Any]) -> dict[str, Any]: | |
| options[Constants.Kwargs.READ_TIMEOUT] = kwargs[Constants.Kwargs.READ_TIMEOUT] | ||
| if Constants.Kwargs.TIMEOUT in kwargs: | ||
| options[Constants.Kwargs.TIMEOUT] = kwargs[Constants.Kwargs.TIMEOUT] | ||
| # Copy (not pop) so connection_timeout stays in kwargs for the page fetch | ||
| # and is also placed in options, where the container read, partition-key | ||
| # ranges, and query plan calls read it. | ||
| if Constants.Kwargs.CONNECTION_TIMEOUT in kwargs: | ||
| options[Constants.Kwargs.CONNECTION_TIMEOUT] = kwargs[Constants.Kwargs.CONNECTION_TIMEOUT] | ||
|
|
||
|
|
||
| options[Constants.OperationStartTime] = time.time() | ||
|
|
@@ -1082,6 +1087,71 @@ def _build_properties_cache(properties: dict[str, Any], container_link: str) -> | |
| "partitionKey": properties.get("partitionKey", None), "container_link": container_link | ||
| } | ||
|
|
||
| # The three per-call timeout keys a caller can set on one request. The deadline | ||
| # tuple below adds OperationStartTime to these; the carry helpers iterate that | ||
| # 4-key tuple, not this one. | ||
| _PER_CALL_TIMEOUT_OPTION_KEYS: Tuple[str, ...] = ( | ||
| Constants.Kwargs.READ_TIMEOUT, | ||
| Constants.Kwargs.CONNECTION_TIMEOUT, | ||
| Constants.Kwargs.TIMEOUT, | ||
| ) | ||
|
|
||
| # timeout and OperationStartTime must travel together: the deadline is checked as | ||
| # now - OperationStartTime, which defaults to now when missing, so a metadata call | ||
| # without it would measure from its own start, not the operation's. | ||
| _PER_CALL_DEADLINE_OPTION_KEYS: Tuple[str, ...] = _PER_CALL_TIMEOUT_OPTION_KEYS + ( | ||
| Constants.OperationStartTime, | ||
| ) | ||
|
|
||
|
|
||
| def _carry_per_call_timeout_options(source: Optional[Mapping[str, Any]], destination: dict[str, Any]) -> None: | ||
| """Copy the per-call timeouts and the operation start time from source into destination. | ||
|
|
||
| Copies read_timeout, connection_timeout, timeout, and OperationStartTime. Only | ||
| keys present in source are copied, so a timeout the caller did not set stays | ||
| absent and the request uses the client default instead of None. A None or empty | ||
| source is a no-op. | ||
|
|
||
| :param source: The request options to read the timeouts from (may be None or empty). | ||
| :type source: ~collections.abc.Mapping[str, typing.Any] or None | ||
| :param destination: The options dict to copy the timeouts into. | ||
| :type destination: dict[str, typing.Any] | ||
| :return: None | ||
| :rtype: None | ||
| """ | ||
| if not source: | ||
| return | ||
| for key in _PER_CALL_DEADLINE_OPTION_KEYS: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟢 Suggestion — Defensive consistency: The two helpers handle
This is fine today because every terminal lift into request kwargs goes through the The latent risk is that a future code path might use the result of Suggested fix: make |
||
| if key in source: | ||
| destination[key] = source[key] | ||
|
|
||
|
|
||
| def _copy_per_call_timeouts_to_kwargs( | ||
| options: Optional[Mapping[str, Any]], | ||
| kwargs: dict[str, Any] | ||
| ) -> None: | ||
| """Copy the per-call timeouts and the operation start time from options into kwargs. | ||
|
|
||
| Moves read_timeout, connection_timeout, timeout, and OperationStartTime from | ||
| the request options into the kwargs the request layer reads. A value is copied | ||
| only when it is set (not None), so an unset timeout falls back to the client | ||
| default instead of None; setdefault keeps any value already in kwargs. | ||
|
|
||
| :param options: The request options to read the timeouts from (may be None or empty). | ||
| :type options: ~collections.abc.Mapping[str, typing.Any] or None | ||
| :param kwargs: The kwargs dict to copy the timeouts into; mutated in place. | ||
| :type kwargs: dict[str, typing.Any] | ||
| :return: None | ||
| :rtype: None | ||
| """ | ||
| if not options: | ||
| return | ||
| for key in _PER_CALL_DEADLINE_OPTION_KEYS: | ||
| value = options.get(key) | ||
| if value is not None: | ||
| kwargs.setdefault(key, value) | ||
|
|
||
|
|
||
| def format_pk_range_options(query_options: Mapping[str, Any]) -> dict[str, Any]: | ||
| """Formats the partition key range options to be used internally from the query ones. | ||
| :param dict query_options: The query options being used. | ||
|
|
@@ -1094,4 +1164,7 @@ def format_pk_range_options(query_options: Mapping[str, Any]) -> dict[str, Any]: | |
| pk_range_options[Constants.ContainerRID] = query_options[Constants.ContainerRID] | ||
| if "excludedLocations" in query_options: | ||
| pk_range_options["excludedLocations"] = query_options["excludedLocations"] | ||
| # Keep the per-call timeouts so the partition-key ranges fetch uses them | ||
| # instead of the client default. | ||
| _carry_per_call_timeout_options(query_options, pk_range_options) | ||
| return pk_range_options | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -116,15 +116,17 @@ class _Constants: | |
| class Kwargs: | ||
| """Keyword arguments used in the azure-cosmos package""" | ||
|
|
||
| # Whether to retry write operations if they fail. Used either at client level or request level. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💬 Observation — Scope: unrelated style change bundled with the bugfix This hunk does two things: it adds the new - RETRY_WRITE: Literal["retry_write"] = "retry_write"
- """Whether to retry write operations if they fail. Used either at client level or request level."""
+ # Whether to retry write operations if they fail. Used either at client level or request level.
+ RETRY_WRITE: Literal["retry_write"] = "retry_write"
The style conversion is unrelated to timeout propagation and has a small cost: Suggested: either revert the conversion for the two existing keys and add only |
||
| RETRY_WRITE: Literal["retry_write"] = "retry_write" | ||
| """Whether to retry write operations if they fail. Used either at client level or request level.""" | ||
| EXCLUDED_LOCATIONS: Literal["excludedLocations"] = "excludedLocations" | ||
| # Availability strategy config. Used either at client level or request level. | ||
| AVAILABILITY_STRATEGY: Literal["availabilityStrategy"] = "availabilityStrategy" | ||
| """Availability strategy config. Used either at client level or request level""" | ||
| # Socket read timeout in seconds. Used either at client level or request level. | ||
| READ_TIMEOUT: Literal["read_timeout"] = "read_timeout" | ||
| """Socket read timeout in seconds. Used either at client level or request level.""" | ||
| # Absolute timeout in seconds for the combined HTTP request and response processing. | ||
| TIMEOUT: Literal["timeout"] = "timeout" | ||
| """Absolute timeout in seconds for the combined HTTP request and response processing.""" | ||
| # Socket connect (handshake) timeout in seconds. Used either at client level or request level. | ||
| CONNECTION_TIMEOUT: Literal["connection_timeout"] = "connection_timeout" | ||
|
|
||
| class UserAgentFeatureFlags(IntEnum): | ||
| """ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3244,18 +3244,11 @@ def __QueryFeed( # pylint: disable=too-many-locals, too-many-statements, too-ma | |
| """ | ||
| if options is None: | ||
| options = {} | ||
| read_timeout = options.get("read_timeout") | ||
| if read_timeout is not None: | ||
| # we currently have a gap where kwargs are not getting passed correctly down the pipeline. In order to make | ||
| # absolute time out work, we are passing read_timeout via kwargs as a temporary fix | ||
| kwargs.setdefault("read_timeout", read_timeout) | ||
|
|
||
| operation_start_time = options.get(Constants.OperationStartTime) | ||
| if operation_start_time is not None: | ||
| kwargs.setdefault(Constants.OperationStartTime, operation_start_time) | ||
| timeout = options.get("timeout") | ||
| if timeout is not None: | ||
| kwargs.setdefault("timeout", timeout) | ||
| # Copy the per-call timeouts and the operation start time out of options into | ||
| # kwargs, where _Request reads them. A value is copied only when set, so | ||
| # an unset timeout falls back to the client/policy default instead of | ||
| # None; setdefault keeps any explicit kwarg the caller already placed. | ||
| base._copy_per_call_timeouts_to_kwargs(options, kwargs) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Recommendation — Coverage: change-feed The PR description promises "a per-call timeout bounds the whole query — metadata calls included", and the new helper here lifts the per-call timeouts out of But a few lines below, the change-feed branch builds a fresh change_feed_state: Optional[ChangeFeedState] = options.get("changeFeedState")
if change_feed_state is not None:
feed_options = {}
if 'excludedLocations' in options:
feed_options['excludedLocations'] = options['excludedLocations']
change_feed_state.populate_request_headers(self._routing_map_provider, headers, feed_options)
A customer calling The async sibling at Suggested fix: add If this is intentionally out of scope, please file a tracking issue and call it out in the CHANGELOG so customers know the change-feed surface isn't covered yet. |
||
|
|
||
| # Execution context injects this via request options; keep kwargs fallback | ||
| # for compatibility with call paths that still thread internal values there. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟢 Suggestion — Maintainability:
_PER_CALL_TIMEOUT_OPTION_KEYSis vestigialThis 3-key tuple has no production consumer. Both helpers (
_carry_per_call_timeout_options,_copy_per_call_timeouts_to_kwargs) iterate_PER_CALL_DEADLINE_OPTION_KEYS(the 4-key tuple at line 1102). The 3-key tuple is referenced only intests/test_timeout_propagation_unit.py:122to assert its own shape.The two named constants look like a hierarchy ("just the timeouts" vs "timeouts + anchor"), but only the 4-key one is real. A new contributor reasonably reaching for "the canonical set of per-call timeout option keys" would import this tuple and write a carry loop that silently drops
OperationStartTime— defeating the operation-deadline semantics this PR is establishing.Suggested fix: either inline the three keys directly into
_PER_CALL_DEADLINE_OPTION_KEYSand drop the 3-key tuple + its test, or, if you want to keep the distinction, add a comment that this tuple is for "name-only" use and not for iteration. The current shape is one rename away from a regression that wouldn't be caught by the existing tests (which exercise the 4-key path).