Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 60 additions & 5 deletions src/requests/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from __future__ import annotations

import os.path
import re
import socket # noqa: F401
import typing
import warnings
Expand Down Expand Up @@ -81,6 +82,45 @@ def SOCKSProxyManager(*args: Any, **kwargs: Any) -> None:
DEFAULT_RETRIES = 0
DEFAULT_POOL_TIMEOUT = None

# Anchored to the authority section of the URL (between "://" and the first
# "/", "?", or "#") so that brackets in the path or query string cannot
# produce false positives.
#
# Inside the brackets two forms are detected:
# - RFC 6874 encoded %25: the delimiter is %25 followed by one or more
# ZoneID characters. Per RFC 6874 the ZoneID unreserved chars are
# [A-Za-z0-9_.\-~] plus percent-encoded octets (%[0-9A-Fa-f]{2}), so
# names like "Ethernet%203" (space encoded as %20) or names containing
# tildes are matched correctly.
# - Literal %: a negative lookahead (?![0-9A-Fa-f]{2}) rejects valid
# percent-encoded bytes whose first hex digit happens to be a letter
# (e.g. %AB, %aF, %CD). After that guard, one alphanumeric character
# is required (covering both named interfaces like eth0 and numeric
# zone indices like 1 or 3), followed by zero or more identifier chars.
_IPV6_ZONE_ID_RE = re.compile(
r"://[^/?#]*\[[^\]]*"
r"(?:%25(?:[a-zA-Z0-9_.\-~]|%[0-9A-Fa-f]{2})+"
r"|%(?![0-9A-Fa-f]{2})[0-9A-Za-z][A-Za-z0-9_.\-]*)\]"
)


def _has_ipv6_zone_id(url: str) -> bool:
"""
Detect if URL contains IPv6 zone identifier (scope ID).

IPv6 zone IDs use % character within brackets, e.g.:
http://[fe80::1%eth0]:8080/

This is used to determine whether to use urllib3's parse_url()
(which handles zone IDs correctly) or urlparse() for backward
compatibility.

:param url: URL string to check
:return: True if URL contains IPv6 zone ID
:rtype: bool
"""
return bool(_IPV6_ZONE_ID_RE.search(url))


def _urllib3_request_context(
request: PreparedRequest,
Expand All @@ -90,9 +130,21 @@ def _urllib3_request_context(
) -> tuple[dict[str, Any], dict[str, Any]]:
host_params: dict[str, Any] = {}
pool_kwargs: dict[str, Any] = {}
parsed_request_url = urlparse(request.url)
scheme = parsed_request_url.scheme.lower()
port = parsed_request_url.port

# Use urllib3's parse_url for IPv6 zone IDs, urlparse otherwise
if _has_ipv6_zone_id(request.url):
parsed_request_url = parse_url(request.url)
scheme = parsed_request_url.scheme.lower()
port = parsed_request_url.port
# parse_url uses .host and includes brackets for IPv6, strip them
hostname = parsed_request_url.host
if hostname and hostname.startswith("[") and hostname.endswith("]"):
hostname = hostname[1:-1]
else:
parsed_request_url = urlparse(request.url)
scheme = parsed_request_url.scheme.lower()
port = parsed_request_url.port
hostname = parsed_request_url.hostname # urlparse uses .hostname

cert_reqs = "CERT_REQUIRED"
if verify is False:
Expand All @@ -113,7 +165,7 @@ def _urllib3_request_context(
pool_kwargs["cert_file"] = client_cert
host_params = {
"scheme": scheme,
"host": parsed_request_url.hostname,
"host": hostname,
"port": port,
}
return host_params, pool_kwargs
Expand Down Expand Up @@ -581,7 +633,10 @@ def request_url(
assert _is_prepared(request)

proxy = select_proxy(request.url, proxies)
scheme = urlparse(request.url).scheme
if _has_ipv6_zone_id(request.url):
scheme = parse_url(request.url).scheme
else:
scheme = urlparse(request.url).scheme

is_proxied_http_request = proxy and scheme != "https"
using_socks_proxy = False
Expand Down
44 changes: 44 additions & 0 deletions src/requests/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# Implicit import within threads may cause LookupError when standard library is in a ZIP,
# such as in Embedded Python. See https://github.com/psf/requests/issues/3578.
import encodings.idna # noqa: F401
import re
from collections.abc import Callable, Generator, Iterable, Iterator, Mapping
from io import UnsupportedOperation
from typing import (
Expand Down Expand Up @@ -104,6 +105,14 @@
CONTENT_CHUNK_SIZE: int = 10 * 1024
ITER_CHUNK_SIZE: int = 512

# Regex patterns for IPv6 zone ID handling in prepare_url.
# Extracts the bracket content from the authority section of the URL.
_AUTHORITY_BRACKET_RE = re.compile(r"://[^/?#]*\[([^\]]*)\]")
# Matches an RFC 6874 zone ID delimiter (%25) followed by zone ID characters.
_RFC6874_ZONE_ID_RE = re.compile(r"%25(?:[a-zA-Z0-9_.\-~]|%[0-9A-Fa-f]{2})+")
# Matches a raw % zone ID delimiter (not a valid percent-encoded byte).
_RAW_ZONE_ID_RE = re.compile(r"%(?![0-9A-Fa-f]{2})[0-9A-Za-z][A-Za-z0-9_.\-]*")


class RequestEncodingMixin:
url: str | None
Expand Down Expand Up @@ -512,6 +521,41 @@ def prepare_url(
except LocationParseError as e:
raise InvalidURL(*e.args)

# Mitigation for RFC 6874: parse_url incorrectly decodes zone ID delimiter (%25 -> %)
# We reconstruct the host with the correct, fully-encoded delimiter to prevent
# downstream errors (like ipaddress validation or incorrect connection arguments).
#
# Matching on the parse_url-decoded host is ambiguous because parse_url decodes
# %25 -> % and then the resulting %XX may look like a valid percent-encoding
# (e.g. %2550 becomes %50 which resembles percent-encoded 'P'). Instead we
# extract the bracket content from the ORIGINAL url (before any decoding) and
# match there. Two input forms are handled:
#
# 1. RFC 6874 encoded form (%25 delimiter): the original bracket contains %25
# followed by one or more ZoneID unreserved chars ([A-Za-z0-9_.\-~]) or
# pct-encoded octets (%XX). Examples: [fe80::1%25eth0], [fe80::1%255],
# [fe80::1%25_foo]. The matched segment is placed verbatim into host.
#
# 2. Raw % delimiter (legacy/non-standard): a literal % that is NOT a valid
# %XX percent-encoding, followed by a letter then more identifier chars.
# Examples: [fe80::1%eth0], [fe80::1%wlan0]. Re-encoded as %25<zone_name>.
#
# This avoids false-positive re-encoding of legitimate %XX sequences (e.g. %20,
# %AB) that should never be treated as zone ID delimiters.
if host and host.startswith("[") and host.endswith("]"):
original_bracket = _AUTHORITY_BRACKET_RE.search(url)
if original_bracket:
original_inner = original_bracket.group(1)
rfc_match = _RFC6874_ZONE_ID_RE.search(original_inner)
if rfc_match:
ip_part = original_inner[: rfc_match.start()]
host = f"[{ip_part}{rfc_match.group()}]"
else:
raw_match = _RAW_ZONE_ID_RE.search(original_inner)
if raw_match:
pos = raw_match.start()
host = f"[{original_inner[:pos]}%25{original_inner[pos + 1 :]}]"

if not scheme:
raise MissingSchema(
f"Invalid URL {url!r}: No scheme supplied. "
Expand Down
Loading