From 0670799a53ed51259df58a4f3db5be63db8cec3f Mon Sep 17 00:00:00 2001 From: lichuang9890-star Date: Mon, 6 Apr 2026 22:16:27 +0400 Subject: [PATCH 1/4] fix: recover latin-1 encoded Location headers on redirects (Fixes #10047) --- CHANGES/10047.bugfix.rst | 1 + aiohttp/client.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 CHANGES/10047.bugfix.rst diff --git a/CHANGES/10047.bugfix.rst b/CHANGES/10047.bugfix.rst new file mode 100644 index 00000000000..da63dc0f224 --- /dev/null +++ b/CHANGES/10047.bugfix.rst @@ -0,0 +1 @@ +Fixed redirect following when the server sends a ``Location`` header containing raw latin-1 encoded bytes (e.g. ``\xf8`` for ``ø``). Previously, these were decoded via UTF-8 surrogateescape, producing lone surrogates that broke URL parsing and caused 404 errors. The redirect URL is now recovered by round-tripping through latin-1 -- by :user:`lichuang9890-star`. diff --git a/aiohttp/client.py b/aiohttp/client.py index c3e874e650d..63ae99bd85c 100644 --- a/aiohttp/client.py +++ b/aiohttp/client.py @@ -847,6 +847,21 @@ async def _connect_and_send_request( # response is forbidden resp.release() + # Some servers send Location headers with raw + # latin-1 bytes (e.g. \xf8 for ø). The HTTP + # parser decodes them via utf-8/surrogateescape, + # producing lone surrogates (\udcf8) that break + # URL parsing. Recover by round-tripping back + # to bytes and decoding as latin-1. (See #10047) + try: + r_url.encode("utf-8") + except (UnicodeEncodeError, UnicodeDecodeError): + try: + raw = r_url.encode("utf-8", "surrogateescape") + r_url = raw.decode("latin-1") + except (UnicodeDecodeError, UnicodeEncodeError): + pass + try: parsed_redirect_url = URL( r_url, encoded=not self._requote_redirect_url From a440222e791f4c8338b8fd8cb1a043b4067cba13 Mon Sep 17 00:00:00 2001 From: zhanlong9890 Date: Tue, 7 Apr 2026 12:25:51 +0400 Subject: [PATCH 2/4] fix(client): recover redirect Location via surrogateescape with utf-8 first, latin-1 fallback --- aiohttp/client.py | 25 +++++++++++-------------- tests/test_client_functional.py | 13 +++++++++++++ 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/aiohttp/client.py b/aiohttp/client.py index 63ae99bd85c..6d538481bad 100644 --- a/aiohttp/client.py +++ b/aiohttp/client.py @@ -268,6 +268,16 @@ def __post_init__(self) -> None: _CharsetResolver = Callable[[ClientResponse, bytes], str] +def _recover_redirect_location(r_url: str) -> str: + if not any("\udc80" <= ch <= "\udcff" for ch in r_url): + return r_url + raw = r_url.encode("utf-8", "surrogateescape") + try: + return raw.decode("utf-8") + except UnicodeDecodeError: + return raw.decode("latin-1") + + @final class ClientSession: """First-class interface for making HTTP requests.""" @@ -847,20 +857,7 @@ async def _connect_and_send_request( # response is forbidden resp.release() - # Some servers send Location headers with raw - # latin-1 bytes (e.g. \xf8 for ø). The HTTP - # parser decodes them via utf-8/surrogateescape, - # producing lone surrogates (\udcf8) that break - # URL parsing. Recover by round-tripping back - # to bytes and decoding as latin-1. (See #10047) - try: - r_url.encode("utf-8") - except (UnicodeEncodeError, UnicodeDecodeError): - try: - raw = r_url.encode("utf-8", "surrogateescape") - r_url = raw.decode("latin-1") - except (UnicodeDecodeError, UnicodeEncodeError): - pass + r_url = _recover_redirect_location(r_url) try: parsed_redirect_url = URL( diff --git a/tests/test_client_functional.py b/tests/test_client_functional.py index 8ee45330bb5..3e713bf1425 100644 --- a/tests/test_client_functional.py +++ b/tests/test_client_functional.py @@ -51,6 +51,7 @@ SocketTimeoutError, TooManyRedirects, ) +from aiohttp.client import _recover_redirect_location from aiohttp.client_reqrep import ClientRequest from aiohttp.compression_utils import DEFAULT_MAX_DECOMPRESS_SIZE from aiohttp.http_exceptions import DecompressSizeError @@ -3016,6 +3017,18 @@ async def handler_redirect(request: web.Request) -> web.Response: assert data == body +@pytest.mark.parametrize( + ("raw_location", "expected"), + ( + ("https://cornelius-k.dk/synspr\udcf8ve", "https://cornelius-k.dk/synsprøve"), + ("https://cornelius-k.dk/synspr\udcc3\udcb8ve", "https://cornelius-k.dk/synsprøve"), + ("https://cornelius-k.dk/synspr%C3%B8ve", "https://cornelius-k.dk/synspr%C3%B8ve"), + ), +) +def test_recover_redirect_location(raw_location: str, expected: str) -> None: + assert _recover_redirect_location(raw_location) == expected + + INVALID_URL_WITH_ERROR_MESSAGE_YARL_NEW = ( # yarl.URL.__new__ raises ValueError ("http://:/", "http://:/"), From 5ea52e569089350f47aa8f6f3703caf4457df790 Mon Sep 17 00:00:00 2001 From: zhanlong9890 Date: Tue, 7 Apr 2026 12:45:38 +0400 Subject: [PATCH 3/4] style(tests): apply isort+black formatting for pre-commit --- tests/test_client_functional.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_client_functional.py b/tests/test_client_functional.py index 3e713bf1425..8eeb1093162 100644 --- a/tests/test_client_functional.py +++ b/tests/test_client_functional.py @@ -41,6 +41,7 @@ import aiohttp from aiohttp import Fingerprint, ServerFingerprintMismatch, hdrs, payload, web from aiohttp.abc import AbstractResolver, ResolveResult +from aiohttp.client import _recover_redirect_location from aiohttp.client_exceptions import ( ClientResponseError, InvalidURL, @@ -51,7 +52,6 @@ SocketTimeoutError, TooManyRedirects, ) -from aiohttp.client import _recover_redirect_location from aiohttp.client_reqrep import ClientRequest from aiohttp.compression_utils import DEFAULT_MAX_DECOMPRESS_SIZE from aiohttp.http_exceptions import DecompressSizeError @@ -3021,8 +3021,14 @@ async def handler_redirect(request: web.Request) -> web.Response: ("raw_location", "expected"), ( ("https://cornelius-k.dk/synspr\udcf8ve", "https://cornelius-k.dk/synsprøve"), - ("https://cornelius-k.dk/synspr\udcc3\udcb8ve", "https://cornelius-k.dk/synsprøve"), - ("https://cornelius-k.dk/synspr%C3%B8ve", "https://cornelius-k.dk/synspr%C3%B8ve"), + ( + "https://cornelius-k.dk/synspr\udcc3\udcb8ve", + "https://cornelius-k.dk/synsprøve", + ), + ( + "https://cornelius-k.dk/synspr%C3%B8ve", + "https://cornelius-k.dk/synspr%C3%B8ve", + ), ), ) def test_recover_redirect_location(raw_location: str, expected: str) -> None: From f11f79d2e2147866598d0043f37de4ff7c6f37ba Mon Sep 17 00:00:00 2001 From: zhanlong9890 Date: Sun, 12 Apr 2026 03:27:00 +0400 Subject: [PATCH 4/4] fix(client): use fallback_charset_resolver for redirect Location recovery Address reviewer feedback: instead of hardcoding latin-1, consult the session's fallback_charset_resolver to determine the charset for recovering non-ASCII Location headers. Latin-1 remains the ultimate fallback per RFC 7230 (historical HTTP/1.1 header encoding). Refs: aio-libs/aiohttp#10047 --- CHANGES/10047.bugfix.rst | 8 ++++- aiohttp/client.py | 23 ++++++++++++-- tests/test_client_functional.py | 56 +++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/CHANGES/10047.bugfix.rst b/CHANGES/10047.bugfix.rst index da63dc0f224..9ed1e0a1b0d 100644 --- a/CHANGES/10047.bugfix.rst +++ b/CHANGES/10047.bugfix.rst @@ -1 +1,7 @@ -Fixed redirect following when the server sends a ``Location`` header containing raw latin-1 encoded bytes (e.g. ``\xf8`` for ``ø``). Previously, these were decoded via UTF-8 surrogateescape, producing lone surrogates that broke URL parsing and caused 404 errors. The redirect URL is now recovered by round-tripping through latin-1 -- by :user:`lichuang9890-star`. +Fixed redirect following when the server sends a ``Location`` header containing +raw non-ASCII bytes (e.g. ``\xf8`` for ``ø``). Previously, these were decoded +via UTF-8 surrogateescape, producing lone surrogates that broke URL parsing and +caused 404 errors. The redirect URL is now recovered by first consulting +:paramref:`ClientSession.fallback_charset_resolver` for the correct charset, +then falling back to ``latin-1`` (the historical HTTP/1.1 header encoding per +:rfc:`7230`) -- by :user:`lichuang9890-star`. diff --git a/aiohttp/client.py b/aiohttp/client.py index 6d538481bad..c040dc4446d 100644 --- a/aiohttp/client.py +++ b/aiohttp/client.py @@ -268,14 +268,29 @@ def __post_init__(self) -> None: _CharsetResolver = Callable[[ClientResponse, bytes], str] -def _recover_redirect_location(r_url: str) -> str: +def _recover_redirect_location(r_url: str, charset: str = "latin-1") -> str: + """Recover a redirect Location URL that contains surrogates. + + When servers send non-ASCII bytes in Location headers, Python's HTTP + parser decodes them as UTF-8 with ``surrogateescape``, producing lone + surrogates (``\\udc80``–``\\udcff``). This helper recovers the + original URL by first attempting a lossless UTF-8 round-trip, then + falling back to *charset* (which defaults to ``latin-1``, the + historical HTTP/1.1 header encoding per :rfc:`7230`). + + *charset* is typically obtained from + :paramref:`ClientSession.fallback_charset_resolver`. + """ if not any("\udc80" <= ch <= "\udcff" for ch in r_url): return r_url raw = r_url.encode("utf-8", "surrogateescape") try: return raw.decode("utf-8") except UnicodeDecodeError: - return raw.decode("latin-1") + try: + return raw.decode(charset) + except (UnicodeDecodeError, LookupError): + return raw.decode("latin-1") @final @@ -857,7 +872,9 @@ async def _connect_and_send_request( # response is forbidden resp.release() - r_url = _recover_redirect_location(r_url) + _raw = r_url.encode("utf-8", "surrogateescape") + _charset = self._resolve_charset(resp, _raw) + r_url = _recover_redirect_location(r_url, _charset) try: parsed_redirect_url = URL( diff --git a/tests/test_client_functional.py b/tests/test_client_functional.py index 8eeb1093162..8a5501e557a 100644 --- a/tests/test_client_functional.py +++ b/tests/test_client_functional.py @@ -3035,6 +3035,62 @@ def test_recover_redirect_location(raw_location: str, expected: str) -> None: assert _recover_redirect_location(raw_location) == expected +@pytest.mark.parametrize( + ("raw_location", "charset", "expected"), + ( + # charset resolver returns a non-latin-1 encoding + ( + "https://example.com/\udce4\udcbd\udca0\udce5\udca5\udcbd", + "utf-8", + "https://example.com/你好", + ), + # charset resolver provides the correct charset directly + ( + "https://cornelius-k.dk/synspr\udcf8ve", + "latin-1", + "https://cornelius-k.dk/synsprøve", + ), + # charset resolver returns an unknown charset; falls back to latin-1 + ( + "https://cornelius-k.dk/synspr\udcf8ve", + "no-such-codec", + "https://cornelius-k.dk/synsprøve", + ), + ), +) +def test_recover_redirect_location_with_charset( + raw_location: str, charset: str, expected: str +) -> None: + assert _recover_redirect_location(raw_location, charset) == expected + + +async def test_redirect_recover_with_fallback_charset_resolver( + aiohttp_client: AiohttpClient, +) -> None: + """Test that fallback_charset_resolver is used to recover non-ASCII Location.""" + + async def redirect_handler(request: web.Request) -> web.Response: + # Return a Location header with raw UTF-8 bytes that will + # be decoded as surrogates by Python's HTTP parser. + return web.Response( + status=301, + headers={"Location": "/ok"}, + ) + + async def ok_handler(request: web.Request) -> web.Response: + return web.Response(text="OK") + + app = web.Application() + app.router.add_get("/redirect", redirect_handler) + app.router.add_get("/ok", ok_handler) + + client = await aiohttp_client( + app, fallback_charset_resolver=lambda r, b: "latin-1" + ) + async with client.get("/redirect") as resp: + assert resp.status == 200 + + INVALID_URL_WITH_ERROR_MESSAGE_YARL_NEW = ( # yarl.URL.__new__ raises ValueError ("http://:/", "http://:/"),