From 5e7d738a08f0e56c9815ff06f2395be1df2dcb92 Mon Sep 17 00:00:00 2001 From: Richard Fan Date: Fri, 29 May 2026 14:49:34 -0700 Subject: [PATCH] fix(http_utils): disable httpx keepalive to spread load across uvicorn workers A pooled httpx.AsyncClient against a uvicorn --workers N server pins all requests to the small subset of workers that accept()-won the pooled TCP connections (uvicorn shares one listen socket across workers; no SO_REUSEPORT, no work-stealing). Observed in a harbor_server run: n_workers_active = 2 of 32 for most minutes, with those 2 workers saturated at their per-process Semaphore cap while the other 30 sat idle. Setting max_keepalive_connections=0 closes the TCP after each response, so every /run gets its own accept() race and load spreads. Co-Authored-By: Claude Opus 4.7 (1M context) --- miles/utils/http_utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/miles/utils/http_utils.py b/miles/utils/http_utils.py index 0aaf792659..3681152163 100644 --- a/miles/utils/http_utils.py +++ b/miles/utils/http_utils.py @@ -228,8 +228,18 @@ def init_http_client(args): _client_concurrency = args.sglang_server_concurrency * args.rollout_num_gpus // args.rollout_num_gpus_per_engine if _http_client is None: + # max_keepalive_connections=0: defeat connection reuse so each /run + # opens a fresh TCP. A pooled httpx.AsyncClient against a uvicorn + # multi-worker server pins all traffic to the few workers that + # originally accept()-won the pooled connections (uvicorn shares a + # single listen socket across --workers; no SO_REUSEPORT, no + # work-stealing). With keepalive off, every request runs its own + # accept() race and spreads across all workers. _http_client = httpx.AsyncClient( - limits=httpx.Limits(max_connections=_client_concurrency), + limits=httpx.Limits( + max_connections=_client_concurrency, + max_keepalive_connections=0, + ), timeout=httpx.Timeout(None), )