From 1f9c816082218c3b2d8ddb05637162cb2ad55cf1 Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 12:24:59 +0300 Subject: [PATCH 01/26] feat(opal-server): gated /internal git-fetcher cache stats endpoint Add an off-by-default diagnostics endpoint so tests can observe the in-memory GitPolicyFetcher cache sizes (repo_locks/repos/repos_last_fetched) and process RSS that the upcoming memory-leak fix eliminates. - debug_stats.py: read-only git_fetcher_cache_stats() helper + a register_internal_stats_route() registrar that mounts GET /internal/git-fetcher-cache-stats only when enabled. - config.py: new OPAL_DEBUG_INTERNAL_STATS flag, default False. - server.py: register the route, gated by the flag, beside /healthcheck. No production behavior change when the flag is off (the default). Also ignore .claude/ so private planning artifacts are never committed. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 3 ++ packages/opal-server/opal_server/config.py | 9 ++++ .../opal-server/opal_server/debug_stats.py | 41 +++++++++++++++++++ packages/opal-server/opal_server/server.py | 5 +++ .../tests/debug_stats_endpoint_test.py | 23 +++++++++++ .../opal_server/tests/debug_stats_test.py | 21 ++++++++++ 6 files changed, 102 insertions(+) create mode 100644 packages/opal-server/opal_server/debug_stats.py create mode 100644 packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py create mode 100644 packages/opal-server/opal_server/tests/debug_stats_test.py diff --git a/.gitignore b/.gitignore index 77faef34a..c3224ec1c 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,6 @@ dmypy.json *.iml .DS_Store + +# Private Claude Code working artifacts (plans/specs) — never commit +.claude/ diff --git a/packages/opal-server/opal_server/config.py b/packages/opal-server/opal_server/config.py index 9faac9be4..aaf3d6f75 100644 --- a/packages/opal-server/opal_server/config.py +++ b/packages/opal-server/opal_server/config.py @@ -358,6 +358,15 @@ class OpalServerConfig(Confi): description="Set if OPAL server should enable tracing with datadog APM", ) + DEBUG_INTERNAL_STATS = confi.bool( + "DEBUG_INTERNAL_STATS", + False, + description=( + "Expose GET /internal/git-fetcher-cache-stats with in-memory cache " + "sizes and process RSS. For diagnostics/tests only; keep off in production." + ), + ) + SCOPES = confi.bool("SCOPES", default=False, description="Enable scopes") SCOPES_REPO_CLONES_SHARDS = confi.int( diff --git a/packages/opal-server/opal_server/debug_stats.py b/packages/opal-server/opal_server/debug_stats.py new file mode 100644 index 000000000..20ea6b341 --- /dev/null +++ b/packages/opal-server/opal_server/debug_stats.py @@ -0,0 +1,41 @@ +"""Read-only introspection of the git-fetcher in-memory caches. + +Used only by the off-by-default /internal stats endpoint so tests can +observe the cache growth that the memory-leak fix (PR2) eliminates. +""" +from pathlib import Path +from typing import Dict + +from fastapi import FastAPI +from opal_server.git_fetcher import GitPolicyFetcher + + +def _read_rss_kb() -> int: + """Resident set size of this process in kilobytes (Linux), else 0.""" + try: + for line in Path("/proc/self/status").read_text().splitlines(): + if line.startswith("VmRSS:"): + return int(line.split()[1]) + except (OSError, ValueError, IndexError): + return 0 + return 0 + + +def git_fetcher_cache_stats() -> Dict[str, int]: + """Sizes of the three process-global GitPolicyFetcher caches + RSS.""" + return { + "repo_locks": len(GitPolicyFetcher.repo_locks), + "repos": len(GitPolicyFetcher.repos), + "repos_last_fetched": len(GitPolicyFetcher.repos_last_fetched), + "rss_kb": _read_rss_kb(), + } + + +def register_internal_stats_route(app: FastAPI, enabled: bool) -> None: + """Mount GET /internal/git-fetcher-cache-stats only when enabled.""" + if not enabled: + return + + @app.get("/internal/git-fetcher-cache-stats", include_in_schema=False) + def _git_fetcher_cache_stats() -> Dict[str, int]: + return git_fetcher_cache_stats() diff --git a/packages/opal-server/opal_server/server.py b/packages/opal-server/opal_server/server.py index a846e5a80..2b88b3db8 100644 --- a/packages/opal-server/opal_server/server.py +++ b/packages/opal-server/opal_server/server.py @@ -26,6 +26,7 @@ ) from opal_server.config import opal_server_config from opal_server.data.api import init_data_updates_router +from opal_server.debug_stats import register_internal_stats_route from opal_server.data.data_update_publisher import DataUpdatePublisher from opal_server.loadlimiting import init_loadlimit_router from opal_server.policy.bundles.api import router as bundles_router @@ -293,6 +294,10 @@ async def redoc_html(req: Request) -> HTMLResponse: def healthcheck(): return {"status": "ok"} + register_internal_stats_route( + app, enabled=opal_server_config.DEBUG_INTERNAL_STATS + ) + return app def _configure_lifecycle_callbacks(self, app: FastAPI): diff --git a/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py b/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py new file mode 100644 index 000000000..8ba0f678a --- /dev/null +++ b/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py @@ -0,0 +1,23 @@ +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from opal_server.debug_stats import register_internal_stats_route + + +def _app_with_flag(enabled: bool) -> FastAPI: + app = FastAPI() + register_internal_stats_route(app, enabled=enabled) + return app + + +def test_endpoint_absent_when_disabled(): + client = TestClient(_app_with_flag(False)) + assert client.get("/internal/git-fetcher-cache-stats").status_code == 404 + + +def test_endpoint_present_when_enabled(): + client = TestClient(_app_with_flag(True)) + resp = client.get("/internal/git-fetcher-cache-stats") + assert resp.status_code == 200 + body = resp.json() + assert set(body) == {"repo_locks", "repos", "repos_last_fetched", "rss_kb"} diff --git a/packages/opal-server/opal_server/tests/debug_stats_test.py b/packages/opal-server/opal_server/tests/debug_stats_test.py new file mode 100644 index 000000000..b2d7935db --- /dev/null +++ b/packages/opal-server/opal_server/tests/debug_stats_test.py @@ -0,0 +1,21 @@ +from opal_server.config import opal_server_config +from opal_server.debug_stats import git_fetcher_cache_stats +from opal_server.git_fetcher import GitPolicyFetcher + + +def test_stats_report_dict_sizes(monkeypatch): + monkeypatch.setattr(GitPolicyFetcher, "repo_locks", {"a": object()}) + monkeypatch.setattr(GitPolicyFetcher, "repos", {"p1": object(), "p2": object()}) + monkeypatch.setattr(GitPolicyFetcher, "repos_last_fetched", {}) + + stats = git_fetcher_cache_stats() + + assert stats["repo_locks"] == 1 + assert stats["repos"] == 2 + assert stats["repos_last_fetched"] == 0 + assert isinstance(stats["rss_kb"], int) + assert stats["rss_kb"] >= 0 + + +def test_internal_stats_flag_defaults_off(): + assert opal_server_config.DEBUG_INTERNAL_STATS is False From c176ce9dbbf89bd92b70c3aad5ee486259868a4a Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 12:24:59 +0300 Subject: [PATCH 02/26] test(git-leak): add OPAL git leak/resilience test bed A self-contained docker-compose stack (opal-server x2 workers + Redis + Postgres broadcaster + Gitea) plus a pytest harness that reproduces, as tests that fail on master, the git-fetcher memory leak, the offline-repo hang, the slow serial boot, and the broadcaster-disconnect gap. These become the regression gates for the follow-up fixes. - seed/: idempotent Gitea seeding sidecar (N policy repos) + Dockerfile. - docker-compose.yml: 4-service stack, opal-server built from the repo's own docker/Dockerfile (server target), scopes on, Postgres broadcaster. - helpers.py / conftest.py: HTTP + infra helpers and stack fixtures. - test_leak.py / test_resilience.py / test_boot.py: the flagship tests. - README.md: how to run and expected fail-on-master behavior. Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/README.md | 33 +++++++ app-tests/git-leak/conftest.py | 44 +++++++++ app-tests/git-leak/docker-compose.yml | 103 ++++++++++++++++++++ app-tests/git-leak/helpers.py | 94 +++++++++++++++++++ app-tests/git-leak/seed/Dockerfile | 6 ++ app-tests/git-leak/seed/seed_gitea.py | 130 ++++++++++++++++++++++++++ app-tests/git-leak/test_boot.py | 37 ++++++++ app-tests/git-leak/test_leak.py | 56 +++++++++++ app-tests/git-leak/test_resilience.py | 54 +++++++++++ 9 files changed, 557 insertions(+) create mode 100644 app-tests/git-leak/README.md create mode 100644 app-tests/git-leak/conftest.py create mode 100644 app-tests/git-leak/docker-compose.yml create mode 100644 app-tests/git-leak/helpers.py create mode 100644 app-tests/git-leak/seed/Dockerfile create mode 100644 app-tests/git-leak/seed/seed_gitea.py create mode 100644 app-tests/git-leak/test_boot.py create mode 100644 app-tests/git-leak/test_leak.py create mode 100644 app-tests/git-leak/test_resilience.py diff --git a/app-tests/git-leak/README.md b/app-tests/git-leak/README.md new file mode 100644 index 000000000..33a291f86 --- /dev/null +++ b/app-tests/git-leak/README.md @@ -0,0 +1,33 @@ +# OPAL git-leak / resilience test bed + +Reproduces (as failing tests on `master`) the four issues fixed by PR2–PR5: +memory leak, offline-repo hang, slow serial boot, broadcaster no-reconnect. + +## Stack +- `opal_server` (2 workers, scopes on, Postgres broadcaster, built from `docker/Dockerfile`) +- `redis`, `postgres`, `gitea` (+ one-shot `gitea-admin` and `seed` sidecars) + +## Run +```bash +cd app-tests/git-leak +python -m pytest -v --boot-scopes=50 # full set +python -m pytest test_leak.py -v --boot-scopes=20 # just the leak gates +``` +Useful flags: `--boot-scopes=N` (any N), `--keep-stack` (skip teardown), +env `BOOT_TARGET_SECONDS=120` (tighten the boot gate). + +## Expected on master +All five flagship tests FAIL (except the boot test, which only fails when +`BOOT_TARGET_SECONDS` is set low). They become the regression gates for the fixes. + +## Requires +Docker + docker compose v2, plus host Python with `pytest pytest-timeout requests GitPython`. + +## Notes +- Auth is disabled in the stack: `OPAL_AUTH_PUBLIC_KEY` is left unset so the JWT + verifier is disabled and the harness can call scope routes without minting JWTs. + Local test bed only; never a production setting. +- The server runs 2 uvicorn workers with a Postgres broadcaster, mirroring a + realistic multi-worker deployment. The `GitPolicyFetcher` caches read by the + `/internal/git-fetcher-cache-stats` endpoint are per-process, so the harness + polls with generous timeouts to let the leader worker converge. diff --git a/app-tests/git-leak/conftest.py b/app-tests/git-leak/conftest.py new file mode 100644 index 000000000..8c538aa06 --- /dev/null +++ b/app-tests/git-leak/conftest.py @@ -0,0 +1,44 @@ +import os + +import pytest + +from helpers import OpalServerClient, compose + + +def pytest_addoption(parser): + parser.addoption( + "--boot-scopes", + action="store", + default="50", + help="number of repos to seed/boot (default 50)", + ) + parser.addoption( + "--keep-stack", + action="store_true", + default=False, + help="do not tear the compose stack down after the run", + ) + + +@pytest.fixture(scope="session") +def repo_count(request) -> int: + return int(request.config.getoption("--boot-scopes")) + + +@pytest.fixture(scope="session") +def stack(request, repo_count): + os.environ["REPO_COUNT"] = str(repo_count) + # build + start infra; seed runs to completion then exits + compose("up", "-d", "--build") + # block until seeding sidecar has finished creating repos + compose("wait", "seed") + client = OpalServerClient() + client.wait_healthy() + yield client + if not request.config.getoption("--keep-stack"): + compose("down", "-v") + + +@pytest.fixture() +def opal(stack) -> OpalServerClient: + return stack diff --git a/app-tests/git-leak/docker-compose.yml b/app-tests/git-leak/docker-compose.yml new file mode 100644 index 000000000..8f9f474f9 --- /dev/null +++ b/app-tests/git-leak/docker-compose.yml @@ -0,0 +1,103 @@ +name: opal-git-leak-test + +services: + redis: + image: redis:7-alpine + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 2s + timeout: 3s + retries: 30 + + postgres: + image: postgres:16-alpine + environment: + POSTGRES_USER: opal + POSTGRES_PASSWORD: opal + POSTGRES_DB: opal + # not published to the host: only opal_server reaches it over the compose + # network, and bounce_postgres() uses `docker compose stop/start`. Publishing + # 5432 would collide with any Postgres already running on the host. + healthcheck: + test: ["CMD-SHELL", "pg_isready -U opal"] + interval: 2s + timeout: 3s + retries: 30 + + gitea: + image: gitea/gitea:1.21 + environment: + GITEA__security__INSTALL_LOCK: "true" + GITEA__server__ROOT_URL: "http://gitea:3000/" + GITEA__database__DB_TYPE: "sqlite3" + # not published to the host: only the seed sidecar and opal_server reach it + # over the compose network (via http://gitea:3000). Avoids host port clashes. + volumes: + - gitea-data:/data + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/v1/version || exit 1"] + interval: 3s + timeout: 5s + retries: 40 + + gitea-admin: + # creates the admin user once gitea is healthy + image: gitea/gitea:1.21 + depends_on: + gitea: + condition: service_healthy + user: git + entrypoint: ["/bin/sh", "-c"] + command: + - > + gitea admin user create --username opaladmin --password opaladmin + --email admin@example.com --admin --must-change-password=false + --config /data/gitea/conf/app.ini || true + volumes: + - gitea-data:/data + restart: "no" + + seed: + build: ./seed + depends_on: + gitea: + condition: service_healthy + gitea-admin: + condition: service_completed_successfully + environment: + GITEA_URL: "http://gitea:3000" + GITEA_ADMIN_USER: "opaladmin" + GITEA_ADMIN_PASSWORD: "opaladmin" + REPO_COUNT: "${REPO_COUNT:-50}" + volumes: + - seed-output:/seed-output + restart: "no" + + opal_server: + build: + context: ../.. + dockerfile: docker/Dockerfile + target: server + environment: + UVICORN_NUM_WORKERS: "2" + OPAL_SCOPES: "1" + OPAL_REDIS_URL: "redis://redis:6379" + OPAL_BROADCAST_URI: "postgres://opal:opal@postgres:5432/opal" + OPAL_BASE_DIR: "/opal" + OPAL_POLICY_REFRESH_INTERVAL: "0" + OPAL_DEBUG_INTERNAL_STATS: "1" + # OPAL_AUTH_PUBLIC_KEY is intentionally left unset: with no public key the + # JWT verifier is disabled, so the harness can call scope routes without + # minting JWTs. Local test bed only; never a production setting. + OPAL_LOG_FORMAT_INCLUDE_PID: "true" + ports: + - "7002:7002" + depends_on: + redis: + condition: service_healthy + postgres: + condition: service_healthy + +volumes: + gitea-data: + seed-output: diff --git a/app-tests/git-leak/helpers.py b/app-tests/git-leak/helpers.py new file mode 100644 index 000000000..93871f194 --- /dev/null +++ b/app-tests/git-leak/helpers.py @@ -0,0 +1,94 @@ +"""HTTP + infra helpers for the git-leak test bed.""" +import subprocess +import time +from pathlib import Path +from typing import Dict, List + +import requests + +OPAL_URL = "http://localhost:7002" +GITEA_INTERNAL_URL = "http://gitea:3000" +GITEA_USER = "opaladmin" + +# the compose project lives next to this file; compose() runs from here +_COMPOSE_DIR = str(Path(__file__).resolve().parent) + + +class OpalServerClient: + def __init__(self, base_url: str = OPAL_URL): + self.base_url = base_url.rstrip("/") + + def wait_healthy(self, timeout: int = 180) -> None: + deadline = time.time() + timeout + last = None + while time.time() < deadline: + try: + if requests.get(f"{self.base_url}/healthcheck", timeout=5).status_code == 200: + return + except requests.RequestException as exc: + last = exc + time.sleep(2) + raise RuntimeError(f"opal-server not healthy in {timeout}s (last: {last})") + + def stats(self) -> Dict[str, int]: + resp = requests.get( + f"{self.base_url}/internal/git-fetcher-cache-stats", timeout=10 + ) + resp.raise_for_status() + return resp.json() + + def put_scope(self, scope_id: str, repo_url: str, branch: str = "main") -> None: + body = { + "scope_id": scope_id, + "policy": { + "source_type": "git", + "url": repo_url, + "auth": {"auth_type": "none"}, + "branch": branch, + "directories": ["."], + "extensions": [".rego", ".json"], + "manifest": ".manifest", + "poll_updates": False, + }, + "data": {"entries": []}, + } + # the scope router mounts at prefix="/scopes" with @router.put("") + resp = requests.put(f"{self.base_url}/scopes", json=body, timeout=30) + resp.raise_for_status() + + def delete_scope(self, scope_id: str) -> None: + resp = requests.delete(f"{self.base_url}/scopes/{scope_id}", timeout=30) + if resp.status_code not in (200, 204, 404): + resp.raise_for_status() + + def refresh_all(self) -> None: + # publishes a refresh on the webhook topic; leader pulls all scopes + resp = requests.post(f"{self.base_url}/scopes/refresh", timeout=30) + if resp.status_code == 404: + return # endpoint name differs across versions; caller falls back to poll + resp.raise_for_status() + + +def gitea_repo_url(name: str) -> str: + # url reachable from inside the opal_server container + return f"{GITEA_INTERNAL_URL}/{GITEA_USER}/{name}.git" + + +def compose(*args: str) -> subprocess.CompletedProcess: + return subprocess.run( + ["docker", "compose", *args], + cwd=_COMPOSE_DIR, + capture_output=True, + text=True, + check=True, + ) + + +def bounce_postgres(down_seconds: int = 5) -> None: + compose("stop", "postgres") + time.sleep(down_seconds) + compose("start", "postgres") + + +def list_seeded_repos(count: int) -> List[str]: + return [f"policy-repo-{i:04d}" for i in range(count)] diff --git a/app-tests/git-leak/seed/Dockerfile b/app-tests/git-leak/seed/Dockerfile new file mode 100644 index 000000000..ee9d776d2 --- /dev/null +++ b/app-tests/git-leak/seed/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.11-slim +RUN apt-get update && apt-get install -y --no-install-recommends git \ + && rm -rf /var/lib/apt/lists/* +RUN pip install --no-cache-dir requests GitPython +COPY seed_gitea.py /seed_gitea.py +ENTRYPOINT ["python", "/seed_gitea.py"] diff --git a/app-tests/git-leak/seed/seed_gitea.py b/app-tests/git-leak/seed/seed_gitea.py new file mode 100644 index 000000000..fb41dbc2a --- /dev/null +++ b/app-tests/git-leak/seed/seed_gitea.py @@ -0,0 +1,130 @@ +"""Seed a Gitea instance with N policy repos for the OPAL git-leak test bed. + +Idempotent: re-running creates only the missing repos. Each repo gets a +single commit containing a minimal OPA policy tree. + +Env: + GITEA_URL e.g. http://gitea:3000 + GITEA_ADMIN_USER admin username (created out-of-band by compose) + GITEA_ADMIN_PASSWORD admin password + REPO_COUNT how many repos to ensure exist (default 50) +""" +import os +import sys +import time +from pathlib import Path + +import requests +from git import Actor, Repo + +POLICY_REGO = """package example + +default allow = false + +allow { + input.user == "admin" +} +""" + +DATA_JSON = '{"roles": {"admin": ["read", "write"]}}\n' + + +def _wait_for_gitea(base_url: str, timeout: int = 120) -> None: + deadline = time.time() + timeout + while time.time() < deadline: + try: + if requests.get(f"{base_url}/api/v1/version", timeout=5).status_code == 200: + return + except requests.RequestException: + pass + time.sleep(2) + raise RuntimeError(f"Gitea not reachable at {base_url} within {timeout}s") + + +def _ensure_token(base_url: str, user: str, password: str) -> str: + name = "seed-token" + resp = requests.post( + f"{base_url}/api/v1/users/{user}/tokens", + auth=(user, password), + json={"name": name, "scopes": ["write:repository", "write:user"]}, + timeout=10, + ) + if resp.status_code == 201: + return resp.json()["sha1"] + # token already exists -> delete then recreate (Gitea won't reveal an existing secret) + requests.delete( + f"{base_url}/api/v1/users/{user}/tokens/{name}", auth=(user, password), timeout=10 + ) + resp = requests.post( + f"{base_url}/api/v1/users/{user}/tokens", + auth=(user, password), + json={"name": name, "scopes": ["write:repository", "write:user"]}, + timeout=10, + ) + resp.raise_for_status() + return resp.json()["sha1"] + + +def _ensure_repo(base_url: str, token: str, user: str, name: str) -> None: + headers = {"Authorization": f"token {token}"} + exists = requests.get( + f"{base_url}/api/v1/repos/{user}/{name}", headers=headers, timeout=10 + ) + if exists.status_code == 200: + return + created = requests.post( + f"{base_url}/api/v1/user/repos", + headers=headers, + json={"name": name, "private": False, "auto_init": False}, + timeout=10, + ) + created.raise_for_status() + + +def _push_policy(base_url: str, token: str, user: str, name: str, workdir: Path) -> None: + repo_dir = workdir / name + repo_dir.mkdir(parents=True, exist_ok=True) + (repo_dir / "example.rego").write_text(POLICY_REGO) + (repo_dir / "data.json").write_text(DATA_JSON) + + repo = Repo.init(repo_dir, initial_branch="main") + repo.index.add(["example.rego", "data.json"]) + author = Actor("seed", "seed@example.com") + repo.index.commit("seed policy", author=author, committer=author) + + push_url = base_url.replace("http://", f"http://{user}:{token}@") + f"/{user}/{name}.git" + origin = repo.create_remote("origin", push_url) + origin.push(refspec="main:main") + + +def main() -> int: + base_url = os.environ["GITEA_URL"].rstrip("/") + user = os.environ["GITEA_ADMIN_USER"] + password = os.environ["GITEA_ADMIN_PASSWORD"] + count = int(os.environ.get("REPO_COUNT", "50")) + + _wait_for_gitea(base_url) + token = _ensure_token(base_url, user, password) + + workdir = Path("/tmp/seed-work") + for i in range(count): + name = f"policy-repo-{i:04d}" + _ensure_repo(base_url, token, user, name) + # only push if the repo is empty (freshly created) + head = requests.get( + f"{base_url}/api/v1/repos/{user}/{name}/branches/main", + headers={"Authorization": f"token {token}"}, + timeout=10, + ) + if head.status_code != 200: + _push_policy(base_url, token, user, name, workdir) + print(f"seeded {name}", flush=True) + + # write the token where the test harness can read it + Path("/seed-output/token").write_text(token) + print(f"DONE: ensured {count} repos", flush=True) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/app-tests/git-leak/test_boot.py b/app-tests/git-leak/test_boot.py new file mode 100644 index 000000000..b9a80b5d7 --- /dev/null +++ b/app-tests/git-leak/test_boot.py @@ -0,0 +1,37 @@ +import os +import time + +import pytest + +from helpers import compose, gitea_repo_url, list_seeded_repos + + +@pytest.mark.timeout(2400) +def test_boot_loads_all_scopes(opal, repo_count): + """Measure how long a fresh boot takes to load all scope repos. + + On master this is serial and slow (the ~20-min problem at scale). + PR4 tightens BOOT_TARGET_SECONDS to assert the parallel speedup. + """ + n = repo_count + repos = list_seeded_repos(n) + for i, name in enumerate(repos): + opal.put_scope(f"boot-{i}", gitea_repo_url(name)) + + # restart only the server so it re-runs sync_scopes on boot + compose("restart", "opal_server") + opal.wait_healthy(timeout=600) + + start = time.time() + deadline = start + 2000 + while time.time() < deadline: + if opal.stats()["repos"] >= n: + break + time.sleep(2) + elapsed = time.time() - start + + # PR1 records the baseline (loose). PR4 will set BOOT_TARGET_SECONDS low. + BOOT_TARGET_SECONDS = int(os.environ.get("BOOT_TARGET_SECONDS", "2000")) + print(f"boot loaded {n} scopes in {elapsed:.1f}s (target {BOOT_TARGET_SECONDS}s)") + assert opal.stats()["repos"] >= n, "not all scopes loaded after boot" + assert elapsed < BOOT_TARGET_SECONDS, f"boot too slow: {elapsed:.1f}s" diff --git a/app-tests/git-leak/test_leak.py b/app-tests/git-leak/test_leak.py new file mode 100644 index 000000000..76db3729c --- /dev/null +++ b/app-tests/git-leak/test_leak.py @@ -0,0 +1,56 @@ +import time + +import pytest + +from helpers import gitea_repo_url, list_seeded_repos + + +def _wait_until(predicate, timeout=30, interval=0.5): + deadline = time.time() + timeout + while time.time() < deadline: + if predicate(): + return True + time.sleep(interval) + return False + + +@pytest.mark.timeout(900) +def test_churn_releases_caches(opal, repo_count): + """Create then delete many scopes; the three caches must return to empty. + + FAILS on master: delete_scope never purges GitPolicyFetcher caches. + """ + n = min(repo_count, 100) + repos = list_seeded_repos(n) + for i, name in enumerate(repos): + opal.put_scope(f"churn-{i}", gitea_repo_url(name)) + _wait_until(lambda: opal.stats()["repos"] >= n, timeout=600) + + for i in range(n): + opal.delete_scope(f"churn-{i}") + + released = _wait_until(lambda: opal.stats()["repos"] == 0, timeout=30) + stats = opal.stats() + assert released, f"repos cache did not drain: {stats}" + assert stats["repos_last_fetched"] == 0, stats + + +@pytest.mark.timeout(900) +def test_repeat_sync_does_not_grow(opal, repo_count): + """Re-syncing the same scopes must not grow the caches unboundedly. + + FAILS on master: repos cache only ever grows. + """ + n = min(repo_count, 50) + repos = list_seeded_repos(n) + for i, name in enumerate(repos): + opal.put_scope(f"stable-{i}", gitea_repo_url(name)) + _wait_until(lambda: opal.stats()["repos"] >= n, timeout=600) + + baseline = opal.stats()["repos"] + for _ in range(10): + opal.refresh_all() + time.sleep(2) + + grown = opal.stats()["repos"] + assert grown <= baseline, f"repos cache grew on repeat sync: {baseline} -> {grown}" diff --git a/app-tests/git-leak/test_resilience.py b/app-tests/git-leak/test_resilience.py new file mode 100644 index 000000000..64598d741 --- /dev/null +++ b/app-tests/git-leak/test_resilience.py @@ -0,0 +1,54 @@ +import time + +import pytest + +from helpers import bounce_postgres, gitea_repo_url, list_seeded_repos + + +@pytest.mark.timeout(300) +def test_offline_repo_does_not_block_healthy_scopes(opal, repo_count): + """An unreachable repo must not stop healthy scopes from serving. + + FAILS on master: the scopes path has no fetch timeout, so a hung + clone occupies the shared executor and the server stalls. + """ + # a routable-but-dead address: TEST-NET-1 (RFC 5737), no git server there + opal.put_scope("offline", "http://192.0.2.1/dead/repo.git", branch="main") + + healthy = list_seeded_repos(1)[0] + opal.put_scope("healthy", gitea_repo_url(healthy)) + + # the healthy scope's repo must appear in the cache within a bounded time, + # even though the offline scope is hanging + deadline = time.time() + 60 + served = False + while time.time() < deadline: + if opal.stats()["repos"] >= 1: + served = True + break + time.sleep(2) + assert served, "healthy scope never loaded while an offline repo was hanging" + + +@pytest.mark.timeout(300) +def test_server_recovers_after_postgres_bounce(opal): + """A transient Postgres outage must not leave the server permanently down. + + FAILS on master: broadcaster disconnect SIGTERMs the worker with no + in-process reconnect; after a bounce the /internal stats endpoint does + not come back within the window without an external supervisor restart. + """ + assert opal.stats() # healthy before + bounce_postgres(down_seconds=5) + + deadline = time.time() + 60 + recovered = False + while time.time() < deadline: + try: + opal.wait_healthy(timeout=5) + opal.stats() + recovered = True + break + except Exception: + time.sleep(2) + assert recovered, "server did not recover within 60s of a postgres bounce" From bd49676013111ac92fe2612d81e642d8d6509d9c Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 12:37:24 +0300 Subject: [PATCH 03/26] test(git-leak): add GiteaAdmin and make_repo_unreachable helpers Complete the helpers.py surface promised in the plan's file-structure table. Both are now functional and used, not dead code: - make_repo_unreachable(name): returns a git URL on a routable-but-dead TEST-NET-1 host (RFC 5737). test_offline_repo now uses it instead of an inlined literal. - GiteaAdmin: host-side Gitea admin client (list_repos / repo_exists / create_repo / delete_repo), exposed as the `gitea_admin` pytest fixture for tests that need to inspect or stage repos beyond the seed sidecar. Gitea is published on host port 13000 (uncommon, to avoid the usual :3000 clash) so GiteaAdmin can reach it; opal_server and the seed sidecar still use the internal http://gitea:3000. README updated with the helper and port notes. Verified live: GiteaAdmin lists the seeded repos and round-trips create/exists/delete against Gitea over the published port. Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/README.md | 10 +++ app-tests/git-leak/conftest.py | 8 ++- app-tests/git-leak/docker-compose.yml | 7 ++- app-tests/git-leak/helpers.py | 88 +++++++++++++++++++++++++++ app-tests/git-leak/test_resilience.py | 11 +++- 5 files changed, 118 insertions(+), 6 deletions(-) diff --git a/app-tests/git-leak/README.md b/app-tests/git-leak/README.md index 33a291f86..bf4060b33 100644 --- a/app-tests/git-leak/README.md +++ b/app-tests/git-leak/README.md @@ -7,6 +7,16 @@ memory leak, offline-repo hang, slow serial boot, broadcaster no-reconnect. - `opal_server` (2 workers, scopes on, Postgres broadcaster, built from `docker/Dockerfile`) - `redis`, `postgres`, `gitea` (+ one-shot `gitea-admin` and `seed` sidecars) +Only `opal_server` (`:7002`) and `gitea` (`:13000` on the host) are published; +Postgres is internal to the compose network. + +## Helpers (`helpers.py`) +- `OpalServerClient` — drive opal over HTTP (`stats`, `put_scope`, `delete_scope`, `refresh_all`). +- `GiteaAdmin` — host-side Gitea admin client (`list_repos`, `repo_exists`, + `create_repo`, `delete_repo`); also exposed as the `gitea_admin` pytest fixture. +- `make_repo_unreachable(name)` — git URL on a routable-but-dead host (TEST-NET-1) for the offline-repo test. +- `bounce_postgres(down_seconds)` — stop/start Postgres to simulate a broadcaster outage. + ## Run ```bash cd app-tests/git-leak diff --git a/app-tests/git-leak/conftest.py b/app-tests/git-leak/conftest.py index 8c538aa06..ba7052079 100644 --- a/app-tests/git-leak/conftest.py +++ b/app-tests/git-leak/conftest.py @@ -2,7 +2,7 @@ import pytest -from helpers import OpalServerClient, compose +from helpers import GiteaAdmin, OpalServerClient, compose def pytest_addoption(parser): @@ -42,3 +42,9 @@ def stack(request, repo_count): @pytest.fixture() def opal(stack) -> OpalServerClient: return stack + + +@pytest.fixture(scope="session") +def gitea_admin(stack) -> GiteaAdmin: + """Host-side Gitea admin client (depends on `stack` so Gitea is up).""" + return GiteaAdmin() diff --git a/app-tests/git-leak/docker-compose.yml b/app-tests/git-leak/docker-compose.yml index 8f9f474f9..3704e5c83 100644 --- a/app-tests/git-leak/docker-compose.yml +++ b/app-tests/git-leak/docker-compose.yml @@ -30,8 +30,11 @@ services: GITEA__security__INSTALL_LOCK: "true" GITEA__server__ROOT_URL: "http://gitea:3000/" GITEA__database__DB_TYPE: "sqlite3" - # not published to the host: only the seed sidecar and opal_server reach it - # over the compose network (via http://gitea:3000). Avoids host port clashes. + # published on 13000 (not 3000) for the host-side GiteaAdmin helper; the + # uncommon port avoids the usual :3000 clash. opal_server and the seed + # sidecar still reach it over the compose network via http://gitea:3000. + ports: + - "13000:3000" volumes: - gitea-data:/data healthcheck: diff --git a/app-tests/git-leak/helpers.py b/app-tests/git-leak/helpers.py index 93871f194..936a7fc61 100644 --- a/app-tests/git-leak/helpers.py +++ b/app-tests/git-leak/helpers.py @@ -7,8 +7,16 @@ import requests OPAL_URL = "http://localhost:7002" +# reachable from inside the opal_server container (compose network) GITEA_INTERNAL_URL = "http://gitea:3000" +# reachable from the host-side test harness (published port, see docker-compose.yml) +GITEA_HOST_URL = "http://localhost:13000" GITEA_USER = "opaladmin" +GITEA_PASSWORD = "opaladmin" + +# TEST-NET-1 (RFC 5737): routable but runs no git server, so a clone hangs +# instead of failing fast — used to simulate an offline/unreachable repo. +UNREACHABLE_HOST = "192.0.2.1" # the compose project lives next to this file; compose() runs from here _COMPOSE_DIR = str(Path(__file__).resolve().parent) @@ -69,11 +77,91 @@ def refresh_all(self) -> None: resp.raise_for_status() +class GiteaAdmin: + """Host-side admin client for the test bed's Gitea. + + The ``seed`` sidecar does the bulk repo creation from inside the compose + network; this class lets a test inspect or mutate Gitea repos directly + from the host (e.g. assert seeding happened, or add/remove a single repo + for a specific scenario). It authenticates with the admin user that the + ``gitea-admin`` sidecar created, over the published host port. + """ + + def __init__( + self, + base_url: str = GITEA_HOST_URL, + user: str = GITEA_USER, + password: str = GITEA_PASSWORD, + ): + self.base_url = base_url.rstrip("/") + self._user = user + self._auth = (user, password) + + def repo_exists(self, name: str) -> bool: + resp = requests.get( + f"{self.base_url}/api/v1/repos/{self._user}/{name}", + auth=self._auth, + timeout=10, + ) + return resp.status_code == 200 + + def list_repos(self) -> List[str]: + names: List[str] = [] + page = 1 + while True: + resp = requests.get( + f"{self.base_url}/api/v1/users/{self._user}/repos", + params={"page": page, "limit": 50}, + auth=self._auth, + timeout=10, + ) + resp.raise_for_status() + batch = resp.json() + if not batch: + break + names.extend(r["name"] for r in batch) + page += 1 + return names + + def create_repo(self, name: str) -> None: + if self.repo_exists(name): + return + resp = requests.post( + f"{self.base_url}/api/v1/user/repos", + json={"name": name, "private": False, "auto_init": True}, + auth=self._auth, + timeout=10, + ) + resp.raise_for_status() + + def delete_repo(self, name: str) -> None: + resp = requests.delete( + f"{self.base_url}/api/v1/repos/{self._user}/{name}", + auth=self._auth, + timeout=10, + ) + if resp.status_code not in (204, 404): + resp.raise_for_status() + + def gitea_repo_url(name: str) -> str: # url reachable from inside the opal_server container return f"{GITEA_INTERNAL_URL}/{GITEA_USER}/{name}.git" +def make_repo_unreachable(name: str) -> str: + """Return a git URL for ``name`` pointing at a routable-but-dead host. + + Simulates an offline/unreachable policy repo: the address is in + TEST-NET-1 (RFC 5737), which is routable but runs no git server, so a + clone hangs rather than failing fast — exercising the missing fetch + timeout on the scopes path (the bug PR3 fixes). The URL keeps the same + ``/{user}/{name}.git`` shape as a real Gitea repo so the scope looks + ordinary apart from the unreachable host. + """ + return f"http://{UNREACHABLE_HOST}/{GITEA_USER}/{name}.git" + + def compose(*args: str) -> subprocess.CompletedProcess: return subprocess.run( ["docker", "compose", *args], diff --git a/app-tests/git-leak/test_resilience.py b/app-tests/git-leak/test_resilience.py index 64598d741..7d9427903 100644 --- a/app-tests/git-leak/test_resilience.py +++ b/app-tests/git-leak/test_resilience.py @@ -2,7 +2,12 @@ import pytest -from helpers import bounce_postgres, gitea_repo_url, list_seeded_repos +from helpers import ( + bounce_postgres, + gitea_repo_url, + list_seeded_repos, + make_repo_unreachable, +) @pytest.mark.timeout(300) @@ -12,8 +17,8 @@ def test_offline_repo_does_not_block_healthy_scopes(opal, repo_count): FAILS on master: the scopes path has no fetch timeout, so a hung clone occupies the shared executor and the server stalls. """ - # a routable-but-dead address: TEST-NET-1 (RFC 5737), no git server there - opal.put_scope("offline", "http://192.0.2.1/dead/repo.git", branch="main") + # a routable-but-dead address (TEST-NET-1, RFC 5737): the clone hangs + opal.put_scope("offline", make_repo_unreachable("dead-repo"), branch="main") healthy = list_seeded_repos(1)[0] opal.put_scope("healthy", gitea_repo_url(healthy)) From afeb9698ae65c3d4e373cc8468b75ae87149f68b Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 12:55:54 +0300 Subject: [PATCH 04/26] test(git-leak): correct postgres-bounce framing (passes on master) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verified test_server_recovers_after_postgres_bounce against the stack: it PASSES on master (~14-19s). On a broadcaster drop the affected worker triggers a graceful shutdown, gunicorn respawns it, and the sibling worker keeps serving HTTP, so the surface recovers within the window — recovery happens via gunicorn's in-container worker supervision, not an external supervisor and not an in-process reconnect. Reframe #5 as a recovery guard (not a known-broken case) in the docstring and README; the prior "FAILS on master / needs external supervisor" wording was wrong. PER-15065's in-process reconnect would avoid the worker churn but recovery already holds. Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/README.md | 9 +++++++-- app-tests/git-leak/test_resilience.py | 13 ++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/app-tests/git-leak/README.md b/app-tests/git-leak/README.md index bf4060b33..778971705 100644 --- a/app-tests/git-leak/README.md +++ b/app-tests/git-leak/README.md @@ -27,8 +27,13 @@ Useful flags: `--boot-scopes=N` (any N), `--keep-stack` (skip teardown), env `BOOT_TARGET_SECONDS=120` (tighten the boot gate). ## Expected on master -All five flagship tests FAIL (except the boot test, which only fails when -`BOOT_TARGET_SECONDS` is set low). They become the regression gates for the fixes. +The leak tests (#1, #2) and the offline-repo test (#4) FAIL on master — they +target unfixed bugs and become the regression gates for PR2/PR3. The boot test +(#3) passes but only fails when `BOOT_TARGET_SECONDS` is set low (PR4's gate). +The Postgres-bounce test (#5) PASSES on master: it is a recovery guard — when +the broadcaster drops, the worker is respawned by gunicorn while the sibling +worker keeps serving, so the HTTP surface recovers. It guards that property +against regression rather than reproducing a current failure. ## Requires Docker + docker compose v2, plus host Python with `pytest pytest-timeout requests GitPython`. diff --git a/app-tests/git-leak/test_resilience.py b/app-tests/git-leak/test_resilience.py index 7d9427903..8455451eb 100644 --- a/app-tests/git-leak/test_resilience.py +++ b/app-tests/git-leak/test_resilience.py @@ -37,11 +37,14 @@ def test_offline_repo_does_not_block_healthy_scopes(opal, repo_count): @pytest.mark.timeout(300) def test_server_recovers_after_postgres_bounce(opal): - """A transient Postgres outage must not leave the server permanently down. - - FAILS on master: broadcaster disconnect SIGTERMs the worker with no - in-process reconnect; after a bounce the /internal stats endpoint does - not come back within the window without an external supervisor restart. + """A transient Postgres (broadcaster) outage must not leave the server down. + + Recovery guard, not a known-broken case. On current master this PASSES: + when the broadcast channel drops, the affected worker triggers a graceful + shutdown and gunicorn respawns it, while the sibling worker keeps serving + HTTP — so the surface recovers within the window. It guards against a + regression of that property (PER-15065's in-process reconnect would make + recovery cleaner by avoiding the worker churn, but recovery already holds). """ assert opal.stats() # healthy before bounce_postgres(down_seconds=5) From 92353f6277521da05e1f0e5eb782314e4e424fb0 Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 13:03:05 +0300 Subject: [PATCH 05/26] style(git-leak): apply black/isort/docformatter (pre-commit) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run the repo's pinned pre-commit formatters (black 23.1.0, isort 5.12.0, docformatter 1.7.5) over the PR1 files to satisfy the pre-commit CI check. Formatting only — no behavior change. Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/conftest.py | 1 - app-tests/git-leak/helpers.py | 5 ++++- app-tests/git-leak/seed/seed_gitea.py | 12 +++++++++--- app-tests/git-leak/test_boot.py | 1 - app-tests/git-leak/test_leak.py | 1 - app-tests/git-leak/test_resilience.py | 19 ++++++++++--------- packages/opal-server/opal_server/server.py | 2 +- .../tests/debug_stats_endpoint_test.py | 1 - 8 files changed, 24 insertions(+), 18 deletions(-) diff --git a/app-tests/git-leak/conftest.py b/app-tests/git-leak/conftest.py index ba7052079..7501f99d0 100644 --- a/app-tests/git-leak/conftest.py +++ b/app-tests/git-leak/conftest.py @@ -1,7 +1,6 @@ import os import pytest - from helpers import GiteaAdmin, OpalServerClient, compose diff --git a/app-tests/git-leak/helpers.py b/app-tests/git-leak/helpers.py index 936a7fc61..1a3bd4fcc 100644 --- a/app-tests/git-leak/helpers.py +++ b/app-tests/git-leak/helpers.py @@ -31,7 +31,10 @@ def wait_healthy(self, timeout: int = 180) -> None: last = None while time.time() < deadline: try: - if requests.get(f"{self.base_url}/healthcheck", timeout=5).status_code == 200: + if ( + requests.get(f"{self.base_url}/healthcheck", timeout=5).status_code + == 200 + ): return except requests.RequestException as exc: last = exc diff --git a/app-tests/git-leak/seed/seed_gitea.py b/app-tests/git-leak/seed/seed_gitea.py index fb41dbc2a..ddebe5667 100644 --- a/app-tests/git-leak/seed/seed_gitea.py +++ b/app-tests/git-leak/seed/seed_gitea.py @@ -53,7 +53,9 @@ def _ensure_token(base_url: str, user: str, password: str) -> str: return resp.json()["sha1"] # token already exists -> delete then recreate (Gitea won't reveal an existing secret) requests.delete( - f"{base_url}/api/v1/users/{user}/tokens/{name}", auth=(user, password), timeout=10 + f"{base_url}/api/v1/users/{user}/tokens/{name}", + auth=(user, password), + timeout=10, ) resp = requests.post( f"{base_url}/api/v1/users/{user}/tokens", @@ -81,7 +83,9 @@ def _ensure_repo(base_url: str, token: str, user: str, name: str) -> None: created.raise_for_status() -def _push_policy(base_url: str, token: str, user: str, name: str, workdir: Path) -> None: +def _push_policy( + base_url: str, token: str, user: str, name: str, workdir: Path +) -> None: repo_dir = workdir / name repo_dir.mkdir(parents=True, exist_ok=True) (repo_dir / "example.rego").write_text(POLICY_REGO) @@ -92,7 +96,9 @@ def _push_policy(base_url: str, token: str, user: str, name: str, workdir: Path) author = Actor("seed", "seed@example.com") repo.index.commit("seed policy", author=author, committer=author) - push_url = base_url.replace("http://", f"http://{user}:{token}@") + f"/{user}/{name}.git" + push_url = ( + base_url.replace("http://", f"http://{user}:{token}@") + f"/{user}/{name}.git" + ) origin = repo.create_remote("origin", push_url) origin.push(refspec="main:main") diff --git a/app-tests/git-leak/test_boot.py b/app-tests/git-leak/test_boot.py index b9a80b5d7..928fd6f4e 100644 --- a/app-tests/git-leak/test_boot.py +++ b/app-tests/git-leak/test_boot.py @@ -2,7 +2,6 @@ import time import pytest - from helpers import compose, gitea_repo_url, list_seeded_repos diff --git a/app-tests/git-leak/test_leak.py b/app-tests/git-leak/test_leak.py index 76db3729c..30b9a81a0 100644 --- a/app-tests/git-leak/test_leak.py +++ b/app-tests/git-leak/test_leak.py @@ -1,7 +1,6 @@ import time import pytest - from helpers import gitea_repo_url, list_seeded_repos diff --git a/app-tests/git-leak/test_resilience.py b/app-tests/git-leak/test_resilience.py index 8455451eb..b8fb73c89 100644 --- a/app-tests/git-leak/test_resilience.py +++ b/app-tests/git-leak/test_resilience.py @@ -1,7 +1,6 @@ import time import pytest - from helpers import ( bounce_postgres, gitea_repo_url, @@ -37,14 +36,16 @@ def test_offline_repo_does_not_block_healthy_scopes(opal, repo_count): @pytest.mark.timeout(300) def test_server_recovers_after_postgres_bounce(opal): - """A transient Postgres (broadcaster) outage must not leave the server down. - - Recovery guard, not a known-broken case. On current master this PASSES: - when the broadcast channel drops, the affected worker triggers a graceful - shutdown and gunicorn respawns it, while the sibling worker keeps serving - HTTP — so the surface recovers within the window. It guards against a - regression of that property (PER-15065's in-process reconnect would make - recovery cleaner by avoiding the worker churn, but recovery already holds). + """A transient Postgres (broadcaster) outage must not leave the server + down. + + Recovery guard, not a known-broken case. On current master this + PASSES: when the broadcast channel drops, the affected worker + triggers a graceful shutdown and gunicorn respawns it, while the + sibling worker keeps serving HTTP — so the surface recovers within + the window. It guards against a regression of that property + (PER-15065's in-process reconnect would make recovery cleaner by + avoiding the worker churn, but recovery already holds). """ assert opal.stats() # healthy before bounce_postgres(down_seconds=5) diff --git a/packages/opal-server/opal_server/server.py b/packages/opal-server/opal_server/server.py index 2b88b3db8..1be796d6d 100644 --- a/packages/opal-server/opal_server/server.py +++ b/packages/opal-server/opal_server/server.py @@ -26,8 +26,8 @@ ) from opal_server.config import opal_server_config from opal_server.data.api import init_data_updates_router -from opal_server.debug_stats import register_internal_stats_route from opal_server.data.data_update_publisher import DataUpdatePublisher +from opal_server.debug_stats import register_internal_stats_route from opal_server.loadlimiting import init_loadlimit_router from opal_server.policy.bundles.api import router as bundles_router from opal_server.policy.watcher.factory import setup_watcher_task diff --git a/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py b/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py index 8ba0f678a..00a9c8e1b 100644 --- a/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py +++ b/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py @@ -1,6 +1,5 @@ from fastapi import FastAPI from fastapi.testclient import TestClient - from opal_server.debug_stats import register_internal_stats_route From db54ae627824aaea11f72b3ff829dcaddc6fa96d Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 13:13:22 +0300 Subject: [PATCH 06/26] test: scope root pytest collection to packages/ (exclude git-leak bed) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI `build` job runs `pytest` from the repo root with no path, which recursed into app-tests/git-leak/ and ran the flagship tests — these are designed to FAIL on master (they are the regression gates for PR2-PR5), so they broke the build job. Set `testpaths = packages` so the rootdir run collects only the unit tests under packages/ (matching master's effective behavior, since app-tests/ had no pytest files before). testpaths only applies when pytest is invoked from the rootdir with no args, so `cd app-tests/git-leak && pytest` still collects and runs the test bed. Verified both: root run -> packages only; subdir run -> all 5 flagship tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- pytest.ini | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pytest.ini b/pytest.ini index 16c88ba91..c98ec88b9 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,10 @@ # Handling DeprecationWarning 'asyncio_mode' default value [pytest] asyncio_mode = strict +# Scope the default (rootdir) collection to the unit tests under packages/. +# The heavyweight, docker-compose-driven test bed under app-tests/git-leak/ is +# designed to FAIL on master (it is the regression gate for later PRs) and must +# not be collected by the repo's CI `pytest` run. testpaths only applies when +# pytest is invoked from the rootdir with no args, so running it explicitly +# (e.g. `cd app-tests/git-leak && pytest`) still works. +testpaths = packages From 6f9a0894a1aa937a91a0cbe8f12b9b666306b680 Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 13:24:22 +0300 Subject: [PATCH 07/26] test(git-leak): address Copilot review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - stats(): sample the /internal endpoint several times and merge per key with max(). The caches are per-process on a multi-worker server, so a single read can hit an empty (non-leader) worker — max-merge avoids both false negatives (missing a populated leader) and false positives (an `== 0` drain assertion passing only because it hit an empty worker). - test_leak: assert the initial-load `_wait_until` succeeded before deleting / before taking baseline, so the tests can't pass vacuously when load never completed. - refresh_all(): correct the misleading comment — a 404 is a no-op, there is no client-side fallback. - conftest: skip the suite cleanly if docker is unavailable (defense in depth; it's already excluded from the default pytest run via testpaths). Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/conftest.py | 7 +++++++ app-tests/git-leak/helpers.py | 35 +++++++++++++++++++++++++-------- app-tests/git-leak/test_leak.py | 6 ++++-- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/app-tests/git-leak/conftest.py b/app-tests/git-leak/conftest.py index 7501f99d0..9701aff87 100644 --- a/app-tests/git-leak/conftest.py +++ b/app-tests/git-leak/conftest.py @@ -1,4 +1,5 @@ import os +import shutil import pytest from helpers import GiteaAdmin, OpalServerClient, compose @@ -26,6 +27,12 @@ def repo_count(request) -> int: @pytest.fixture(scope="session") def stack(request, repo_count): + # Defense-in-depth: this docker-compose suite is already excluded from the + # repo's default `pytest` run via `testpaths = packages` in pytest.ini, so + # the unit-test CI matrix never collects it. If it is ever collected in an + # environment without docker, skip cleanly instead of erroring. + if shutil.which("docker") is None: + pytest.skip("docker (compose) is required for the git-leak test bed") os.environ["REPO_COUNT"] = str(repo_count) # build + start infra; seed runs to completion then exits compose("up", "-d", "--build") diff --git a/app-tests/git-leak/helpers.py b/app-tests/git-leak/helpers.py index 1a3bd4fcc..8d0acc38a 100644 --- a/app-tests/git-leak/helpers.py +++ b/app-tests/git-leak/helpers.py @@ -41,12 +41,28 @@ def wait_healthy(self, timeout: int = 180) -> None: time.sleep(2) raise RuntimeError(f"opal-server not healthy in {timeout}s (last: {last})") - def stats(self) -> Dict[str, int]: - resp = requests.get( - f"{self.base_url}/internal/git-fetcher-cache-stats", timeout=10 - ) - resp.raise_for_status() - return resp.json() + def stats(self, samples: int = 5, interval: float = 0.1) -> Dict[str, int]: + """Read the git-fetcher cache stats, merged across several reads. + + The server runs multiple workers and the ``GitPolicyFetcher`` caches + are per-process, so a single read may land on a worker with empty + caches (e.g. a non-leader that never fetched). Sample a few times and + take the ``max`` per key, so the harness observes whichever worker + still holds the most entries — this prevents both false negatives + (missing a populated leader) and false positives (an ``== 0`` drain + assertion passing only because it hit an empty worker). + """ + merged: Dict[str, int] = {} + for i in range(max(1, samples)): + resp = requests.get( + f"{self.base_url}/internal/git-fetcher-cache-stats", timeout=10 + ) + resp.raise_for_status() + for key, value in resp.json().items(): + merged[key] = max(merged.get(key, 0), value) + if i < samples - 1: + time.sleep(interval) + return merged def put_scope(self, scope_id: str, repo_url: str, branch: str = "main") -> None: body = { @@ -73,10 +89,13 @@ def delete_scope(self, scope_id: str) -> None: resp.raise_for_status() def refresh_all(self) -> None: - # publishes a refresh on the webhook topic; leader pulls all scopes + # Best-effort: POST /scopes/refresh publishes on the webhook topic so + # every leader re-syncs all scopes. If this OPAL build doesn't expose + # the route (404), treat it as a no-op — there is no client-side + # fallback; callers rely on their own stats polling either way. resp = requests.post(f"{self.base_url}/scopes/refresh", timeout=30) if resp.status_code == 404: - return # endpoint name differs across versions; caller falls back to poll + return resp.raise_for_status() diff --git a/app-tests/git-leak/test_leak.py b/app-tests/git-leak/test_leak.py index 30b9a81a0..d2ef0f21b 100644 --- a/app-tests/git-leak/test_leak.py +++ b/app-tests/git-leak/test_leak.py @@ -23,7 +23,8 @@ def test_churn_releases_caches(opal, repo_count): repos = list_seeded_repos(n) for i, name in enumerate(repos): opal.put_scope(f"churn-{i}", gitea_repo_url(name)) - _wait_until(lambda: opal.stats()["repos"] >= n, timeout=600) + loaded = _wait_until(lambda: opal.stats()["repos"] >= n, timeout=600) + assert loaded, f"initial load never reached {n} repos: {opal.stats()}" for i in range(n): opal.delete_scope(f"churn-{i}") @@ -44,7 +45,8 @@ def test_repeat_sync_does_not_grow(opal, repo_count): repos = list_seeded_repos(n) for i, name in enumerate(repos): opal.put_scope(f"stable-{i}", gitea_repo_url(name)) - _wait_until(lambda: opal.stats()["repos"] >= n, timeout=600) + loaded = _wait_until(lambda: opal.stats()["repos"] >= n, timeout=600) + assert loaded, f"initial sync never reached {n} repos: {opal.stats()}" baseline = opal.stats()["repos"] for _ in range(10): From 1b23ac00223f3b827d51546e0be6c196c27a8edf Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 13:47:25 +0300 Subject: [PATCH 08/26] test(git-leak): isolate scopes per test and fix false repeat-sync gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two review findings on the regression gates: - Cross-test contamination: clone paths are keyed by repo URL (source_id = sha256(url)+branch-shard), not scope_id, so scopes from different tests that point at the same seeded repo share one GitPolicyFetcher cache entry. With a session-scoped stack and no teardown, a leftover boot-*/stable-* scope kept those entries alive and would make test_churn's `repos == 0` drain assertion fail on fixed code. OpalServerClient now tracks created scopes and the opal fixture deletes them on teardown (best-effort drain wait, swallows errors so master — where delete never purges — doesn't fail the passing test). - False gate: test_repeat_sync_does_not_grow re-syncs identical scopes, which a path-keyed cache can't grow even on master, so it could never be the leak gate it claimed. Reframed as an honest idempotency guard (passes on master) that points at test_churn_releases_caches as the real leak gate; README's "Expected on master" reclassifies it alongside the postgres-bounce guard. Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/README.md | 21 ++++++++++++++------- app-tests/git-leak/conftest.py | 9 ++++++++- app-tests/git-leak/helpers.py | 31 +++++++++++++++++++++++++++++++ app-tests/git-leak/test_leak.py | 11 +++++++++-- 4 files changed, 62 insertions(+), 10 deletions(-) diff --git a/app-tests/git-leak/README.md b/app-tests/git-leak/README.md index 778971705..a8abea9f6 100644 --- a/app-tests/git-leak/README.md +++ b/app-tests/git-leak/README.md @@ -27,13 +27,20 @@ Useful flags: `--boot-scopes=N` (any N), `--keep-stack` (skip teardown), env `BOOT_TARGET_SECONDS=120` (tighten the boot gate). ## Expected on master -The leak tests (#1, #2) and the offline-repo test (#4) FAIL on master — they -target unfixed bugs and become the regression gates for PR2/PR3. The boot test -(#3) passes but only fails when `BOOT_TARGET_SECONDS` is set low (PR4's gate). -The Postgres-bounce test (#5) PASSES on master: it is a recovery guard — when -the broadcaster drops, the worker is respawned by gunicorn while the sibling -worker keeps serving, so the HTTP surface recovers. It guards that property -against regression rather than reproducing a current failure. +The churn leak test (`test_churn_releases_caches`) and the offline-repo test +FAIL on master — they target unfixed bugs and become the regression gates for +PR2/PR3. The boot test passes but only fails when `BOOT_TARGET_SECONDS` is set +low (PR4's gate). + +Two tests are guards that PASS on master rather than reproducing a current +failure: +- `test_repeat_sync_does_not_grow` — clone paths are keyed by the repo URL, so + re-syncing identical scopes reuses cache entries and does not grow them. It + guards against a regression that would make repeat sync allocate per-sync. + (The unbounded-growth-then-no-purge leak is the *churn* test's job.) +- `test_server_recovers_after_postgres_bounce` — when the broadcaster drops, the + affected worker is respawned by gunicorn while the sibling keeps serving, so + the HTTP surface recovers. Guards that property against regression. ## Requires Docker + docker compose v2, plus host Python with `pytest pytest-timeout requests GitPython`. diff --git a/app-tests/git-leak/conftest.py b/app-tests/git-leak/conftest.py index 9701aff87..c3c01bdbb 100644 --- a/app-tests/git-leak/conftest.py +++ b/app-tests/git-leak/conftest.py @@ -47,7 +47,14 @@ def stack(request, repo_count): @pytest.fixture() def opal(stack) -> OpalServerClient: - return stack + # The compose stack is session-scoped (one server for the whole run), but + # scopes must not leak between tests: clone paths are keyed by repo URL, so + # a scope left behind by one test shares a cache entry with any later test + # using the same seeded repo and would pollute its drain assertions. Delete + # whatever this test created on teardown to keep each test isolated. + stack._created_scopes.clear() + yield stack + stack.cleanup_created_scopes() @pytest.fixture(scope="session") diff --git a/app-tests/git-leak/helpers.py b/app-tests/git-leak/helpers.py index 8d0acc38a..4fefaf81d 100644 --- a/app-tests/git-leak/helpers.py +++ b/app-tests/git-leak/helpers.py @@ -25,6 +25,12 @@ class OpalServerClient: def __init__(self, base_url: str = OPAL_URL): self.base_url = base_url.rstrip("/") + # scope_ids created via put_scope, so a per-test fixture can delete them + # on teardown. Clone paths are keyed by repo URL (not scope_id), so a + # scope left behind by one test shares a GitPolicyFetcher cache entry + # with any other test pointing at the same seeded repo — without cleanup + # that leftover keeps the entry alive and pollutes a drain assertion. + self._created_scopes: set = set() def wait_healthy(self, timeout: int = 180) -> None: deadline = time.time() + timeout @@ -82,11 +88,36 @@ def put_scope(self, scope_id: str, repo_url: str, branch: str = "main") -> None: # the scope router mounts at prefix="/scopes" with @router.put("") resp = requests.put(f"{self.base_url}/scopes", json=body, timeout=30) resp.raise_for_status() + self._created_scopes.add(scope_id) def delete_scope(self, scope_id: str) -> None: resp = requests.delete(f"{self.base_url}/scopes/{scope_id}", timeout=30) if resp.status_code not in (200, 204, 404): resp.raise_for_status() + self._created_scopes.discard(scope_id) + + def cleanup_created_scopes(self, drain_timeout: int = 15) -> None: + """Delete every scope created on this client, then best-effort wait for + the caches to drain — so the next test starts from a clean slate. + + Best-effort by design: on master, delete never purges the caches (the + leak this suite gates), so the drain wait will simply time out. A + teardown failure must never fail the test that just ran, hence the broad + excepts and the bounded wait. + """ + for scope_id in list(self._created_scopes): + try: + self.delete_scope(scope_id) + except Exception: + self._created_scopes.discard(scope_id) + deadline = time.time() + drain_timeout + while time.time() < deadline: + try: + if self.stats(samples=1)["repos"] == 0: + return + except Exception: + return + time.sleep(1) def refresh_all(self) -> None: # Best-effort: POST /scopes/refresh publishes on the webhook topic so diff --git a/app-tests/git-leak/test_leak.py b/app-tests/git-leak/test_leak.py index d2ef0f21b..8cfee6363 100644 --- a/app-tests/git-leak/test_leak.py +++ b/app-tests/git-leak/test_leak.py @@ -37,9 +37,16 @@ def test_churn_releases_caches(opal, repo_count): @pytest.mark.timeout(900) def test_repeat_sync_does_not_grow(opal, repo_count): - """Re-syncing the same scopes must not grow the caches unboundedly. + """Re-syncing the *same* scopes must not grow the caches. - FAILS on master: repos cache only ever grows. + Idempotency guard, not a known-broken case. On current master this + PASSES: a clone path is keyed by the repo URL (``source_id`` = + sha256(url)+branch-shard), so re-syncing identical scopes reuses the + existing ``repos`` / ``repos_last_fetched`` entries instead of + allocating new ones. It guards against a regression that would make + repeat sync allocate per-sync. The unbounded *growth + no purge on + delete* leak is covered by ``test_churn_releases_caches`` above, which + uses distinct scopes and DOES fail on master. """ n = min(repo_count, 50) repos = list_seeded_repos(n) From 245f80ba6c06a7e52449dd35cc9dfa482a297378 Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 14:19:46 +0300 Subject: [PATCH 09/26] feat(opal-server): add scope git fetch timeout + dedicated pool config Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/plans/docs/05-config-reference.md | 21 +++++++++++++++++++ packages/opal-server/opal_server/config.py | 14 +++++++++++++ .../opal_server/tests/git_executor_test.py | 7 +++++++ 3 files changed, 42 insertions(+) create mode 100644 .claude/plans/docs/05-config-reference.md create mode 100644 packages/opal-server/opal_server/tests/git_executor_test.py diff --git a/.claude/plans/docs/05-config-reference.md b/.claude/plans/docs/05-config-reference.md new file mode 100644 index 000000000..19caea4c4 --- /dev/null +++ b/.claude/plans/docs/05-config-reference.md @@ -0,0 +1,21 @@ +# 05 — OPAL Config Reference (private) + +Private, internal supplement to the public operator docs under `documentation/docs/`. Tracks +`OPAL_*` env vars added or clarified by the OPAL Server Git Fixes work, with the declaring +`file:line` so contributors can jump straight to the `Confi` declaration. Each key maps to an +`OPAL_` env var (the `OPAL_` prefix is added once by the component's `Confi(prefix="OPAL_")` +instantiation — the bare name is what appears in the table). + +## 4. opal-server keys + +| Env var | Type | Default | Purpose | Declared at | +|---|---|---|---|---| +| `OPAL_SCOPES_GIT_FETCH_TIMEOUT` | float (seconds) | `120.0` | Hard timeout for a single scope git clone/fetch. On timeout the operation is abandoned and the scope is marked failed, so one unreachable repo can never block boot or other scopes. `0` = no timeout. | `packages/opal-server/opal_server/config.py:150-156` | +| `OPAL_SCOPES_GIT_MAX_WORKERS` | int | `10` | Size of the dedicated `ThreadPoolExecutor` for scope git operations. Isolating git work keeps a hung fetch from starving bundle serving and other server work that uses the default executor. | `packages/opal-server/opal_server/config.py:157-163` | + +> **Caveat (timeout is soft, not a hard kill).** `OPAL_SCOPES_GIT_FETCH_TIMEOUT` is enforced via +> `asyncio.wait_for`, which cancels the *await* — unblocking the event loop and the awaiting +> coroutine — but the underlying pygit2 call keeps running on its pool thread until the OS network +> timeout. The dedicated pool (`OPAL_SCOPES_GIT_MAX_WORKERS`) bounds and isolates those lingering +> threads so they cannot affect bundle serving or other scopes. Hard-kill via subprocess is out of +> scope. See spec §6. diff --git a/packages/opal-server/opal_server/config.py b/packages/opal-server/opal_server/config.py index 9faac9be4..82ffc305c 100644 --- a/packages/opal-server/opal_server/config.py +++ b/packages/opal-server/opal_server/config.py @@ -147,6 +147,20 @@ class OpalServerConfig(Confi): 0, description="The timeout for cloning the policy repository (0 means wait forever)", ) + SCOPES_GIT_FETCH_TIMEOUT = confi.float( + "SCOPES_GIT_FETCH_TIMEOUT", + 120.0, + description="Hard timeout in seconds for a single scope git clone/fetch. " + "On timeout the operation is abandoned and the scope is marked failed so " + "one unreachable repo can never block boot or other scopes (0 = no timeout).", + ) + SCOPES_GIT_MAX_WORKERS = confi.int( + "SCOPES_GIT_MAX_WORKERS", + 10, + description="Size of the dedicated thread pool for scope git operations. " + "Isolating git work keeps a hung fetch from starving bundle serving and " + "other server work that uses the default executor.", + ) LEADER_LOCK_FILE_PATH = confi.str( "LEADER_LOCK_FILE_PATH", "/tmp/opal_server_leader.lock", diff --git a/packages/opal-server/opal_server/tests/git_executor_test.py b/packages/opal-server/opal_server/tests/git_executor_test.py new file mode 100644 index 000000000..cfc250f75 --- /dev/null +++ b/packages/opal-server/opal_server/tests/git_executor_test.py @@ -0,0 +1,7 @@ +from opal_server.config import OpalServerConfig + + +def test_git_resilience_config_defaults(): + clean = OpalServerConfig(prefix="OPAL_") + assert clean.SCOPES_GIT_FETCH_TIMEOUT == 120.0 + assert clean.SCOPES_GIT_MAX_WORKERS == 10 From 6abbc2a74bfaf54f6b32ad51e0d096892997420b Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 14:21:19 +0300 Subject: [PATCH 10/26] feat(opal-server): dedicated git thread pool with hard per-op timeout Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-server/opal_server/git_fetcher.py | 36 +++++++++++++++++++ .../opal_server/tests/git_executor_test.py | 23 ++++++++++++ 2 files changed, 59 insertions(+) diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index e52083c98..9d3f7d70f 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -4,6 +4,8 @@ import hashlib import shutil import time +from concurrent.futures import ThreadPoolExecutor +from functools import partial from pathlib import Path from typing import Optional, cast @@ -34,6 +36,40 @@ ) +_git_executor: Optional[ThreadPoolExecutor] = None + + +def _get_git_executor() -> ThreadPoolExecutor: + """Lazily build the dedicated pool for scope git operations. + + Isolated from the default executor so a hung clone/fetch can never starve + bundle serving or other server work. + """ + global _git_executor + if _git_executor is None: + _git_executor = ThreadPoolExecutor( + max_workers=opal_server_config.SCOPES_GIT_MAX_WORKERS, + thread_name_prefix="opal-git", + ) + return _git_executor + + +async def run_in_git_executor(func, *args, timeout: float, **kwargs): + """Run a blocking git call on the dedicated pool with a hard timeout. + + Raises ``TimeoutError`` (via ``asyncio.wait_for``) when the call exceeds + ``timeout`` seconds. ``timeout <= 0`` means no limit. NOTE: the timeout + unblocks the event loop and the awaiting coroutine, but the underlying + pygit2 call keeps running on its pool thread until the OS network timeout; + the dedicated pool keeps that lingering thread isolated. + """ + loop = asyncio.get_event_loop() + fut = loop.run_in_executor(_get_git_executor(), partial(func, *args, **kwargs)) + if timeout and timeout > 0: + return await asyncio.wait_for(fut, timeout=timeout) + return await fut + + class PolicyFetcherCallbacks: async def on_update(self, old_head: Optional[str], head: str): pass diff --git a/packages/opal-server/opal_server/tests/git_executor_test.py b/packages/opal-server/opal_server/tests/git_executor_test.py index cfc250f75..9a36c624f 100644 --- a/packages/opal-server/opal_server/tests/git_executor_test.py +++ b/packages/opal-server/opal_server/tests/git_executor_test.py @@ -1,7 +1,30 @@ +import time + +import pytest + from opal_server.config import OpalServerConfig +from opal_server.git_fetcher import run_in_git_executor def test_git_resilience_config_defaults(): clean = OpalServerConfig(prefix="OPAL_") assert clean.SCOPES_GIT_FETCH_TIMEOUT == 120.0 assert clean.SCOPES_GIT_MAX_WORKERS == 10 + + +@pytest.mark.asyncio +async def test_run_in_git_executor_returns_value(): + result = await run_in_git_executor(lambda: 21 * 2, timeout=5) + assert result == 42 + + +@pytest.mark.asyncio +async def test_run_in_git_executor_times_out(): + with pytest.raises(TimeoutError): + await run_in_git_executor(lambda: time.sleep(2), timeout=0.1) + + +@pytest.mark.asyncio +async def test_zero_timeout_means_no_limit(): + result = await run_in_git_executor(lambda: "ok", timeout=0) + assert result == "ok" From ed86a9c286d8e865df12c30c108a100d37875761 Mon Sep 17 00:00:00 2001 From: David Shoen Date: Tue, 23 Jun 2026 14:24:10 +0300 Subject: [PATCH 11/26] fix(opal-server): bound scope clone/fetch with timeout, skip on hang Wire scope clone/fetch through run_in_git_executor with SCOPES_GIT_FETCH_TIMEOUT, and broaden the _clone except to catch asyncio.TimeoutError so a hung clone is logged and the scope skipped instead of crashing the caller. Drop the now-unused run_sync import. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-server/opal_server/git_fetcher.py | 15 ++++++++----- .../opal_server/tests/fetch_timeout_test.py | 22 +++++++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) create mode 100644 packages/opal-server/opal_server/tests/fetch_timeout_test.py diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index 9d3f7d70f..c8dee329a 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -13,7 +13,6 @@ import pygit2 from ddtrace import tracer from git import Repo -from opal_common.async_utils import run_sync from opal_common.git_utils.bundle_maker import BundleMaker from opal_common.logger import logger from opal_common.schemas.policy import PolicyBundle @@ -226,9 +225,10 @@ async def fetch_and_notify_on_changes( GitPolicyFetcher.repos_last_fetched[ self._source_id ] = datetime.datetime.now() - await run_sync( + await run_in_git_executor( repo.remotes[self._remote].fetch, callbacks=self._auth_callbacks, + timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT, ) logger.debug(f"Fetch completed: {self._source.url}") @@ -258,14 +258,19 @@ async def _clone(self): path=self._repo_path, ) try: - repo: Repository = await run_sync( + repo: Repository = await run_in_git_executor( clone_repository, self._source.url, str(self._repo_path), callbacks=self._auth_callbacks, + timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT, + ) + except (pygit2.GitError, asyncio.TimeoutError) as exc: + logger.error( + "Could not clone repo at {url}: {err}", + url=self._source.url, + err=repr(exc), ) - except pygit2.GitError: - logger.exception(f"Could not clone repo at {self._source.url}") else: logger.info(f"Clone completed: {self._source.url}") await self._notify_on_changes(repo) diff --git a/packages/opal-server/opal_server/tests/fetch_timeout_test.py b/packages/opal-server/opal_server/tests/fetch_timeout_test.py new file mode 100644 index 000000000..58376ff44 --- /dev/null +++ b/packages/opal-server/opal_server/tests/fetch_timeout_test.py @@ -0,0 +1,22 @@ +import time + +import pytest + +from opal_server.config import opal_server_config +from opal_server.git_fetcher import run_in_git_executor + + +@pytest.mark.asyncio +async def test_hanging_git_op_raises_timeout(monkeypatch): + """A clone/fetch that hangs must surface TimeoutError, not block forever.""" + monkeypatch.setattr(opal_server_config, "SCOPES_GIT_FETCH_TIMEOUT", 0.2) + + def _hang(): + time.sleep(5) + + start = time.time() + with pytest.raises(TimeoutError): + await run_in_git_executor( + _hang, timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT + ) + assert time.time() - start < 2, "wait_for did not unblock promptly" From 6046a103f229c07616a9e053fb5de0a0267e93f7 Mon Sep 17 00:00:00 2001 From: David Shoen Date: Wed, 24 Jun 2026 16:03:00 +0300 Subject: [PATCH 12/26] test(git-leak): make the regression gates trustworthy (address PR review) Addresses the CHANGES_REQUESTED review on PR #922. Root cause behind most findings: a fresh scope's first sync takes the _clone() branch, which only fills GitPolicyFetcher.repo_locks; repos/repos_last_fetched are filled on a *second* sync. So the load gates on `repos` hung, and the 2-worker per-process caches made `== 0` drain assertions unsafe. Blocking: - Single worker (UVICORN_NUM_WORKERS=1): deterministic per-process cache reads; removes the false `== 0` drain class. - Load gate (CRITICAL + Zivxx HIGH): _load_scopes gates on repo_locks then refresh_all() to force the second sync, so repos/repos_last_fetched are actually populated before any drain/purge assertion. - compose() surfaces captured stdout/stderr on failure. - Seed completeness asserted in conftest; seed script isolates per-repo failures and exits non-zero with a count. Secondary: - test_repeat_sync asserts an RSS bound (count can't grow for any impl); churn asserts all three caches drain + a loose RSS backstop. - blackhole socat sidecar replaces TEST-NET-1 (deterministic hang); offline test saturates the fetch executor with 40 hung clones and recovers via OpalServerClient.hard_reset() (stop -> redis FLUSHALL -> start). - Per-test clean slate deletes all server scopes (fixes orphan-scope leak). - Postgres-bounce proves broadcast recovery (PUT post-bounce, assert sync) and uses `up -d --wait`. - Remove dead 404 branch in refresh_all; boot clock starts at restart; gitea-admin `|| true` -> "already exists"-only guard; README reworded. opal-server: - /internal stats route now takes the JWTAuthenticator dependency (protected when JWT on, no-op in the test bed); unit test asserts enforcement. Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/README.md | 62 +++++--- app-tests/git-leak/conftest.py | 27 +++- app-tests/git-leak/docker-compose.yml | 26 +++- app-tests/git-leak/helpers.py | 141 ++++++++++++------ app-tests/git-leak/seed/seed_gitea.py | 36 +++-- app-tests/git-leak/test_boot.py | 12 +- app-tests/git-leak/test_leak.py | 99 ++++++++---- app-tests/git-leak/test_resilience.py | 105 +++++++++---- .../opal-server/opal_server/debug_stats.py | 25 +++- packages/opal-server/opal_server/server.py | 4 +- .../tests/debug_stats_endpoint_test.py | 19 ++- 11 files changed, 407 insertions(+), 149 deletions(-) diff --git a/app-tests/git-leak/README.md b/app-tests/git-leak/README.md index a8abea9f6..aabbb4657 100644 --- a/app-tests/git-leak/README.md +++ b/app-tests/git-leak/README.md @@ -1,21 +1,31 @@ # OPAL git-leak / resilience test bed -Reproduces (as failing tests on `master`) the four issues fixed by PR2–PR5: -memory leak, offline-repo hang, slow serial boot, broadcaster no-reconnect. +Reproduces (as failing tests) the four issues fixed by PR2–PR5: memory leak, +offline-repo hang, slow serial boot, broadcaster no-reconnect. + +Every assertion is driven through `GET /internal/git-fetcher-cache-stats`, which +**this PR (PR1) adds** — it does not exist on `master`. So the suite runs against +*this branch*: the leak/offline tests fail here *until PR2/PR3 land*, then go +green. Run against true `master` they would all error at setup on the missing +endpoint, not "fail for the targeted bug." ## Stack -- `opal_server` (2 workers, scopes on, Postgres broadcaster, built from `docker/Dockerfile`) +- `opal_server` (single worker, scopes on, Postgres broadcaster, built from `docker/Dockerfile`) - `redis`, `postgres`, `gitea` (+ one-shot `gitea-admin` and `seed` sidecars) +- `blackhole` (alpine/socat: accepts TCP then never answers — the offline repo) Only `opal_server` (`:7002`) and `gitea` (`:13000` on the host) are published; -Postgres is internal to the compose network. +Postgres and `blackhole` are internal to the compose network. ## Helpers (`helpers.py`) -- `OpalServerClient` — drive opal over HTTP (`stats`, `put_scope`, `delete_scope`, `refresh_all`). +- `OpalServerClient` — drive opal over HTTP (`stats`, `put_scope`, `delete_scope`, + `refresh_all`, `get_scope_policy`, `list_scope_ids`, `delete_all_scopes`). - `GiteaAdmin` — host-side Gitea admin client (`list_repos`, `repo_exists`, `create_repo`, `delete_repo`); also exposed as the `gitea_admin` pytest fixture. -- `make_repo_unreachable(name)` — git URL on a routable-but-dead host (TEST-NET-1) for the offline-repo test. -- `bounce_postgres(down_seconds)` — stop/start Postgres to simulate a broadcaster outage. +- `make_repo_unreachable(name)` — git URL on the `blackhole` sidecar (completes + the TCP handshake, never answers) so the clone hangs for the offline-repo test. +- `bounce_postgres(down_seconds)` — stop Postgres, then `up -d --wait` it back to + simulate a broadcaster outage and await readiness before the recovery poll. ## Run ```bash @@ -26,21 +36,21 @@ python -m pytest test_leak.py -v --boot-scopes=20 # just the leak gates Useful flags: `--boot-scopes=N` (any N), `--keep-stack` (skip teardown), env `BOOT_TARGET_SECONDS=120` (tighten the boot gate). -## Expected on master +## Expected behavior The churn leak test (`test_churn_releases_caches`) and the offline-repo test -FAIL on master — they target unfixed bugs and become the regression gates for -PR2/PR3. The boot test passes but only fails when `BOOT_TARGET_SECONDS` is set -low (PR4's gate). +FAIL on this branch *without the PR2/PR3 fix* — they target unfixed bugs and +become the regression gates for PR2/PR3, flipping green when those land. The +boot test passes but fails when `BOOT_TARGET_SECONDS` is set low (PR4's gate). -Two tests are guards that PASS on master rather than reproducing a current -failure: +Two tests are guards that PASS rather than reproducing a current failure: - `test_repeat_sync_does_not_grow` — clone paths are keyed by the repo URL, so - re-syncing identical scopes reuses cache entries and does not grow them. It - guards against a regression that would make repeat sync allocate per-sync. - (The unbounded-growth-then-no-purge leak is the *churn* test's job.) + re-syncing identical scopes reuses cache entries and the cache *counts* can't + grow for any implementation; the load-bearing assertion is therefore on RSS, + guarding against a regression that leaks per-sync allocations. - `test_server_recovers_after_postgres_bounce` — when the broadcaster drops, the - affected worker is respawned by gunicorn while the sibling keeps serving, so - the HTTP surface recovers. Guards that property against regression. + worker is respawned by gunicorn and the broadcaster reconnects once Postgres + is back; the test PUTs a fresh scope post-bounce and asserts it syncs, proving + the broadcast path (not just HTTP) recovered. ## Requires Docker + docker compose v2, plus host Python with `pytest pytest-timeout requests GitPython`. @@ -48,8 +58,14 @@ Docker + docker compose v2, plus host Python with `pytest pytest-timeout request ## Notes - Auth is disabled in the stack: `OPAL_AUTH_PUBLIC_KEY` is left unset so the JWT verifier is disabled and the harness can call scope routes without minting JWTs. - Local test bed only; never a production setting. -- The server runs 2 uvicorn workers with a Postgres broadcaster, mirroring a - realistic multi-worker deployment. The `GitPolicyFetcher` caches read by the - `/internal/git-fetcher-cache-stats` endpoint are per-process, so the harness - polls with generous timeouts to let the leader worker converge. + Local test bed only; never a production setting. (The `/internal` endpoint is + registered with the same `JWTAuthenticator` dependency as the other routes, so + it is protected when JWT verification is enabled and open only here.) +- The server runs a **single** uvicorn worker. The `GitPolicyFetcher` caches read + by `/internal/git-fetcher-cache-stats` are per-process, so a multi-worker stack + would make a round-robin read miss the worker that fetched and let a `== 0` + drain assertion pass falsely. One worker makes every cache read deterministic; + the leak/boot/offline bugs all reproduce single-worker. +- First-sync of a fresh scope takes the clone path, which fills only `repo_locks`; + `repos` / `repos_last_fetched` are filled by the discover/fetch path on a second + sync, so the load helpers issue a `refresh_all()` before asserting on `repos`. diff --git a/app-tests/git-leak/conftest.py b/app-tests/git-leak/conftest.py index c3c01bdbb..1660d596b 100644 --- a/app-tests/git-leak/conftest.py +++ b/app-tests/git-leak/conftest.py @@ -2,7 +2,7 @@ import shutil import pytest -from helpers import GiteaAdmin, OpalServerClient, compose +from helpers import GiteaAdmin, OpalServerClient, compose, list_seeded_repos def pytest_addoption(parser): @@ -36,8 +36,20 @@ def stack(request, repo_count): os.environ["REPO_COUNT"] = str(repo_count) # build + start infra; seed runs to completion then exits compose("up", "-d", "--build") - # block until seeding sidecar has finished creating repos + # block until seeding sidecar has finished creating repos. compose() raises + # (with output) if the seed container exited non-zero, so a hard seed + # failure surfaces here rather than as a confusing later test failure. compose("wait", "seed") + # Verify the seed actually produced all N repos before any test runs: a + # partial seed would otherwise look like a server bug when the load gate + # can't reach N. Fail loudly with the gap. + expected = set(list_seeded_repos(repo_count)) + present = set(GiteaAdmin().list_repos()) + missing = expected - present + assert not missing, ( + f"seed incomplete: {len(missing)}/{repo_count} repos missing " + f"(e.g. {sorted(missing)[:5]})" + ) client = OpalServerClient() client.wait_healthy() yield client @@ -50,11 +62,14 @@ def opal(stack) -> OpalServerClient: # The compose stack is session-scoped (one server for the whole run), but # scopes must not leak between tests: clone paths are keyed by repo URL, so # a scope left behind by one test shares a cache entry with any later test - # using the same seeded repo and would pollute its drain assertions. Delete - # whatever this test created on teardown to keep each test isolated. - stack._created_scopes.clear() + # using the same seeded repo and would pollute its drain assertions. + # + # Delete every scope the *server* currently knows (not just this client's + # tracked set) at setup, so a scope orphaned by a prior failed test can't + # contaminate this one; then again on teardown. + stack.delete_all_scopes() yield stack - stack.cleanup_created_scopes() + stack.delete_all_scopes() @pytest.fixture(scope="session") diff --git a/app-tests/git-leak/docker-compose.yml b/app-tests/git-leak/docker-compose.yml index 3704e5c83..bd48c26dc 100644 --- a/app-tests/git-leak/docker-compose.yml +++ b/app-tests/git-leak/docker-compose.yml @@ -51,15 +51,30 @@ services: condition: service_healthy user: git entrypoint: ["/bin/sh", "-c"] + # Tolerate only the idempotent "already exists" case; any other failure + # must abort so `seed` (which depends on this completing) doesn't run + # against a Gitea with no admin user and fail with a confusing 401. command: - > - gitea admin user create --username opaladmin --password opaladmin + out=$$(gitea admin user create --username opaladmin --password opaladmin --email admin@example.com --admin --must-change-password=false - --config /data/gitea/conf/app.ini || true + --config /data/gitea/conf/app.ini 2>&1); rc=$$?; + echo "$$out"; + if [ $$rc -ne 0 ] && ! echo "$$out" | grep -qi "already exist"; then + exit $$rc; + fi volumes: - gitea-data:/data restart: "no" + blackhole: + # Accepts the TCP handshake then never answers — a clone connects and + # blocks reading the git smart-HTTP response, holding the fetch executor. + # Deterministic, unlike a TEST-NET-1 address which many networks reject + # fast with ICMP-unreachable (so the clone would fail fast, not hang). + image: alpine/socat:latest + command: ["TCP-LISTEN:80,fork,reuseaddr", "SYSTEM:sleep 3600"] + seed: build: ./seed depends_on: @@ -82,7 +97,12 @@ services: dockerfile: docker/Dockerfile target: server environment: - UVICORN_NUM_WORKERS: "2" + # Single worker on purpose: the GitPolicyFetcher caches read by + # /internal/git-fetcher-cache-stats are per-process, so with >1 worker a + # round-robin read can miss the worker that fetched and make a `== 0` + # drain assertion pass falsely. One worker makes every cache read + # deterministic. The leak/boot/offline bugs all reproduce single-worker. + UVICORN_NUM_WORKERS: "1" OPAL_SCOPES: "1" OPAL_REDIS_URL: "redis://redis:6379" OPAL_BROADCAST_URI: "postgres://opal:opal@postgres:5432/opal" diff --git a/app-tests/git-leak/helpers.py b/app-tests/git-leak/helpers.py index 4fefaf81d..3cb66a36a 100644 --- a/app-tests/git-leak/helpers.py +++ b/app-tests/git-leak/helpers.py @@ -14,9 +14,12 @@ GITEA_USER = "opaladmin" GITEA_PASSWORD = "opaladmin" -# TEST-NET-1 (RFC 5737): routable but runs no git server, so a clone hangs -# instead of failing fast — used to simulate an offline/unreachable repo. -UNREACHABLE_HOST = "192.0.2.1" +# the `blackhole` compose service (alpine/socat) accepts the TCP handshake then +# never answers, so a clone connects and blocks reading the response — a +# deterministic hang. Reachable from the opal_server container on the compose +# network. (A TEST-NET-1 address was rejected too fast on many networks, so the +# clone failed fast instead of hanging and the offline scenario wasn't exercised.) +UNREACHABLE_HOST = "blackhole" # the compose project lives next to this file; compose() runs from here _COMPOSE_DIR = str(Path(__file__).resolve().parent) @@ -47,16 +50,15 @@ def wait_healthy(self, timeout: int = 180) -> None: time.sleep(2) raise RuntimeError(f"opal-server not healthy in {timeout}s (last: {last})") - def stats(self, samples: int = 5, interval: float = 0.1) -> Dict[str, int]: - """Read the git-fetcher cache stats, merged across several reads. + def stats(self, samples: int = 3, interval: float = 0.1) -> Dict[str, int]: + """Read the git-fetcher cache stats, merged across a few reads. - The server runs multiple workers and the ``GitPolicyFetcher`` caches - are per-process, so a single read may land on a worker with empty - caches (e.g. a non-leader that never fetched). Sample a few times and - take the ``max`` per key, so the harness observes whichever worker - still holds the most entries — this prevents both false negatives - (missing a populated leader) and false positives (an ``== 0`` drain - assertion passing only because it hit an empty worker). + The stack runs a single uvicorn worker (see docker-compose.yml), so the + per-process ``GitPolicyFetcher`` caches are read deterministically — a + read can't miss the worker that fetched. Sampling a few times and taking + the ``max`` per key only smooths over a read that races an in-flight + mutation; it is not relied on to paper over multi-worker nondeterminism + (which the single-worker setup removes outright). """ merged: Dict[str, int] = {} for i in range(max(1, samples)): @@ -96,37 +98,75 @@ def delete_scope(self, scope_id: str) -> None: resp.raise_for_status() self._created_scopes.discard(scope_id) - def cleanup_created_scopes(self, drain_timeout: int = 15) -> None: - """Delete every scope created on this client, then best-effort wait for - the caches to drain — so the next test starts from a clean slate. - - Best-effort by design: on master, delete never purges the caches (the - leak this suite gates), so the drain wait will simply time out. A - teardown failure must never fail the test that just ran, hence the broad - excepts and the bounded wait. + def list_scope_ids(self) -> List[str]: + """All scope ids the server currently knows (GET /scopes).""" + resp = requests.get(f"{self.base_url}/scopes", timeout=30) + resp.raise_for_status() + return [s["scope_id"] for s in resp.json()] + + def hard_reset(self, timeout: int = 600) -> None: + """Recover the server from a saturated fetch executor by wiping state. + + When a test leaves many clones hung (the offline-repo test saturates the + executor on purpose), per-scope DELETEs would queue *behind* those hung + threads, and a plain restart would have ``preload_scopes`` re-clone the + offline scopes and saturate again. Instead: stop the server (killing the + hung threads), flush the Redis scope store so nothing is re-cloned, then + start clean. Used in that test's teardown so the session-scoped stack is + usable by every later test. """ - for scope_id in list(self._created_scopes): - try: - self.delete_scope(scope_id) - except Exception: - self._created_scopes.discard(scope_id) + compose("stop", "opal_server") + compose("exec", "-T", "redis", "redis-cli", "FLUSHALL") + compose("start", "opal_server") + self._created_scopes.clear() + self.wait_healthy(timeout=timeout) + + def delete_all_scopes(self, drain_timeout: int = 20) -> None: + """Delete every scope the *server* knows (not just this client's), then + best-effort wait for the caches to drain — a clean slate independent of + what any prior, possibly-failed, test left behind. + + Best-effort drain by design: on master, delete never purges the caches + (the leak this suite gates), so the wait simply times out. This runs in + fixture setup/teardown, so a failure here must not mask the test, hence + the broad excepts and bounded wait. + """ + try: + for scope_id in self.list_scope_ids(): + try: + self.delete_scope(scope_id) + except Exception: + self._created_scopes.discard(scope_id) + except Exception: + pass + self._created_scopes.clear() deadline = time.time() + drain_timeout while time.time() < deadline: try: - if self.stats(samples=1)["repos"] == 0: + if self.stats()["repo_locks"] == 0: return except Exception: return time.sleep(1) + def get_scope_policy(self, scope_id: str) -> requests.Response: + """Fetch a scope's policy bundle (GET /scopes/{id}/policy). + + A 200 here proves the scope's repo was cloned and is being served — + the signal that a healthy scope still works while another scope's + clone is hanging. + """ + return requests.get( + f"{self.base_url}/scopes/{scope_id}/policy", timeout=30 + ) + def refresh_all(self) -> None: - # Best-effort: POST /scopes/refresh publishes on the webhook topic so - # every leader re-syncs all scopes. If this OPAL build doesn't expose - # the route (404), treat it as a no-op — there is no client-side - # fallback; callers rely on their own stats polling either way. + # POST /scopes/refresh publishes on the webhook topic so the leader + # re-syncs all scopes. The second sync takes the discover/fetch path + # (not the first-sync clone path), which is what populates the `repos` + # and `repos_last_fetched` caches. A missing route is a real error and + # is surfaced via raise_for_status. resp = requests.post(f"{self.base_url}/scopes/refresh", timeout=30) - if resp.status_code == 404: - return resp.raise_for_status() @@ -203,32 +243,47 @@ def gitea_repo_url(name: str) -> str: def make_repo_unreachable(name: str) -> str: - """Return a git URL for ``name`` pointing at a routable-but-dead host. - - Simulates an offline/unreachable policy repo: the address is in - TEST-NET-1 (RFC 5737), which is routable but runs no git server, so a - clone hangs rather than failing fast — exercising the missing fetch - timeout on the scopes path (the bug PR3 fixes). The URL keeps the same - ``/{user}/{name}.git`` shape as a real Gitea repo so the scope looks - ordinary apart from the unreachable host. + """Return a git URL for ``name`` pointing at the ``blackhole`` sidecar. + + Simulates an offline/unreachable policy repo: ``blackhole`` (alpine/socat) + accepts the TCP handshake then never answers, so the clone connects and + blocks reading the git smart-HTTP response — a deterministic hang that + exercises the missing fetch timeout on the scopes path (the bug PR3 fixes). + The URL keeps the same ``/{user}/{name}.git`` shape as a real Gitea repo so + the scope looks ordinary apart from the unreachable host. """ return f"http://{UNREACHABLE_HOST}/{GITEA_USER}/{name}.git" def compose(*args: str) -> subprocess.CompletedProcess: - return subprocess.run( + """Run `docker compose `; on failure, surface the captured output. + + `capture_output=True` keeps compose noise out of passing tests, but a raw + CalledProcessError shows only the exit code — so on failure we re-raise + with the captured stdout/stderr embedded, otherwise a broken build/seed/ + restart is opaque to debug. + """ + proc = subprocess.run( ["docker", "compose", *args], cwd=_COMPOSE_DIR, capture_output=True, text=True, - check=True, ) + if proc.returncode != 0: + raise RuntimeError( + f"`docker compose {' '.join(args)}` failed (exit {proc.returncode})\n" + f"--- stdout ---\n{proc.stdout}\n--- stderr ---\n{proc.stderr}" + ) + return proc def bounce_postgres(down_seconds: int = 5) -> None: compose("stop", "postgres") time.sleep(down_seconds) - compose("start", "postgres") + # `up -d --wait` blocks until Postgres passes its healthcheck again (plain + # `compose start` has no --wait), so a recovery poll that follows isn't + # racing an unready broadcaster. --no-recreate keeps the same container. + compose("up", "-d", "--wait", "--no-recreate", "postgres") def list_seeded_repos(count: int) -> List[str]: diff --git a/app-tests/git-leak/seed/seed_gitea.py b/app-tests/git-leak/seed/seed_gitea.py index ddebe5667..0b8bf2e33 100644 --- a/app-tests/git-leak/seed/seed_gitea.py +++ b/app-tests/git-leak/seed/seed_gitea.py @@ -113,18 +113,36 @@ def main() -> int: token = _ensure_token(base_url, user, password) workdir = Path("/tmp/seed-work") + failures = [] for i in range(count): name = f"policy-repo-{i:04d}" - _ensure_repo(base_url, token, user, name) - # only push if the repo is empty (freshly created) - head = requests.get( - f"{base_url}/api/v1/repos/{user}/{name}/branches/main", - headers={"Authorization": f"token {token}"}, - timeout=10, + # Isolate per-repo failures: one bad push must not abort the loop and + # leave an indeterminate subset seeded. Collect failures and exit + # non-zero with a count so the harness sees a real seed error (and + # `docker compose wait seed` surfaces it) instead of a later, confusing + # load-gate timeout. + try: + _ensure_repo(base_url, token, user, name) + # only push if the repo is empty (freshly created) + head = requests.get( + f"{base_url}/api/v1/repos/{user}/{name}/branches/main", + headers={"Authorization": f"token {token}"}, + timeout=10, + ) + if head.status_code != 200: + _push_policy(base_url, token, user, name, workdir) + print(f"seeded {name}", flush=True) + except Exception as exc: # noqa: BLE001 - report, don't abort the loop + failures.append((name, repr(exc))) + print(f"FAILED {name}: {exc!r}", flush=True) + + if failures: + print( + f"ERROR: seeded {count - len(failures)}/{count} repos; " + f"{len(failures)} failed (e.g. {failures[:3]})", + flush=True, ) - if head.status_code != 200: - _push_policy(base_url, token, user, name, workdir) - print(f"seeded {name}", flush=True) + return 1 # write the token where the test harness can read it Path("/seed-output/token").write_text(token) diff --git a/app-tests/git-leak/test_boot.py b/app-tests/git-leak/test_boot.py index 928fd6f4e..ab9f1eb4b 100644 --- a/app-tests/git-leak/test_boot.py +++ b/app-tests/git-leak/test_boot.py @@ -17,14 +17,18 @@ def test_boot_loads_all_scopes(opal, repo_count): for i, name in enumerate(repos): opal.put_scope(f"boot-{i}", gitea_repo_url(name)) - # restart only the server so it re-runs sync_scopes on boot + # Start the clock at the restart, not after wait_healthy: preload_scopes() + # runs in gunicorn's `when_ready` (before workers accept traffic), so by the + # time /healthcheck answers, boot-sync may already be partly done — starting + # the clock later would undercount it. Measuring from the restart captures + # the whole boot-sync window. + start = time.time() compose("restart", "opal_server") opal.wait_healthy(timeout=600) - start = time.time() deadline = start + 2000 while time.time() < deadline: - if opal.stats()["repos"] >= n: + if opal.stats()["repo_locks"] >= n: break time.sleep(2) elapsed = time.time() - start @@ -32,5 +36,5 @@ def test_boot_loads_all_scopes(opal, repo_count): # PR1 records the baseline (loose). PR4 will set BOOT_TARGET_SECONDS low. BOOT_TARGET_SECONDS = int(os.environ.get("BOOT_TARGET_SECONDS", "2000")) print(f"boot loaded {n} scopes in {elapsed:.1f}s (target {BOOT_TARGET_SECONDS}s)") - assert opal.stats()["repos"] >= n, "not all scopes loaded after boot" + assert opal.stats()["repo_locks"] >= n, "not all scopes loaded after boot" assert elapsed < BOOT_TARGET_SECONDS, f"boot too slow: {elapsed:.1f}s" diff --git a/app-tests/git-leak/test_leak.py b/app-tests/git-leak/test_leak.py index 8cfee6363..11b6da82d 100644 --- a/app-tests/git-leak/test_leak.py +++ b/app-tests/git-leak/test_leak.py @@ -13,52 +13,101 @@ def _wait_until(predicate, timeout=30, interval=0.5): return False +def _load_scopes(opal, prefix, names): + """PUT a scope per repo, then force a second sync so all three caches fill. + + The first sync of a fresh scope takes the clone path, which populates only + ``repo_locks`` — ``repos`` and ``repos_last_fetched`` are filled solely by + the discover/fetch path on a *subsequent* sync. ``refresh_all()`` triggers + that second sync, so after this returns all three caches reflect the N + scopes and the drain assertions test a cache the sync path actually fills. + + Returns the per-key load count reached (max over a wait). + """ + n = len(names) + for i, name in enumerate(names): + opal.put_scope(f"{prefix}-{i}", gitea_repo_url(name)) + # repo_locks is populated on the first sync (the clone path), so it is the + # deterministic signal that every scope was at least picked up. + locked = _wait_until(lambda: opal.stats()["repo_locks"] >= n, timeout=600) + assert locked, f"initial load never locked {n} repos: {opal.stats()}" + # force the discover/fetch path so `repos` / `repos_last_fetched` fill too + opal.refresh_all() + fetched = _wait_until(lambda: opal.stats()["repos"] >= n, timeout=600) + assert fetched, f"refresh never populated {n} repos: {opal.stats()}" + return opal.stats() + + @pytest.mark.timeout(900) def test_churn_releases_caches(opal, repo_count): """Create then delete many scopes; the three caches must return to empty. - FAILS on master: delete_scope never purges GitPolicyFetcher caches. + FAILS on this branch (without PR2): delete_scope never purges the + GitPolicyFetcher caches, so they stay populated after every scope is gone. + Becomes green once PR2 lands. """ n = min(repo_count, 100) repos = list_seeded_repos(n) - for i, name in enumerate(repos): - opal.put_scope(f"churn-{i}", gitea_repo_url(name)) - loaded = _wait_until(lambda: opal.stats()["repos"] >= n, timeout=600) - assert loaded, f"initial load never reached {n} repos: {opal.stats()}" + loaded = _load_scopes(opal, "churn", repos) + assert loaded["repo_locks"] >= n and loaded["repos"] >= n, loaded + rss_loaded = loaded["rss_kb"] for i in range(n): opal.delete_scope(f"churn-{i}") - released = _wait_until(lambda: opal.stats()["repos"] == 0, timeout=30) + # all three caches must drain to empty once every scope is deleted + released = _wait_until( + lambda: opal.stats()["repo_locks"] == 0 + and opal.stats()["repos"] == 0 + and opal.stats()["repos_last_fetched"] == 0, + timeout=60, + ) stats = opal.stats() - assert released, f"repos cache did not drain: {stats}" - assert stats["repos_last_fetched"] == 0, stats + assert released, f"caches did not drain after deleting all scopes: {stats}" + + # The cache drain above is the gate. RSS is only a loose backstop here: + # freeing the caches need not return memory to the OS (glibc/Python keep + # arenas), so this guards against a *gross* leak — RSS ballooning well past + # the loaded peak — without false-failing on allocator slack once PR2 lands. + rss_budget = rss_loaded + max(100_000, rss_loaded // 2) + assert stats["rss_kb"] <= rss_budget, ( + f"RSS ballooned across churn: {rss_loaded} -> {stats['rss_kb']} kb " + f"(budget {rss_budget})" + ) @pytest.mark.timeout(900) def test_repeat_sync_does_not_grow(opal, repo_count): - """Re-syncing the *same* scopes must not grow the caches. - - Idempotency guard, not a known-broken case. On current master this - PASSES: a clone path is keyed by the repo URL (``source_id`` = - sha256(url)+branch-shard), so re-syncing identical scopes reuses the - existing ``repos`` / ``repos_last_fetched`` entries instead of - allocating new ones. It guards against a regression that would make - repeat sync allocate per-sync. The unbounded *growth + no purge on - delete* leak is covered by ``test_churn_releases_caches`` above, which - uses distinct scopes and DOES fail on master. + """Re-syncing the *same* scopes must not grow memory unboundedly. + + Idempotency guard, not a known-broken case — PASSES on this branch. Clone + paths are keyed by the repo URL (``source_id`` = sha256(url)+branch-shard), + so re-syncing identical scopes reuses the existing ``repos`` / + ``repos_last_fetched`` entries rather than allocating new ones; the cache + *counts* therefore can't grow for any implementation, which is why the + load-bearing assertion here is on **RSS**, not on ``len(repos)``. It guards + against a regression where each repeat sync leaks per-sync allocations. The + unbounded growth-then-no-purge-on-delete leak is covered by + ``test_churn_releases_caches`` above, which uses distinct scopes. """ n = min(repo_count, 50) repos = list_seeded_repos(n) - for i, name in enumerate(repos): - opal.put_scope(f"stable-{i}", gitea_repo_url(name)) - loaded = _wait_until(lambda: opal.stats()["repos"] >= n, timeout=600) - assert loaded, f"initial sync never reached {n} repos: {opal.stats()}" + loaded = _load_scopes(opal, "stable", repos) + baseline_repos = loaded["repos"] + baseline_rss = loaded["rss_kb"] - baseline = opal.stats()["repos"] for _ in range(10): opal.refresh_all() time.sleep(2) - grown = opal.stats()["repos"] - assert grown <= baseline, f"repos cache grew on repeat sync: {baseline} -> {grown}" + grown = opal.stats() + assert grown["repos"] <= baseline_repos, ( + f"repos cache count grew on repeat sync: {baseline_repos} -> {grown['repos']}" + ) + # allow generous headroom for allocator slack; fail only on a real per-sync + # leak (10 refreshes of N scopes ballooning RSS). + rss_budget = baseline_rss + max(50_000, baseline_rss // 5) + assert grown["rss_kb"] <= rss_budget, ( + f"RSS grew unboundedly on repeat sync: " + f"{baseline_rss} -> {grown['rss_kb']} kb (budget {rss_budget})" + ) diff --git a/app-tests/git-leak/test_resilience.py b/app-tests/git-leak/test_resilience.py index b8fb73c89..11a25f90e 100644 --- a/app-tests/git-leak/test_resilience.py +++ b/app-tests/git-leak/test_resilience.py @@ -9,48 +9,83 @@ ) -@pytest.mark.timeout(300) +# Enough hanging clones to exhaust opal's default fetch executor +# (run_sync -> run_in_executor(None, ...), a ThreadPoolExecutor of +# min(32, cpu+4) workers). One hang wouldn't starve a multi-thread pool, so we +# saturate it with many; after PR3's fetch timeout these give up and free their +# threads, letting the healthy scope through. +OFFLINE_REPOS = 40 + + +@pytest.mark.timeout(420) def test_offline_repo_does_not_block_healthy_scopes(opal, repo_count): - """An unreachable repo must not stop healthy scopes from serving. + """Unreachable repos must not stop a healthy scope from serving. - FAILS on master: the scopes path has no fetch timeout, so a hung - clone occupies the shared executor and the server stalls. + FAILS on this branch (without PR3): the scopes path has no fetch timeout, + so the hung clones of the offline repos occupy the shared fetch executor + and the healthy scope's bundle never becomes available. """ - # a routable-but-dead address (TEST-NET-1, RFC 5737): the clone hangs - opal.put_scope("offline", make_repo_unreachable("dead-repo"), branch="main") + # the `blackhole` sidecar accepts the TCP handshake then never answers, so + # each of these clones hangs (holding a fetch-executor thread) rather than + # failing fast; enough of them saturate the pool. + for i in range(OFFLINE_REPOS): + opal.put_scope(f"offline-{i}", make_repo_unreachable(f"dead-{i}"), branch="main") healthy = list_seeded_repos(1)[0] opal.put_scope("healthy", gitea_repo_url(healthy)) - # the healthy scope's repo must appear in the cache within a bounded time, - # even though the offline scope is hanging - deadline = time.time() + 60 - served = False - while time.time() < deadline: - if opal.stats()["repos"] >= 1: - served = True - break - time.sleep(2) - assert served, "healthy scope never loaded while an offline repo was hanging" + try: + # The healthy scope must become *servable* within a bounded time even + # while the offline scopes hang. A 200 from its policy bundle proves the + # clone completed and the scope is served — a stronger signal than a + # cache count, and exactly what the offline hang starves on master. + deadline = time.time() + 90 + served = False + last = None + while time.time() < deadline: + try: + resp = opal.get_scope_policy("healthy") + last = resp.status_code + if resp.status_code == 200: + served = True + break + except Exception as exc: # request itself may stall when starved + last = repr(exc) + time.sleep(2) + assert served, ( + f"healthy scope never served while {OFFLINE_REPOS} offline repos " + f"were hanging (last policy response: {last})" + ) + finally: + # The offline clones hang for the blackhole's full duration, occupying + # executor threads. On the session-scoped stack that would starve every + # later test, and per-scope DELETEs would queue behind the hung threads. + # hard_reset stops the server (killing the hung threads), flushes the + # Redis scope store so preload doesn't re-clone them, and restarts clean. + opal.hard_reset() @pytest.mark.timeout(300) -def test_server_recovers_after_postgres_bounce(opal): - """A transient Postgres (broadcaster) outage must not leave the server - down. - - Recovery guard, not a known-broken case. On current master this - PASSES: when the broadcast channel drops, the affected worker - triggers a graceful shutdown and gunicorn respawns it, while the - sibling worker keeps serving HTTP — so the surface recovers within - the window. It guards against a regression of that property - (PER-15065's in-process reconnect would make recovery cleaner by - avoiding the worker churn, but recovery already holds). +def test_server_recovers_after_postgres_bounce(opal, repo_count): + """A transient Postgres (broadcaster) outage must not break propagation. + + Recovery guard, not a known-broken case — PASSES on this branch: when the + broadcast channel drops, the affected worker triggers a graceful shutdown + and gunicorn respawns it, and once Postgres is back the broadcaster + reconnects. We prove recovery of the *broadcast path*, not just HTTP + liveness: after the bounce we PUT a fresh scope and assert it actually + syncs (its repo lands in the cache), which only happens if the leader + received the sync notification over the recovered broadcaster. + (PER-15065's in-process reconnect would make recovery cleaner by avoiding + the worker churn, but recovery already holds.) """ assert opal.stats() # healthy before + baseline_locks = opal.stats()["repo_locks"] + bounce_postgres(down_seconds=5) - deadline = time.time() + 60 + # wait for the HTTP surface to come back first + deadline = time.time() + 90 recovered = False while time.time() < deadline: try: @@ -60,4 +95,16 @@ def test_server_recovers_after_postgres_bounce(opal): break except Exception: time.sleep(2) - assert recovered, "server did not recover within 60s of a postgres bounce" + assert recovered, "server did not recover HTTP within 90s of a postgres bounce" + + # prove the broadcast path itself recovered: a freshly PUT scope must sync + healthy = list_seeded_repos(1)[0] + opal.put_scope("post-bounce", gitea_repo_url(healthy)) + synced = False + deadline = time.time() + 120 + while time.time() < deadline: + if opal.stats()["repo_locks"] > baseline_locks: + synced = True + break + time.sleep(2) + assert synced, "scope PUT after the bounce never synced; broadcaster did not recover" diff --git a/packages/opal-server/opal_server/debug_stats.py b/packages/opal-server/opal_server/debug_stats.py index 20ea6b341..af77ff4ea 100644 --- a/packages/opal-server/opal_server/debug_stats.py +++ b/packages/opal-server/opal_server/debug_stats.py @@ -4,9 +4,9 @@ observe the cache growth that the memory-leak fix (PR2) eliminates. """ from pathlib import Path -from typing import Dict +from typing import Dict, List, Optional -from fastapi import FastAPI +from fastapi import FastAPI, params from opal_server.git_fetcher import GitPolicyFetcher @@ -31,11 +31,26 @@ def git_fetcher_cache_stats() -> Dict[str, int]: } -def register_internal_stats_route(app: FastAPI, enabled: bool) -> None: - """Mount GET /internal/git-fetcher-cache-stats only when enabled.""" +def register_internal_stats_route( + app: FastAPI, + enabled: bool, + dependencies: Optional[List[params.Depends]] = None, +) -> None: + """Mount GET /internal/git-fetcher-cache-stats only when enabled. + + ``dependencies`` are applied to the route (e.g. the server's + ``JWTAuthenticator``) so the endpoint is protected when JWT verification + is enabled. When verification is disabled — as in the test bed, which + leaves ``OPAL_AUTH_PUBLIC_KEY`` unset — the authenticator is a no-op and + the route stays reachable without a token. + """ if not enabled: return - @app.get("/internal/git-fetcher-cache-stats", include_in_schema=False) + @app.get( + "/internal/git-fetcher-cache-stats", + include_in_schema=False, + dependencies=dependencies or [], + ) def _git_fetcher_cache_stats() -> Dict[str, int]: return git_fetcher_cache_stats() diff --git a/packages/opal-server/opal_server/server.py b/packages/opal-server/opal_server/server.py index 1be796d6d..9702e372f 100644 --- a/packages/opal-server/opal_server/server.py +++ b/packages/opal-server/opal_server/server.py @@ -295,7 +295,9 @@ def healthcheck(): return {"status": "ok"} register_internal_stats_route( - app, enabled=opal_server_config.DEBUG_INTERNAL_STATS + app, + enabled=opal_server_config.DEBUG_INTERNAL_STATS, + dependencies=[Depends(authenticator)], ) return app diff --git a/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py b/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py index 00a9c8e1b..3e36cd330 100644 --- a/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py +++ b/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py @@ -1,4 +1,4 @@ -from fastapi import FastAPI +from fastapi import Depends, FastAPI, HTTPException from fastapi.testclient import TestClient from opal_server.debug_stats import register_internal_stats_route @@ -20,3 +20,20 @@ def test_endpoint_present_when_enabled(): assert resp.status_code == 200 body = resp.json() assert set(body) == {"repo_locks", "repos", "repos_last_fetched", "rss_kb"} + + +def test_endpoint_applies_passed_dependencies(): + """A route dependency (e.g. the server's authenticator) is enforced. + + Mirrors how server.py wires the real JWTAuthenticator: when verification is + enabled the dependency rejects unauthenticated reads; when disabled it is a + no-op (covered by the test above, which passes no dependency). + """ + + def _deny(): + raise HTTPException(status_code=401, detail="unauthorized") + + app = FastAPI() + register_internal_stats_route(app, enabled=True, dependencies=[Depends(_deny)]) + resp = TestClient(app).get("/internal/git-fetcher-cache-stats") + assert resp.status_code == 401 From f810db8faf7f8ab953f179c956cd51f17852ff47 Mon Sep 17 00:00:00 2001 From: David Shoen Date: Wed, 24 Jun 2026 16:15:05 +0300 Subject: [PATCH 13/26] style(git-leak): apply black/isort/docformatter (pre-commit) Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/helpers.py | 18 ++++++++---------- app-tests/git-leak/test_leak.py | 10 +++++----- app-tests/git-leak/test_resilience.py | 16 ++++++++++------ .../tests/debug_stats_endpoint_test.py | 7 ++++--- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/app-tests/git-leak/helpers.py b/app-tests/git-leak/helpers.py index 3cb66a36a..3dc549421 100644 --- a/app-tests/git-leak/helpers.py +++ b/app-tests/git-leak/helpers.py @@ -152,13 +152,11 @@ def delete_all_scopes(self, drain_timeout: int = 20) -> None: def get_scope_policy(self, scope_id: str) -> requests.Response: """Fetch a scope's policy bundle (GET /scopes/{id}/policy). - A 200 here proves the scope's repo was cloned and is being served — - the signal that a healthy scope still works while another scope's - clone is hanging. + A 200 here proves the scope's repo was cloned and is being + served — the signal that a healthy scope still works while + another scope's clone is hanging. """ - return requests.get( - f"{self.base_url}/scopes/{scope_id}/policy", timeout=30 - ) + return requests.get(f"{self.base_url}/scopes/{scope_id}/policy", timeout=30) def refresh_all(self) -> None: # POST /scopes/refresh publishes on the webhook topic so the leader @@ -258,10 +256,10 @@ def make_repo_unreachable(name: str) -> str: def compose(*args: str) -> subprocess.CompletedProcess: """Run `docker compose `; on failure, surface the captured output. - `capture_output=True` keeps compose noise out of passing tests, but a raw - CalledProcessError shows only the exit code — so on failure we re-raise - with the captured stdout/stderr embedded, otherwise a broken build/seed/ - restart is opaque to debug. + `capture_output=True` keeps compose noise out of passing tests, but + a raw CalledProcessError shows only the exit code — so on failure we + re-raise with the captured stdout/stderr embedded, otherwise a + broken build/seed/ restart is opaque to debug. """ proc = subprocess.run( ["docker", "compose", *args], diff --git a/app-tests/git-leak/test_leak.py b/app-tests/git-leak/test_leak.py index 11b6da82d..7ded9eded 100644 --- a/app-tests/git-leak/test_leak.py +++ b/app-tests/git-leak/test_leak.py @@ -43,8 +43,8 @@ def test_churn_releases_caches(opal, repo_count): """Create then delete many scopes; the three caches must return to empty. FAILS on this branch (without PR2): delete_scope never purges the - GitPolicyFetcher caches, so they stay populated after every scope is gone. - Becomes green once PR2 lands. + GitPolicyFetcher caches, so they stay populated after every scope is + gone. Becomes green once PR2 lands. """ n = min(repo_count, 100) repos = list_seeded_repos(n) @@ -101,9 +101,9 @@ def test_repeat_sync_does_not_grow(opal, repo_count): time.sleep(2) grown = opal.stats() - assert grown["repos"] <= baseline_repos, ( - f"repos cache count grew on repeat sync: {baseline_repos} -> {grown['repos']}" - ) + assert ( + grown["repos"] <= baseline_repos + ), f"repos cache count grew on repeat sync: {baseline_repos} -> {grown['repos']}" # allow generous headroom for allocator slack; fail only on a real per-sync # leak (10 refreshes of N scopes ballooning RSS). rss_budget = baseline_rss + max(50_000, baseline_rss // 5) diff --git a/app-tests/git-leak/test_resilience.py b/app-tests/git-leak/test_resilience.py index 11a25f90e..76515dbe0 100644 --- a/app-tests/git-leak/test_resilience.py +++ b/app-tests/git-leak/test_resilience.py @@ -8,7 +8,6 @@ make_repo_unreachable, ) - # Enough hanging clones to exhaust opal's default fetch executor # (run_sync -> run_in_executor(None, ...), a ThreadPoolExecutor of # min(32, cpu+4) workers). One hang wouldn't starve a multi-thread pool, so we @@ -21,15 +20,18 @@ def test_offline_repo_does_not_block_healthy_scopes(opal, repo_count): """Unreachable repos must not stop a healthy scope from serving. - FAILS on this branch (without PR3): the scopes path has no fetch timeout, - so the hung clones of the offline repos occupy the shared fetch executor - and the healthy scope's bundle never becomes available. + FAILS on this branch (without PR3): the scopes path has no fetch + timeout, so the hung clones of the offline repos occupy the shared + fetch executor and the healthy scope's bundle never becomes + available. """ # the `blackhole` sidecar accepts the TCP handshake then never answers, so # each of these clones hangs (holding a fetch-executor thread) rather than # failing fast; enough of them saturate the pool. for i in range(OFFLINE_REPOS): - opal.put_scope(f"offline-{i}", make_repo_unreachable(f"dead-{i}"), branch="main") + opal.put_scope( + f"offline-{i}", make_repo_unreachable(f"dead-{i}"), branch="main" + ) healthy = list_seeded_repos(1)[0] opal.put_scope("healthy", gitea_repo_url(healthy)) @@ -107,4 +109,6 @@ def test_server_recovers_after_postgres_bounce(opal, repo_count): synced = True break time.sleep(2) - assert synced, "scope PUT after the bounce never synced; broadcaster did not recover" + assert ( + synced + ), "scope PUT after the bounce never synced; broadcaster did not recover" diff --git a/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py b/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py index 3e36cd330..4abd1abb1 100644 --- a/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py +++ b/packages/opal-server/opal_server/tests/debug_stats_endpoint_test.py @@ -25,9 +25,10 @@ def test_endpoint_present_when_enabled(): def test_endpoint_applies_passed_dependencies(): """A route dependency (e.g. the server's authenticator) is enforced. - Mirrors how server.py wires the real JWTAuthenticator: when verification is - enabled the dependency rejects unauthenticated reads; when disabled it is a - no-op (covered by the test above, which passes no dependency). + Mirrors how server.py wires the real JWTAuthenticator: when + verification is enabled the dependency rejects unauthenticated + reads; when disabled it is a no-op (covered by the test above, which + passes no dependency). """ def _deny(): From 82ff33a7300adfe857c115d64ccea5ec135ee2be Mon Sep 17 00:00:00 2001 From: David Shoen Date: Wed, 24 Jun 2026 19:57:23 +0300 Subject: [PATCH 14/26] test(git-leak): tighten stat polling and pin test-bed images (PR review) - snapshot a single /internal stats read per poll in the churn-drain and boot gates (consistent multi-key observation; fewer HTTP round-trips) - document why a 200 from the healthy scope can't be a masked default bundle, and why the stats route is intentionally a sync def - pin alpine/socat and the seed image's pip deps for reproducibility Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/docker-compose.yml | 2 +- app-tests/git-leak/seed/Dockerfile | 2 +- app-tests/git-leak/test_boot.py | 6 ++++-- app-tests/git-leak/test_leak.py | 15 ++++++++------- app-tests/git-leak/test_resilience.py | 5 +++++ packages/opal-server/opal_server/debug_stats.py | 5 +++++ 6 files changed, 24 insertions(+), 11 deletions(-) diff --git a/app-tests/git-leak/docker-compose.yml b/app-tests/git-leak/docker-compose.yml index bd48c26dc..a863faed4 100644 --- a/app-tests/git-leak/docker-compose.yml +++ b/app-tests/git-leak/docker-compose.yml @@ -72,7 +72,7 @@ services: # blocks reading the git smart-HTTP response, holding the fetch executor. # Deterministic, unlike a TEST-NET-1 address which many networks reject # fast with ICMP-unreachable (so the clone would fail fast, not hang). - image: alpine/socat:latest + image: alpine/socat:1.8.0.3 command: ["TCP-LISTEN:80,fork,reuseaddr", "SYSTEM:sleep 3600"] seed: diff --git a/app-tests/git-leak/seed/Dockerfile b/app-tests/git-leak/seed/Dockerfile index ee9d776d2..540118cd2 100644 --- a/app-tests/git-leak/seed/Dockerfile +++ b/app-tests/git-leak/seed/Dockerfile @@ -1,6 +1,6 @@ FROM python:3.11-slim RUN apt-get update && apt-get install -y --no-install-recommends git \ && rm -rf /var/lib/apt/lists/* -RUN pip install --no-cache-dir requests GitPython +RUN pip install --no-cache-dir requests==2.32.3 GitPython==3.1.50 COPY seed_gitea.py /seed_gitea.py ENTRYPOINT ["python", "/seed_gitea.py"] diff --git a/app-tests/git-leak/test_boot.py b/app-tests/git-leak/test_boot.py index ab9f1eb4b..b28637ec1 100644 --- a/app-tests/git-leak/test_boot.py +++ b/app-tests/git-leak/test_boot.py @@ -27,8 +27,10 @@ def test_boot_loads_all_scopes(opal, repo_count): opal.wait_healthy(timeout=600) deadline = start + 2000 + loaded = 0 while time.time() < deadline: - if opal.stats()["repo_locks"] >= n: + loaded = opal.stats()["repo_locks"] + if loaded >= n: break time.sleep(2) elapsed = time.time() - start @@ -36,5 +38,5 @@ def test_boot_loads_all_scopes(opal, repo_count): # PR1 records the baseline (loose). PR4 will set BOOT_TARGET_SECONDS low. BOOT_TARGET_SECONDS = int(os.environ.get("BOOT_TARGET_SECONDS", "2000")) print(f"boot loaded {n} scopes in {elapsed:.1f}s (target {BOOT_TARGET_SECONDS}s)") - assert opal.stats()["repo_locks"] >= n, "not all scopes loaded after boot" + assert loaded >= n, "not all scopes loaded after boot" assert elapsed < BOOT_TARGET_SECONDS, f"boot too slow: {elapsed:.1f}s" diff --git a/app-tests/git-leak/test_leak.py b/app-tests/git-leak/test_leak.py index 7ded9eded..8ef2258f7 100644 --- a/app-tests/git-leak/test_leak.py +++ b/app-tests/git-leak/test_leak.py @@ -55,13 +55,14 @@ def test_churn_releases_caches(opal, repo_count): for i in range(n): opal.delete_scope(f"churn-{i}") - # all three caches must drain to empty once every scope is deleted - released = _wait_until( - lambda: opal.stats()["repo_locks"] == 0 - and opal.stats()["repos"] == 0 - and opal.stats()["repos_last_fetched"] == 0, - timeout=60, - ) + # all three caches must drain to empty once every scope is deleted. Read a + # single stats snapshot per poll so the three keys reflect the same + # observation (and to avoid 3x the HTTP round-trips per iteration). + def _all_caches_empty() -> bool: + s = opal.stats() + return s["repo_locks"] == 0 and s["repos"] == 0 and s["repos_last_fetched"] == 0 + + released = _wait_until(_all_caches_empty, timeout=60) stats = opal.stats() assert released, f"caches did not drain after deleting all scopes: {stats}" diff --git a/app-tests/git-leak/test_resilience.py b/app-tests/git-leak/test_resilience.py index 76515dbe0..b18829e12 100644 --- a/app-tests/git-leak/test_resilience.py +++ b/app-tests/git-leak/test_resilience.py @@ -41,6 +41,11 @@ def test_offline_repo_does_not_block_healthy_scopes(opal, repo_count): # while the offline scopes hang. A 200 from its policy bundle proves the # clone completed and the scope is served — a stronger signal than a # cache count, and exactly what the offline hang starves on master. + # + # A 200 here can't be a *masked* default bundle: GET /{scope}/policy + # falls back to the "default" scope on a bad/missing repo, but this bed + # never creates a "default" scope, so that fallback raises instead of + # returning 200. The only way to get 200 is the healthy clone completing. deadline = time.time() + 90 served = False last = None diff --git a/packages/opal-server/opal_server/debug_stats.py b/packages/opal-server/opal_server/debug_stats.py index af77ff4ea..5439ce950 100644 --- a/packages/opal-server/opal_server/debug_stats.py +++ b/packages/opal-server/opal_server/debug_stats.py @@ -47,6 +47,11 @@ def register_internal_stats_route( if not enabled: return + # Deliberately a sync def: Starlette runs it in its own threadpool, which is + # independent of the default loop executor opal uses for git fetches + # (run_sync -> run_in_executor(None, ...)). So this endpoint keeps answering + # even when hung clones saturate the fetch executor — which is exactly the + # condition the offline-repo test observes through it. @app.get( "/internal/git-fetcher-cache-stats", include_in_schema=False, From c043e88a10e2d1cb0180e2fbcc2a20c2bb97436c Mon Sep 17 00:00:00 2001 From: David Shoen Date: Wed, 24 Jun 2026 20:09:56 +0300 Subject: [PATCH 15/26] fix(opal-server): normalize asyncio.TimeoutError to builtin + format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Python < 3.11 asyncio.TimeoutError is a distinct class from the builtin TimeoutError, so run_in_git_executor's wait_for timeout was not caught by `pytest.raises(TimeoutError)` — failing build (3.9)/(3.10). Normalize to the builtin TimeoutError so the documented contract holds on every supported Python, and update the _clone catch site to match. Also apply black/isort/docformatter formatting to satisfy pre-commit. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/opal-server/opal_server/git_fetcher.py | 15 ++++++++++----- .../opal_server/tests/fetch_timeout_test.py | 4 ++-- .../opal_server/tests/git_executor_test.py | 1 - 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index c8dee329a..c549e04dd 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -34,15 +34,14 @@ reference_is_valid_name, ) - _git_executor: Optional[ThreadPoolExecutor] = None def _get_git_executor() -> ThreadPoolExecutor: """Lazily build the dedicated pool for scope git operations. - Isolated from the default executor so a hung clone/fetch can never starve - bundle serving or other server work. + Isolated from the default executor so a hung clone/fetch can never + starve bundle serving or other server work. """ global _git_executor if _git_executor is None: @@ -65,7 +64,13 @@ async def run_in_git_executor(func, *args, timeout: float, **kwargs): loop = asyncio.get_event_loop() fut = loop.run_in_executor(_get_git_executor(), partial(func, *args, **kwargs)) if timeout and timeout > 0: - return await asyncio.wait_for(fut, timeout=timeout) + try: + return await asyncio.wait_for(fut, timeout=timeout) + except asyncio.TimeoutError as exc: + # On Python < 3.11 ``asyncio.TimeoutError`` is a distinct class from + # the builtin ``TimeoutError``; normalize so callers (and the + # documented contract) can rely on the builtin everywhere. + raise TimeoutError() from exc return await fut @@ -265,7 +270,7 @@ async def _clone(self): callbacks=self._auth_callbacks, timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT, ) - except (pygit2.GitError, asyncio.TimeoutError) as exc: + except (pygit2.GitError, TimeoutError) as exc: logger.error( "Could not clone repo at {url}: {err}", url=self._source.url, diff --git a/packages/opal-server/opal_server/tests/fetch_timeout_test.py b/packages/opal-server/opal_server/tests/fetch_timeout_test.py index 58376ff44..b658cf45a 100644 --- a/packages/opal-server/opal_server/tests/fetch_timeout_test.py +++ b/packages/opal-server/opal_server/tests/fetch_timeout_test.py @@ -1,14 +1,14 @@ import time import pytest - from opal_server.config import opal_server_config from opal_server.git_fetcher import run_in_git_executor @pytest.mark.asyncio async def test_hanging_git_op_raises_timeout(monkeypatch): - """A clone/fetch that hangs must surface TimeoutError, not block forever.""" + """A clone/fetch that hangs must surface TimeoutError, not block + forever.""" monkeypatch.setattr(opal_server_config, "SCOPES_GIT_FETCH_TIMEOUT", 0.2) def _hang(): diff --git a/packages/opal-server/opal_server/tests/git_executor_test.py b/packages/opal-server/opal_server/tests/git_executor_test.py index 9a36c624f..8dfaf93c7 100644 --- a/packages/opal-server/opal_server/tests/git_executor_test.py +++ b/packages/opal-server/opal_server/tests/git_executor_test.py @@ -1,7 +1,6 @@ import time import pytest - from opal_server.config import OpalServerConfig from opal_server.git_fetcher import run_in_git_executor From 5dd53a2b290de7636531f4d996d75a3d9d354db9 Mon Sep 17 00:00:00 2001 From: David Shoen Date: Wed, 24 Jun 2026 20:13:14 +0300 Subject: [PATCH 16/26] fix(opal-server): address Copilot review on git resilience - run_in_git_executor: use asyncio.get_running_loop() instead of the deprecated get_event_loop() inside an async function - fetch_and_notify_on_changes: set repos_last_fetched only after a successful fetch so a timeout/error does not wrongly suppress a later force_fetch via _was_fetched_after - fetch_timeout_test: measure elapsed time with time.monotonic() Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/opal-server/opal_server/git_fetcher.py | 12 ++++++++---- .../opal_server/tests/fetch_timeout_test.py | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index c549e04dd..3202ced81 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -61,7 +61,7 @@ async def run_in_git_executor(func, *args, timeout: float, **kwargs): pygit2 call keeps running on its pool thread until the OS network timeout; the dedicated pool keeps that lingering thread isolated. """ - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() fut = loop.run_in_executor(_get_git_executor(), partial(func, *args, **kwargs)) if timeout and timeout > 0: try: @@ -227,14 +227,18 @@ async def fetch_and_notify_on_changes( logger.debug( f"Fetching remote (force_fetch={force_fetch}): {self._remote} ({self._source.url})" ) - GitPolicyFetcher.repos_last_fetched[ - self._source_id - ] = datetime.datetime.now() await run_in_git_executor( repo.remotes[self._remote].fetch, callbacks=self._auth_callbacks, timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT, ) + # Only mark as fetched after the fetch actually + # succeeds. On timeout/error this stays stale so a + # later force_fetch is not wrongly suppressed by + # _was_fetched_after. + GitPolicyFetcher.repos_last_fetched[ + self._source_id + ] = datetime.datetime.now() logger.debug(f"Fetch completed: {self._source.url}") # New commits might be present because of a previous fetch made by another scope diff --git a/packages/opal-server/opal_server/tests/fetch_timeout_test.py b/packages/opal-server/opal_server/tests/fetch_timeout_test.py index b658cf45a..f2c4b8f9d 100644 --- a/packages/opal-server/opal_server/tests/fetch_timeout_test.py +++ b/packages/opal-server/opal_server/tests/fetch_timeout_test.py @@ -14,9 +14,9 @@ async def test_hanging_git_op_raises_timeout(monkeypatch): def _hang(): time.sleep(5) - start = time.time() + start = time.monotonic() with pytest.raises(TimeoutError): await run_in_git_executor( _hang, timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT ) - assert time.time() - start < 2, "wait_for did not unblock promptly" + assert time.monotonic() - start < 2, "wait_for did not unblock promptly" From 382cd8364b1f9d996e9ddefdc4424f84cc629c1d Mon Sep 17 00:00:00 2001 From: David Shoen Date: Wed, 24 Jun 2026 20:21:44 +0300 Subject: [PATCH 17/26] fix(opal-server): clean fetch-timeout logging, trim test thread sleeps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fetch path let TimeoutError propagate to sync_scope's catch-all, which logged a full traceback at ERROR level for the expected unreachable-repo case — inconsistent with the clone path's quiet logger.error. Catch TimeoutError at the fetch site and log without a traceback, then skip (repos_last_fetched stays stale so the next cycle retries). Also shorten the hanging-thread sleeps in the timeout tests so the lingering pool thread doesn't delay process teardown. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-server/opal_server/git_fetcher.py | 26 +++++++++++++------ .../opal_server/tests/fetch_timeout_test.py | 4 ++- .../opal_server/tests/git_executor_test.py | 2 +- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index 3202ced81..173e506b2 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -227,15 +227,25 @@ async def fetch_and_notify_on_changes( logger.debug( f"Fetching remote (force_fetch={force_fetch}): {self._remote} ({self._source.url})" ) - await run_in_git_executor( - repo.remotes[self._remote].fetch, - callbacks=self._auth_callbacks, - timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT, - ) + try: + await run_in_git_executor( + repo.remotes[self._remote].fetch, + callbacks=self._auth_callbacks, + timeout=opal_server_config.SCOPES_GIT_FETCH_TIMEOUT, + ) + except TimeoutError as exc: + # Expected when a repo is unreachable: log cleanly + # (no traceback) and skip, matching the clone path. + # repos_last_fetched stays stale so the next cycle + # retries and force_fetch is not wrongly suppressed. + logger.error( + "Timed out fetching {url}, skipping: {err}", + url=self._source.url, + err=repr(exc), + ) + return # Only mark as fetched after the fetch actually - # succeeds. On timeout/error this stays stale so a - # later force_fetch is not wrongly suppressed by - # _was_fetched_after. + # succeeds. GitPolicyFetcher.repos_last_fetched[ self._source_id ] = datetime.datetime.now() diff --git a/packages/opal-server/opal_server/tests/fetch_timeout_test.py b/packages/opal-server/opal_server/tests/fetch_timeout_test.py index f2c4b8f9d..b8bde71e1 100644 --- a/packages/opal-server/opal_server/tests/fetch_timeout_test.py +++ b/packages/opal-server/opal_server/tests/fetch_timeout_test.py @@ -12,7 +12,9 @@ async def test_hanging_git_op_raises_timeout(monkeypatch): monkeypatch.setattr(opal_server_config, "SCOPES_GIT_FETCH_TIMEOUT", 0.2) def _hang(): - time.sleep(5) + # Short enough that the lingering pool thread doesn't delay teardown, + # but well above the 0.2s timeout under test. + time.sleep(1) start = time.monotonic() with pytest.raises(TimeoutError): diff --git a/packages/opal-server/opal_server/tests/git_executor_test.py b/packages/opal-server/opal_server/tests/git_executor_test.py index 8dfaf93c7..e1c534c80 100644 --- a/packages/opal-server/opal_server/tests/git_executor_test.py +++ b/packages/opal-server/opal_server/tests/git_executor_test.py @@ -20,7 +20,7 @@ async def test_run_in_git_executor_returns_value(): @pytest.mark.asyncio async def test_run_in_git_executor_times_out(): with pytest.raises(TimeoutError): - await run_in_git_executor(lambda: time.sleep(2), timeout=0.1) + await run_in_git_executor(lambda: time.sleep(1), timeout=0.1) @pytest.mark.asyncio From 75ad43a4a90ac23843d0e0e68f2775d3d717584c Mon Sep 17 00:00:00 2001 From: David Shoen Date: Sun, 28 Jun 2026 18:45:58 +0300 Subject: [PATCH 18/26] test(git-leak): isolate offline-hang healthy probe to a never-cloned repo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ziv (PR review, round 2) caught that the offline-resilience gate can false-PASS in the full-suite run. The "healthy" scope pointed at policy-repo-0000 (list_seeded_repos(1)[0]), but on-disk clones are keyed by URL-hash and survive compose restart/stop/start (opal_server mounts no volume at /opal; only `down -v` wipes them). test_boot/test_leak run first (alphabetical) and already clone every seeded repo, so the healthy scope hit the existing clone via _discover_repository, skipped _clone(), and served 200 without ever touching the saturated fetch executor — the gate that must FAIL on this branch (no PR3 timeout) passed. Fix: seed a reserved repo (policy-repo-healthy-probe) outside the numeric policy-repo-NNNN range that no boot/leak test enumerates, and point the healthy probe at it. A never-cloned repo forces a genuine fresh clone through the starved pool, so the gate fails correctly. The seed- completeness check in conftest now covers the reserved repo too. Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/conftest.py | 12 ++++++++++-- app-tests/git-leak/helpers.py | 14 ++++++++++++++ app-tests/git-leak/seed/seed_gitea.py | 19 +++++++++++++++---- app-tests/git-leak/test_resilience.py | 11 +++++++++-- 4 files changed, 48 insertions(+), 8 deletions(-) diff --git a/app-tests/git-leak/conftest.py b/app-tests/git-leak/conftest.py index 1660d596b..446376e3f 100644 --- a/app-tests/git-leak/conftest.py +++ b/app-tests/git-leak/conftest.py @@ -2,7 +2,13 @@ import shutil import pytest -from helpers import GiteaAdmin, OpalServerClient, compose, list_seeded_repos +from helpers import ( + HEALTHY_PROBE_REPO, + GiteaAdmin, + OpalServerClient, + compose, + list_seeded_repos, +) def pytest_addoption(parser): @@ -43,7 +49,9 @@ def stack(request, repo_count): # Verify the seed actually produced all N repos before any test runs: a # partial seed would otherwise look like a server bug when the load gate # can't reach N. Fail loudly with the gap. - expected = set(list_seeded_repos(repo_count)) + # include the reserved probe repo the resilience test relies on, so a + # partial seed of it is caught here too rather than as a later test failure + expected = set(list_seeded_repos(repo_count)) | {HEALTHY_PROBE_REPO} present = set(GiteaAdmin().list_repos()) missing = expected - present assert not missing, ( diff --git a/app-tests/git-leak/helpers.py b/app-tests/git-leak/helpers.py index 3dc549421..f819d0d01 100644 --- a/app-tests/git-leak/helpers.py +++ b/app-tests/git-leak/helpers.py @@ -286,3 +286,17 @@ def bounce_postgres(down_seconds: int = 5) -> None: def list_seeded_repos(count: int) -> List[str]: return [f"policy-repo-{i:04d}" for i in range(count)] + + +# A reserved repo seeded *outside* the numeric ``policy-repo-NNNN`` range that +# ``list_seeded_repos`` enumerates, so no boot/leak test ever clones it. The +# resilience offline-hang test uses it as its "healthy" probe: clones live at +# ``base_dir/`` keyed by URL-hash and survive ``compose +# restart/stop/start`` (opal_server mounts no volume at ``/opal``; only +# ``down -v`` wipes them), so pointing the probe at any shared seeded repo would +# let the healthy scope reuse an on-disk clone and serve 200 *without* touching +# the saturated fetch executor — false-passing a gate that must FAIL on this +# branch. A dedicated never-cloned repo forces a genuine fresh clone through the +# starved executor. Keep this name in sync with ``RESERVED_REPOS`` in +# ``seed/seed_gitea.py``. +HEALTHY_PROBE_REPO = "policy-repo-healthy-probe" diff --git a/app-tests/git-leak/seed/seed_gitea.py b/app-tests/git-leak/seed/seed_gitea.py index 0b8bf2e33..7e1de1a4c 100644 --- a/app-tests/git-leak/seed/seed_gitea.py +++ b/app-tests/git-leak/seed/seed_gitea.py @@ -28,6 +28,14 @@ DATA_JSON = '{"roles": {"admin": ["read", "write"]}}\n' +# Reserved repos seeded in addition to the numeric ``policy-repo-NNNN`` set. +# These sit outside the range the boot/leak tests enumerate (so no test ever +# clones them) and back the resilience offline-hang test's "healthy" probe, +# which must force a fresh clone through the saturated executor rather than +# reuse a surviving on-disk clone. Keep in sync with ``HEALTHY_PROBE_REPO`` in +# ``helpers.py``. +RESERVED_REPOS = ("policy-repo-healthy-probe",) + def _wait_for_gitea(base_url: str, timeout: int = 120) -> None: deadline = time.time() + timeout @@ -114,8 +122,10 @@ def main() -> int: workdir = Path("/tmp/seed-work") failures = [] - for i in range(count): - name = f"policy-repo-{i:04d}" + # the numeric set the boot/leak tests enumerate, plus the reserved repos + # (e.g. the resilience offline-hang test's never-cloned healthy probe) + names = [f"policy-repo-{i:04d}" for i in range(count)] + list(RESERVED_REPOS) + for name in names: # Isolate per-repo failures: one bad push must not abort the loop and # leave an indeterminate subset seeded. Collect failures and exit # non-zero with a count so the harness sees a real seed error (and @@ -136,9 +146,10 @@ def main() -> int: failures.append((name, repr(exc))) print(f"FAILED {name}: {exc!r}", flush=True) + total = len(names) if failures: print( - f"ERROR: seeded {count - len(failures)}/{count} repos; " + f"ERROR: seeded {total - len(failures)}/{total} repos; " f"{len(failures)} failed (e.g. {failures[:3]})", flush=True, ) @@ -146,7 +157,7 @@ def main() -> int: # write the token where the test harness can read it Path("/seed-output/token").write_text(token) - print(f"DONE: ensured {count} repos", flush=True) + print(f"DONE: ensured {total} repos", flush=True) return 0 diff --git a/app-tests/git-leak/test_resilience.py b/app-tests/git-leak/test_resilience.py index b18829e12..60f28338f 100644 --- a/app-tests/git-leak/test_resilience.py +++ b/app-tests/git-leak/test_resilience.py @@ -2,6 +2,7 @@ import pytest from helpers import ( + HEALTHY_PROBE_REPO, bounce_postgres, gitea_repo_url, list_seeded_repos, @@ -33,8 +34,14 @@ def test_offline_repo_does_not_block_healthy_scopes(opal, repo_count): f"offline-{i}", make_repo_unreachable(f"dead-{i}"), branch="main" ) - healthy = list_seeded_repos(1)[0] - opal.put_scope("healthy", gitea_repo_url(healthy)) + # Point the healthy scope at a repo *no other test clones* (HEALTHY_PROBE_REPO + # is seeded outside the numeric range the boot/leak tests enumerate). Clones + # survive compose restart/stop/start, so reusing a shared seeded repo here + # would let this scope hit the existing on-disk clone via _discover_repository, + # skip _clone(), and serve 200 without ever touching the saturated executor — + # false-passing this gate. A never-cloned repo forces a real clone through the + # starved pool, so the gate fails correctly on this branch (no PR3 timeout). + opal.put_scope("healthy", gitea_repo_url(HEALTHY_PROBE_REPO)) try: # The healthy scope must become *servable* within a bounded time even From 15f3cfe2495735936652400edd6ea8fe3fe8bb77 Mon Sep 17 00:00:00 2001 From: David Shoen Date: Sun, 28 Jun 2026 19:05:33 +0300 Subject: [PATCH 19/26] test(git-leak): harden harness teardown and tighten assertions (PR review) Follow-up to PR review of the git-leak/resilience test bed: - hard_reset: always restart opal_server + wait_healthy via a finally, so a failed redis FLUSHALL can't leave the server stopped (which would fail every later session-scoped test and, running in a test finally, mask the result). - delete_all_scopes drain: a transient /internal read error no longer counts as a successful drain (was `except: return`); keep polling to the deadline so a not-yet-drained cache can't leak into the next test once PR2 lands. - use stats(samples=1) for the zero-waiting drain/empty polls (the peak-merge only matters for load assertions; this also drops 3x HTTP per poll). - resilience: narrow broad `except Exception` to requests.RequestException (and RuntimeError for wait_healthy timeout) so harness bugs surface instead of masquerading as "never served"/"never recovered". - resilience: collapse `assert opal.stats()` + redundant re-read into one read. - debug_stats_test: assert rss_kb > 0 on Linux (was `>= 0`, which passed for the wrong reason where /proc is absent and RSS reads fall back to 0). Co-Authored-By: Claude Opus 4.8 (1M context) --- app-tests/git-leak/helpers.py | 22 ++++++++++++++----- app-tests/git-leak/test_leak.py | 2 +- app-tests/git-leak/test_resilience.py | 11 ++++++---- .../opal_server/tests/debug_stats_test.py | 9 +++++++- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/app-tests/git-leak/helpers.py b/app-tests/git-leak/helpers.py index f819d0d01..452d16b99 100644 --- a/app-tests/git-leak/helpers.py +++ b/app-tests/git-leak/helpers.py @@ -116,10 +116,15 @@ def hard_reset(self, timeout: int = 600) -> None: usable by every later test. """ compose("stop", "opal_server") - compose("exec", "-T", "redis", "redis-cli", "FLUSHALL") - compose("start", "opal_server") - self._created_scopes.clear() - self.wait_healthy(timeout=timeout) + try: + compose("exec", "-T", "redis", "redis-cli", "FLUSHALL") + finally: + # Always bring the server back up, even if the flush failed: leaving + # it stopped would fail every later session-scoped test, and since + # this runs in a test's `finally` it would also mask the real result. + compose("start", "opal_server") + self._created_scopes.clear() + self.wait_healthy(timeout=timeout) def delete_all_scopes(self, drain_timeout: int = 20) -> None: """Delete every scope the *server* knows (not just this client's), then @@ -143,10 +148,15 @@ def delete_all_scopes(self, drain_timeout: int = 20) -> None: deadline = time.time() + drain_timeout while time.time() < deadline: try: - if self.stats()["repo_locks"] == 0: + # Single snapshot: we're waiting for zero, so the peak-merge + # (max over samples) would only delay observing the drain. + if self.stats(samples=1)["repo_locks"] == 0: return except Exception: - return + # A transient stats-read failure is not proof of a drain — keep + # polling until the deadline rather than returning early, which + # would let a not-yet-drained cache leak into the next test. + pass time.sleep(1) def get_scope_policy(self, scope_id: str) -> requests.Response: diff --git a/app-tests/git-leak/test_leak.py b/app-tests/git-leak/test_leak.py index 8ef2258f7..09fe9034d 100644 --- a/app-tests/git-leak/test_leak.py +++ b/app-tests/git-leak/test_leak.py @@ -59,7 +59,7 @@ def test_churn_releases_caches(opal, repo_count): # single stats snapshot per poll so the three keys reflect the same # observation (and to avoid 3x the HTTP round-trips per iteration). def _all_caches_empty() -> bool: - s = opal.stats() + s = opal.stats(samples=1) return s["repo_locks"] == 0 and s["repos"] == 0 and s["repos_last_fetched"] == 0 released = _wait_until(_all_caches_empty, timeout=60) diff --git a/app-tests/git-leak/test_resilience.py b/app-tests/git-leak/test_resilience.py index 60f28338f..04702448b 100644 --- a/app-tests/git-leak/test_resilience.py +++ b/app-tests/git-leak/test_resilience.py @@ -1,6 +1,7 @@ import time import pytest +import requests from helpers import ( HEALTHY_PROBE_REPO, bounce_postgres, @@ -63,7 +64,7 @@ def test_offline_repo_does_not_block_healthy_scopes(opal, repo_count): if resp.status_code == 200: served = True break - except Exception as exc: # request itself may stall when starved + except requests.RequestException as exc: # may stall when starved last = repr(exc) time.sleep(2) assert served, ( @@ -93,8 +94,9 @@ def test_server_recovers_after_postgres_bounce(opal, repo_count): (PER-15065's in-process reconnect would make recovery cleaner by avoiding the worker churn, but recovery already holds.) """ - assert opal.stats() # healthy before - baseline_locks = opal.stats()["repo_locks"] + baseline = opal.stats() # healthy before + assert baseline + baseline_locks = baseline["repo_locks"] bounce_postgres(down_seconds=5) @@ -107,7 +109,8 @@ def test_server_recovers_after_postgres_bounce(opal, repo_count): opal.stats() recovered = True break - except Exception: + except (requests.RequestException, RuntimeError): + # RequestException: HTTP not back yet; RuntimeError: wait_healthy timed out time.sleep(2) assert recovered, "server did not recover HTTP within 90s of a postgres bounce" diff --git a/packages/opal-server/opal_server/tests/debug_stats_test.py b/packages/opal-server/opal_server/tests/debug_stats_test.py index b2d7935db..c67fb3530 100644 --- a/packages/opal-server/opal_server/tests/debug_stats_test.py +++ b/packages/opal-server/opal_server/tests/debug_stats_test.py @@ -1,3 +1,5 @@ +import sys + from opal_server.config import opal_server_config from opal_server.debug_stats import git_fetcher_cache_stats from opal_server.git_fetcher import GitPolicyFetcher @@ -14,7 +16,12 @@ def test_stats_report_dict_sizes(monkeypatch): assert stats["repos"] == 2 assert stats["repos_last_fetched"] == 0 assert isinstance(stats["rss_kb"], int) - assert stats["rss_kb"] >= 0 + # On Linux /proc/self/status exists, so RSS reading must actually work; on + # other platforms _read_rss_kb falls back to 0 and the wiring is untestable. + if sys.platform.startswith("linux"): + assert stats["rss_kb"] > 0 + else: + assert stats["rss_kb"] >= 0 def test_internal_stats_flag_defaults_off(): From ac29c2e9c424ed3e6f8a0d166b90f04a09ceb049 Mon Sep 17 00:00:00 2001 From: zivxx Date: Tue, 23 Jun 2026 12:50:30 +0300 Subject: [PATCH 20/26] feat(opal-server): add GitPolicyFetcher.forget_repo to release cached repo handles Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-server/opal_server/git_fetcher.py | 20 ++++++++++++ .../opal_server/tests/forget_repo_test.py | 31 +++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 packages/opal-server/opal_server/tests/forget_repo_test.py diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index e52083c98..8daee2a8e 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -366,6 +366,26 @@ def base_dir(base_dir: Path) -> Path: def repo_clone_path(base_dir: Path, source: GitPolicyScopeSource) -> Path: return GitPolicyFetcher.base_dir(base_dir) / GitPolicyFetcher.source_id(source) + @staticmethod + def forget_repo(path: str) -> None: + """Drop the cached repository for a clone path and release its handles. + + The cached ``pygit2.Repository`` keeps OS file descriptors and mmapped + pack indexes open; without this, a deleted scope's repo pins memory and + inodes for the lifetime of the process even after the clone is removed. + ``Repository.free()`` is called only when available (it is not part of + every pygit2 release); otherwise the dropped reference is reclaimed by GC. + """ + repo = GitPolicyFetcher.repos.pop(path, None) + if repo is None: + return + free = getattr(repo, "free", None) + if callable(free): + try: + free() + except Exception: + logger.debug("pygit2 Repository.free() failed; relying on GC") + class GitCallback(RemoteCallbacks): def __init__(self, source: GitPolicyScopeSource): diff --git a/packages/opal-server/opal_server/tests/forget_repo_test.py b/packages/opal-server/opal_server/tests/forget_repo_test.py new file mode 100644 index 000000000..b6e223cf1 --- /dev/null +++ b/packages/opal-server/opal_server/tests/forget_repo_test.py @@ -0,0 +1,31 @@ +from opal_server.git_fetcher import GitPolicyFetcher + + +class _FakeRepo: + def __init__(self): + self.freed = False + + def free(self): + self.freed = True + + +def test_forget_repo_pops_and_frees(monkeypatch): + fake = _FakeRepo() + monkeypatch.setattr(GitPolicyFetcher, "repos", {"/clones/x": fake}) + + GitPolicyFetcher.forget_repo("/clones/x") + + assert "/clones/x" not in GitPolicyFetcher.repos + assert fake.freed is True + + +def test_forget_repo_unknown_path_is_noop(monkeypatch): + monkeypatch.setattr(GitPolicyFetcher, "repos", {}) + GitPolicyFetcher.forget_repo("/clones/missing") # must not raise + assert GitPolicyFetcher.repos == {} + + +def test_forget_repo_without_free_method(monkeypatch): + monkeypatch.setattr(GitPolicyFetcher, "repos", {"/clones/y": object()}) + GitPolicyFetcher.forget_repo("/clones/y") # object() has no .free(); must not raise + assert "/clones/y" not in GitPolicyFetcher.repos From b792f239a43ee27e7eef049ff49cefaf27fa8658 Mon Sep 17 00:00:00 2001 From: zivxx Date: Tue, 23 Jun 2026 12:52:04 +0300 Subject: [PATCH 21/26] fix(opal-server): purge git fetcher caches on scope delete Capture the deleted scope's source before scanning siblings (fixing the loop-variable shadowing bug where the clone path was computed from the last-iterated scope). Drop the cached pygit2.Repository handle via forget_repo and pop repos_last_fetched when no sibling shares the source. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-server/opal_server/scopes/service.py | 40 ++++-- .../tests/delete_scope_cache_purge_test.py | 132 ++++++++++++++++++ 2 files changed, 157 insertions(+), 15 deletions(-) create mode 100644 packages/opal-server/opal_server/tests/delete_scope_cache_purge_test.py diff --git a/packages/opal-server/opal_server/scopes/service.py b/packages/opal-server/opal_server/scopes/service.py index d3df4972f..64cf3b8a5 100644 --- a/packages/opal-server/opal_server/scopes/service.py +++ b/packages/opal-server/opal_server/scopes/service.py @@ -162,24 +162,34 @@ async def delete_scope(self, scope_id: str): with tracer.trace("scopes_service.delete_scope", resource=scope_id): logger.info(f"Delete scope: {scope_id}") scope = await self._scopes.get(scope_id) - url = scope.policy.url - - scopes = await self._scopes.all() - remove_repo_clone = True - - for scope in scopes: - if scope.scope_id != scope_id and scope.policy.url == url: - logger.info( - f"found another scope with same remote url ({scope.scope_id}), skipping clone deletion" - ) - remove_repo_clone = False - break + deleted_source = cast(GitPolicyScopeSource, scope.policy) + deleted_source_id = GitPolicyFetcher.source_id(deleted_source) + scope_dir = GitPolicyFetcher.repo_clone_path(self._base_dir, deleted_source) + + # Clone dir, the `repos` handle cache, and `repos_last_fetched` are + # all keyed by source_id (= the clone path). A sibling only shares + # storage when it resolves to the same source_id; same url with a + # different branch can shard to a different source_id (and a + # different clone dir) when SCOPES_REPO_CLONES_SHARDS > 1, so gate on + # source_id, not url — otherwise the deleted scope's clone + pygit2 + # handle leak. + other_scopes = [ + s for s in await self._scopes.all() if s.scope_id != scope_id + ] + source_id_shared = any( + isinstance(s.policy, GitPolicyScopeSource) + and GitPolicyFetcher.source_id(s.policy) == deleted_source_id + for s in other_scopes + ) - if remove_repo_clone: - scope_dir = GitPolicyFetcher.repo_clone_path( - self._base_dir, cast(GitPolicyScopeSource, scope.policy) + if source_id_shared: + logger.info( + "Another scope shares the same clone (source id), skipping clone deletion" ) + else: shutil.rmtree(scope_dir, ignore_errors=True) + GitPolicyFetcher.forget_repo(str(scope_dir)) + GitPolicyFetcher.repos_last_fetched.pop(deleted_source_id, None) await self._scopes.delete(scope_id) diff --git a/packages/opal-server/opal_server/tests/delete_scope_cache_purge_test.py b/packages/opal-server/opal_server/tests/delete_scope_cache_purge_test.py new file mode 100644 index 000000000..4a4e0a5e7 --- /dev/null +++ b/packages/opal-server/opal_server/tests/delete_scope_cache_purge_test.py @@ -0,0 +1,132 @@ +import pytest +from opal_common.schemas.policy_source import GitPolicyScopeSource, NoAuthData +from opal_common.schemas.scopes import Scope +from opal_server.git_fetcher import GitPolicyFetcher +from opal_server.scopes.scope_repository import ScopeNotFoundError +from opal_server.scopes.service import ScopesService + + +class FakeScopeRepository: + def __init__(self, scopes): + self._scopes = {s.scope_id: s for s in scopes} + + async def get(self, scope_id): + if scope_id not in self._scopes: + raise ScopeNotFoundError(scope_id) + return self._scopes[scope_id] + + async def all(self): + return list(self._scopes.values()) + + async def delete(self, scope_id): + self._scopes.pop(scope_id, None) + + +def _scope(scope_id, url, branch="main"): + return Scope( + scope_id=scope_id, + policy=GitPolicyScopeSource( + source_type="git", + url=url, + branch=branch, + auth=NoAuthData(auth_type="none"), + ), + data={"entries": []}, + ) + + +@pytest.fixture(autouse=True) +def clear_caches(): + GitPolicyFetcher.repos.clear() + GitPolicyFetcher.repos_last_fetched.clear() + GitPolicyFetcher.repo_locks.clear() + yield + GitPolicyFetcher.repos.clear() + GitPolicyFetcher.repos_last_fetched.clear() + GitPolicyFetcher.repo_locks.clear() + + +@pytest.mark.asyncio +async def test_delete_unique_scope_purges_caches(tmp_path, monkeypatch): + scope = _scope("only", "https://git/repo-a.git") + repo = FakeScopeRepository([scope]) + svc = ScopesService(base_dir=tmp_path, scopes=repo, pubsub_endpoint=None) + + src = scope.policy + sid = GitPolicyFetcher.source_id(src) + clone_path = str(GitPolicyFetcher.repo_clone_path(tmp_path, src)) + GitPolicyFetcher.repos[clone_path] = object() + GitPolicyFetcher.repos_last_fetched[sid] = "ts" + + monkeypatch.setattr( + "opal_server.scopes.service.shutil.rmtree", lambda *a, **k: None + ) + + await svc.delete_scope("only") + + assert clone_path not in GitPolicyFetcher.repos + assert sid not in GitPolicyFetcher.repos_last_fetched + + +@pytest.mark.asyncio +async def test_delete_keeps_caches_when_sibling_shares_source(tmp_path, monkeypatch): + a = _scope("a", "https://git/shared.git") + b = _scope("b", "https://git/shared.git") # same url+branch -> same source_id + repo = FakeScopeRepository([a, b]) + svc = ScopesService(base_dir=tmp_path, scopes=repo, pubsub_endpoint=None) + + sid = GitPolicyFetcher.source_id(a.policy) + clone_path = str(GitPolicyFetcher.repo_clone_path(tmp_path, a.policy)) + GitPolicyFetcher.repos[clone_path] = object() + GitPolicyFetcher.repos_last_fetched[sid] = "ts" + + rmtree_calls = [] + monkeypatch.setattr( + "opal_server.scopes.service.shutil.rmtree", + lambda p, **k: rmtree_calls.append(p), + ) + + await svc.delete_scope("a") + + assert rmtree_calls == [] # sibling shares the source id; clone must survive + assert clone_path in GitPolicyFetcher.repos + assert sid in GitPolicyFetcher.repos_last_fetched + + +@pytest.mark.asyncio +async def test_delete_purges_when_sibling_shares_url_but_not_source( + tmp_path, monkeypatch +): + """Same url, different branch, sharded clones (SCOPES_REPO_CLONES_SHARDS>1) + resolve to different source_ids -> different clone dirs. + + Deleting one must still purge its own clone + caches; the url- + sharing sibling lives elsewhere. + """ + # shards=4: branch "main" -> index 1, "dev" -> index 3 (distinct source_ids). + monkeypatch.setattr( + "opal_server.git_fetcher.opal_server_config.SCOPES_REPO_CLONES_SHARDS", 4 + ) + a = _scope("a", "https://git/shared.git", branch="main") + b = _scope("b", "https://git/shared.git", branch="dev") # same url, diff source_id + assert GitPolicyFetcher.source_id(a.policy) != GitPolicyFetcher.source_id(b.policy) + + repo = FakeScopeRepository([a, b]) + svc = ScopesService(base_dir=tmp_path, scopes=repo, pubsub_endpoint=None) + + sid_a = GitPolicyFetcher.source_id(a.policy) + clone_path_a = str(GitPolicyFetcher.repo_clone_path(tmp_path, a.policy)) + GitPolicyFetcher.repos[clone_path_a] = object() + GitPolicyFetcher.repos_last_fetched[sid_a] = "ts" + + rmtree_calls = [] + monkeypatch.setattr( + "opal_server.scopes.service.shutil.rmtree", + lambda p, **k: rmtree_calls.append(str(p)), + ) + + await svc.delete_scope("a") + + assert rmtree_calls == [clone_path_a] # its own clone dir removed + assert clone_path_a not in GitPolicyFetcher.repos + assert sid_a not in GitPolicyFetcher.repos_last_fetched From 8db1b6bd8be41244a53da98e30ae50718973e686 Mon Sep 17 00:00:00 2001 From: zivxx Date: Tue, 23 Jun 2026 12:53:04 +0300 Subject: [PATCH 22/26] fix(opal-server): clean all finished webhook tasks (no remove-while-iterating) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal_server/policy/watcher/task.py | 9 +++-- .../tests/webhook_task_cleanup_test.py | 35 +++++++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 packages/opal-server/opal_server/tests/webhook_task_cleanup_test.py diff --git a/packages/opal-server/opal_server/policy/watcher/task.py b/packages/opal-server/opal_server/policy/watcher/task.py index d420d183c..1afba7ef2 100644 --- a/packages/opal-server/opal_server/policy/watcher/task.py +++ b/packages/opal-server/opal_server/policy/watcher/task.py @@ -28,11 +28,10 @@ async def __aexit__(self, exc_type, exc, tb): async def _on_webhook(self, topic: Topic, data: Any): logger.info(f"Webhook listener triggered ({len(self._webhook_tasks)})") - for task in self._webhook_tasks: - if task.done(): - # Clean references to finished tasks - self._webhook_tasks.remove(task) - + # Rebuild rather than remove-while-iterating: list.remove() inside a + # `for t in self._webhook_tasks` loop skips the element after each removal, + # so finished tasks accumulate. + self._webhook_tasks = [t for t in self._webhook_tasks if not t.done()] self._webhook_tasks.append(asyncio.create_task(self.trigger(topic, data))) async def _listen_to_webhook_notifications(self): diff --git a/packages/opal-server/opal_server/tests/webhook_task_cleanup_test.py b/packages/opal-server/opal_server/tests/webhook_task_cleanup_test.py new file mode 100644 index 000000000..a576d924c --- /dev/null +++ b/packages/opal-server/opal_server/tests/webhook_task_cleanup_test.py @@ -0,0 +1,35 @@ +import asyncio + +import pytest +from opal_server.policy.watcher.task import BasePolicyWatcherTask + + +class _Watcher(BasePolicyWatcherTask): + async def trigger(self, topic, data): + return None # fast no-op so the created task finishes quickly + + +@pytest.mark.asyncio +async def test_done_tasks_are_all_removed(): + w = _Watcher(pubsub_endpoint=None) + + async def _done(): + return None + + # three already-finished tasks pre-loaded into the list + finished = [asyncio.create_task(_done()) for _ in range(3)] + await asyncio.gather(*finished) + w._webhook_tasks = list(finished) + + await w._on_webhook("webhook", None) + await asyncio.sleep(0) # let the newly created trigger task finish + + # all 3 done ones removed... + remaining_done = [t for t in w._webhook_tasks if t in finished] + assert remaining_done == [], f"stale done tasks leaked: {remaining_done}" + # ...and exactly the one freshly scheduled trigger task survives. + survivors = [t for t in w._webhook_tasks if t not in finished] + assert len(w._webhook_tasks) == 1 + assert len(survivors) == 1, f"new trigger task not scheduled: {w._webhook_tasks}" + + await asyncio.gather(*survivors) # drain the dangling task From 7a18ab44bc36bad8744d4afdc72d90d2456cd6e5 Mon Sep 17 00:00:00 2001 From: zivxx Date: Wed, 1 Jul 2026 03:28:54 +0300 Subject: [PATCH 23/26] feat(opal-server): add SCOPES_SYNC_CONCURRENCY config (default 10) Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/plans/docs/05-config-reference.md | 1 + packages/opal-server/opal_server/config.py | 8 ++++++++ .../opal_server/tests/sync_concurrency_test.py | 6 ++++++ 3 files changed, 15 insertions(+) create mode 100644 packages/opal-server/opal_server/tests/sync_concurrency_test.py diff --git a/.claude/plans/docs/05-config-reference.md b/.claude/plans/docs/05-config-reference.md index 19caea4c4..3b331f56b 100644 --- a/.claude/plans/docs/05-config-reference.md +++ b/.claude/plans/docs/05-config-reference.md @@ -12,6 +12,7 @@ instantiation — the bare name is what appears in the table). |---|---|---|---|---| | `OPAL_SCOPES_GIT_FETCH_TIMEOUT` | float (seconds) | `120.0` | Hard timeout for a single scope git clone/fetch. On timeout the operation is abandoned and the scope is marked failed, so one unreachable repo can never block boot or other scopes. `0` = no timeout. | `packages/opal-server/opal_server/config.py:150-156` | | `OPAL_SCOPES_GIT_MAX_WORKERS` | int | `10` | Size of the dedicated `ThreadPoolExecutor` for scope git operations. Isolating git work keeps a hung fetch from starving bundle serving and other server work that uses the default executor. | `packages/opal-server/opal_server/config.py:157-163` | +| `OPAL_SCOPES_SYNC_CONCURRENCY` | int | `10` | Maximum number of scope git repositories fetched concurrently during `sync_scopes` (boot and periodic). Bounds parallel loading so a fleet of repos loads ~N at a time instead of serially. Higher is faster but uses more memory, file descriptors, and network connections at once; tune down for many pods × many sources. | `packages/opal-server/opal_server/config.py:438-444` | > **Caveat (timeout is soft, not a hard kill).** `OPAL_SCOPES_GIT_FETCH_TIMEOUT` is enforced via > `asyncio.wait_for`, which cancels the *await* — unblocking the event loop and the awaiting diff --git a/packages/opal-server/opal_server/config.py b/packages/opal-server/opal_server/config.py index 83b5dbb26..766f674b4 100644 --- a/packages/opal-server/opal_server/config.py +++ b/packages/opal-server/opal_server/config.py @@ -435,6 +435,14 @@ class OpalServerConfig(Confi): description="The max number of local clones to use for the same repo (reused across scopes)", ) + SCOPES_SYNC_CONCURRENCY = confi.int( + "SCOPES_SYNC_CONCURRENCY", + 10, + description="Maximum number of scope git repositories fetched concurrently " + "during sync_scopes (boot and periodic). Higher is faster but uses more " + "memory, file descriptors, and network connections at once.", + ) + REDIS_URL = confi.str( "REDIS_URL", default="redis://localhost", diff --git a/packages/opal-server/opal_server/tests/sync_concurrency_test.py b/packages/opal-server/opal_server/tests/sync_concurrency_test.py new file mode 100644 index 000000000..e9ea68280 --- /dev/null +++ b/packages/opal-server/opal_server/tests/sync_concurrency_test.py @@ -0,0 +1,6 @@ +from opal_server.config import OpalServerConfig + + +def test_sync_concurrency_default(): + clean = OpalServerConfig(prefix="OPAL_") + assert clean.SCOPES_SYNC_CONCURRENCY == 10 From 7ce2a83fb0aadcecd8db3ef6fdb6a80c324932ef Mon Sep 17 00:00:00 2001 From: zivxx Date: Wed, 1 Jul 2026 03:29:37 +0300 Subject: [PATCH 24/26] refactor(opal-server): expose stable per-source lock accessor Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-server/opal_server/git_fetcher.py | 21 +++++++++++++------ .../tests/sync_concurrency_test.py | 16 ++++++++++++++ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index 1ebee00fe..be9cf6048 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -179,14 +179,23 @@ def __init__( f"Initializing git fetcher: scope_id={scope_id}, url={source.url}, branch={self._source.branch}, source_id={self._source_id}" ) - async def _get_repo_lock(self): - # Previous file based implementation worked across multiple processes/threads, but wasn't fair (next acquiree is random) - # This implementation works only within the same process/thread, but is fair (next acquiree is the earliest to enter the lock) - lock = GitPolicyFetcher.repo_locks[ - self._source_id - ] = GitPolicyFetcher.repo_locks.get(self._source_id, asyncio.Lock()) + @staticmethod + def source_lock(source_id: str) -> asyncio.Lock: + """Return the process-shared per-source lock, creating it on first use. + + The same lock object is reused for a given source_id so that a fetch and + a concurrent delete of the same source serialize against each other. + """ + lock = GitPolicyFetcher.repo_locks.get(source_id) + if lock is None: + lock = GitPolicyFetcher.repo_locks[source_id] = asyncio.Lock() return lock + async def _get_repo_lock(self): + # Same-process, fair (FIFO) lock per source. Shared with delete_scope + # via GitPolicyFetcher.source_lock so delete cannot race an in-flight fetch. + return GitPolicyFetcher.source_lock(self._source_id) + async def _was_fetched_after(self, t: datetime.datetime): last_fetched = GitPolicyFetcher.repos_last_fetched.get(self._source_id, None) if last_fetched is None: diff --git a/packages/opal-server/opal_server/tests/sync_concurrency_test.py b/packages/opal-server/opal_server/tests/sync_concurrency_test.py index e9ea68280..77ff9d2a6 100644 --- a/packages/opal-server/opal_server/tests/sync_concurrency_test.py +++ b/packages/opal-server/opal_server/tests/sync_concurrency_test.py @@ -1,6 +1,22 @@ +import asyncio + +import pytest + from opal_server.config import OpalServerConfig +from opal_server.git_fetcher import GitPolicyFetcher def test_sync_concurrency_default(): clean = OpalServerConfig(prefix="OPAL_") assert clean.SCOPES_SYNC_CONCURRENCY == 10 + + +@pytest.mark.asyncio +async def test_source_lock_is_stable_per_source(monkeypatch): + monkeypatch.setattr(GitPolicyFetcher, "repo_locks", {}) + lock_a1 = GitPolicyFetcher.source_lock("src-a") + lock_a2 = GitPolicyFetcher.source_lock("src-a") + lock_b = GitPolicyFetcher.source_lock("src-b") + assert lock_a1 is lock_a2 # same source -> same lock object + assert lock_a1 is not lock_b + assert isinstance(lock_a1, asyncio.Lock) From 170a221737d887745c9978c6df13c47ebbfee05b Mon Sep 17 00:00:00 2001 From: zivxx Date: Wed, 1 Jul 2026 03:30:34 +0300 Subject: [PATCH 25/26] fix(opal-server): serialize scope delete against in-flight fetch via per-source lock Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-server/opal_server/scopes/service.py | 20 +++--- .../tests/sync_concurrency_test.py | 61 +++++++++++++++++++ 2 files changed, 73 insertions(+), 8 deletions(-) diff --git a/packages/opal-server/opal_server/scopes/service.py b/packages/opal-server/opal_server/scopes/service.py index 64cf3b8a5..a96215921 100644 --- a/packages/opal-server/opal_server/scopes/service.py +++ b/packages/opal-server/opal_server/scopes/service.py @@ -182,14 +182,18 @@ async def delete_scope(self, scope_id: str): for s in other_scopes ) - if source_id_shared: - logger.info( - "Another scope shares the same clone (source id), skipping clone deletion" - ) - else: - shutil.rmtree(scope_dir, ignore_errors=True) - GitPolicyFetcher.forget_repo(str(scope_dir)) - GitPolicyFetcher.repos_last_fetched.pop(deleted_source_id, None) + # Serialize the filesystem + cache mutation against an in-flight + # fetch of the same source so a delete cannot race a concurrent + # sync_scope (see PR4 bounded-concurrency loading). + async with GitPolicyFetcher.source_lock(deleted_source_id): + if source_id_shared: + logger.info( + "Another scope shares the same clone (source id), skipping clone deletion" + ) + else: + shutil.rmtree(scope_dir, ignore_errors=True) + GitPolicyFetcher.forget_repo(str(scope_dir)) + GitPolicyFetcher.repos_last_fetched.pop(deleted_source_id, None) await self._scopes.delete(scope_id) diff --git a/packages/opal-server/opal_server/tests/sync_concurrency_test.py b/packages/opal-server/opal_server/tests/sync_concurrency_test.py index 77ff9d2a6..d9772defc 100644 --- a/packages/opal-server/opal_server/tests/sync_concurrency_test.py +++ b/packages/opal-server/opal_server/tests/sync_concurrency_test.py @@ -2,8 +2,41 @@ import pytest +from opal_common.schemas.policy_source import GitPolicyScopeSource +from opal_common.schemas.scopes import Scope from opal_server.config import OpalServerConfig from opal_server.git_fetcher import GitPolicyFetcher +from opal_server.scopes.scope_repository import ScopeNotFoundError +from opal_server.scopes.service import ScopesService + + +class _FakeRepoStore: + def __init__(self, scopes): + self._s = {x.scope_id: x for x in scopes} + + async def get(self, sid): + if sid not in self._s: + raise ScopeNotFoundError(sid) + return self._s[sid] + + async def all(self): + return list(self._s.values()) + + async def delete(self, sid): + self._s.pop(sid, None) + + +def _git_scope(sid, url): + return Scope( + scope_id=sid, + policy=GitPolicyScopeSource( + source_type="git", + url=url, + branch="main", + auth={"auth_type": "none"}, + ), + data={"entries": []}, + ) def test_sync_concurrency_default(): @@ -20,3 +53,31 @@ async def test_source_lock_is_stable_per_source(monkeypatch): assert lock_a1 is lock_a2 # same source -> same lock object assert lock_a1 is not lock_b assert isinstance(lock_a1, asyncio.Lock) + + +@pytest.mark.asyncio +async def test_delete_waits_for_in_flight_fetch(tmp_path, monkeypatch): + monkeypatch.setattr(GitPolicyFetcher, "repo_locks", {}) + monkeypatch.setattr(GitPolicyFetcher, "repos", {}) + monkeypatch.setattr(GitPolicyFetcher, "repos_last_fetched", {}) + monkeypatch.setattr( + "opal_server.scopes.service.shutil.rmtree", lambda *a, **k: None + ) + + scope = _git_scope("s1", "https://git/repo.git") + svc = ScopesService( + base_dir=tmp_path, scopes=_FakeRepoStore([scope]), pubsub_endpoint=None + ) + sid = GitPolicyFetcher.source_id(scope.policy) + + # simulate an in-flight fetch holding the per-source lock + lock = GitPolicyFetcher.source_lock(sid) + await lock.acquire() + + delete_task = asyncio.create_task(svc.delete_scope("s1")) + await asyncio.sleep(0.05) + assert not delete_task.done(), "delete proceeded while the source lock was held" + + lock.release() + await asyncio.wait_for(delete_task, timeout=2) + assert delete_task.done() From a20489a3a3b59f376b8d969072643abf0c789642 Mon Sep 17 00:00:00 2001 From: zivxx Date: Wed, 1 Jul 2026 03:31:34 +0300 Subject: [PATCH 26/26] feat(opal-server): bounded-concurrency scope sync (parallel boot) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../opal-server/opal_server/git_fetcher.py | 5 +- .../opal-server/opal_server/scopes/service.py | 54 +++++++++---- .../tests/sync_concurrency_test.py | 78 ++++++++++++++++++- 3 files changed, 116 insertions(+), 21 deletions(-) diff --git a/packages/opal-server/opal_server/git_fetcher.py b/packages/opal-server/opal_server/git_fetcher.py index be9cf6048..a6c971c4c 100644 --- a/packages/opal-server/opal_server/git_fetcher.py +++ b/packages/opal-server/opal_server/git_fetcher.py @@ -183,8 +183,9 @@ def __init__( def source_lock(source_id: str) -> asyncio.Lock: """Return the process-shared per-source lock, creating it on first use. - The same lock object is reused for a given source_id so that a fetch and - a concurrent delete of the same source serialize against each other. + The same lock object is reused for a given source_id so that a + fetch and a concurrent delete of the same source serialize + against each other. """ lock = GitPolicyFetcher.repo_locks.get(source_id) if lock is None: diff --git a/packages/opal-server/opal_server/scopes/service.py b/packages/opal-server/opal_server/scopes/service.py index a96215921..c4a37e18e 100644 --- a/packages/opal-server/opal_server/scopes/service.py +++ b/packages/opal-server/opal_server/scopes/service.py @@ -1,3 +1,4 @@ +import asyncio import datetime import shutil from functools import partial @@ -12,6 +13,7 @@ from opal_common.schemas.policy import PolicyUpdateMessageNotification from opal_common.schemas.policy_source import GitPolicyScopeSource from opal_common.topics.publisher import ScopedServerSideTopicPublisher +from opal_server.config import opal_server_config from opal_server.git_fetcher import GitPolicyFetcher, PolicyFetcherCallbacks from opal_server.policy.watcher.callbacks import ( create_policy_update, @@ -204,38 +206,56 @@ async def sync_scopes(self, only_poll_updates=False, notify_on_changes=True): # Only sync scopes that have polling enabled (in a periodic check) scopes = [scope for scope in scopes if scope.policy.poll_updates] + concurrency = max(1, opal_server_config.SCOPES_SYNC_CONCURRENCY) logger.info( - f"OPAL Scopes: syncing {len(scopes)} scopes in the background (polling updates: {only_poll_updates})" + f"OPAL Scopes: syncing {len(scopes)} scopes " + f"(concurrency={concurrency}, polling updates: {only_poll_updates})" ) + # Pass 1: fetch each distinct source once (force_fetch). Pass 2: scopes that + # share an already-fetched source only re-check locally (no network). fetched_source_ids = set() - skipped_scopes = [] + first_pass = [] + second_pass = [] for scope in scopes: src_id = GitPolicyFetcher.source_id(scope.policy) # Give priority to scopes that have a unique url per shard (so we'll clone all repos asap) if src_id in fetched_source_ids: - skipped_scopes.append(scope) - continue + second_pass.append(scope) + else: + fetched_source_ids.add(src_id) + first_pass.append(scope) - try: - await self.sync_scope( - scope=scope, - force_fetch=True, - notify_on_changes=notify_on_changes, - ) - except Exception as e: - logger.exception(f"sync_scope failed for {scope.scope_id}") + await self._sync_scope_batch( + first_pass, + force_fetch=True, + notify_on_changes=notify_on_changes, + concurrency=concurrency, + ) + await self._sync_scope_batch( + second_pass, + force_fetch=False, + notify_on_changes=notify_on_changes, + concurrency=concurrency, + ) - fetched_source_ids.add(src_id) + async def _sync_scope_batch( + self, scopes, *, force_fetch, notify_on_changes, concurrency + ): + if not scopes: + return + semaphore = asyncio.Semaphore(concurrency) - for scope in skipped_scopes: - # No need to refetch the same repo, just check for changes + async def _one(scope): + async with semaphore: try: await self.sync_scope( scope=scope, - force_fetch=False, + force_fetch=force_fetch, notify_on_changes=notify_on_changes, ) - except Exception as e: + except Exception: logger.exception(f"sync_scope failed for {scope.scope_id}") + + await asyncio.gather(*(_one(scope) for scope in scopes)) diff --git a/packages/opal-server/opal_server/tests/sync_concurrency_test.py b/packages/opal-server/opal_server/tests/sync_concurrency_test.py index d9772defc..f41ff7b21 100644 --- a/packages/opal-server/opal_server/tests/sync_concurrency_test.py +++ b/packages/opal-server/opal_server/tests/sync_concurrency_test.py @@ -1,10 +1,9 @@ import asyncio import pytest - from opal_common.schemas.policy_source import GitPolicyScopeSource from opal_common.schemas.scopes import Scope -from opal_server.config import OpalServerConfig +from opal_server.config import OpalServerConfig, opal_server_config from opal_server.git_fetcher import GitPolicyFetcher from opal_server.scopes.scope_repository import ScopeNotFoundError from opal_server.scopes.service import ScopesService @@ -81,3 +80,78 @@ async def test_delete_waits_for_in_flight_fetch(tmp_path, monkeypatch): lock.release() await asyncio.wait_for(delete_task, timeout=2) assert delete_task.done() + + +@pytest.mark.asyncio +async def test_sync_scopes_runs_in_parallel_bounded(tmp_path, monkeypatch): + monkeypatch.setattr(opal_server_config, "SCOPES_SYNC_CONCURRENCY", 3) + + scopes = [_git_scope(f"s{i}", f"https://git/r{i}.git") for i in range(9)] + svc = ScopesService( + base_dir=tmp_path, scopes=_FakeRepoStore(scopes), pubsub_endpoint=None + ) + + in_flight = 0 + peak = 0 + + async def fake_sync_scope(*args, scope=None, **kwargs): + nonlocal in_flight, peak + in_flight += 1 + peak = max(peak, in_flight) + await asyncio.sleep(0.05) + in_flight -= 1 + + monkeypatch.setattr(svc, "sync_scope", fake_sync_scope) + + await svc.sync_scopes() + + assert peak <= 3, f"concurrency exceeded the cap: peak={peak}" + assert peak > 1, "did not actually run in parallel" + + +@pytest.mark.asyncio +async def test_sync_scopes_isolates_one_failure(tmp_path, monkeypatch): + monkeypatch.setattr(opal_server_config, "SCOPES_SYNC_CONCURRENCY", 5) + + scopes = [_git_scope(f"s{i}", f"https://git/r{i}.git") for i in range(5)] + svc = ScopesService( + base_dir=tmp_path, scopes=_FakeRepoStore(scopes), pubsub_endpoint=None + ) + + started = [] + completed = [] + in_flight = 0 + peak = 0 + + async def fake_sync_scope(*args, scope=None, **kwargs): + nonlocal in_flight, peak + started.append(scope.scope_id) + in_flight += 1 + peak = max(peak, in_flight) + try: + # A real suspension point so sibling coroutines actually interleave; + # the failing scope then raises mid-flight while its peers are still + # in-flight — the exact condition the concurrent path must isolate. + await asyncio.sleep(0.05) + if scope.scope_id == "s2": + raise RuntimeError("boom") + completed.append(scope.scope_id) + finally: + in_flight -= 1 + + monkeypatch.setattr(svc, "sync_scope", fake_sync_scope) + + await svc.sync_scopes() # must not raise + + # Every scope was attempted, and s2's mid-flight failure neither stopped the + # batch nor cancelled its concurrently-running siblings (all four completed). + assert set(started) == {f"s{i}" for i in range(5)}, "a failure stopped the batch" + assert set(completed) == { + "s0", + "s1", + "s3", + "s4", + }, "a sibling failure cancelled in-flight peers" + # peak > 1 proves this exercised the concurrent path (a serial loop would + # give peak == 1), so the isolation guarantee is verified against parallelism. + assert peak > 1, "did not actually run in parallel"