From bae3ceea293757bfb05123aa0d045d8b9e295d85 Mon Sep 17 00:00:00 2001 From: Tale Agent Date: Sun, 21 Jun 2026 14:26:27 +0000 Subject: [PATCH] fix(sandbox): verify skipTLSVerify/caFile kubeconfig behavior under Bun (#1849) Empirical testing under Bun 1.3.14 confirms that @kubernetes/client-node@1.4.0 sends requests via node-fetch@2 -> node:https.request with an https.Agent, and that Bun's node:https shim silently ignores the agent's ca and rejectUnauthorized fields at the TLS layer. Consequences: - caFile / caData in the kubeconfig: INERT under Bun (CA not loaded) - skipTLSVerify: true: INERT under Bun (TLS still verified, NOT bypassed) - NODE_EXTRA_CA_CERTS: the only working CA-trust mechanism under Bun Changes: - Remove skipTLSVerify: true from makeK8sClient (dead code that misleadingly looks like a security bypass but does nothing under Bun) - Update the file-level TLS NOTE to enumerate all three inert knobs - Add tests pinning the kubeconfig shape (skipTLSVerify absent, caFile conditional on SANDBOX_K8S_CAFILE, bearer token on user entry) - Add docs/kubernetes.md Bun TLS contract section with findings table and updated NODE_EXTRA_CA_CERTS env table row --- services/sandbox/docs/kubernetes.md | 47 +++++++--- .../src/backend/kubernetes/k8s-client.test.ts | 89 ++++++++++++++++++- .../src/backend/kubernetes/k8s-client.ts | 42 ++++++--- 3 files changed, 151 insertions(+), 27 deletions(-) diff --git a/services/sandbox/docs/kubernetes.md b/services/sandbox/docs/kubernetes.md index b2089c52f..e9001bb13 100644 --- a/services/sandbox/docs/kubernetes.md +++ b/services/sandbox/docs/kubernetes.md @@ -83,22 +83,45 @@ stream. Keep it out so a stray exec call fails closed. ## Environment -| Env | Required | Notes | -| ----------------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `SANDBOX_BACKEND=kubernetes` | yes | Selects this backend. | -| `SANDBOX_SPAWNER_IMAGE` | yes | The spawner's **own** image ref, used for the `stage`/`harvest` containers. Must match the deployed spawner version. | -| `SANDBOX_RUNTIME_IMAGE` | yes | The `runner` image (`tale-sandbox-runtime:`). | -| `SANDBOX_K8S_NAMESPACE` | yes | Namespace the per-exec Pods/Secrets are created in (default `tale-sandbox`). | -| `NODE_EXTRA_CA_CERTS` | in-cluster | Point at the SA `ca.crt` (`/var/run/secrets/kubernetes.io/serviceaccount/ca.crt`). **Bun's fetch ignores the kubeconfig CA**, so without this the apiserver TLS isn't trusted. | -| `SANDBOX_RUNTIME=runsc` | optional | Sets the Pod `runtimeClassName` (gVisor) via `SANDBOX_RUNTIME_CLASS` (default `gvisor`). | -| `SANDBOX_CACHE=pvc` | optional | Mounts per-org RWX cache PVCs on the runner; needs the PVC RBAC above + an RWX StorageClass. Default `none` (installs fresh each run via the egress proxy). | -| `SANDBOX_K8S_WORKSPACE_SIZE_LIMIT` | optional | `sizeLimit` on the per-exec `/user` emptyDir (default `4Gi`). Bounds deps + temp + outputs; exceeding it evicts the Pod. | -| `SANDBOX_EGRESS_PROXY` | optional | The runner's `HTTPS_PROXY`/`HTTP_PROXY` for `pip`/`npm` (default `http://sandbox-egress:3128`). | -| `SANDBOX_K8S_SERVER` / `SANDBOX_K8S_TOKEN` / `SANDBOX_K8S_CAFILE` | dev only | Explicit bearer-token kubeconfig for local Bun dev (kind's client-cert kubeconfig auths as `system:anonymous` under Bun). In-cluster uses the projected SA token automatically. | +| Env | Required | Notes | +| ----------------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `SANDBOX_BACKEND=kubernetes` | yes | Selects this backend. | +| `SANDBOX_SPAWNER_IMAGE` | yes | The spawner's **own** image ref, used for the `stage`/`harvest` containers. Must match the deployed spawner version. | +| `SANDBOX_RUNTIME_IMAGE` | yes | The `runner` image (`tale-sandbox-runtime:`). | +| `SANDBOX_K8S_NAMESPACE` | yes | Namespace the per-exec Pods/Secrets are created in (default `tale-sandbox`). | +| `NODE_EXTRA_CA_CERTS` | in-cluster | Point at the SA `ca.crt` (`/var/run/secrets/kubernetes.io/serviceaccount/ca.crt`). **This is the only working CA-trust mechanism under Bun** — see [Bun TLS note](#bun-tls-contract) below. | +| `SANDBOX_RUNTIME=runsc` | optional | Sets the Pod `runtimeClassName` (gVisor) via `SANDBOX_RUNTIME_CLASS` (default `gvisor`). | +| `SANDBOX_CACHE=pvc` | optional | Mounts per-org RWX cache PVCs on the runner; needs the PVC RBAC above + an RWX StorageClass. Default `none` (installs fresh each run via the egress proxy). | +| `SANDBOX_K8S_WORKSPACE_SIZE_LIMIT` | optional | `sizeLimit` on the per-exec `/user` emptyDir (default `4Gi`). Bounds deps + temp + outputs; exceeding it evicts the Pod. | +| `SANDBOX_EGRESS_PROXY` | optional | The runner's `HTTPS_PROXY`/`HTTP_PROXY` for `pip`/`npm` (default `http://sandbox-egress:3128`). | +| `SANDBOX_K8S_SERVER` / `SANDBOX_K8S_TOKEN` / `SANDBOX_K8S_CAFILE` | dev only | Explicit bearer-token kubeconfig for local Bun dev (kind's client-cert kubeconfig auths as `system:anonymous` under Bun). In-cluster uses the projected SA token automatically. | The Pod sets `automountServiceAccountToken: false` — the runtime never gets an SA token. +## Bun TLS contract + +`@kubernetes/client-node@1.4.0` sends requests via `node-fetch@2`, which calls +`node:https.request` with an `https.Agent` carrying the kubeconfig TLS options. +Empirical testing under **Bun 1.3.x** shows that Bun's `node:https` shim stores +the options on the Agent object but **silently ignores them** at the TLS layer: + +| Kubeconfig knob | `https.Agent` option | Bun 1.3.x behavior | +| ---------------------- | --------------------------- | ----------------------------------- | +| `caFile` / `caData` | `ca: ` | **INERT** — CA is not loaded | +| `skipTLSVerify` | `rejectUnauthorized: false` | **INERT** — TLS is still verified | +| `certFile` / `keyFile` | `cert` / `key` | **INERT** — client cert is not sent | + +**`NODE_EXTRA_CA_CERTS` is the only working CA-trust mechanism under Bun.** Set +it to the cluster CA file (in-cluster: the projected SA `ca.crt`; dev: the file +referenced by `SANDBOX_K8S_CAFILE`). Without it, apiserver TLS verification +fails with an opaque `self signed certificate` error even when `caFile` or +`skipTLSVerify` are set in the kubeconfig. + +`skipTLSVerify` is therefore intentionally absent from the kubeconfig built by +`makeK8sClient`: it provides no security bypass under Bun and would only +mislead operators into thinking TLS verification is disabled. + ## NetworkPolicy (operator-applied, opt-in) The Pod's containers share one network namespace, so the policy is per-Pod. The diff --git a/services/sandbox/src/backend/kubernetes/k8s-client.test.ts b/services/sandbox/src/backend/kubernetes/k8s-client.test.ts index 37a060bc4..a264ffef1 100644 --- a/services/sandbox/src/backend/kubernetes/k8s-client.test.ts +++ b/services/sandbox/src/backend/kubernetes/k8s-client.test.ts @@ -2,11 +2,20 @@ // middleware (the only thing standing between a wedged TCP connection and a // forever-hung execute()) and withRetry's retryability gate. -import { describe, expect, test } from 'bun:test'; +import { afterEach, beforeEach, describe, expect, spyOn, test } from 'bun:test'; -import { HttpMethod, RequestContext } from '@kubernetes/client-node'; +import { + HttpMethod, + KubeConfig, + RequestContext, +} from '@kubernetes/client-node'; -import { apiTimeout, httpStatusCode, withRetry } from './k8s-client.ts'; +import { + apiTimeout, + httpStatusCode, + makeK8sClient, + withRetry, +} from './k8s-client.ts'; /** Await a rejection and hand back the error (typed alternative to .rejects). */ async function rejectionOf(p: Promise): Promise { @@ -57,6 +66,80 @@ describe('httpStatusCode', () => { }); }); +// Captures options passed to KubeConfig.loadFromOptions via a spy. +describe('makeK8sClient kubeconfig shape', () => { + type LoadFromOptionsArg = Parameters[0]; + let capturedOpts: LoadFromOptionsArg | undefined; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + let spy: ReturnType>; + const savedEnv: Record = {}; + + beforeEach(() => { + capturedOpts = undefined; + // Capture the original before spyOn replaces it to avoid infinite recursion. + const originalFn = KubeConfig.prototype.loadFromOptions; + spy = spyOn(KubeConfig.prototype, 'loadFromOptions').mockImplementation( + function (this: KubeConfig, opts: LoadFromOptionsArg) { + capturedOpts = opts; + originalFn.call(this, opts); + }, + ); + for (const k of [ + 'SANDBOX_K8S_SERVER', + 'SANDBOX_K8S_TOKEN', + 'SANDBOX_K8S_CAFILE', + ]) { + savedEnv[k] = process.env[k]; + } + }); + + afterEach(() => { + spy.mockRestore(); + for (const [k, v] of Object.entries(savedEnv)) { + if (v === undefined) delete process.env[k]; + else process.env[k] = v; + } + }); + + test('skipTLSVerify is never set — it is inert under Bun (Bun 1.3.x ignores rejectUnauthorized on https.Agent)', () => { + process.env.SANDBOX_K8S_SERVER = 'https://k8s.example.com'; + process.env.SANDBOX_K8S_TOKEN = 'test-token'; + delete process.env.SANDBOX_K8S_CAFILE; + makeK8sClient('test-ns'); + const cluster = capturedOpts?.clusters?.[0]; + expect(cluster?.skipTLSVerify).toBeUndefined(); + }); + + test('caFile is set from SANDBOX_K8S_CAFILE when provided (non-Bun compat; inert under Bun)', () => { + process.env.SANDBOX_K8S_SERVER = 'https://k8s.example.com'; + process.env.SANDBOX_K8S_TOKEN = 'test-token'; + process.env.SANDBOX_K8S_CAFILE = '/etc/ssl/k8s-ca.crt'; + makeK8sClient('test-ns'); + const cluster = capturedOpts?.clusters?.[0]; + expect(cluster?.caFile).toBe('/etc/ssl/k8s-ca.crt'); + expect(cluster?.skipTLSVerify).toBeUndefined(); + }); + + test('caFile is absent when SANDBOX_K8S_CAFILE is unset', () => { + process.env.SANDBOX_K8S_SERVER = 'https://k8s.example.com'; + process.env.SANDBOX_K8S_TOKEN = 'test-token'; + delete process.env.SANDBOX_K8S_CAFILE; + makeK8sClient('test-ns'); + const cluster = capturedOpts?.clusters?.[0]; + expect(cluster?.caFile).toBeUndefined(); + expect(cluster?.skipTLSVerify).toBeUndefined(); + }); + + test('bearer token is placed on the user entry', () => { + process.env.SANDBOX_K8S_SERVER = 'https://k8s.example.com'; + process.env.SANDBOX_K8S_TOKEN = 'my-sa-token'; + delete process.env.SANDBOX_K8S_CAFILE; + makeK8sClient('ns'); + expect(capturedOpts?.users?.[0]?.token).toBe('my-sa-token'); + expect(capturedOpts?.clusters?.[0]?.server).toBe('https://k8s.example.com'); + }); +}); + describe('withRetry', () => { test('throws definitive 4xx immediately (no second attempt)', async () => { for (const code of [400, 401, 403, 404, 409, 422]) { diff --git a/services/sandbox/src/backend/kubernetes/k8s-client.ts b/services/sandbox/src/backend/kubernetes/k8s-client.ts index 67c2a5472..5f62cfd70 100644 --- a/services/sandbox/src/backend/kubernetes/k8s-client.ts +++ b/services/sandbox/src/backend/kubernetes/k8s-client.ts @@ -6,13 +6,27 @@ // createNamespacedPod / readNamespacedPodLog / deleteNamespacedPod + presigned- // URL I/O done inside the Pod — every primitive below is plain HTTP. // -// AUTH NOTE (Bun): Bun's fetch does NOT apply a kubeconfig's client cert or -// custom CA, so a client-cert cluster (e.g. kind's default kubeconfig) auths -// as system:anonymous. The real in-cluster path uses a ServiceAccount BEARER -// TOKEN (an Authorization header Bun sends fine) + the cluster CA — for Bun to -// trust that CA, the container must set NODE_EXTRA_CA_CERTS to the SA ca.crt -// (/var/run/secrets/kubernetes.io/serviceaccount/ca.crt). Local dev needs a -// token-based kubeconfig, not kind's client-cert one. +// TLS NOTE (Bun + @kubernetes/client-node@1.4.0): The library sends HTTP via +// node-fetch@2, which uses node:https.request with an https.Agent carrying the +// kubeconfig TLS options. Empirical testing (Bun 1.3.14) confirms that Bun's +// node:https shim does NOT apply the agent's `ca` or `rejectUnauthorized` +// fields — they are stored on the Agent object but silently ignored at the TLS +// layer. Concrete consequences: +// +// • caFile / caData in the kubeconfig → INERT; custom CA is NOT loaded. +// Real CA trust requires NODE_EXTRA_CA_CERTS (e.g. pointed at the SA +// ca.crt: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt). +// +// • skipTLSVerify: true in the kubeconfig → INERT; TLS is still verified. +// It looks like a security bypass but does nothing — do not rely on it. +// +// • Client certificates (cert/key) in the kubeconfig → INERT; the cluster +// treats the request as system:anonymous. +// +// The real in-cluster path works because it uses a ServiceAccount bearer token +// (an Authorization header Bun sends correctly) and sets NODE_EXTRA_CA_CERTS +// to the SA ca.crt so Bun trusts the apiserver's TLS certificate. Local dev +// must use a token-based kubeconfig, not kind's default client-cert kubeconfig. import { type ConfigurationOptions, @@ -75,16 +89,20 @@ export function makeK8sClient(namespace: string): K8sClient { const token = process.env.SANDBOX_K8S_TOKEN; if (server && token) { // Explicit bearer-token config (dev / Bun-friendly). Bun can't use a - // client-cert kubeconfig, so point at the apiserver with an SA token; CA - // trust via SANDBOX_K8S_CAFILE + NODE_EXTRA_CA_CERTS, or skipTLSVerify. + // client-cert kubeconfig, so point at the apiserver with an SA token. + // CA trust: set NODE_EXTRA_CA_CERTS to the cluster CA file — caFile here + // is inert under Bun (see TLS NOTE above) but is kept for documentation + // of intent and compatibility with non-Bun runtimes. + // skipTLSVerify is intentionally absent: it is also inert under Bun and + // looks like a security bypass without providing one. kc.loadFromOptions({ clusters: [ { name: 'k', server, - ...(process.env.SANDBOX_K8S_CAFILE - ? { caFile: process.env.SANDBOX_K8S_CAFILE } - : { skipTLSVerify: true }), + ...(process.env.SANDBOX_K8S_CAFILE && { + caFile: process.env.SANDBOX_K8S_CAFILE, + }), }, ], users: [{ name: 'sa', token }],