Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 31 additions & 10 deletions sdk/mcp/forkd_mcp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def spawn_sandboxes(
per_child_netns: bool = False,
memory_limit_mib: int = 256,
prewarm: bool = False,
live_fork: bool = False,
) -> list[dict[str, Any]]:
"""Fork N children from a parent snapshot.

Expand All @@ -77,6 +78,10 @@ def spawn_sandboxes(
this sandbox to creation time — useful when you have a
BRANCH SLO and fan out N>=3 from the same source. Default
false. See bench/pause-window/RESULTS-v0.2.md.
live_fork: v0.4+. Boot the sandbox with a memfd-backed RAM
region so later branch_sandbox calls can use mode="live"
(UFFD_WP). Requires kernel 5.7+ and the vendored
Firecracker fork — see docs/VENDORED-FIRECRACKER.md.

Returns the spawned SandboxInfo objects (one per child) with their
id, pid, guest_addr, etc.
Expand All @@ -88,6 +93,8 @@ def spawn_sandboxes(
"memory_limit_mib": memory_limit_mib,
"prewarm": prewarm,
}
if live_fork:
body["live_fork"] = True
with _client() as c:
r = c.post("/v1/sandboxes", json=body)
r.raise_for_status()
Expand All @@ -100,6 +107,8 @@ def branch_sandbox(
tag: str | None = None,
diff: bool = False,
measure_diff: bool = False,
mode: str | None = None,
wait: bool = True,
) -> dict[str, Any]:
"""Branch a running sandbox into a new snapshot.

Expand All @@ -116,30 +125,42 @@ def branch_sandbox(
sandbox_id: Id of the source sandbox (see list_sandboxes).
tag: Optional name for the new snapshot. When unset the
daemon generates `branch-<sandbox-id>-<unix-ts>`.
diff: When true (v0.3+), use Firecracker's Diff snapshot
mode instead of writing the full memory.bin under pause.
The user-visible source-pause window collapses to the
diff write (~200 ms idle source, 6-15x speedup on
typical agent workloads, up to 143x ceiling on 4 GiB SSD
idle source). Multi-BRANCH supported in v0.3.1+ via the
previous-output chain. See
bench/pause-window/RESULTS-v0.3.md.
mode: v0.4+ canonical mode selector. One of "full", "diff",
"live". Prefer this over the legacy `diff` boolean.
"live" requires the source to have been spawned with
live_fork=True; source pause drops to sub-50 ms while
memory streams from the running parent (UFFD_WP). Mutually
exclusive with `diff` (daemon returns 400 if both).
diff: Legacy. Equivalent to mode="diff"; kept so this server
can drive v0.3.x daemons that don't understand `mode`.
See bench/pause-window/RESULTS-v0.3.md.
measure_diff: Measurement-only hook. Take a Diff snapshot
inside the existing Full pause to report what diff
would have cost, without changing semantics. Mutually
exclusive with `diff` (400 if both set).
wait: v0.4+, only meaningful with mode="live". Default True
blocks until the background memory copy finishes and the
returned snapshot is status="ready". Set False to return
as soon as the source resumes (~10 ms); snapshot reaches
status="ready" later — poll list_snapshots to detect.

Returns SnapshotInfo: tag, dir, pause_ms, plus diff_ms /
diff_physical_bytes / diff_logical_bytes when diff or
measure_diff was set.
measure_diff was set, and status when mode="live".
"""
body: dict[str, Any] = {}
if tag is not None:
body["tag"] = tag
if diff:
# Prefer canonical `mode` when set; fall back to legacy `diff`.
if mode is not None:
body["mode"] = mode
elif diff:
body["diff"] = True
if measure_diff:
body["measure_diff"] = True
# wait=True is the daemon default; only send when fire-and-forget.
if not wait:
body["wait"] = False
with _client() as c:
r = c.post(f"/v1/sandboxes/{sandbox_id}/branch", json=body)
r.raise_for_status()
Expand Down
10 changes: 8 additions & 2 deletions sdk/python/forkd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,14 @@
``Sandbox`` to drive code execution inside one specific child.
"""

from .controller import Controller, ControllerError
from .controller import BranchMode, Controller, ControllerError
from .sandbox import CommandResult, Sandbox

__version__ = "0.3.4"
__all__ = ["Sandbox", "CommandResult", "Controller", "ControllerError"]
__all__ = [
"Sandbox",
"CommandResult",
"Controller",
"ControllerError",
"BranchMode",
]
65 changes: 53 additions & 12 deletions sdk/python/forkd/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,18 @@ class (`forkd.Sandbox`): Controller manages VM lifecycle from the host
import os
import urllib.error
import urllib.request
from typing import Any, Optional
from typing import Any, Literal, Optional

BranchMode = Literal["full", "diff", "live"]
"""Canonical BRANCH mode selector (Phase 7.1+).

- ``"full"`` — copy entire guest RAM under pause (default for v0.x).
- ``"diff"`` — Firecracker Diff snapshot (v0.3+). Sub-second pause for
idle sources; replaces the legacy ``diff=True`` boolean.
- ``"live"`` — UFFD_WP-based live BRANCH (v0.4+). Source pause drops to
sub-50 ms; memory streams from the running parent. Requires the
source to have been spawned with ``live_fork=True``.
"""


class ControllerError(RuntimeError):
Expand Down Expand Up @@ -105,6 +116,7 @@ def spawn_sandboxes(
per_child_netns: bool = False,
memory_limit_mib: Optional[int] = None,
prewarm: bool = False,
live_fork: bool = False,
) -> list[dict]:
"""``POST /v1/sandboxes`` — fork N children from a snapshot tag.

Expand All @@ -117,6 +129,13 @@ def spawn_sandboxes(
for steady-state BRANCH latency on the first user-visible
BRANCH (avoids the 2-9× cold-cache penalty documented in
``bench/pause-window/RESULTS-v0.2.md``).
live_fork:
v0.4+. Boot the sandbox with a memfd-backed RAM region so
later BRANCHes from it can use ``mode="live"`` (UFFD_WP).
Requires kernel 5.7+ and the vendored Firecracker fork —
see ``docs/VENDORED-FIRECRACKER.md``. No effect at spawn
time beyond the backend swap; cost shows up on the first
live BRANCH.

Returns the list of SandboxInfo dicts (id, snapshot_tag, netns,
guest_addr, created_at_unix, pid, memory_limit_mib).
Expand All @@ -130,6 +149,8 @@ def spawn_sandboxes(
body["memory_limit_mib"] = memory_limit_mib
if prewarm:
body["prewarm"] = True
if live_fork:
body["live_fork"] = True
return self._request("POST", "/v1/sandboxes", body)

def list_sandboxes(self) -> list[dict]:
Expand All @@ -150,40 +171,60 @@ def branch_sandbox(
tag: Optional[str] = None,
diff: bool = False,
measure_diff: bool = False,
mode: Optional[BranchMode] = None,
wait: bool = True,
) -> dict:
"""``POST /v1/sandboxes/:id/branch`` — pause + snapshot + resume.

Parameters
----------
mode:
v0.4+ canonical selector. ``"full"``, ``"diff"``, or
``"live"``. When set, takes precedence over the legacy
``diff`` boolean — and passing both raises
:class:`ControllerError` (HTTP 400). Prefer this over
``diff=`` in new code. See :data:`BranchMode`.
diff:
v0.3+: use Firecracker Diff snapshot mode. The source's
pause window collapses to the Diff write only (~200 ms
for an idle source; 6-15× speedup on typical agent
workloads; up to 143× on a 4 GiB sandbox on commodity
SSD — see ``bench/pause-window/RESULTS-v0.3.md``). Multi-
BRANCH on the same source is supported in v0.3.1+ via
the previous-output chain (``last_branch_memory_path``).
**Legacy.** Equivalent to ``mode="diff"``; kept so this SDK
can drive v0.3.x daemons that don't understand ``mode``.
Mutually exclusive with ``mode`` (server-side).
measure_diff:
v0.3+: measurement-only hook. Take a Diff snapshot inside
the existing Full pause to report what diff would have
cost, without changing semantics. Mutually exclusive with
``diff`` (daemon returns 400 if both are true).
wait:
v0.4+, only meaningful with ``mode="live"``. Default
``True`` blocks until the background memory copy finishes
and the returned snapshot is ``status="ready"``. Set to
``False`` to return as soon as the source resumes (~10 ms);
the snapshot reaches ``status="ready"`` later — poll
:meth:`list_snapshots` to detect completion.

The source sandbox is paused for the duration of the snapshot
write — typically 0.5-8 s for Full, ~200 ms for Diff — then
resumed. The returned snapshot is independent of the source's
lifecycle.
write — typically 0.5-8 s for Full, ~200 ms for Diff, sub-50 ms
for Live — then resumed. The returned snapshot is independent
of the source's lifecycle.

Returns a SnapshotInfo dict; pass its ``tag`` to
``spawn_sandboxes`` to fork grandchildren from the branch.
"""
body: dict[str, Any] = {}
if tag is not None:
body["tag"] = tag
if diff:
# Prefer canonical `mode` when set; fall back to legacy `diff`
# so older daemons keep working unchanged.
if mode is not None:
body["mode"] = mode
elif diff:
body["diff"] = True
if measure_diff:
body["measure_diff"] = True
# `wait=True` is the daemon default; only send when the caller
# opted into fire-and-forget so the body stays minimal against
# daemons that don't recognize the field.
if not wait:
body["wait"] = False
return self._request("POST", f"/v1/sandboxes/{sandbox_id}/branch", body)

def exec_command(
Expand Down
40 changes: 34 additions & 6 deletions sdk/typescript/src/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,19 @@ export class Controller {
* @param options.prewarm v0.2.5+. Relocates the cold-cache
* penalty from the first BRANCH to
* sandbox-creation time.
* @param options.liveFork v0.4+. Boot with memfd-backed RAM so
* later BRANCHes from this sandbox can
* use `mode: "live"`. Requires kernel
* 5.7+ and the vendored Firecracker
* fork.
*/
async spawnSandboxes(options: {
snapshotTag: string;
n?: number;
perChildNetns?: boolean;
memoryLimitMib?: number;
prewarm?: boolean;
liveFork?: boolean;
}): Promise<SandboxInfo[]> {
const body: SpawnOptions = {
snapshot_tag: options.snapshotTag,
Expand All @@ -137,6 +143,9 @@ export class Controller {
if (options.prewarm !== undefined) {
body.prewarm = options.prewarm;
}
if (options.liveFork !== undefined) {
body.live_fork = options.liveFork;
}
return this.request<SandboxInfo[]>("POST", "/v1/sandboxes", body);
}

Expand All @@ -161,11 +170,20 @@ export class Controller {
/**
* Branch a running sandbox into a new snapshot.
*
* Pauses the source briefly, snapshots, resumes. With
* `options.diff = true` (v0.3+) the user-visible source-pause window
* collapses to the diff write — sub-second across all memory sizes
* for idle sources, 6-15× speedup on typical agent workloads, 143×
* ceiling on 4 GiB SSD.
* Pauses the source briefly, snapshots, resumes. Pause window
* depends on `options.mode`:
*
* - `"full"` (default): 0.5-8 s, whole guest RAM written.
* - `"diff"` (v0.3+): ~200 ms idle source, 6-15× speedup on typical
* agent workloads, 143× ceiling on 4 GiB SSD.
* - `"live"` (v0.4+): sub-50 ms; memory streams from the running
* parent via UFFD_WP. Requires source booted with
* `liveFork: true`. Combine with `wait: false` to return after
* the source resumes (~10 ms) without waiting on the background
* copy.
*
* The legacy `options.diff` boolean still works for v0.3.x daemon
* compat but is mutually exclusive with `options.mode` server-side.
*
* Returns a {@link SnapshotInfo}; pass its `tag` back into
* {@link spawnSandboxes} to fan out grandchildren.
Expand All @@ -176,8 +194,18 @@ export class Controller {
): Promise<SnapshotInfo> {
const body: BranchOptions = {};
if (options.tag !== undefined) body.tag = options.tag;
if (options.diff) body.diff = true;
// Prefer canonical `mode` when set; fall back to legacy `diff`
// so older daemons keep working unchanged.
if (options.mode !== undefined) {
body.mode = options.mode;
} else if (options.diff) {
body.diff = true;
}
if (options.measure_diff) body.measure_diff = true;
// `wait: true` is the daemon default; only send when the caller
// opted into fire-and-forget so the body stays minimal against
// daemons that don't recognize the field.
if (options.wait === false) body.wait = false;
return this.request<SnapshotInfo>(
"POST",
`/v1/sandboxes/${encodeURIComponent(sandboxId)}/branch`,
Expand Down
1 change: 1 addition & 0 deletions sdk/typescript/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
export { Controller, ControllerError, type ControllerOptions } from "./controller.js";
export { Sandbox } from "./sandbox.js";
export type {
BranchMode,
BranchOptions,
EvalResult,
ExecOptions,
Expand Down
47 changes: 43 additions & 4 deletions sdk/typescript/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ export interface SnapshotInfo {
diff_physical_bytes?: number;
/** v0.3+: full guest-RAM size (what a Full snapshot would have written). */
diff_logical_bytes?: number;
/**
* v0.4+: live BRANCH lifecycle marker. `"writing"` while the
* background memory copy is in flight (only seen with `wait=false`),
* `"ready"` once the snapshot is consumable, `"failed"` if the
* background copy hit an error.
*/
status?: "writing" | "ready" | "failed";
}

export interface SandboxInfo {
Expand All @@ -43,16 +50,40 @@ export interface SpawnOptions {
memory_limit_mib?: number;
/** v0.2.5+: pre-warm sandbox after restore to relocate cold-cache. */
prewarm?: boolean;
/**
* v0.4+: boot the sandbox with a memfd-backed RAM region so later
* BRANCHes from it can use `mode: "live"`. Requires kernel 5.7+ and
* the vendored Firecracker fork (see
* `docs/VENDORED-FIRECRACKER.md`).
*/
live_fork?: boolean;
}

/**
* Canonical BRANCH mode (Phase 7.1+).
*
* - `"full"` — copy entire guest RAM under pause (default for v0.x).
* - `"diff"` — Firecracker Diff snapshot (v0.3+). Sub-second pause for
* idle sources; replaces the legacy `diff: true` boolean.
* - `"live"` — UFFD_WP-based live BRANCH (v0.4+). Sub-50 ms source
* pause; memory streams from the running parent. Requires source
* booted with `live_fork: true`.
*/
export type BranchMode = "full" | "diff" | "live";

export interface BranchOptions {
/** Optional tag for the new snapshot. Daemon generates one when unset. */
tag?: string;
/**
* v0.3+: use Firecracker Diff snapshot mode. Source pause window
* collapses to the diff write only (~200 ms idle source, 6-15×
* speedup on typical agent workloads, 143× ceiling on 4 GiB SSD).
* Multi-BRANCH supported in v0.3.1+ via the previous-output chain.
* v0.4+ canonical mode selector. Prefer this over the legacy `diff`
* boolean in new code. Mutually exclusive with `diff` — passing both
* yields HTTP 400.
*/
mode?: BranchMode;
/**
* **Legacy.** Equivalent to `mode: "diff"`. Kept so this SDK can
* drive v0.3.x daemons that don't understand `mode`. Mutually
* exclusive with `mode` server-side.
*/
diff?: boolean;
/**
Expand All @@ -61,6 +92,14 @@ export interface BranchOptions {
* changing semantics. Mutually exclusive with `diff` (400 if both).
*/
measure_diff?: boolean;
/**
* v0.4+, only meaningful with `mode: "live"`. Default `true` blocks
* until the background memory copy finishes and the returned
* snapshot is `status: "ready"`. Set to `false` to return as soon
* as the source resumes (~10 ms); snapshot reaches `status: "ready"`
* later — poll `listSnapshots` to detect completion.
*/
wait?: boolean;
}

export interface ExecOptions {
Expand Down
Loading
Loading