diff --git a/cmd/containerd-shim-urunc-v2/main.go b/cmd/containerd-shim-urunc-v2/main.go index dac1451c2..70d2398ea 100644 --- a/cmd/containerd-shim-urunc-v2/main.go +++ b/cmd/containerd-shim-urunc-v2/main.go @@ -17,11 +17,11 @@ package main import ( "context" - "github.com/containerd/containerd/runtime/v2/runc/manager" "github.com/containerd/containerd/runtime/v2/shim" _ "github.com/urunc-dev/urunc/pkg/containerd-shim" + containerdshim "github.com/urunc-dev/urunc/pkg/containerd-shim" ) func main() { - shim.RunManager(context.Background(), manager.NewShimManager("io.containerd.urunc.v2")) + shim.RunManager(context.Background(), containerdshim.NewShimManager("io.containerd.urunc.v2")) } diff --git a/deployment/urunc-deploy/config.toml b/deployment/urunc-deploy/config.toml index 8eeffaf4e..0330dba96 100644 --- a/deployment/urunc-deploy/config.toml +++ b/deployment/urunc-deploy/config.toml @@ -7,6 +7,9 @@ syslog = false [timestamps] enabled = false +[rootfs_view] +enabled = false + [monitors.qemu] default_memory_mb = 256 default_vcpus = 1 diff --git a/docs/configuration.md b/docs/configuration.md index a2daa4ba4..75dd87f1c 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -38,6 +38,9 @@ default_vcpus = 1 [extra_binaries.virtiofsd] path = "/usr/libexec/virtiofsd" options = "--sandbox none" + +[rootfs_view] +enabled = false ``` ## Configuration Sections @@ -89,6 +92,34 @@ destination = "/tmp/urunc-timestamps.log" When enabled, `urunc` will log performance timestamps to help with debugging and optimization. +### Rootfs View Configuration + +The `[rootfs_view]` section controls whether the urunc shim prepares a +per-container containerd rootfs view at task Create (for `devmapper` / +`blockfile` snapshotters). + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `enabled` | boolean | `false` | Prepare rootfs views for container block rootfs after shim task Create | + +When `enabled = true`, the shim first lets the wrapped task service create the +task so the bundle rootfs is mounted. It then runs `ChooseRootfs` and prepares a +view only if **all** of the following hold: + +1. The container snapshotter is block-based (`devmapper` or `blockfile`). +2. Shim `ChooseRootfs` selected **container block rootfs** (`type=block` with a + non-empty `MountedPath`). + +This matches the block-rootfs boot-artifact path: kernel/initrd are read from a +read-only view instead of being copied out of the container rootfs before attach. + +**Example:** + +```toml +[rootfs_view] +enabled = true +``` + ### Monitor Configuration The `[monitors]` section allows you to configure default settings for different @@ -201,6 +232,9 @@ To create a configuration file, you can: [monitors.spt] default_memory_mb = 256 default_vcpus = 1 + + [rootfs_view] + enabled = false EOF ``` @@ -244,6 +278,9 @@ default_vcpus = 1 default_memory_mb = 256 default_vcpus = 1 # path is not set by default - urunc will search in PATH + +[rootfs_view] +enabled = false ``` ## Notes diff --git a/docs/package/index.md b/docs/package/index.md index 2e772414a..b61f0be64 100644 --- a/docs/package/index.md +++ b/docs/package/index.md @@ -73,6 +73,16 @@ Except of the above, `urunc` accepts the following optional annotations: requests from `urunc` to mount the container's image rootfs in the unikernel (either as a block device or through shared-fs). +Per-container rootfs views are controlled by `[rootfs_view] enabled` in +`/etc/urunc/config.toml`. See +[configuration](../configuration.md#rootfs-view-configuration). When enabled, +the container must also use `com.urunc.unikernel.mountRootfs=true` (typically +from image annotations merged into `config.json` before shim task Create). +Supported snapshotters include `devmapper` and `blockfile`. After the wrapped +task service creates the task and mounts the bundle rootfs, the shim runs +`ChooseRootfs` and prepares a view only when that selection is container block +rootfs. + Due to the fact that [Docker](https://www.docker.com/) and some high-level container runtimes do not pass the image annotations to the underlying container runtime, `urunc` can also read the above information from a file inside the diff --git a/pkg/containerd-shim/containerd/annotations.go b/pkg/containerd-shim/containerd/annotations.go index 5d980c961..099c34639 100644 --- a/pkg/containerd-shim/containerd/annotations.go +++ b/pkg/containerd-shim/containerd/annotations.go @@ -86,7 +86,7 @@ func InjectUruncAnnotations(ctx context.Context, session *Session, bundlePath st return nil } - return patchConfigJSON(bundlePath, annotations) + return PatchConfigJSON(bundlePath, annotations) } func (f *annotationFetcher) fetchUruncAnnotations(ctx context.Context) (map[string]string, error) { @@ -152,12 +152,12 @@ func readBlob(ctx context.Context, namespace string, contentClient contentapi.Co return raw, nil } -// patchConfigJSON injects missing annotations into the OCI runtime spec -// stored in the bundle's config.json. +// PatchConfigJSON injects missing annotations into the OCI runtime spec stored in +// the bundle's config.json. // // Existing annotations in config.json are preserved. Only annotation keys that // are not already present in the runtime spec are added. -func patchConfigJSON(bundlePath string, annotations map[string]string) error { +func PatchConfigJSON(bundlePath string, annotations map[string]string) error { configPath := filepath.Join(bundlePath, "config.json") fi, err := os.Stat(configPath) diff --git a/pkg/containerd-shim/containerd/rootfs_view.go b/pkg/containerd-shim/containerd/rootfs_view.go new file mode 100644 index 000000000..a43648b39 --- /dev/null +++ b/pkg/containerd-shim/containerd/rootfs_view.go @@ -0,0 +1,267 @@ +// Copyright (c) 2023-2026, Nubificus LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package containerd + +import ( + "context" + "fmt" + + leasesapi "github.com/containerd/containerd/api/services/leases/v1" + snapshotsapi "github.com/containerd/containerd/api/services/snapshots/v1" + cntrtypes "github.com/containerd/containerd/api/types" + "github.com/containerd/containerd/errdefs" + "github.com/containerd/containerd/mount" + "github.com/urunc-dev/urunc/pkg/unikontainers" + "github.com/urunc-dev/urunc/pkg/unikontainers/types" + "google.golang.org/grpc/metadata" +) + +const ( + rootfsViewKeyPrefix = "urunc-rootfs-view-" + rootfsViewLeasePrefix = "urunc-rootfs-view-lease-" +) + +type RootfsViewAccessor struct { + namespace string + containerID string + snapshotter string + snapshotKey string + snapshots snapshotsapi.SnapshotsClient + leases leasesapi.LeasesClient +} + +func NewRootfsViewAccessor(session *Session) *RootfsViewAccessor { + a := &RootfsViewAccessor{ + namespace: session.GetNamespace(), + containerID: session.GetContainerID(), + snapshots: session.snapshotsClient(), + leases: session.leasesClient(), + } + ctr := session.GetContainer() + if ctr != nil && ctr.GetSnapshotKey() != "" { + a.snapshotter = ctr.GetSnapshotter() + a.snapshotKey = ctr.GetSnapshotKey() + } + return a +} + +func (a *RootfsViewAccessor) ShouldPrepare(rootfs types.RootfsParams) (bool, error) { + if a == nil || + a.snapshotter == "" || + a.snapshotKey == "" || + (a.snapshotter != "devmapper" && a.snapshotter != "blockfile") || + rootfs.Type != "block" || + rootfs.MountedPath == "" { + return false, nil + } + + uruncCfg, cfgErr := unikontainers.LoadUruncConfig(unikontainers.UruncConfigPath) + if cfgErr != nil { + return false, cfgErr + } + return uruncCfg.RootfsView.Enabled, nil +} + +// Prepare records a read-only view of the committed rootfs snapshot for runtime use. +// On success it returns view state for the caller to persist in bundle rootfs-view.json. +func (a *RootfsViewAccessor) Prepare(ctx context.Context) (types.RootfsViewState, error) { + if a == nil { + return types.RootfsViewState{}, fmt.Errorf("rootfs view accessor is nil") + } + + snapshotKey, err := a.resolveCommittedSnapshotBase(ctx, a.snapshotter, a.snapshotKey) + if err != nil { + return types.RootfsViewState{}, err + } + + viewKey := rootfsViewKeyPrefix + a.containerID + leaseID := rootfsViewLeasePrefix + a.containerID + + nsCtx := withNamespace(ctx, a.namespace) + if _, err := a.leases.Create(nsCtx, &leasesapi.CreateRequest{ID: leaseID}); err != nil { + err = containerdErr(err) + if err != nil && !errdefs.IsAlreadyExists(err) { + return types.RootfsViewState{}, fmt.Errorf("create rootfs view lease %s: %w", leaseID, err) + } + } + + leaseCtx := metadata.AppendToOutgoingContext(nsCtx, "containerd-lease", leaseID) + mounts, err := a.createRootfsView(leaseCtx, viewKey, snapshotKey) + if err != nil { + _ = deleteRootfsViewLease(ctx, a.namespace, leaseID, a.leases) + return types.RootfsViewState{}, err + } + + return types.RootfsViewState{ + Snapshotter: a.snapshotter, + Mounts: mounts, + }, nil +} + +// Rootfs view cleanup (call chain): +// +// Delete / Stop: ShouldCleanupRootfsView(bundle) → Cleanup(ctx, snapshotter from bundle) +// Create rollback: Cleanup(ctx, "") — snapshotter comes from container metadata on the accessor +// +// Cleanup → removeRootfsViewSnapshotAndLease (view snapshot + lease in containerd) +// Prepare failure after lease create → deleteRootfsViewLease (lease only) + +// Cleanup removes the per-container rootfs view snapshot and its containerd lease. +func (a *RootfsViewAccessor) Cleanup(ctx context.Context, snapshotter string) error { + if a == nil { + return fmt.Errorf("rootfs view accessor is nil") + } + if a.containerID == "" { + return fmt.Errorf("container id is empty") + } + + effectiveSnapshotter := snapshotter + if effectiveSnapshotter == "" { + effectiveSnapshotter = a.snapshotter + } + if effectiveSnapshotter == "" { + return fmt.Errorf("snapshotter name required for rootfs view cleanup") + } + + return removeRootfsViewSnapshotAndLease( + ctx, a.namespace, a.containerID, effectiveSnapshotter, a.snapshots, a.leases, + ) +} + +func (a *RootfsViewAccessor) statSnapshot(ctx context.Context, snapshotter, key string) (parent string, committed bool, err error) { + resp, err := a.snapshots.Stat(withNamespace(ctx, a.namespace), &snapshotsapi.StatSnapshotRequest{ + Snapshotter: snapshotter, + Key: key, + }) + if err = containerdErr(err); err != nil { + return "", false, err + } + info := resp.GetInfo() + if info == nil { + return "", false, fmt.Errorf("stat snapshot %s (%s): empty info", key, snapshotter) + } + return info.GetParent(), info.GetKind() == snapshotsapi.Kind_COMMITTED, nil +} + +func (a *RootfsViewAccessor) resolveCommittedSnapshotBase(ctx context.Context, snapshotter, snapshotKey string) (string, error) { + parent, committed, err := a.statSnapshot(ctx, snapshotter, snapshotKey) + if err != nil { + return "", fmt.Errorf("stat snapshot %s (%s): %w", snapshotKey, snapshotter, err) + } + if committed { + return snapshotKey, nil + } + if parent == "" { + return snapshotKey, nil + } + + current := parent + for { + parent, committed, err = a.statSnapshot(ctx, snapshotter, current) + if err != nil { + return "", fmt.Errorf("stat snapshot %s (%s parent walk): %w", current, snapshotter, err) + } + if committed { + return current, nil + } + if parent == "" { + return "", fmt.Errorf("%s snapshot %s has no committed parent in chain", snapshotter, snapshotKey) + } + current = parent + } +} + +func (a *RootfsViewAccessor) createRootfsView(ctx context.Context, viewKey, parentKey string) ([]mount.Mount, error) { + nsCtx := withNamespace(ctx, a.namespace) + viewResp, err := a.snapshots.View(nsCtx, &snapshotsapi.ViewSnapshotRequest{ + Snapshotter: a.snapshotter, + Key: viewKey, + Parent: parentKey, + }) + if err = containerdErr(err); err == nil { + return protoMountsToMounts(viewResp.GetMounts()), nil + } + if !errdefs.IsAlreadyExists(err) { + return nil, fmt.Errorf("create rootfs view %s from %s: %w", viewKey, parentKey, err) + } + + // Reuse an existing view left by a retry or partial prepare. + mountsResp, err := a.snapshots.Mounts(nsCtx, &snapshotsapi.MountsRequest{ + Snapshotter: a.snapshotter, + Key: viewKey, + }) + if err = containerdErr(err); err != nil { + return nil, fmt.Errorf("create rootfs view %s from %s: %w", viewKey, parentKey, err) + } + return protoMountsToMounts(mountsResp.GetMounts()), nil +} + +func protoMountsToMounts(mm []*cntrtypes.Mount) []mount.Mount { + out := make([]mount.Mount, len(mm)) + for i, m := range mm { + out[i] = mount.Mount{ + Type: m.Type, + Source: m.Source, + Target: m.Target, + Options: m.Options, + } + } + return out +} + +// ShouldCleanupRootfsView reports whether bundle rootfs-view.json exists and returns its snapshotter. +func ShouldCleanupRootfsView(bundle string) (bool, string, error) { + state, err := unikontainers.LoadBundleRootfsView(bundle) + if err != nil { + return false, "", err + } + if state == nil || state.Snapshotter == "" { + return false, "", nil + } + return true, state.Snapshotter, nil +} + +// removeRootfsViewSnapshotAndLease deletes the view snapshot and its lease in containerd. +func removeRootfsViewSnapshotAndLease( + ctx context.Context, + namespace, containerID, snapshotter string, + snapshots snapshotsapi.SnapshotsClient, + leases leasesapi.LeasesClient, +) error { + if containerID == "" || snapshotter == "" { + return nil + } + nsCtx := withNamespace(ctx, namespace) + _, err := snapshots.Remove(nsCtx, &snapshotsapi.RemoveSnapshotRequest{ + Snapshotter: snapshotter, + Key: rootfsViewKeyPrefix + containerID, + }) + if err = containerdErr(err); err != nil && !errdefs.IsNotFound(err) { + return err + } + return deleteRootfsViewLease(ctx, namespace, rootfsViewLeasePrefix+containerID, leases) +} + +// deleteRootfsViewLease removes only the containerd lease (Prepare rollback after lease create). +func deleteRootfsViewLease(ctx context.Context, namespace, leaseID string, leases leasesapi.LeasesClient) error { + if leaseID == "" { + return nil + } + _, err := leases.Delete(withNamespace(ctx, namespace), &leasesapi.DeleteRequest{ID: leaseID}) + if err = containerdErr(err); err != nil && !errdefs.IsNotFound(err) { + return err + } + return nil +} diff --git a/pkg/containerd-shim/containerd/session.go b/pkg/containerd-shim/containerd/session.go index e7168ffa1..c3ff02ce1 100644 --- a/pkg/containerd-shim/containerd/session.go +++ b/pkg/containerd-shim/containerd/session.go @@ -158,12 +158,10 @@ func (s *Session) contentClient() contentapi.ContentClient { return contentapi.NewContentClient(s.conn) } -//nolint:unused // Used by follow-up feature-specific access constructors. func (s *Session) snapshotsClient() snapshotsapi.SnapshotsClient { return snapshotsapi.NewSnapshotsClient(s.conn) } -//nolint:unused // Used by follow-up feature-specific access constructors. func (s *Session) leasesClient() leasesapi.LeasesClient { return leasesapi.NewLeasesClient(s.conn) } diff --git a/pkg/containerd-shim/guest_rootfs.go b/pkg/containerd-shim/guest_rootfs.go index f8982ecf1..496ed362d 100644 --- a/pkg/containerd-shim/guest_rootfs.go +++ b/pkg/containerd-shim/guest_rootfs.go @@ -24,44 +24,38 @@ import ( taskAPI "github.com/containerd/containerd/api/runtime/task/v2" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/urunc-dev/urunc/pkg/unikontainers" + "github.com/urunc-dev/urunc/pkg/unikontainers/types" ) -const annotRootfsParams = "com.urunc.internal.rootfs.params" - var errGuestRootfsChoiceSkipped = errors.New("guest rootfs choice skipped") // chooseGuestRootfs runs the same ChooseRootfs logic as runtime Exec after inner -// task Create (#684) and records the result in annotRootfsParams so Exec knows -// selection already happened. -func chooseGuestRootfs(r *taskAPI.CreateTaskRequest) error { +// task Create (#684). The caller persists the JSON-encoded result in bundle +// config.json so Exec can reuse the selection. +func chooseGuestRootfs(r *taskAPI.CreateTaskRequest) (types.RootfsParams, string, error) { configPath := filepath.Join(r.Bundle, "config.json") - info, err := os.Stat(configPath) - if err != nil { - return fmt.Errorf("stat config.json: %w", err) - } - data, err := os.ReadFile(configPath) if err != nil { - return fmt.Errorf("read config.json: %w", err) + return types.RootfsParams{}, "", fmt.Errorf("read config.json: %w", err) } var spec specs.Spec if err := json.Unmarshal(data, &spec); err != nil { - return fmt.Errorf("unmarshal config.json: %w", err) + return types.RootfsParams{}, "", fmt.Errorf("unmarshal config.json: %w", err) } if spec.Root == nil { - return fmt.Errorf("invalid OCI spec: root section is required") + return types.RootfsParams{}, "", fmt.Errorf("invalid OCI spec: root section is required") } config, err := unikontainers.GetUnikernelConfig(filepath.Clean(r.Bundle), &spec) if err != nil { - return fmt.Errorf("%w: %w", errGuestRootfsChoiceSkipped, err) + return types.RootfsParams{}, "", fmt.Errorf("%w: %w", errGuestRootfsChoiceSkipped, err) } annotations := config.Map() uruncCfg, err := unikontainers.LoadUruncConfig(unikontainers.UruncConfigPath) if err != nil && uruncCfg == nil { - return err + return types.RootfsParams{}, "", err } rootfsParams, err := unikontainers.ChooseRootfs( @@ -71,22 +65,12 @@ func chooseGuestRootfs(r *taskAPI.CreateTaskRequest) error { uruncCfg, ) if err != nil { - return err + return types.RootfsParams{}, "", err } encoded, err := json.Marshal(rootfsParams) if err != nil { - return err - } - if spec.Annotations == nil { - spec.Annotations = make(map[string]string) + return types.RootfsParams{}, "", err } - spec.Annotations[annotRootfsParams] = string(encoded) - - patched, err := json.MarshalIndent(spec, "", " ") - if err != nil { - return fmt.Errorf("marshal config.json: %w", err) - } - - return os.WriteFile(configPath, patched, info.Mode()) + return rootfsParams, string(encoded), nil } diff --git a/pkg/containerd-shim/shim_manager.go b/pkg/containerd-shim/shim_manager.go new file mode 100644 index 000000000..c8193f8df --- /dev/null +++ b/pkg/containerd-shim/shim_manager.go @@ -0,0 +1,80 @@ +// Copyright (c) 2023-2026, Nubificus LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package containerdshim + +import ( + "context" + "os" + + "github.com/containerd/containerd/runtime/v2/runc/manager" + "github.com/containerd/containerd/runtime/v2/shim" + "github.com/containerd/log" + containerdShim "github.com/urunc-dev/urunc/pkg/containerd-shim/containerd" +) + +const containerdGRPCAddressEnv = "GRPC_ADDRESS" + +func containerdGRPCAddress() string { + return os.Getenv(containerdGRPCAddressEnv) +} + +type shimManager struct { + shim.Manager +} + +func NewShimManager(runtime string) shim.Manager { + return &shimManager{Manager: manager.NewShimManager(runtime)} +} + +func (m *shimManager) Stop(ctx context.Context, id string) (shim.StopStatus, error) { + bundle, err := os.Getwd() + if err != nil { + log.G(ctx).WithError(err).Warn("urunc(shim): getwd during delete failed") + return m.Manager.Stop(ctx, id) + } + + shouldCleanup, snapshotter, err := containerdShim.ShouldCleanupRootfsView(bundle) + if err != nil { + log.G(ctx).WithError(err).Warn("urunc(shim): read rootfs view cleanup state from bundle during delete failed") + return m.Manager.Stop(ctx, id) + } + if !shouldCleanup { + return m.Manager.Stop(ctx, id) + } + + address := containerdGRPCAddress() + if address == "" { + log.G(ctx).Warn("urunc(shim): containerd gRPC address unset during delete; rootfs view cleanup skipped") + return m.Manager.Stop(ctx, id) + } + + session, err := containerdShim.OpenSession(ctx, address, id) + if err != nil { + log.G(ctx).WithError(err).Warn("urunc(shim): open containerd session for rootfs view cleanup failed") + return m.Manager.Stop(ctx, id) + } + defer func() { + if err := session.Close(); err != nil { + log.G(ctx).WithError(err).Warn("urunc(shim): failed to close containerd session after rootfs view cleanup") + } + }() + + // snapshotter from bundle view state; shim cwd may outlive task Delete. + if err := containerdShim.NewRootfsViewAccessor(session).Cleanup(ctx, snapshotter); err != nil { + log.G(ctx).WithError(err).Warn("urunc(shim): rootfs view cleanup during delete failed") + } + + return m.Manager.Stop(ctx, id) +} diff --git a/pkg/containerd-shim/task_plugin.go b/pkg/containerd-shim/task_plugin.go index 85226f383..54dfabddf 100644 --- a/pkg/containerd-shim/task_plugin.go +++ b/pkg/containerd-shim/task_plugin.go @@ -15,6 +15,9 @@ package containerdshim import ( + "os" + "path/filepath" + "github.com/containerd/containerd/pkg/shutdown" "github.com/containerd/containerd/plugin" runcTask "github.com/containerd/containerd/runtime/v2/runc/task" @@ -45,9 +48,15 @@ func init() { return nil, err } + cwd, err := os.Getwd() + if err != nil { + return nil, err + } + return &taskService{ TaskService: inner, containerdAddress: ic.Address, + stateRoot: filepath.Dir(filepath.Dir(cwd)), }, nil }, }) diff --git a/pkg/containerd-shim/task_service.go b/pkg/containerd-shim/task_service.go index fb126c3f0..698719013 100644 --- a/pkg/containerd-shim/task_service.go +++ b/pkg/containerd-shim/task_service.go @@ -17,13 +17,21 @@ package containerdshim import ( "context" "errors" + "fmt" + "path/filepath" taskAPI "github.com/containerd/containerd/api/runtime/task/v2" + "github.com/containerd/containerd/namespaces" "github.com/containerd/log" "github.com/containerd/ttrpc" containerdShim "github.com/urunc-dev/urunc/pkg/containerd-shim/containerd" + "github.com/urunc-dev/urunc/pkg/unikontainers" + "github.com/urunc-dev/urunc/pkg/unikontainers/types" ) +// Internal bundle annotation (duplicated in unikontainers; keep in sync). +const annotRootfsParams = "com.urunc.internal.rootfs.params" + // taskService is urunc's shim-side wrapper around containerd's runc task // service. It wires urunc task setup before forwarding calls to the wrapped // service. @@ -31,6 +39,8 @@ type taskService struct { taskAPI.TaskService containerdAddress string + // Used on Delete, where cwd may no longer be the bundle. + stateRoot string } func (s *taskService) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*taskAPI.CreateTaskResponse, error) { @@ -53,9 +63,8 @@ func (s *taskService) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) return resp, err } - // ChooseRootfs after inner task Create so bundle rootfs is mounted; - // params are persisted in bundle config.json for runtime Exec. - if err := chooseGuestRootfs(r); err != nil { + rootfsChoice, rootfsParamsJSON, err := chooseGuestRootfs(r) + if err != nil { if errors.Is(err, errGuestRootfsChoiceSkipped) { log.G(ctx).WithError(err).Debug("urunc(shim): guest rootfs choice skipped") return resp, nil @@ -64,14 +73,118 @@ func (s *taskService) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) return nil, err } + rootfsViewPrepared := false + var rootfsViewState types.RootfsViewState + var rootfsViewAccessor *containerdShim.RootfsViewAccessor + if session != nil { + rootfsViewAccessor = containerdShim.NewRootfsViewAccessor(session) + shouldPrepare, shouldPrepareErr := rootfsViewAccessor.ShouldPrepare(rootfsChoice) + if shouldPrepareErr != nil { + log.G(ctx).WithError(shouldPrepareErr).Warn("urunc(shim): failed to load urunc config; rootfs view disabled") + } else if shouldPrepare { + rootfsViewState, err = rootfsViewAccessor.Prepare(ctx) + if err != nil { + log.G(ctx).WithError(err).Warn("urunc(shim): failed to prepare rootfs view; falling back to legacy boot artifact extraction") + } else { + rootfsViewPrepared = true + log.G(ctx).Debug("urunc(shim): rootfs view prepared") + } + } else { + log.G(ctx).WithField("rootfs_type", rootfsChoice.Type).Debug("urunc(shim): rootfs view prepare skipped") + } + } + + if err := containerdShim.PatchConfigJSON(r.Bundle, map[string]string{ + annotRootfsParams: rootfsParamsJSON, + }); err != nil { + if rootfsViewPrepared && rootfsViewAccessor != nil { + if cleanupErr := rootfsViewAccessor.Cleanup(ctx, ""); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Warn("urunc(shim): failed to clean up rootfs view after rootfs params persistence failure") + } + } + log.G(ctx).WithError(err).Warn("urunc(shim): failed to persist shim create annotations") + return nil, err + } + + if rootfsViewPrepared { + if err := unikontainers.WriteBundleRootfsView(r.Bundle, rootfsViewState); err != nil { + if rootfsViewAccessor != nil { + if cleanupErr := rootfsViewAccessor.Cleanup(ctx, ""); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Warn("urunc(shim): failed to clean up rootfs view after state persistence failure") + } + } + log.G(ctx).WithError(err).Warn("urunc(shim): failed to persist rootfs view state") + return nil, err + } + } + return resp, nil } func (s *taskService) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAPI.DeleteResponse, error) { - return s.TaskService.Delete(ctx, r) + shouldCleanup := false + snapshotter := "" + var loadErr error + + if r.ExecID == "" { + bundle, err := s.bundlePathFor(ctx, r.ID) + if err != nil { + log.G(ctx).WithError(err).Warn("urunc(shim): resolve bundle path during Delete failed") + loadErr = err + } else { + // Read view state before inner Delete; snapshotter is taken from bundle + // (written at Prepare) because container metadata may be gone after Delete. + shouldCleanup, snapshotter, loadErr = containerdShim.ShouldCleanupRootfsView(bundle) + } + } + + // Delete tears down the monitor namespace before removing the view it may pin. + resp, err := s.TaskService.Delete(ctx, r) + + if loadErr != nil { + if err != nil { + return resp, err + } + return resp, loadErr + } + + if shouldCleanup { + session, sessionErr := containerdShim.OpenSession(ctx, s.containerdAddress, r.ID) + if sessionErr != nil { + log.G(ctx).WithError(sessionErr).Warn("urunc(shim): open containerd session for rootfs view cleanup failed") + if err == nil { + err = sessionErr + } + } else { + defer func() { + if err := session.Close(); err != nil { + log.G(ctx).WithError(err).Warn("urunc(shim): failed to close containerd session after rootfs view cleanup") + } + }() + if cleanupErr := containerdShim.NewRootfsViewAccessor(session).Cleanup(ctx, snapshotter); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Warn("urunc(shim): delete rootfs view during Delete failed") + if err == nil { + err = cleanupErr + } + } + } + } + + return resp, err } func (s *taskService) RegisterTTRPC(server *ttrpc.Server) error { taskAPI.RegisterTaskService(server, s) return nil } + +func (s *taskService) bundlePathFor(ctx context.Context, containerID string) (string, error) { + if s.stateRoot == "" { + return "", fmt.Errorf("task service state root is empty (shim cwd layout assumption violated)") + } + ns, err := namespaces.NamespaceRequired(ctx) + if err != nil { + return "", fmt.Errorf("namespace required: %w", err) + } + return filepath.Join(s.stateRoot, ns, containerID), nil +} diff --git a/pkg/unikontainers/block.go b/pkg/unikontainers/block.go index 1b7bf7892..c429d0aa5 100644 --- a/pkg/unikontainers/block.go +++ b/pkg/unikontainers/block.go @@ -36,15 +36,16 @@ const tmpfsSizeForBlockRootfs = "65536k" var ErrMountpoint = errors.New("no FS is mounted in this mountpoint") type blockRootfs struct { - mounts []specs.Mount - monRootfs string - mountedPath string - path string - kernelPath string - initrdPath string - uruncJSONPath string - guestType string - guest types.Unikernel + mounts []specs.Mount + monRootfs string + mountedPath string + path string + kernelPath string + initrdPath string + uruncJSONPath string + guestType string + guest types.Unikernel + rootfsViewState *types.RootfsViewState } // getMountInfo determines whether the provided path is a mount point @@ -122,8 +123,6 @@ func getMountInfo(path string) (types.BlockDevParams, error) { // extractUnikernelFromBlock moves unikernel binary, initrd and urunc.json // files from old rootfsPath to newRootfsPath -// FIXME: This approach fills up /run with unikernel binaries, initrds and urunc.json -// files for each unikernel we run func extractBootFiles(rootfsPath string, newRootfsPath string, unikernel string, uruncJSON string, initrd string) error { currentUnikernelPath := filepath.Join(rootfsPath, unikernel) targetUnikernelPath := filepath.Join(newRootfsPath, unikernel) @@ -148,7 +147,6 @@ func extractBootFiles(rootfsPath string, newRootfsPath string, unikernel string, if err != nil { return fmt.Errorf("could not move %s to %s: %w", currentConfigPath, newRootfsPath, err) } - return nil } @@ -226,24 +224,36 @@ func getBlockVolumes(monRootfs string, mounts []specs.Mount, ukernel types.Unike } func (b blockRootfs) preSetup() error { + // Preserve main's propagation fix: consume boot artifacts and unmount the + // container rootfs before prepareRoot() makes the mount tree private/slave. if b.mountedPath == "" { return nil } - err := copyMountfiles(b.mountedPath, b.mounts) - if err != nil { - return fmt.Errorf("failed to copy files from mount list: %w", err) + useViewPath := b.rootfsViewState != nil + if useViewPath { + // Probe only; the real bind must happen after prepareRoot. + useView, err := probeRootfsViewBootArtifacts(b.rootfsViewState, b.kernelPath, b.initrdPath, b.uruncJSONPath) + if err != nil { + return err + } + if !useView { + useViewPath = false + } } - // FIXME: This approach fills up /run with unikernel binaries and - // urunc.json files for each unikernel instance we run - err = extractBootFiles(b.mountedPath, b.monRootfs, b.kernelPath, b.uruncJSONPath, b.initrdPath) - if err != nil { - return fmt.Errorf("failed to extract boot files from rootfs: %w", err) + if !useViewPath { + err := extractBootFiles(b.mountedPath, b.monRootfs, b.kernelPath, b.uruncJSONPath, b.initrdPath) + if err != nil { + return fmt.Errorf("failed to extract boot files from rootfs: %w", err) + } } - err = mount.Unmount(b.mountedPath) - if err != nil { + if err := copyMountfiles(b.mountedPath, b.mounts); err != nil { + return fmt.Errorf("failed to copy files from mount list: %w", err) + } + + if err := mount.Unmount(b.mountedPath); err != nil { return fmt.Errorf("failed to unmount rootfs: %w", err) } diff --git a/pkg/unikontainers/rootfs_view_boot.go b/pkg/unikontainers/rootfs_view_boot.go new file mode 100644 index 000000000..bd9fbe768 --- /dev/null +++ b/pkg/unikontainers/rootfs_view_boot.go @@ -0,0 +1,181 @@ +// Copyright (c) 2023-2026, Nubificus LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package unikontainers + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/containerd/containerd/mount" + "github.com/urunc-dev/urunc/pkg/unikontainers/types" + "golang.org/x/sys/unix" +) + +// WriteBundleRootfsView persists shim-prepared view state in the bundle. +func WriteBundleRootfsView(bundleDir string, state types.RootfsViewState) error { + bundleDir = filepath.Clean(bundleDir) + data, err := json.Marshal(state) + if err != nil { + return fmt.Errorf("marshal %s: %w", rootfsViewFilename, err) + } + path := filepath.Join(bundleDir, rootfsViewFilename) + if err := os.WriteFile(path, data, 0o644); err != nil { //nolint:gosec // bundle metadata, same as state.json + return fmt.Errorf("write %s: %w", path, err) + } + return nil +} + +// LoadBundleRootfsView reads view state written by the shim at task Create. +// Returns (nil, nil) when the file is absent. +func LoadBundleRootfsView(bundleDir string) (*types.RootfsViewState, error) { + bundleDir = filepath.Clean(bundleDir) + path := filepath.Join(bundleDir, rootfsViewFilename) + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, fmt.Errorf("read %s: %w", path, err) + } + var state types.RootfsViewState + if err := json.Unmarshal(data, &state); err != nil { + return nil, fmt.Errorf("unmarshal %s: %w", path, err) + } + return &state, nil +} + +func rootfsViewBootArtifactBindPaths(viewRoot, monRootfs, unikernelPath, initrdPath, uruncJSON string) []struct{ src, target string } { + artifactPaths := []string{unikernelPath, uruncJSON} + if initrdPath != "" { + artifactPaths = append(artifactPaths, initrdPath) + } + files := make([]struct{ src, target string }, 0, len(artifactPaths)) + for _, artifactPath := range artifactPaths { + rootfsRelPath := strings.TrimPrefix(filepath.Clean(artifactPath), "/") + files = append(files, struct{ src, target string }{ + src: filepath.Join(viewRoot, rootfsRelPath), + target: filepath.Join(monRootfs, rootfsRelPath), + }) + } + return files +} + +func rollbackRootfsViewBinds(targets []string) { + for i := len(targets) - 1; i >= 0; i-- { + if err := unmountRootfsViewBind(targets[i]); err != nil { + uniklog.WithError(err).WithField("target", filepath.Clean(targets[i])).Warn("failed to roll back rootfs view bind mount") + } + } +} + +// probeRootfsViewBootArtifacts checks that boot artifacts can be bind-mounted +// from the view. preSetup still has mountedPath; binds are rolled back immediately. +func probeRootfsViewBootArtifacts(view *types.RootfsViewState, unikernelPath, initrdPath, uruncJSON string) (useView bool, err error) { + if view == nil { + return false, nil + } + + mountpoint, err := os.MkdirTemp("", "urunc-rootfs-view-") + if err != nil { + return false, fmt.Errorf("create temporary rootfs view mountpoint: %w", err) + } + defer os.RemoveAll(mountpoint) + + if err := mount.All(view.Mounts, mountpoint); err != nil { + uniklog.WithError(err).Warn("rootfs view unavailable; falling back to legacy boot file extraction") + return false, nil + } + + probeRoot, err := os.MkdirTemp("", "urunc-rootfs-view-probe-") + if err != nil { + return false, fmt.Errorf("create temporary rootfs view probe mountpoint: %w", err) + } + defer os.RemoveAll(probeRoot) + + var bindTargets []string + defer rollbackRootfsViewBinds(bindTargets) + + for _, f := range rootfsViewBootArtifactBindPaths(mountpoint, probeRoot, unikernelPath, initrdPath, uruncJSON) { + dstPath := f.target + if err := bindMountFile(f.src, filepath.Dir(dstPath), dstPath, 0, unix.MS_BIND, false); err != nil { + return false, fmt.Errorf("bind view %s -> %s: %w", f.src, f.target, err) + } + bindTargets = append(bindTargets, dstPath) + } + + if uerr := mount.Unmount(mountpoint, 0); uerr != nil && !os.IsNotExist(uerr) && uerr != unix.EINVAL { + return false, fmt.Errorf("unmount temporary rootfs view mountpoint: %w", uerr) + } + + return true, nil +} + +// prepareRootfsViewBootBinds runs after prepareRoot, so the binds live in the +// monitor mount namespace and are released with it. +func prepareRootfsViewBootBinds(view *types.RootfsViewState, monRootfs, unikernelPath, initrdPath, uruncJSON string) error { + if view == nil { + return nil + } + + var bindTargets []string + keepBinds := false + defer func() { + if !keepBinds { + rollbackRootfsViewBinds(bindTargets) + } + }() + + mountpoint, err := os.MkdirTemp("", "urunc-rootfs-view-") + if err != nil { + return fmt.Errorf("create temporary rootfs view mountpoint: %w", err) + } + defer os.RemoveAll(mountpoint) + + if err := mount.All(view.Mounts, mountpoint); err != nil { + return fmt.Errorf("mount rootfs view: %w", err) + } + + for _, f := range rootfsViewBootArtifactBindPaths(mountpoint, monRootfs, unikernelPath, initrdPath, uruncJSON) { + dstPath := f.target + if err := bindMountFile(f.src, filepath.Dir(dstPath), dstPath, 0, unix.MS_BIND, false); err != nil { + return fmt.Errorf("bind view %s -> %s: %w", f.src, f.target, err) + } + bindTargets = append(bindTargets, dstPath) + } + + bindErr := mount.Unmount(mountpoint, 0) + + if bindErr != nil && !os.IsNotExist(bindErr) && bindErr != unix.EINVAL { + if len(bindTargets) > 0 { + return fmt.Errorf("rootfs view boot artifact bind completed but cleanup failed: %w", bindErr) + } + return fmt.Errorf("unmount temporary rootfs view mountpoint: %w", bindErr) + } + + keepBinds = true + return nil +} + +func unmountRootfsViewBind(target string) error { + target = filepath.Clean(target) + err := unix.Unmount(target, unix.MNT_DETACH) + if err == nil || err == unix.EINVAL || err == unix.ENOENT || os.IsNotExist(err) { + return nil + } + return fmt.Errorf("failed to unmount rootfs view bind %s: %w", target, err) +} diff --git a/pkg/unikontainers/types/types.go b/pkg/unikontainers/types/types.go index c6388e2cc..6c796dbb7 100644 --- a/pkg/unikontainers/types/types.go +++ b/pkg/unikontainers/types/types.go @@ -15,7 +15,10 @@ //revive:disable:var-naming package types -import "golang.org/x/sys/unix" +import ( + "github.com/containerd/containerd/mount" + "golang.org/x/sys/unix" +) type Unikernel interface { Init(UnikernelParams) error @@ -72,6 +75,12 @@ type RootfsParams struct { MonRootfs string // The rootfs for the monitor process } +// RootfsViewState is passed from shim to runtime via bundle rootfs-view.json. +type RootfsViewState struct { + Snapshotter string `json:"snapshotter"` + Mounts []mount.Mount `json:"mounts,omitempty"` +} + // Specific to Linux type ProcessConfig struct { UID uint32 // The uid of the process inside the guest diff --git a/pkg/unikontainers/unikontainers.go b/pkg/unikontainers/unikontainers.go index a3a00bb5f..c3c3bf942 100644 --- a/pkg/unikontainers/unikontainers.go +++ b/pkg/unikontainers/unikontainers.go @@ -306,11 +306,21 @@ func ChooseRootfs(bundle, specRoot string, annot map[string]string, cfg *UruncCo func (u *Unikontainer) Exec(metrics m.Writer) error { metrics.Capture(m.TS15) + // Reload annotations written by the shim after Create. + spec, err := loadSpec(u.State.Bundle) + if err != nil { + return fmt.Errorf("reload bundle spec: %w", err) + } + if spec == nil || spec.Linux == nil { + return fmt.Errorf("invalid OCI spec: linux section is required") + } + u.Spec = spec + // container Paths // Make sure paths are clean bundleDir := filepath.Clean(u.State.Bundle) rootfsDir := filepath.Clean(u.Spec.Root.Path) - rootfsDir, err := resolveAgainstBase(bundleDir, rootfsDir) + rootfsDir, err = resolveAgainstBase(bundleDir, rootfsDir) if err != nil { uniklog.Errorf("could not resolve rootfs directory %s: %v", rootfsDir, err) return err @@ -461,16 +471,21 @@ func (u *Unikontainer) Exec(metrics m.Writer) error { var rfsBuilder rootfsBuilder switch rootfsParams.Type { case "block": + view, err := LoadBundleRootfsView(bundleDir) + if err != nil { + return fmt.Errorf("could not load guest rootfs view: %w", err) + } rfsBuilder = blockRootfs{ - mounts: u.Spec.Mounts, - monRootfs: rootfsParams.MonRootfs, - mountedPath: rootfsParams.MountedPath, - path: rootfsParams.Path, - kernelPath: unikernelPath, - initrdPath: initrdPath, - uruncJSONPath: uruncJSONFilename, - guestType: unikernelType, - guest: unikernel, + mounts: u.Spec.Mounts, + monRootfs: rootfsParams.MonRootfs, + mountedPath: rootfsParams.MountedPath, + path: rootfsParams.Path, + kernelPath: unikernelPath, + initrdPath: initrdPath, + uruncJSONPath: uruncJSONFilename, + guestType: unikernelType, + guest: unikernel, + rootfsViewState: view, } case "initrd": rfsBuilder = initrdRootfs{ @@ -517,6 +532,12 @@ func (u *Unikontainer) Exec(metrics m.Writer) error { return err } + if b, ok := rfsBuilder.(blockRootfs); ok && b.rootfsViewState != nil { + if err := prepareRootfsViewBootBinds(b.rootfsViewState, b.monRootfs, b.kernelPath, b.initrdPath, b.uruncJSONPath); err != nil { + return fmt.Errorf("boot artifact setup after prepareRoot failed: %w", err) + } + } + // Setup the rootfs for the monitor execution, creating necessary // devices and the monitor's binary. err = prepareMonRootfs(rootfsParams.MonRootfs, vmm.Path(), u.UruncCfg.Monitors[vmmType].DataPath, vmm.UsesKVM(), withTUNTAP) diff --git a/pkg/unikontainers/urunc_config.go b/pkg/unikontainers/urunc_config.go index 5f21d106e..36436fbdd 100644 --- a/pkg/unikontainers/urunc_config.go +++ b/pkg/unikontainers/urunc_config.go @@ -34,9 +34,15 @@ type UruncTimestamps struct { Destination string `toml:"destination"` // Used to specify a file for timestamps } +// UruncRootfsView configures shim-side per-container rootfs views (devmapper/blockfile). +type UruncRootfsView struct { + Enabled bool `toml:"enabled"` +} + type UruncConfig struct { Log UruncLog `toml:"log"` Timestamps UruncTimestamps `toml:"timestamps"` + RootfsView UruncRootfsView `toml:"rootfs_view"` Monitors map[string]types.MonitorConfig `toml:"monitors"` ExtraBins map[string]types.ExtraBinConfig `toml:"extra_binaries"` } @@ -94,10 +100,15 @@ func defaultExtraBinConfig() map[string]types.ExtraBinConfig { } } +func defaultRootfsViewConfig() UruncRootfsView { + return UruncRootfsView{Enabled: false} +} + func defaultUruncConfig() *UruncConfig { return &UruncConfig{ Log: defaultLogConfig(), Timestamps: defaultTimestampsConfig(), + RootfsView: defaultRootfsViewConfig(), Monitors: defaultMonitorsConfig(), ExtraBins: defaultExtraBinConfig(), } diff --git a/pkg/unikontainers/utils.go b/pkg/unikontainers/utils.go index c53c0fc05..668aab722 100644 --- a/pkg/unikontainers/utils.go +++ b/pkg/unikontainers/utils.go @@ -35,9 +35,10 @@ import ( ) const ( - configFilename = "config.json" - stateFilename = "state.json" - initPidFilename = "init.pid" + configFilename = "config.json" + stateFilename = "state.json" + rootfsViewFilename = "rootfs-view.json" + initPidFilename = "init.pid" uruncJSONFilename = "urunc.json" rootfsDirName = "rootfs" )