From 869e782cac22e48c7e09cbecd3f29985e808a74b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ey=C3=BCp=20Can=20Akman?= Date: Sat, 6 Jun 2026 14:15:12 +0300 Subject: [PATCH] scheduler: reject sticky on a static host volume sticky is only supported on dynamic host volumes. A static host volume requested with sticky passed feasibility, so the allocation was placed and plan apply then failed inserting the volume claim with an empty volume ID ("object missing primary index"). Reject it in the per-volume feasibility loop, and make the sticky checks continue instead of returning early so a later volume in the same task group can't short-circuit the rejection. Fixes #27153 --- .changelog/28097.txt | 3 + scheduler/feasible/feasible.go | 25 +++++-- scheduler/feasible/feasible_test.go | 106 ++++++++++++++++++++++++++++ scheduler/generic_sched_test.go | 60 ++++++++++++++++ 4 files changed, 187 insertions(+), 7 deletions(-) create mode 100644 .changelog/28097.txt diff --git a/.changelog/28097.txt b/.changelog/28097.txt new file mode 100644 index 00000000000..9d8d4bcbd29 --- /dev/null +++ b/.changelog/28097.txt @@ -0,0 +1,3 @@ +```release-note:bug +scheduler: Fixed a bug where setting `sticky` on a static host volume could fail the evaluation instead of being rejected during feasibility checking +``` diff --git a/scheduler/feasible/feasible.go b/scheduler/feasible/feasible.go index 91f4cfcd5da..8d1f33e1032 100644 --- a/scheduler/feasible/feasible.go +++ b/scheduler/feasible/feasible.go @@ -355,12 +355,13 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { } if req.Sticky { - // the node is feasible if there are no remaining claims to + // the volume is feasible if there are no remaining claims to // fulfill or if there's an exact match if len(h.unmetClaims) == 0 { - return true + continue } + matched := false for _, c := range h.unmetClaims { if c.VolumeID == vol.ID { // if we have a match for a volume claim, delete this @@ -372,15 +373,25 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { func(claim *structs.TaskGroupHostVolumeClaim) bool { return claim.VolumeID == vol.ID }) - return true + matched = true + break } } + if !matched { + return false + } + } + } else { + // this is a static host volume + + // sticky is only supported on dynamic host volumes + if req.Sticky { return false } - } else if !req.ReadOnly { - // this is a static host volume and can only be mounted ReadOnly, - // validate that no requests for it are ReadWrite. - if volCfg.ReadOnly { + + // a static host volume can only be mounted ReadOnly, so validate + // that no requests for it are ReadWrite + if !req.ReadOnly && volCfg.ReadOnly { return false } } diff --git a/scheduler/feasible/feasible_test.go b/scheduler/feasible/feasible_test.go index 1db3b2631b2..806548e2716 100644 --- a/scheduler/feasible/feasible_test.go +++ b/scheduler/feasible/feasible_test.go @@ -240,6 +240,112 @@ func TestHostVolumeChecker_Static(t *testing.T) { } } +func TestHostVolumeChecker_StaticSticky(t *testing.T) { + ci.Parallel(t) + + _, ctx := MockContext(t) + + node := mock.Node() + node.HostVolumes = map[string]*structs.ClientHostVolumeConfig{ + "foo": {}, // static host volume: ID is empty + } + + // sticky is only supported on dynamic host volumes, so requesting it on a + // static host volume must fail feasibility regardless of ReadOnly. + cases := []struct { + name string + req *structs.VolumeRequest + result bool + }{ + { + name: "sticky read-write", + req: &structs.VolumeRequest{Type: "host", Source: "foo", Sticky: true}, + result: false, + }, + { + name: "sticky read-only", + req: &structs.VolumeRequest{Type: "host", Source: "foo", Sticky: true, ReadOnly: true}, + result: false, + }, + { + name: "not sticky", + req: &structs.VolumeRequest{Type: "host", Source: "foo"}, + result: true, + }, + } + + checker := NewHostVolumeChecker(ctx) + alloc := mock.Alloc() + alloc.NodeID = node.ID + job := mock.Job() + taskGroup := job.TaskGroups[0] + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + checker.SetVolumes(alloc.Name, structs.DefaultNamespace, job.ID, taskGroup.Name, + map[string]*structs.VolumeRequest{"foo": tc.req}) + must.Eq(t, tc.result, checker.Feasible(node)) + }) + } +} + +func TestHostVolumeChecker_StaticStickyMultiVolume(t *testing.T) { + ci.Parallel(t) + + store, ctx := MockContext(t) + + // the node has one dynamic host volume (ID set) and one static host volume + // (ID empty) + dhvID := uuid.Generate() + node := mock.Node() + node.HostVolumes = map[string]*structs.ClientHostVolumeConfig{ + "dyn": {ID: dhvID}, + "stat": {}, + } + must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, 1000, node)) + + dhv := &structs.HostVolume{ + Namespace: structs.DefaultNamespace, + ID: dhvID, + Name: "dyn", + NodeID: node.ID, + RequestedCapabilities: []*structs.HostVolumeCapability{{ + AttachmentMode: structs.HostVolumeAttachmentModeFilesystem, + AccessMode: structs.HostVolumeAccessModeSingleNodeSingleWriter, + }}, + State: structs.HostVolumeStateReady, + } + must.NoError(t, store.UpsertHostVolume(1000, dhv)) + + // both volumes are requested sticky. The dynamic one is available with no + // outstanding claims, so on its own it satisfies the per-volume check; the + // static one is invalid for sticky. The node must be infeasible no matter + // which volume is checked first, and volumeReqs is built from a map so the + // order is not deterministic. + reqs := map[string]*structs.VolumeRequest{ + "dyn": { + Type: "host", + Source: "dyn", + Sticky: true, + AccessMode: structs.HostVolumeAccessModeSingleNodeSingleWriter, + AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem, + }, + "stat": {Type: "host", Source: "stat", Sticky: true}, + } + + checker := NewHostVolumeChecker(ctx) + alloc := mock.Alloc() + alloc.NodeID = node.ID + job := mock.Job() + taskGroup := job.TaskGroups[0] + + for i := 0; i < 100; i++ { + checker.SetVolumes(alloc.Name, structs.DefaultNamespace, job.ID, taskGroup.Name, reqs) + must.False(t, checker.Feasible(node), + must.Sprint("sticky on a static host volume must be infeasible regardless of volume order")) + } +} + func TestHostVolumeChecker_Dynamic(t *testing.T) { ci.Parallel(t) diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 25260f1573c..56981fa8a75 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -17,6 +17,7 @@ import ( "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/scheduler/feasible" "github.com/hashicorp/nomad/scheduler/reconciler" sstructs "github.com/hashicorp/nomad/scheduler/structs" "github.com/hashicorp/nomad/scheduler/tests" @@ -476,6 +477,65 @@ func TestServiceSched_JobRegister_StickyHostVolumes(t *testing.T) { must.SliceLen(t, 10, newPlanned) } +func TestServiceSched_JobRegister_StickyStaticHostVolume(t *testing.T) { + ci.Parallel(t) + + h := tests.NewHarness(t) + + // A node with a static host volume (its ID is empty). + node := mock.Node() + node.HostVolumes = map[string]*structs.ClientHostVolumeConfig{"foo": {Name: "foo"}} + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // A job that requests the static volume with sticky, which is only valid for + // dynamic host volumes. + job := mock.Job() + job.TaskGroups[0].Count = 1 + job.TaskGroups[0].Volumes = map[string]*structs.VolumeRequest{ + "foo": { + Type: "host", + Source: "foo", + Sticky: true, + }, + } + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job)) + + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Eval must process cleanly. Before the fix the alloc was placed and plan + // apply failed inserting the claim with an empty volume ID. + must.NoError(t, h.Process(NewServiceScheduler, eval)) + + // Nothing is placed: sticky on a static host volume is infeasible. + placed := 0 + for _, plan := range h.Plans { + for _, allocs := range plan.NodeAllocation { + placed += len(allocs) + } + } + must.Zero(t, placed, must.Sprint("sticky on a static host volume must not place")) + + // The unplaced allocation spawns a blocked eval, and the node is filtered + // out for the host volume constraint. + must.SliceLen(t, 1, h.CreateEvals) + must.Eq(t, structs.EvalStatusBlocked, h.CreateEvals[0].Status) + + must.SliceLen(t, 1, h.Evals) + outEval := h.Evals[0] + must.MapLen(t, 1, outEval.FailedTGAllocs) + metrics := outEval.FailedTGAllocs[job.TaskGroups[0].Name] + must.NotNil(t, metrics) + must.MapContainsKey(t, metrics.ConstraintFiltered, feasible.FilterConstraintHostVolumes) +} + func TestServiceSched_JobRegister_DiskConstraints(t *testing.T) { ci.Parallel(t)