diff --git a/GNUmakefile b/GNUmakefile index afdd84c5632..62023fbb915 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -108,6 +108,15 @@ endif pkg/windows_%/nomad: GO_OUT = $@.exe pkg/windows_%/nomad: GO_TAGS += timetzdata +# Build the example device plugin for e2e device tests +pkg/%/nomad-device-example: GO_OUT ?= $@ +pkg/%/nomad-device-example: ## Build the example device plugin for GOOS_GOARCH + @echo "==> Building $@..." + @CGO_ENABLED=0 \ + GOOS=$(firstword $(subst _, ,$*)) \ + GOARCH=$(lastword $(subst _, ,$*)) \ + go build -trimpath -o $(GO_OUT) ./plugins/device/cmd/example/cmd + # Define package targets for each of the build targets we actually have on this system define makePackageTarget diff --git a/api/resources.go b/api/resources.go index 415f0a7613c..7a29d641a78 100644 --- a/api/resources.go +++ b/api/resources.go @@ -245,6 +245,10 @@ type NodeDevice struct { // Locality stores HW locality information for the node to optionally be // used when making placement decisions. Locality *NodeDeviceLocality + + // Shared mirrors a string enum on device.DetectedDevice that some + // devices use to report status and presence of sharing subsystems + Shared Shared } // Attribute is used to describe the value of an attribute, optionally @@ -289,6 +293,19 @@ func (a Attribute) String() string { } } +// Shared mirrors the plugin.Shared string enum found +// on Devices.DetectedDevice that some devices use to +// report the status and presence of sharing subsystems + +type Shared string + +const ( + DeviceSharingUnset Shared = "" + DeviceSharingIneligible Shared = "ineligible" + DeviceSharingActive Shared = "active" + DeviceSharingInactive Shared = "inactive" +) + // NodeDeviceLocality stores information about the devices hardware locality on // the node. type NodeDeviceLocality struct { @@ -296,6 +313,43 @@ type NodeDeviceLocality struct { PciBusID string } +// ShareDevices indicates whether the task is willing to share it's device +type ShareDevices struct { + // Enabled + Enabled bool `hcl:"enabled"` + // SharedDeviceID is an optional field for use in environments with + // multiple shared devices, to make the shared device ID available to + // the plugin. If in use alongside the device.id constraint, the two must + // match or the job will not be placed. + SharedDeviceId string `hcl:"shared_device_id,optional"` +} + +// DeviceOption represents a single option in a first_available device selection. +// Each option specifies a count and optional constraints that must be satisfied +// for this option to be selected. +type DeviceOption struct { + // Count is the number of requested devices for this option + Count *uint64 `hcl:"count,optional"` + + // Constraints are a set of constraints to apply when selecting the device + // to use for this option. + Constraints []*Constraint `hcl:"constraint,block"` + + // ShareDevices indicates whether this device option is willing to share + // TODO: determine if ShareDevices should be inherited or if, like count, + // it should only be set on one or the other + ShareDevices *ShareDevices `hcl:"share_devices,block"` +} + +func (o *DeviceOption) Canonicalize() { + if o == nil { + return + } + if o.Count == nil { + o.Count = pointerOf(uint64(1)) + } +} + // RequestedDevice is used to request a device for a task. type RequestedDevice struct { // Name is the request name. The possible values are as follows: @@ -309,20 +363,37 @@ type RequestedDevice struct { // * "nvidia/gpu/GTX2080Ti" Name string `hcl:",label"` - // Count is the number of requested devices + // Count is the number of requested devices. Mutually exclusive with + // FirstAvailable. Count *uint64 `hcl:"count,optional"` // Constraints are a set of constraints to apply when selecting the device - // to use. + // to use. When FirstAvailable is specified, these constraints are applied + // as base constraints that all options must also satisfy. Constraints []*Constraint `hcl:"constraint,block"` - // Affinities are a set of affinites to apply when selecting the device - // to use. + // Affinities are a set of affinities to apply when selecting the device + // to use. When FirstAvailable is specified, these affinities are applied + // as base affinities for all options. Affinities []*Affinity `hcl:"affinity,block"` + + // ShareDevices reports whether the task should be placed on a shared device + ShareDevices *ShareDevices `hcl:"share_devices,block"` + + //// FirstAvailable specifies a prioritized list of device options. The + //// scheduler will attempt to satisfy each option in order, selecting the + //// first one that can be fulfilled. Mutually exclusive with Count. + FirstAvailable []*DeviceOption `hcl:"first_available,block"` } func (d *RequestedDevice) Canonicalize() { - if d.Count == nil { + // If using first_available, canonicalize each option but don't set default count + if len(d.FirstAvailable) > 0 { + for _, opt := range d.FirstAvailable { + opt.Canonicalize() + } + } else if d.Count == nil { + // Only set default count when not using first_available d.Count = pointerOf(uint64(1)) } diff --git a/api/resources_test.go b/api/resources_test.go index 608c6f7833e..b58d39af6fb 100644 --- a/api/resources_test.go +++ b/api/resources_test.go @@ -116,3 +116,49 @@ func TestNUMAResource_Canonicalize(t *testing.T) { n3.Canonicalize() must.Eq(t, &NUMAResource{Affinity: "require", Devices: nil}, n3) } + +func TestDeviceOption_Canonicalize(t *testing.T) { + testutil.Parallel(t) + + // Nil option + var opt *DeviceOption + opt.Canonicalize() // should not panic + + // Count defaults to 1 + opt2 := &DeviceOption{} + opt2.Canonicalize() + must.Eq(t, uint64(1), *opt2.Count) + + // Explicit count preserved + opt3 := &DeviceOption{Count: pointerOf(uint64(4))} + opt3.Canonicalize() + must.Eq(t, uint64(4), *opt3.Count) +} + +func TestRequestedDevice_Canonicalize_FirstAvailable(t *testing.T) { + testutil.Parallel(t) + + // With FirstAvailable, Count should NOT be set to default + rd := &RequestedDevice{ + Name: "nvidia/gpu", + FirstAvailable: []*DeviceOption{ + {Count: pointerOf(uint64(2))}, + {}, // no count set + }, + } + rd.Canonicalize() + + // Count should remain nil when using FirstAvailable + must.Nil(t, rd.Count) + + // FirstAvailable options should be canonicalized + must.Eq(t, uint64(2), *rd.FirstAvailable[0].Count) + must.Eq(t, uint64(1), *rd.FirstAvailable[1].Count) // defaulted to 1 + + // Without FirstAvailable, Count defaults to 1 + rd2 := &RequestedDevice{ + Name: "nvidia/gpu", + } + rd2.Canonicalize() + must.Eq(t, uint64(1), *rd2.Count) +} diff --git a/client/devicemanager/utils.go b/client/devicemanager/utils.go index 12e5ab5c1e1..2b9bd129698 100644 --- a/client/devicemanager/utils.go +++ b/client/devicemanager/utils.go @@ -82,6 +82,7 @@ func convertDevice(dev *device.Device) *structs.NodeDevice { Healthy: dev.Healthy, HealthDescription: dev.HealthDesc, Locality: convertHwLocality(dev.HwLocality), + Shared: convertShared(dev.Shared), } } @@ -94,3 +95,16 @@ func convertHwLocality(l *device.DeviceLocality) *structs.NodeDeviceLocality { PciBusID: l.PciBusID, } } + +func convertShared(s device.Shared) structs.Shared { + switch s { + case device.SharingIneligible: + return structs.DeviceSharingIneligible + case device.SharingActive: + return structs.DeviceSharingActive + case device.SharingInactive: + return structs.DeviceSharingInactive + default: + } + return structs.DeviceSharingUnset +} diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index 6774c11bbdc..586eb29f8fc 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -1623,13 +1623,37 @@ func ApiResourcesToStructs(in *api.Resources) *structs.Resources { if len(in.Devices) > 0 { out.Devices = []*structs.RequestedDevice{} + for _, d := range in.Devices { - out.Devices = append(out.Devices, &structs.RequestedDevice{ + rd := &structs.RequestedDevice{ Name: d.Name, - Count: *d.Count, Constraints: ApiConstraintsToStructs(d.Constraints), Affinities: ApiAffinitiesToStructs(d.Affinities), - }) + } + // Only set Count if not using FirstAvailable + if d.Count != nil && len(d.FirstAvailable) == 0 { + rd.Count = *d.Count + } + // Only set ShareDevices if not using FirstAvailable + if d.ShareDevices != nil && len(d.FirstAvailable) == 0 { + rd.ShareDevices = ApiShareDevicesToStructs(d.ShareDevices) + } + //// Convert FirstAvailable options + if len(d.FirstAvailable) > 0 { + rd.FirstAvailable = make([]*structs.DeviceOption, len(d.FirstAvailable)) + for i, opt := range d.FirstAvailable { + rd.FirstAvailable[i] = &structs.DeviceOption{ + Constraints: ApiConstraintsToStructs(opt.Constraints), + } + if opt.Count != nil { + rd.FirstAvailable[i].Count = *opt.Count + } + if opt.ShareDevices != nil { + rd.FirstAvailable[i].ShareDevices = ApiShareDevicesToStructs(opt.ShareDevices) + } + } + } + out.Devices = append(out.Devices, rd) } } @@ -1646,6 +1670,16 @@ func ApiResourcesToStructs(in *api.Resources) *structs.Resources { return out } +func ApiShareDevicesToStructs(in *api.ShareDevices) *structs.ShareDevices { + if in == nil { + return nil + } + return &structs.ShareDevices{ + Enabled: in.Enabled, + SharedDeviceId: in.SharedDeviceId, + } + +} func ApiNetworkResourceToStructs(in []*api.NetworkResource) []*structs.NetworkResource { var out []*structs.NetworkResource diff --git a/e2e/devices/basic_test.go b/e2e/devices/basic_test.go new file mode 100644 index 00000000000..66a4af5f94e --- /dev/null +++ b/e2e/devices/basic_test.go @@ -0,0 +1,327 @@ +// Copyright IBM Corp. 2015, 2025 +// SPDX-License-Identifier: BUSL-1.1 + +package devices + +import ( + "fmt" + "strings" + "testing" + "time" + + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/testutil" + "github.com/shoenig/test/must" +) + +// TestDeviceScheduling runs end-to-end tests for traditional device scheduling +// (count, constraint, affinity without first_available). These tests require: +// - A Nomad cluster with at least one Linux client +// - The example device plugin (nomad/file/mock) installed and configured +// - Mock device files created in the configured directory +// +// See plugins/device/cmd/example/README.md for setup instructions. +func TestDeviceScheduling(t *testing.T) { + nomadClient := e2eutil.NomadClient(t) + e2eutil.WaitForLeader(t, nomadClient) + e2eutil.WaitForNodesReady(t, nomadClient, 1) + + // Check if any nodes have mock devices available + if !hasDevicePlugin(t, nomadClient, "nomad/file/mock") { + t.Skip("skipping: no nodes with nomad/file/mock device plugin") + } + + t.Run("testDeviceCountOnly", testDeviceCountOnly) + t.Run("testDeviceWithConstraint", testDeviceWithConstraint) + t.Run("testDeviceWithAffinity", testDeviceWithAffinity) + t.Run("testDeviceWithConstraintAndAffinity", testDeviceWithConstraintAndAffinity) + t.Run("testDeviceConstraintNoMatch", testDeviceConstraintNoMatch) +} + +// hasDevicePlugin checks if any node in the cluster has the specified device +// plugin available. +func hasDevicePlugin(t *testing.T, client *api.Client, deviceName string) bool { + t.Helper() + + nodes, _, err := client.Nodes().List(nil) + must.NoError(t, err) + + for _, nodeStub := range nodes { + node, _, err := client.Nodes().Info(nodeStub.ID, nil) + must.NoError(t, err) + + if node.NodeResources != nil && node.NodeResources.Devices != nil { + for _, device := range node.NodeResources.Devices { + fullName := device.Vendor + "/" + device.Type + "/" + device.Name + if strings.Contains(fullName, deviceName) || + strings.Contains(device.Name, deviceName) { + return true + } + } + } + } + return false +} + +// testDeviceCountOnly tests that a job with only device count specified +// can be successfully scheduled. +func testDeviceCountOnly(t *testing.T) { + nomadClient := e2eutil.NomadClient(t) + + jobID := "device-count-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) + + allocs := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "./input/device_count_only.hcl", jobID, "") + must.Len(t, 1, allocs, must.Sprint("expected 1 allocation")) + + alloc, _, err := nomadClient.Allocations().Info(allocs[0].ID, nil) + must.NoError(t, err) + must.Eq(t, api.AllocClientStatusRunning, alloc.ClientStatus, + must.Sprintf("allocation status: %s, description: %s", + alloc.ClientStatus, alloc.ClientDescription)) + + // Verify device was allocated + must.NotNil(t, alloc.AllocatedResources) + taskResources := alloc.AllocatedResources.Tasks["sleep"] + must.NotNil(t, taskResources) + must.SliceNotEmpty(t, taskResources.Devices, + must.Sprint("expected devices to be allocated")) + + // Verify exactly 1 device + totalDevices := 0 + for _, deviceResource := range taskResources.Devices { + totalDevices += len(deviceResource.DeviceIDs) + } + must.Eq(t, 1, totalDevices, must.Sprint("expected exactly 1 device")) +} + +// testDeviceWithConstraint tests that a job with device count and constraint +// can be successfully scheduled when the constraint is satisfied. +func testDeviceWithConstraint(t *testing.T) { + nomadClient := e2eutil.NomadClient(t) + + jobID := "device-constraint-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) + + allocs := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "./input/device_with_constraint.hcl", jobID, "") + must.Len(t, 1, allocs, must.Sprint("expected 1 allocation")) + + alloc, _, err := nomadClient.Allocations().Info(allocs[0].ID, nil) + must.NoError(t, err) + must.Eq(t, api.AllocClientStatusRunning, alloc.ClientStatus, + must.Sprintf("allocation status: %s, description: %s", + alloc.ClientStatus, alloc.ClientDescription)) + + // Verify device was allocated + must.NotNil(t, alloc.AllocatedResources) + taskResources := alloc.AllocatedResources.Tasks["sleep"] + must.NotNil(t, taskResources) + must.SliceNotEmpty(t, taskResources.Devices, + must.Sprint("expected devices to be allocated")) +} + +// testDeviceWithAffinity tests that a job with device count and affinity +// can be successfully scheduled. +func testDeviceWithAffinity(t *testing.T) { + nomadClient := e2eutil.NomadClient(t) + + jobID := "device-affinity-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) + + allocs := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "./input/device_with_affinity.hcl", jobID, "") + must.Len(t, 1, allocs, must.Sprint("expected 1 allocation")) + + alloc, _, err := nomadClient.Allocations().Info(allocs[0].ID, nil) + must.NoError(t, err) + must.Eq(t, api.AllocClientStatusRunning, alloc.ClientStatus, + must.Sprintf("allocation status: %s, description: %s", + alloc.ClientStatus, alloc.ClientDescription)) + + // Verify device was allocated + must.NotNil(t, alloc.AllocatedResources) + taskResources := alloc.AllocatedResources.Tasks["sleep"] + must.NotNil(t, taskResources) + must.SliceNotEmpty(t, taskResources.Devices, + must.Sprint("expected devices to be allocated")) +} + +// testDeviceWithConstraintAndAffinity tests that a job with device count, +// constraint, and affinity can be successfully scheduled. +func testDeviceWithConstraintAndAffinity(t *testing.T) { + nomadClient := e2eutil.NomadClient(t) + + jobID := "device-both-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) + + allocs := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "./input/device_with_constraint_and_affinity.hcl", jobID, "") + must.Len(t, 1, allocs, must.Sprint("expected 1 allocation")) + + alloc, _, err := nomadClient.Allocations().Info(allocs[0].ID, nil) + must.NoError(t, err) + must.Eq(t, api.AllocClientStatusRunning, alloc.ClientStatus, + must.Sprintf("allocation status: %s, description: %s", + alloc.ClientStatus, alloc.ClientDescription)) + + // Verify devices were allocated + must.NotNil(t, alloc.AllocatedResources) + taskResources := alloc.AllocatedResources.Tasks["sleep"] + must.NotNil(t, taskResources) + must.SliceNotEmpty(t, taskResources.Devices, + must.Sprint("expected devices to be allocated")) + + // Verify 2 devices were allocated + totalDevices := 0 + for _, deviceResource := range taskResources.Devices { + totalDevices += len(deviceResource.DeviceIDs) + } + must.Eq(t, 2, totalDevices, must.Sprint("expected exactly 2 devices")) +} + +// testDeviceConstraintNoMatch tests that when a device constraint cannot be +// satisfied, the job fails to schedule with appropriate error messages. +func testDeviceConstraintNoMatch(t *testing.T) { + nomadClient := e2eutil.NomadClient(t) + + jobID := "device-nomatch-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) + + // Parse and register the job + job, err := e2eutil.Parse2(t, "./input/device_constraint_no_match.hcl") + must.NoError(t, err) + job.ID = &jobID + + resp, _, err := nomadClient.Jobs().Register(job, nil) + must.NoError(t, err) + + evalID := resp.EvalID + + // Wait for the evaluation to complete (it should fail to place) + var eval *api.Evaluation + testutil.WaitForResultRetries(30, func() (bool, error) { + time.Sleep(500 * time.Millisecond) + eval, _, err = nomadClient.Evaluations().Info(evalID, nil) + if err != nil { + return false, err + } + if eval.Status == api.EvalStatusComplete || eval.Status == api.EvalStatusBlocked { + return true, nil + } + return false, fmt.Errorf("eval status: %s", eval.Status) + }, func(err error) { + must.NoError(t, err) + }) + + // The evaluation should have failed task group allocations + must.MapNotEmpty(t, eval.FailedTGAllocs, + must.Sprint("expected failed task group allocations")) + + // Check that the failure is due to device exhaustion or constraint filtering + for _, metrics := range eval.FailedTGAllocs { + exhausted := metrics.NodesExhausted > 0 || + len(metrics.DimensionExhausted) > 0 || + len(metrics.ConstraintFiltered) > 0 + must.True(t, exhausted, + must.Sprintf("expected device exhaustion, got metrics: %+v", metrics)) + } +} + +// TestDeviceParsing tests that traditional device configurations (count, +// constraint, affinity) are parsed correctly. These are unit-style tests +// that don't require a running Nomad cluster. +func TestDeviceParsing(t *testing.T) { + t.Run("testParseDeviceCountOnly", testParseDeviceCountOnly) + t.Run("testParseDeviceWithConstraint", testParseDeviceWithConstraint) + t.Run("testParseDeviceWithAffinity", testParseDeviceWithAffinity) + t.Run("testParseDeviceWithConstraintAndAffinity", testParseDeviceWithConstraintAndAffinity) +} + +// testParseDeviceCountOnly verifies parsing of a device with only count. +func testParseDeviceCountOnly(t *testing.T) { + job, err := e2eutil.Parse2(t, "./input/device_count_only.hcl") + must.NoError(t, err) + must.NotNil(t, job) + + must.Len(t, 1, job.TaskGroups) + task := job.TaskGroups[0].Tasks[0] + must.NotNil(t, task.Resources) + must.Len(t, 1, task.Resources.Devices) + + device := task.Resources.Devices[0] + must.Eq(t, "nomad/file/mock", device.Name) + must.Eq(t, uint64(1), *device.Count) + must.Len(t, 0, device.Constraints) + must.Len(t, 0, device.Affinities) + must.Len(t, 0, device.FirstAvailable) +} + +// testParseDeviceWithConstraint verifies parsing of a device with count and constraint. +func testParseDeviceWithConstraint(t *testing.T) { + job, err := e2eutil.Parse2(t, "./input/device_with_constraint.hcl") + must.NoError(t, err) + must.NotNil(t, job) + + task := job.TaskGroups[0].Tasks[0] + device := task.Resources.Devices[0] + + must.Eq(t, "nomad/file/mock", device.Name) + must.Eq(t, uint64(1), *device.Count) + must.Len(t, 1, device.Constraints) + must.Eq(t, "${device.attr.type}", device.Constraints[0].LTarget) + must.Eq(t, "file", device.Constraints[0].RTarget) + must.Len(t, 0, device.Affinities) + must.Len(t, 0, device.FirstAvailable) +} + +// testParseDeviceWithAffinity verifies parsing of a device with count and affinity. +func testParseDeviceWithAffinity(t *testing.T) { + job, err := e2eutil.Parse2(t, "./input/device_with_affinity.hcl") + must.NoError(t, err) + must.NotNil(t, job) + + task := job.TaskGroups[0].Tasks[0] + device := task.Resources.Devices[0] + + must.Eq(t, "nomad/file/mock", device.Name) + must.Eq(t, uint64(1), *device.Count) + must.Len(t, 0, device.Constraints) + must.Len(t, 1, device.Affinities) + must.Eq(t, "${device.attr.priority}", device.Affinities[0].LTarget) + must.Eq(t, "high", device.Affinities[0].RTarget) + must.Eq(t, int8(100), *device.Affinities[0].Weight) + must.Len(t, 0, device.FirstAvailable) +} + +// testParseDeviceWithConstraintAndAffinity verifies parsing of a device with +// count, constraint, and affinity. +func testParseDeviceWithConstraintAndAffinity(t *testing.T) { + job, err := e2eutil.Parse2(t, "./input/device_with_constraint_and_affinity.hcl") + must.NoError(t, err) + must.NotNil(t, job) + + task := job.TaskGroups[0].Tasks[0] + device := task.Resources.Devices[0] + + must.Eq(t, "nomad/file/mock", device.Name) + must.Eq(t, uint64(2), *device.Count) + + // Verify constraint + must.Len(t, 1, device.Constraints) + must.Eq(t, "${device.attr.type}", device.Constraints[0].LTarget) + must.Eq(t, "file", device.Constraints[0].RTarget) + + // Verify affinity + must.Len(t, 1, device.Affinities) + must.Eq(t, "${device.attr.priority}", device.Affinities[0].LTarget) + must.Eq(t, "high", device.Affinities[0].RTarget) + must.Eq(t, int8(50), *device.Affinities[0].Weight) + + // No first_available + must.Len(t, 0, device.FirstAvailable) +} diff --git a/e2e/devices/doc.go b/e2e/devices/doc.go new file mode 100644 index 00000000000..85336e16234 --- /dev/null +++ b/e2e/devices/doc.go @@ -0,0 +1,7 @@ +// Copyright IBM Corp. 2015, 2025 +// SPDX-License-Identifier: BUSL-1.1 + +// Package devices provides end-to-end tests for Nomad's device scheduling +// functionality, including the first_available feature for flexible device +// selection. +package devices diff --git a/e2e/devices/first_available_test.go b/e2e/devices/first_available_test.go new file mode 100644 index 00000000000..aa288f6f4e2 --- /dev/null +++ b/e2e/devices/first_available_test.go @@ -0,0 +1,193 @@ +// Copyright IBM Corp. 2015, 2025 +// SPDX-License-Identifier: BUSL-1.1 + +package devices + +import ( + "fmt" + "testing" + "time" + + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/testutil" + "github.com/shoenig/test/must" +) + +// TestDeviceFirstAvailable runs end-to-end tests for the first_available +// device scheduling feature. These tests require: +// - A Nomad cluster with at least one Linux client +// - The example device plugin (nomad/file/mock) installed and configured +// - Mock device files created in the configured directory +// +// See plugins/device/cmd/example/README.md for setup instructions. +func TestDeviceFirstAvailable(t *testing.T) { + nomadClient := e2eutil.NomadClient(t) + e2eutil.WaitForLeader(t, nomadClient) + e2eutil.WaitForNodesReady(t, nomadClient, 1) + + // Check if any nodes have mock devices available + if !hasDevicePlugin(t, nomadClient, "nomad/file/mock") { + t.Skip("skipping: no nodes with nomad/file/mock device plugin") + } + + t.Run("testFirstAvailableSelectsCorrectOption", testFirstAvailableSelectsCorrectOption) + t.Run("testFirstAvailableNoMatch", testFirstAvailableNoMatch) +} + +// testFirstAvailableSelectsCorrectOption tests that first_available correctly +// evaluates options in order and selects the appropriate one. The first option +// has an impossible constraint (should fail), so the scheduler must fall back +// to the second option. We verify by checking that exactly 2 devices were +// allocated (second option's count), not 1 (first option's count) or 3 (third +// option's count). +func testFirstAvailableSelectsCorrectOption(t *testing.T) { + nomadClient := e2eutil.NomadClient(t) + + jobID := "device-fa-second-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) + + // Register the job - first option has impossible constraint (should fail), + // second option requests 2 devices (should be selected) + allocs := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "./input/first_available_with_basic.hcl", jobID, "") + must.Len(t, 1, allocs, must.Sprint("expected 1 allocation")) + + // Verify the allocation is running (fallback to second option succeeded) + alloc, _, err := nomadClient.Allocations().Info(allocs[0].ID, nil) + must.NoError(t, err) + must.Eq(t, api.AllocClientStatusRunning, alloc.ClientStatus, + must.Sprintf("allocation status: %s, description: %s", + alloc.ClientStatus, alloc.ClientDescription)) + + // Verify devices were allocated + must.NotNil(t, alloc.AllocatedResources) + taskResources := alloc.AllocatedResources.Tasks["sleep"] + must.NotNil(t, taskResources) + must.SliceNotEmpty(t, taskResources.Devices, + must.Sprint("expected devices to be allocated")) + + // Count total devices allocated - should be 2 (second option), not 1 (first option) + totalDevices := 0 + for _, deviceResource := range taskResources.Devices { + totalDevices += len(deviceResource.DeviceIDs) + } + must.Eq(t, 2, totalDevices, + must.Sprint("expected exactly 2 devices from SECOND option, got different count indicating wrong option selected")) +} + +// testFirstAvailableNoMatch tests that when no first_available options can be +// satisfied, the job fails to schedule with appropriate error messages. +func testFirstAvailableNoMatch(t *testing.T) { + nomadClient := e2eutil.NomadClient(t) + + jobID := "device-fa-nomatch-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) + + // Parse and register the job + job, err := e2eutil.Parse2(t, "./input/first_available_no_match.hcl") + must.NoError(t, err) + job.ID = &jobID + + resp, _, err := nomadClient.Jobs().Register(job, nil) + must.NoError(t, err) + + evalID := resp.EvalID + + // Wait for the evaluation to complete (it should fail to place) + var eval *api.Evaluation + testutil.WaitForResultRetries(30, func() (bool, error) { + time.Sleep(500 * time.Millisecond) + eval, _, err = nomadClient.Evaluations().Info(evalID, nil) + if err != nil { + return false, err + } + // Wait until eval is complete or blocked + if eval.Status == api.EvalStatusComplete || eval.Status == api.EvalStatusBlocked { + return true, nil + } + return false, fmt.Errorf("eval status: %s", eval.Status) + }, func(err error) { + must.NoError(t, err) + }) + + // The evaluation should have failed task group allocations + must.MapNotEmpty(t, eval.FailedTGAllocs, + must.Sprint("expected failed task group allocations")) + + // Check that the failure is due to device exhaustion + for _, metrics := range eval.FailedTGAllocs { + // Should see nodes exhausted or constraint filtered + exhausted := metrics.NodesExhausted > 0 || + len(metrics.DimensionExhausted) > 0 || + len(metrics.ConstraintFiltered) > 0 + must.True(t, exhausted, + must.Sprintf("expected device exhaustion, got metrics: %+v", metrics)) + } +} + +// TestDeviceFirstAvailableParsing tests that jobs with first_available blocks +// are parsed correctly. These are unit-style tests that don't require a +// running Nomad cluster. +func TestDeviceFirstAvailableParsing(t *testing.T) { + t.Run("testParseFirstAvailable", testParseFirstAvailable) + t.Run("testParseWithBaseConstraint", testParseWithBaseConstraint) +} + +// testParseFirstAvailable verifies parsing of first_available with multiple +// options including constraints. +func testParseFirstAvailable(t *testing.T) { + job, err := e2eutil.Parse2(t, "./input/first_available_with_basic.hcl") + must.NoError(t, err) + must.NotNil(t, job) + + // Verify the structure was parsed correctly + must.Len(t, 1, job.TaskGroups) + task := job.TaskGroups[0].Tasks[0] + must.NotNil(t, task.Resources) + must.Len(t, 1, task.Resources.Devices) + + device := task.Resources.Devices[0] + must.Eq(t, "nomad/file/mock", device.Name) + must.Len(t, 3, device.FirstAvailable, + must.Sprint("expected 3 first_available options")) + + // Verify first option: count=1, with impossible constraint + opt1 := device.FirstAvailable[0] + must.Eq(t, uint64(1), *opt1.Count) + must.Len(t, 1, opt1.Constraints) + must.Eq(t, "${device.attr.impossible_attr}", opt1.Constraints[0].LTarget) + must.Eq(t, "impossible_value", opt1.Constraints[0].RTarget) + + // Verify second option: count=2, no constraints + opt2 := device.FirstAvailable[1] + must.Eq(t, uint64(2), *opt2.Count) + must.Len(t, 0, opt2.Constraints) + + // Verify third option: count=3, no constraints + opt3 := device.FirstAvailable[2] + must.Eq(t, uint64(3), *opt3.Count) + must.Len(t, 0, opt3.Constraints) +} + +// testParseWithBaseConstraint verifies parsing with base and option constraints. +func testParseWithBaseConstraint(t *testing.T) { + job, err := e2eutil.Parse2(t, "./input/first_available_with_base_constraint.hcl") + must.NoError(t, err) + must.NotNil(t, job) + + task := job.TaskGroups[0].Tasks[0] + device := task.Resources.Devices[0] + + // Verify base constraint exists + must.Len(t, 1, device.Constraints, + must.Sprint("expected 1 base constraint")) + must.Eq(t, "${device.attr.cool-attribute}", device.Constraints[0].LTarget) + + // Verify first_available options also have their own constraints + must.Len(t, 2, device.FirstAvailable) + must.Len(t, 1, device.FirstAvailable[0].Constraints) + must.Len(t, 1, device.FirstAvailable[1].Constraints) +} diff --git a/e2e/devices/input/device_constraint_no_match.hcl b/e2e/devices/input/device_constraint_no_match.hcl new file mode 100644 index 00000000000..e7271618a04 --- /dev/null +++ b/e2e/devices/input/device_constraint_no_match.hcl @@ -0,0 +1,36 @@ +# Copyright IBM Corp. 2015, 2025 +# SPDX-License-Identifier: BUSL-1.1 + +# Test for device constraint that cannot be satisfied. +# The job should fail to schedule because no device matches the constraint. + +job "device-constraint-nomatch" { + type = "batch" + + group "test" { + count = 1 + + task "sleep" { + driver = "raw_exec" + + config { + command = "sleep" + args = ["30"] + } + + resources { + cpu = 10 + memory = 64 + + device "nomad/file/mock" { + count = 1 + + constraint { + attribute = "${device.attr.cool-attribute}" + value = "impossible-value-that-will-never-match" + } + } + } + } + } +} diff --git a/e2e/devices/input/device_count_only.hcl b/e2e/devices/input/device_count_only.hcl new file mode 100644 index 00000000000..43192495293 --- /dev/null +++ b/e2e/devices/input/device_count_only.hcl @@ -0,0 +1,30 @@ +# Copyright IBM Corp. 2015, 2025 +# SPDX-License-Identifier: BUSL-1.1 + +# Basic test for device scheduling with only count specified. + +job "device-count-only" { + type = "batch" + + group "test" { + count = 1 + + task "sleep" { + driver = "raw_exec" + + config { + command = "sleep" + args = ["30"] + } + + resources { + cpu = 10 + memory = 64 + + device "nomad/file/mock" { + count = 1 + } + } + } + } +} diff --git a/e2e/devices/input/device_with_affinity.hcl b/e2e/devices/input/device_with_affinity.hcl new file mode 100644 index 00000000000..c2d0eee88d7 --- /dev/null +++ b/e2e/devices/input/device_with_affinity.hcl @@ -0,0 +1,36 @@ +# Copyright IBM Corp. 2015, 2025 +# SPDX-License-Identifier: BUSL-1.1 + +# Test for device scheduling with count and affinity. + +job "device-with-affinity" { + type = "batch" + + group "test" { + count = 1 + + task "sleep" { + driver = "raw_exec" + + config { + command = "sleep" + args = ["30"] + } + + resources { + cpu = 10 + memory = 64 + + device "nomad/file/mock" { + count = 1 + + affinity { + attribute = "${device.attr.cool-attribute}" + value = "high" + weight = 100 + } + } + } + } + } +} diff --git a/e2e/devices/input/device_with_constraint.hcl b/e2e/devices/input/device_with_constraint.hcl new file mode 100644 index 00000000000..38bb79b5892 --- /dev/null +++ b/e2e/devices/input/device_with_constraint.hcl @@ -0,0 +1,35 @@ +# Copyright IBM Corp. 2015, 2025 +# SPDX-License-Identifier: BUSL-1.1 + +# Test for device scheduling with count and constraint. + +job "device-with-constraint" { + type = "batch" + + group "test" { + count = 1 + + task "sleep" { + driver = "raw_exec" + + config { + command = "sleep" + args = ["30"] + } + + resources { + cpu = 10 + memory = 64 + + device "nomad/file/mock" { + count = 1 + + constraint { + attribute = "${device.attr.cool-attribute}" + value = "attribute-wearing-sunglasses" + } + } + } + } + } +} diff --git a/e2e/devices/input/device_with_constraint_and_affinity.hcl b/e2e/devices/input/device_with_constraint_and_affinity.hcl new file mode 100644 index 00000000000..410f5c51c77 --- /dev/null +++ b/e2e/devices/input/device_with_constraint_and_affinity.hcl @@ -0,0 +1,41 @@ +# Copyright IBM Corp. 2015, 2025 +# SPDX-License-Identifier: BUSL-1.1 + +# Test for device scheduling with count, constraint, and affinity combined. + +job "device-constraint-affinity" { + type = "batch" + + group "test" { + count = 1 + + task "sleep" { + driver = "raw_exec" + + config { + command = "sleep" + args = ["30"] + } + + resources { + cpu = 10 + memory = 64 + + device "nomad/file/mock" { + count = 2 + + constraint { + attribute = "${device.attr.cool-attribute}" + value = "attribute-wearing-sunglasses" + } + + affinity { + attribute = "${device.attr.priority}" + value = "high" + weight = 50 + } + } + } + } + } +} diff --git a/e2e/devices/input/first_available_no_match.hcl b/e2e/devices/input/first_available_no_match.hcl new file mode 100644 index 00000000000..92bfd9c0465 --- /dev/null +++ b/e2e/devices/input/first_available_no_match.hcl @@ -0,0 +1,44 @@ +# Copyright IBM Corp. 2015, 2025 +# SPDX-License-Identifier: BUSL-1.1 + +# Test for first_available when no options can be satisfied. +# All options have impossible constraints, so the job should fail to schedule. + +job "device-first-available-nomatch" { + type = "batch" + + group "test" { + count = 1 + + task "sleep" { + driver = "raw_exec" + + config { + command = "sleep" + args = ["30"] + } + + resources { + cpu = 10 + memory = 64 + + device "nomad/file/mock" { + first_available { + count = 100 + constraint { + attribute = "${device.attr.nonexistent1}" + value = "impossible1" + } + } + first_available { + count = 100 + constraint { + attribute = "${device.attr.nonexistent2}" + value = "impossible2" + } + } + } + } + } + } +} diff --git a/e2e/devices/input/first_available_with_base_constraint.hcl b/e2e/devices/input/first_available_with_base_constraint.hcl new file mode 100644 index 00000000000..8c3fd57f45a --- /dev/null +++ b/e2e/devices/input/first_available_with_base_constraint.hcl @@ -0,0 +1,51 @@ +# Copyright IBM Corp. 2015, 2025 +# SPDX-License-Identifier: BUSL-1.1 + +# Test for first_available with base constraints. +# The device block has a base constraint that all options must satisfy, +# plus each first_available option can have additional constraints. + +job "device-first-available-base" { + type = "batch" + + group "test" { + count = 1 + + task "sleep" { + driver = "raw_exec" + + config { + command = "sleep" + args = ["30"] + } + + resources { + cpu = 10 + memory = 64 + + device "nomad/file/mock" { + # Base constraint applied to all first_available options + constraint { + attribute = "${device.attr.cool-attribute}" + value = "attribute-wearing-sunglasses" + } + + first_available { + count = 2 + constraint { + attribute = "${device.attr.type}" + value = "premium" + } + } + first_available { + count = 1 + constraint { + attribute = "${device.attr.type}" + value = "standard" + } + } + } + } + } + } +} diff --git a/e2e/devices/input/first_available_with_basic.hcl b/e2e/devices/input/first_available_with_basic.hcl new file mode 100644 index 00000000000..4cde776897a --- /dev/null +++ b/e2e/devices/input/first_available_with_basic.hcl @@ -0,0 +1,49 @@ +# Copyright IBM Corp. 2015, 2025 +# SPDX-License-Identifier: BUSL-1.1 + +# Test that the SECOND option is selected when the first cannot be satisfied. +# Option 1: 1 device with impossible constraint (should fail) +# Option 2: 2 devices with no constraints (should be selected) +# +# We verify by checking that exactly 2 devices were allocated. + +job "device-first-available-second" { + type = "batch" + + group "test" { + count = 1 + + task "sleep" { + driver = "raw_exec" + + config { + command = "sleep" + args = ["30"] + } + + resources { + cpu = 10 + memory = 64 + + device "nomad/file/mock" { + # First option: impossible constraint (should fail) + first_available { + count = 1 + constraint { + attribute = "${device.attr.impossible_attr}" + value = "impossible_value" + } + } + # Second option: request 2 devices (should be selected) + first_available { + count = 2 + } + # Second option: request 3 devices (should not be selected) + first_available { + count = 3 + } + } + } + } + } +} diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 61cb37ee822..fb385c47b6d 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -16,6 +16,7 @@ import ( _ "github.com/hashicorp/nomad/e2e/consul" _ "github.com/hashicorp/nomad/e2e/csi" _ "github.com/hashicorp/nomad/e2e/deployment" + _ "github.com/hashicorp/nomad/e2e/devices" _ "github.com/hashicorp/nomad/e2e/eval_priority" _ "github.com/hashicorp/nomad/e2e/events" _ "github.com/hashicorp/nomad/e2e/lifecycle" diff --git a/e2e/terraform/main.tf b/e2e/terraform/main.tf index acafb1fc967..76c5ae9f15b 100644 --- a/e2e/terraform/main.tf +++ b/e2e/terraform/main.tf @@ -16,6 +16,7 @@ module "provision-infra" { nomad_local_binary = var.nomad_local_binary nomad_local_binary_client_ubuntu_jammy = var.nomad_local_binary_client_ubuntu_jammy nomad_local_binary_client_windows_2022 = var.nomad_local_binary_client_windows_2022 + device_plugin_local_binary = var.device_plugin_local_binary nomad_license = var.nomad_license consul_license = var.consul_license nomad_region = var.nomad_region diff --git a/e2e/terraform/provision-infra/nomad.tf b/e2e/terraform/provision-infra/nomad.tf index 23dc7813499..43086de7178 100644 --- a/e2e/terraform/provision-infra/nomad.tf +++ b/e2e/terraform/provision-infra/nomad.tf @@ -55,6 +55,8 @@ module "nomad_client_ubuntu_jammy" { nomad_region = var.nomad_region nomad_local_binary = local.linux_binary + device_plugin_local_binary = var.device_plugin_local_binary + tls_ca_key = tls_private_key.ca.private_key_pem tls_ca_cert = tls_self_signed_cert.ca.cert_pem diff --git a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux.hcl b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux.hcl index 20f24261a8a..1dfd5e9000a 100644 --- a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux.hcl +++ b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux.hcl @@ -51,3 +51,11 @@ plugin "nomad-driver-exec2" { unveil_paths = ["r:/etc/mime.types"] } } + +plugin "nomad-device-example" { + config { + dir = "/tmp/nomad-device" + list_period = "1s" + unhealthy_perm = "-rwxrwxrwx" + } +} diff --git a/e2e/terraform/provision-infra/provision-nomad/install-linux.tf b/e2e/terraform/provision-infra/provision-nomad/install-linux.tf index c99e9f02c48..afc6a733a85 100644 --- a/e2e/terraform/provision-infra/provision-nomad/install-linux.tf +++ b/e2e/terraform/provision-infra/provision-nomad/install-linux.tf @@ -31,6 +31,37 @@ resource "null_resource" "install_nomad_binary_linux" { } } +resource "null_resource" "install_device_plugin_linux" { + count = var.platform == "linux" && var.role == "client" && var.device_plugin_local_binary != "" ? 1 : 0 + + connection { + type = "ssh" + user = var.connection.user + host = var.instance.public_ip + port = var.connection.port + private_key = file(var.connection.private_key) + timeout = "5m" + } + + provisioner "file" { + source = var.device_plugin_local_binary + destination = "/tmp/nomad-device-example" + } + provisioner "remote-exec" { + inline = [ + "sudo mv /tmp/nomad-device-example /opt/nomad/plugins/nomad-device-example", + "sudo chmod +x /opt/nomad/plugins/nomad-device-example", + # Create mock device directory and files for e2e device tests + "sudo mkdir -p /tmp/nomad-device", + "sudo touch /tmp/nomad-device/device01", + "sudo touch /tmp/nomad-device/device02", + "sudo touch /tmp/nomad-device/device03", + # Mark device01 as unhealthy + "sudo chmod 0777 /tmp/nomad-device/device01", + ] + } +} + resource "null_resource" "install_consul_configs_linux" { count = var.platform == "linux" ? 1 : 0 diff --git a/e2e/terraform/provision-infra/provision-nomad/variables.tf b/e2e/terraform/provision-infra/provision-nomad/variables.tf index b64fd37cca4..927f41e9d2e 100644 --- a/e2e/terraform/provision-infra/provision-nomad/variables.tf +++ b/e2e/terraform/provision-infra/provision-nomad/variables.tf @@ -99,3 +99,9 @@ variable "keys_dir" { description = "Directory where all the configuration TLS and SSH keys and certificates will be stored for provisioning" default = "" } + +variable "device_plugin_local_binary" { + type = string + description = "Path to the example device plugin binary for e2e device tests" + default = "" +} diff --git a/e2e/terraform/provision-infra/variables.tf b/e2e/terraform/provision-infra/variables.tf index 44345a5a18f..d343fcd0ede 100644 --- a/e2e/terraform/provision-infra/variables.tf +++ b/e2e/terraform/provision-infra/variables.tf @@ -125,3 +125,9 @@ variable "nomad_local_binary_client_windows_2022" { type = string default = "" } + +variable "device_plugin_local_binary" { + description = "Path to the example device plugin binary for e2e device tests" + type = string + default = "" +} diff --git a/e2e/terraform/terraform.tfvars b/e2e/terraform/terraform.tfvars index a454167079b..eb0fbbd5549 100644 --- a/e2e/terraform/terraform.tfvars +++ b/e2e/terraform/terraform.tfvars @@ -5,6 +5,10 @@ # with `make dev` or similar (../../ = this repository root) # before running `terraform apply` and created the /pkg/goos_goarch/binary # folder +# +# For the device e2e tests, also build the example device plugin: +# make pkg/linux_amd64/nomad-device-example nomad_local_binary = "../../pkg/linux_amd64/nomad" nomad_local_binary_client_windows_2022 = "../../pkg/windows_amd64/nomad.exe" +device_plugin_local_binary = "../../pkg/linux_amd64/nomad-device-example" diff --git a/e2e/terraform/variables.tf b/e2e/terraform/variables.tf index 7b586013648..e2af8ed5fee 100644 --- a/e2e/terraform/variables.tf +++ b/e2e/terraform/variables.tf @@ -122,3 +122,9 @@ variable "nomad_local_binary_client_windows_2022" { type = string default = "" } + +variable "device_plugin_local_binary" { + description = "Path to the example device plugin binary for e2e device tests" + type = string + default = "" +} diff --git a/nomad/mock/node.go b/nomad/mock/node.go index 2b2e88498c5..96cf3e719db 100644 --- a/nomad/mock/node.go +++ b/nomad/mock/node.go @@ -148,3 +148,42 @@ func NvidiaNode() *structs.Node { _ = n.ComputeClass() return n } + +// SharedNvidiaNode returns a node with two sharing enabled instances of an Nvidia GPU +func SharedNvidiaNode() *structs.Node { + n := Node() + n.NodeResources.Processors.Topology = structs.MockWorkstationTopology() + n.NodeResources.Devices = []*structs.NodeDeviceResource{ + { + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + Attributes: map[string]*psstructs.Attribute{ + "memory": psstructs.NewIntAttribute(11, psstructs.UnitGiB), + "cuda_cores": psstructs.NewIntAttribute(3584, ""), + "graphics_clock": psstructs.NewIntAttribute(1480, psstructs.UnitMHz), + "memory_bandwidth": psstructs.NewIntAttribute(11, psstructs.UnitGBPerS), + }, + Instances: []*structs.NodeDevice{ + { + ID: uuid.Generate(), + Healthy: true, + Locality: &structs.NodeDeviceLocality{ + PciBusID: "0000:02:00.1", // node 0 + }, + Shared: structs.DeviceSharingActive, + }, + { + ID: uuid.Generate(), + Healthy: true, + Locality: &structs.NodeDeviceLocality{ + PciBusID: "0000:02:01.1", // node 0 + }, + Shared: structs.DeviceSharingActive, + }, + }, + }, + } + _ = n.ComputeClass() + return n +} diff --git a/nomad/structs/constraint_test.go b/nomad/structs/constraint_test.go index 716ef167dfd..524faea8d5f 100644 --- a/nomad/structs/constraint_test.go +++ b/nomad/structs/constraint_test.go @@ -87,6 +87,11 @@ func TestValidateConstraintTarget(t *testing.T) { inputTarget: "${device.type}", expectedErrorMsg: "", }, + { + name: "valid device.attr", + inputTarget: "${device.attr.model}", + expectedErrorMsg: "", + }, { name: "missing closing brace", inputTarget: "${node.datacenter", diff --git a/nomad/structs/devices.go b/nomad/structs/devices.go index a1049208a9e..78061251fe4 100644 --- a/nomad/structs/devices.go +++ b/nomad/structs/devices.go @@ -3,7 +3,9 @@ package structs -import "maps" +import ( + "maps" +) // DeviceAccounter is used to account for device usage on a node. It can detect // when a node is oversubscribed and can be used for deciding what devices are @@ -44,6 +46,20 @@ func (dai *DeviceAccounterInstance) Copy() *DeviceAccounterInstance { } } +// GetSharedByID returns the underlying Shared string value of the instance +// of the specific deviceID. +// +// If no instance matching the deviceID is found or if Shared is nil +// an empty string, equivalent to DeviceSharingUnset is returned +func (dai *DeviceAccounterInstance) GetSharedByID(instanceID string) Shared { + for _, instance := range dai.Device.Instances { + if instance.ID == instanceID { + return instance.Shared + } + } + return "" +} + // NewDeviceAccounter returns a new device accounter. The node is used to // populate the set of available devices based on what healthy device instances // exist on the node. @@ -90,7 +106,8 @@ func (d *DeviceAccounter) Copy() *DeviceAccounter { // AddAllocs takes a set of allocations and internally marks which devices are // used. If a device is used more than once by the set of passed allocations, -// the collision will be returned as true. +// the collision will be returned as true unless it has been placed on a +// device that explicitly allows sharing. func (d *DeviceAccounter) AddAllocs(allocs []*Allocation) (collision bool) { for _, a := range allocs { // Filter any terminal allocation @@ -107,22 +124,23 @@ func (d *DeviceAccounter) AddAllocs(allocs []*Allocation) (collision bool) { // Go through each task resource for _, tr := range a.AllocatedResources.Tasks { - // Go through each assigned device group - for _, device := range tr.Devices { - devID := device.ID() + for _, allocatedDeviceGroup := range tr.Devices { + devID := allocatedDeviceGroup.ID() // Go through each assigned device - for _, instanceID := range device.DeviceIDs { - + for _, instanceID := range allocatedDeviceGroup.DeviceIDs { // Mark that we are using the device. It may not be in the // map if the device is no longer being fingerprinted, is // unhealthy, etc. - if devInst, ok := d.Devices[*devID]; ok { - if i, ok := devInst.Instances[instanceID]; ok { + if devAccounter, ok := d.Devices[*devID]; ok { + if i, ok := devAccounter.Instances[instanceID]; ok { // Mark that the device is in use - devInst.Instances[instanceID]++ - + devAccounter.Instances[instanceID]++ + shared := devAccounter.GetSharedByID(instanceID) + if shared == DeviceSharingActive { + continue + } if i != 0 { collision = true } @@ -136,30 +154,54 @@ func (d *DeviceAccounter) AddAllocs(allocs []*Allocation) (collision bool) { return } +// willingToShare is called in the loop that marks each reserved instance as used +// in the accounter. It takes a deviceID string and uses it to look up +// return the task requesting the device is willing to share +func willingToShare(res *AllocatedDeviceResource, deviceID string) bool { + // res.WillShare is nil => return false as default and do reservation as usual + if res.WillShare == nil { + return false + } + // does exist, is true = > this is the shared device, it will share => return true + if willing, exists := res.WillShare[deviceID]; willing && exists { + return true + } + // In all remaining cases we return false + return false +} + // AddReserved marks the device instances in the passed device reservation as -// used and returns if there is a collision. +// used, checks the res.WillingToShare map to see if the createOffer expected the device +// to share. If the device will share we do not report a collision even if it +// has already been used func (d *DeviceAccounter) AddReserved(res *AllocatedDeviceResource) (collision bool) { - // Lookup the device. - devInst, ok := d.Devices[*res.ID()] + // Lookup the deviceAccounter + devAccounter, ok := d.Devices[*res.ID()] if !ok { return false } // For each reserved instance, mark it as used for _, id := range res.DeviceIDs { - cur, ok := devInst.Instances[id] + cur, ok := devAccounter.Instances[id] if !ok { continue } - // It has already been used, so mark that there is a collision + // if offer expects device will share, mark device as used + // and continue without marking collision + if willingToShare(res, id) { + devAccounter.Instances[id]++ + continue + } + + // mark collision if device will not share and has already been used if cur != 0 { collision = true } + devAccounter.Instances[id]++ - devInst.Instances[id]++ } - return } diff --git a/nomad/structs/devices_test.go b/nomad/structs/devices_test.go index 694d98a534a..d6d83176bf6 100644 --- a/nomad/structs/devices_test.go +++ b/nomad/structs/devices_test.go @@ -32,6 +32,13 @@ func nvidiaAlloc() *Allocation { return a } +// sets the supplied DeviceSharing on the node and returns the node and 1st deviceID +func sharedNodeWithDeviceID(node *Node, sharingStatus Shared) (*Node, string) { + node.NodeResources.Devices[0].Instances[0].Shared = sharingStatus + deviceID := node.NodeResources.Devices[0].Instances[0].ID + return node, deviceID +} + // devNode returns a node containing two devices, an nvidia gpu and an intel // FPGA. func devNode() *Node { @@ -150,20 +157,157 @@ func TestDeviceAccounter_AddAllocs_UnknownID(t *testing.T) { func TestDeviceAccounter_AddAllocs_Collision(t *testing.T) { ci.Parallel(t) - require := require.New(t) - n := devNode() - d := NewDeviceAccounter(n) - require.NotNil(d) + for _, tc := range []struct { + name string + shared bool + expCollision bool + }{ + { + name: "standard", + shared: false, + expCollision: true, + }, { + name: "sharedNode", + shared: true, + expCollision: false, + }, + } { + t.Run(tc.name, func(t *testing.T) { + + n := devNode() + if tc.shared { + n.NodeResources.Devices[0].Instances[0].Shared = DeviceSharingActive + n.NodeResources.Devices[0].Instances[1].Shared = DeviceSharingActive + } + d := NewDeviceAccounter(n) + must.NotNil(t, d) + // Create two allocations, both with the same device + a1, a2 := nvidiaAlloc(), nvidiaAlloc() + + nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID + a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} + a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} + + allocs := []*Allocation{a1, a2} + must.Eq(t, tc.expCollision, d.AddAllocs(allocs)) + + }) + } +} - // Create two allocations, both with the same device - a1, a2 := nvidiaAlloc(), nvidiaAlloc() +// Tests that allocs on any shared devices can be double scheduled +// if device and request both agree to share +func TestDeviceAccounter_AllocateAndReserveSharedDevices(t *testing.T) { + ci.Parallel(t) - nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID - a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} - a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID} + nvidiaNode, nvidiaSharedDeviceId := sharedNodeWithDeviceID(MockNvidiaNode(), DeviceSharingUnset) + sharedNvidiaNode, sharedNvidiaSharedDeviceId := sharedNodeWithDeviceID(MockNvidiaNode(), DeviceSharingActive) + sharedIntelNode, sharedIntelNodeSharedDeviceId := sharedNodeWithDeviceID(MockIntelNode(), DeviceSharingActive) + genNvidiaOrIntelAllocs := func(isNvidia bool, willShare bool, count int, sharedSharedDeviceId string) []*Allocation { + var ( + allocs []*Allocation + allocated *AllocatedDeviceResource + ) + if isNvidia { + allocated = &AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + } + } else { + allocated = &AllocatedDeviceResource{ + Type: "fpga", + Vendor: "intel", + Name: "F100", + } + } + // function to generate a single intel or nvidia allocation + genAlloc := func(ID string, allocated *AllocatedDeviceResource, willShare bool) *Allocation { + var SharedDeviceId string + if len(ID) == 0 { + SharedDeviceId = uuid.Generate() + } else { + SharedDeviceId = ID + } + allocated.DeviceIDs = []string{SharedDeviceId} + allocated.WillShare = map[string]bool{SharedDeviceId: willShare} + + a := MockAlloc() + a.AllocatedResources.Tasks["web"].Devices = []*AllocatedDeviceResource{allocated} + a.ClientStatus = AllocClientStatusPending + return a + } + + // build []*Allocation + for range count { + allocs = append(allocs, genAlloc(sharedSharedDeviceId, allocated, willShare)) + } + + return allocs + + } + for _, tc := range []struct { + name string + node *Node + SharedDeviceId string + allocs []*Allocation + allocWillCollide bool + reserveWillCollide bool + expectedCount int + }{ + { + name: "shared device- alloc passes, willing request- reservation passes", + node: sharedNvidiaNode, + allocs: genNvidiaOrIntelAllocs(true, true, 2, sharedNvidiaSharedDeviceId), + SharedDeviceId: sharedNvidiaSharedDeviceId, + allocWillCollide: false, + reserveWillCollide: false, + expectedCount: 3, + }, + { + name: "intel , reservation passes", + node: sharedIntelNode, + allocs: genNvidiaOrIntelAllocs(false, true, 2, sharedIntelNodeSharedDeviceId), + SharedDeviceId: sharedIntelNodeSharedDeviceId, + allocWillCollide: false, + reserveWillCollide: false, + expectedCount: 3, + }, + { + name: "unshared device- alloc collides, unsharing request- reservation collides", + node: nvidiaNode, + allocs: genNvidiaOrIntelAllocs(true, false, 2, nvidiaSharedDeviceId), + SharedDeviceId: nvidiaSharedDeviceId, + allocWillCollide: true, + reserveWillCollide: true, + expectedCount: 3, + }, + { + name: "shared device- alloc passes, unsharing request - reservation collides", + node: sharedNvidiaNode, + allocs: genNvidiaOrIntelAllocs(true, false, 2, sharedNvidiaSharedDeviceId), + SharedDeviceId: sharedNvidiaSharedDeviceId, + allocWillCollide: false, + reserveWillCollide: true, + expectedCount: 3, + }, + } { + t.Run(tc.name, func(t *testing.T) { + d := NewDeviceAccounter(tc.node) + // create allocations + collision := d.AddAllocs(tc.allocs) + + must.Eq(t, tc.allocWillCollide, collision) + // attempt to reserve one of the previously allocated devices + device := tc.allocs[0].AllocatedResources.Tasks["web"].Devices[0] + + deviceName := DeviceIdTuple{device.Vendor, device.Type, device.Name} + must.Eq(t, tc.reserveWillCollide, d.AddReserved(device)) + //demonstrate the Instance counter was incremented at each attempt + must.Eq(t, tc.expectedCount, d.Devices[deviceName].Instances[tc.SharedDeviceId]) + }) + } - allocs := []*Allocation{a1, a2} - require.True(d.AddAllocs(allocs)) } // Assert that devices are not freed when an alloc's ServerTerminalStatus is diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 589548bc49e..0cbb2e2e70f 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -3038,6 +3038,80 @@ func (ns Networks) Modes() *set.Set[string] { }) } +// ShareDevices indicates whether the task should be placed on a shared device +// ShareDevices indicates whether the task should be placed on a shared device +type ShareDevices struct { + // Enabled + Enabled bool `hcl:"enabled"` + // SharedDeviceID is an optional field for use in environments with + // multiple shared devices, to make the shared device ID available to + // the plugin. If in use alongside the device.id constraint, the two must + // match or the job will not be placed. + SharedDeviceId string `hcl:"shared_device_id,optional"` +} + +// DeviceOption represents a single option in a first_available device selection. +// Each option specifies a count and optional constraints that must be satisfied +// for this option to be selected. +type DeviceOption struct { + // Count is the number of requested devices for this option + Count uint64 + + // Constraints are a set of constraints to apply when selecting the device + // to use for this option. + Constraints Constraints + + // ShareDevices indicates whether this device option is willing to share + // TODO: determine if ShareDevices should be inherited or if, like count, + // it should only be set on one or the other + ShareDevices *ShareDevices `hcl:"share_devices,optional"` +} + +func (o *DeviceOption) Equal(other *DeviceOption) bool { + if o == other { + return true + } + if o == nil || other == nil { + return false + } + return o.Count == other.Count && + o.Constraints.Equal(&other.Constraints) +} + +func (o *DeviceOption) Copy() *DeviceOption { + if o == nil { + return nil + } + return &DeviceOption{ + Count: o.Count, + ShareDevices: o.ShareDevices, + Constraints: CopySliceConstraints(o.Constraints), + } +} + +func (o *DeviceOption) Validate() error { + if o == nil { + return nil + } + + var mErr multierror.Error + for idx, constr := range o.Constraints { + // Ensure that the constraint doesn't use an operand we do not allow + switch constr.Operand { + case ConstraintDistinctHosts, ConstraintDistinctProperty: + outer := fmt.Errorf("Constraint %d validation failed: using unsupported operand %q", idx+1, constr.Operand) + _ = multierror.Append(&mErr, outer) + default: + if err := constr.Validate(); err != nil { + outer := fmt.Errorf("Constraint %d validation failed: %s", idx+1, err) + _ = multierror.Append(&mErr, outer) + } + } + } + + return mErr.ErrorOrNil() +} + // RequestedDevice is used to request a device for a task. type RequestedDevice struct { // Name is the request name. The possible values are as follows: @@ -3051,16 +3125,28 @@ type RequestedDevice struct { // * "nvidia/gpu/GTX2080Ti" Name string - // Count is the number of requested devices + // Count is the number of requested devices. Mutually exclusive with + // FirstAvailable. Count uint64 // Constraints are a set of constraints to apply when selecting the device - // to use. + // to use. When FirstAvailable is specified, these constraints are applied + // as base constraints that all options must also satisfy. Constraints Constraints // Affinities are a set of affinities to apply when selecting the device - // to use. + // to use. When FirstAvailable is specified, these affinities are applied + // as base affinities for all options. Affinities Affinities + + // ShareDevices indicates whether the job should be placed on a shared device + // and is willing to share + ShareDevices *ShareDevices + + // FirstAvailable specifies a prioritized list of device options. The + // scheduler will attempt to satisfy each option in order, selecting the + // first one that can be fulfilled. Mutually exclusive with Count. + FirstAvailable []*DeviceOption } func (r *RequestedDevice) String() string { @@ -3074,10 +3160,25 @@ func (r *RequestedDevice) Equal(o *RequestedDevice) bool { if r == nil || o == nil { return false } - return r.Name == o.Name && - r.Count == o.Count && - r.Constraints.Equal(&o.Constraints) && - r.Affinities.Equal(&o.Affinities) + if r.Name != o.Name || r.Count != o.Count { + return false + } + if !r.Constraints.Equal(&o.Constraints) || !r.Affinities.Equal(&o.Affinities) { + return false + } + + if r.ShareDevices != o.ShareDevices { + return false + } + if len(r.FirstAvailable) != len(o.FirstAvailable) { + return false + } + for i, opt := range r.FirstAvailable { + if !opt.Equal(o.FirstAvailable[i]) { + return false + } + } + return true } func (r *RequestedDevice) Copy() *RequestedDevice { @@ -3089,6 +3190,13 @@ func (r *RequestedDevice) Copy() *RequestedDevice { nr.Constraints = CopySliceConstraints(nr.Constraints) nr.Affinities = CopySliceAffinities(nr.Affinities) + if len(r.FirstAvailable) > 0 { + nr.FirstAvailable = make([]*DeviceOption, len(r.FirstAvailable)) + for i, opt := range r.FirstAvailable { + nr.FirstAvailable[i] = opt.Copy() + } + } + return &nr } @@ -3127,6 +3235,12 @@ func (r *RequestedDevice) Validate() error { _ = multierror.Append(&mErr, errors.New("device name must be given as one of the following: type, vendor/type, or vendor/type/name")) } + // Count and FirstAvailable are mutually exclusive + if r.Count > 0 && len(r.FirstAvailable) > 0 { + _ = multierror.Append(&mErr, errors.New("'count' and 'first_available' are mutually exclusive")) + } + + // Validate base constraints for idx, constr := range r.Constraints { // Ensure that the constraint doesn't use an operand we do not allow switch constr.Operand { @@ -3140,6 +3254,8 @@ func (r *RequestedDevice) Validate() error { } } } + + // Validate base affinities for idx, affinity := range r.Affinities { if err := affinity.Validate(); err != nil { outer := fmt.Errorf("Affinity %d validation failed: %s", idx+1, err) @@ -3147,6 +3263,18 @@ func (r *RequestedDevice) Validate() error { } } + // Validate each first_available option + for idx, opt := range r.FirstAvailable { + if opt == nil { + _ = multierror.Append(&mErr, fmt.Errorf("first_available %d is nil", idx+1)) + continue + } + if err := opt.Validate(); err != nil { + outer := fmt.Errorf("first_available %d validation failed: %s", idx+1, err) + _ = multierror.Append(&mErr, outer) + } + } + return mErr.ErrorOrNil() } @@ -3575,6 +3703,32 @@ func (n *NodeDeviceResource) Equal(o *NodeDeviceResource) bool { return true } +// Shared mirrors the plugin.Shared string enum found +// on Devices.DetectedDevice that some devices use to +// report the status and presence of sharing subsystems +type Shared string + +func (s Shared) String() string { + switch s { + case DeviceSharingInactive: + return "inactive" + case DeviceSharingIneligible: + return "inelgible" + case DeviceSharingActive: + return "active" + case DeviceSharingUnset: + return "" + } + return "" +} + +const ( + DeviceSharingUnset Shared = "" + DeviceSharingIneligible Shared = "ineligible" + DeviceSharingActive Shared = "active" + DeviceSharingInactive Shared = "inactive" +) + // NodeDevice is an instance of a particular device. type NodeDevice struct { // ID is the ID of the device. @@ -3590,6 +3744,10 @@ type NodeDevice struct { // Locality stores HW locality information for the node to optionally be // used when making placement decisions. Locality *NodeDeviceLocality + + // Shared mirrors a string enum on device.DetectedDevice that some + // devices use to report status and presence of sharing subsystems + Shared Shared } func (n *NodeDevice) Equal(o *NodeDevice) bool { @@ -3609,6 +3767,8 @@ func (n *NodeDevice) Equal(o *NodeDevice) bool { return false } else if !n.Locality.Equal(o.Locality) { return false + } else if n.Shared != o.Shared { + return false } return true @@ -3624,7 +3784,6 @@ func (n *NodeDevice) Copy() *NodeDevice { // Copy the locality nn.Locality = nn.Locality.Copy() - return &nn } @@ -4176,6 +4335,10 @@ type AllocatedDeviceResource struct { // DeviceIDs is the set of allocated devices DeviceIDs []string + + // WillShare is a map of DeviceIDs[bool] that indicates whether the + // requesting task is willing to share the device + WillShare map[string]bool } func (a *AllocatedDeviceResource) ID() *DeviceIdTuple { @@ -4208,6 +4371,7 @@ func (a *AllocatedDeviceResource) Copy() *AllocatedDeviceResource { // Copy the devices na.DeviceIDs = make([]string, len(a.DeviceIDs)) copy(na.DeviceIDs, a.DeviceIDs) + na.WillShare = make(map[string]bool, len(a.DeviceIDs)) return &na } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 2f66c9c2de0..9eae90cd5ce 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -6780,6 +6780,180 @@ func TestDevicesEquals(t *testing.T) { } } +func TestDeviceOption_Equal(t *testing.T) { + ci.Parallel(t) + + must.Equal[*DeviceOption](t, nil, nil) + must.NotEqual[*DeviceOption](t, nil, new(DeviceOption)) + + opt1 := &DeviceOption{ + Count: 2, + Constraints: []*Constraint{ + {LTarget: "${attr.kernel.name}", Operand: "=", RTarget: "linux"}, + }, + } + + // Equal copy + opt2 := opt1.Copy() + must.True(t, opt1.Equal(opt2)) + + // Different count + opt3 := opt1.Copy() + opt3.Count = 4 + must.False(t, opt1.Equal(opt3)) + + // Different constraints + opt4 := opt1.Copy() + opt4.Constraints = []*Constraint{ + {LTarget: "${attr.kernel.name}", Operand: "=", RTarget: "darwin"}, + } + must.False(t, opt1.Equal(opt4)) +} + +func TestDeviceOption_Copy(t *testing.T) { + ci.Parallel(t) + + // Nil copy + var nilOpt *DeviceOption + must.Nil(t, nilOpt.Copy()) + + opt := &DeviceOption{ + Count: 2, + Constraints: []*Constraint{ + {LTarget: "${attr.kernel.name}", Operand: "=", RTarget: "linux"}, + }, + } + + cp := opt.Copy() + must.True(t, opt.Equal(cp)) + + // Modify original, copy should be unchanged + opt.Count = 10 + opt.Constraints[0].RTarget = "darwin" + must.Eq(t, uint64(2), cp.Count) + must.Eq(t, "linux", cp.Constraints[0].RTarget) +} + +func TestDeviceOption_Validate(t *testing.T) { + ci.Parallel(t) + + // Valid option (no constraints - device constraints use ${device.*} which + // is validated at scheduler time, not job submission time) + opt := &DeviceOption{ + Count: 2, + } + must.NoError(t, opt.Validate()) + + // Invalid constraint operand (distinct_hosts not allowed for devices) + opt2 := &DeviceOption{ + Count: 1, + Constraints: []*Constraint{ + {LTarget: "${attr.kernel.name}", Operand: ConstraintDistinctHosts, RTarget: "true"}, + }, + } + err := opt2.Validate() + must.Error(t, err) + must.StrContains(t, err.Error(), "unsupported operand") +} + +func TestRequestedDevice_FirstAvailable_Equal(t *testing.T) { + ci.Parallel(t) + + rd1 := &RequestedDevice{ + Name: "nvidia/gpu", + FirstAvailable: []*DeviceOption{ + {Count: 2}, + {Count: 1}, + }, + } + + // Equal copy + rd2 := rd1.Copy() + must.True(t, rd1.Equal(rd2)) + + // Different number of options + rd3 := rd1.Copy() + rd3.FirstAvailable = rd3.FirstAvailable[:1] + must.False(t, rd1.Equal(rd3)) + + // Different option content + rd4 := rd1.Copy() + rd4.FirstAvailable[0].Count = 4 + must.False(t, rd1.Equal(rd4)) +} + +func TestRequestedDevice_FirstAvailable_Copy(t *testing.T) { + ci.Parallel(t) + + rd := &RequestedDevice{ + Name: "nvidia/gpu", + FirstAvailable: []*DeviceOption{ + {Count: 2, Constraints: []*Constraint{{LTarget: "${attr.kernel.name}", Operand: "=", RTarget: "linux"}}}, + }, + } + + cp := rd.Copy() + must.True(t, rd.Equal(cp)) + + // Modify original, copy should be unchanged + rd.FirstAvailable[0].Count = 10 + rd.FirstAvailable[0].Constraints[0].RTarget = "darwin" + must.Eq(t, uint64(2), cp.FirstAvailable[0].Count) + must.Eq(t, "linux", cp.FirstAvailable[0].Constraints[0].RTarget) +} + +func TestRequestedDevice_FirstAvailable_Validate(t *testing.T) { + ci.Parallel(t) + + // Valid first_available request (no constraints - device constraints use + // ${device.*} which is validated at scheduler time, not job submission time) + rd := &RequestedDevice{ + Name: "nvidia/gpu", + FirstAvailable: []*DeviceOption{ + {Count: 2}, + {Count: 1}, + }, + } + must.NoError(t, rd.Validate()) + + // Count and FirstAvailable are mutually exclusive + rd2 := &RequestedDevice{ + Name: "nvidia/gpu", + Count: 2, + FirstAvailable: []*DeviceOption{ + {Count: 1}, + }, + } + err := rd2.Validate() + must.Error(t, err) + must.StrContains(t, err.Error(), "mutually exclusive") + + // Invalid option in FirstAvailable (distinct_hosts not allowed) + rd3 := &RequestedDevice{ + Name: "nvidia/gpu", + FirstAvailable: []*DeviceOption{ + { + Count: 1, + Constraints: []*Constraint{ + {LTarget: "${attr.kernel.name}", Operand: ConstraintDistinctHosts, RTarget: "true"}, + }, + }, + }, + } + err = rd3.Validate() + must.Error(t, err) + must.StrContains(t, err.Error(), "first_available 1 validation failed") + + // Nil option in FirstAvailable + rd4 := &RequestedDevice{ + Name: "nvidia/gpu", + FirstAvailable: []*DeviceOption{nil}, + } + err = rd4.Validate() + must.Error(t, err) + must.StrContains(t, err.Error(), "is nil") +} + func TestAllocatedPortMapping_Equal(t *testing.T) { ci.Parallel(t) diff --git a/nomad/structs/testing.go b/nomad/structs/testing.go index 1c37ae0ccb9..03cd6b78b97 100644 --- a/nomad/structs/testing.go +++ b/nomad/structs/testing.go @@ -256,6 +256,35 @@ func MockNvidiaNode() *Node { return n } +// MockIntelNode returns a shared node with a single Intel GPU +func MockIntelNode() *Node { + n := MockNode() + n.NodeResources.Devices = []*NodeDeviceResource{ + { + Type: "fpga", + Vendor: "intel", + Name: "F100", + Attributes: map[string]*psstructs.Attribute{ + "memory": psstructs.NewIntAttribute(4, psstructs.UnitGiB), + }, + Instances: []*NodeDevice{ + { + ID: uuid.Generate(), + Healthy: true, + }, + { + ID: uuid.Generate(), + Healthy: false, + }, + }, + }, + } + err := n.ComputeClass() + if err != nil { + panic(fmt.Sprintf("failed to compute node class: %v", err)) + } + return n +} func MockJob() *Job { job := &Job{ Region: "global", diff --git a/plugins/device/device.go b/plugins/device/device.go index cf1df4bfd4a..ea90499aa1d 100644 --- a/plugins/device/device.go +++ b/plugins/device/device.go @@ -16,8 +16,28 @@ import ( const ( // DeviceTypeGPU is a canonical device type for a GPU. DeviceTypeGPU = "gpu" + + SharingUnset Shared = "" + SharingIneligible Shared = "ineligible" + SharingActive Shared = "active" + SharingInactive Shared = "inactive" ) +type Shared string + +func (s Shared) String() string { + switch s { + case SharingInactive: + return "inactive" + case SharingIneligible: + return "inelegible" + case SharingActive: + return "active" + default: + return "unset" + } +} + var ( // ErrPluginDisabled indicates that the device plugin is disabled ErrPluginDisabled = fmt.Errorf("device is not enabled") @@ -133,6 +153,9 @@ type Device struct { // HwLocality captures hardware locality information for the device. HwLocality *DeviceLocality + + // Shared marks whether Device Sharing is enabled + Shared Shared } // Validate validates that the device is valid diff --git a/plugins/device/proto/device.pb.go b/plugins/device/proto/device.pb.go index 815268efc44..a255ad71e39 100644 --- a/plugins/device/proto/device.pb.go +++ b/plugins/device/proto/device.pb.go @@ -27,6 +27,37 @@ var _ = math.Inf // proto package needs to be updated. const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package +type Shared int32 + +const ( + Shared_SHARED_UNSET Shared = 0 + Shared_SHARED_ACTIVE Shared = 1 + Shared_SHARED_INACTIVE Shared = 2 + Shared_SHARED_INELIGIBLE Shared = 3 +) + +var Shared_name = map[int32]string{ + 0: "SHARED_UNSET", + 1: "SHARED_ACTIVE", + 2: "SHARED_INACTIVE", + 3: "SHARED_INELIGIBLE", +} + +var Shared_value = map[string]int32{ + "SHARED_UNSET": 0, + "SHARED_ACTIVE": 1, + "SHARED_INACTIVE": 2, + "SHARED_INELIGIBLE": 3, +} + +func (x Shared) String() string { + return proto.EnumName(Shared_name, int32(x)) +} + +func (Shared) EnumDescriptor() ([]byte, []int) { + return fileDescriptor_5edb0c35c07fa415, []int{0} +} + // FingerprintRequest is used to request for devices to be fingerprinted. type FingerprintRequest struct { XXX_NoUnkeyedLiteral struct{} `json:"-"` @@ -193,10 +224,12 @@ type DetectedDevice struct { HealthDescription string `protobuf:"bytes,3,opt,name=health_description,json=healthDescription,proto3" json:"health_description,omitempty"` // hw_locality is optionally set to expose hardware locality information for // more optimal placement decisions. - HwLocality *DeviceLocality `protobuf:"bytes,4,opt,name=hw_locality,json=hwLocality,proto3" json:"hw_locality,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` + HwLocality *DeviceLocality `protobuf:"bytes,4,opt,name=hw_locality,json=hwLocality,proto3" json:"hw_locality,omitempty"` + // shared reports on the presence and state of a device sharing daemon + Shared Shared `protobuf:"varint,5,opt,name=shared,proto3,enum=hashicorp.nomad.plugins.device.Shared" json:"shared,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` } func (m *DetectedDevice) Reset() { *m = DetectedDevice{} } @@ -252,6 +285,13 @@ func (m *DetectedDevice) GetHwLocality() *DeviceLocality { return nil } +func (m *DetectedDevice) GetShared() Shared { + if m != nil { + return m.Shared + } + return Shared_SHARED_UNSET +} + // DeviceLocality is used to expose HW locality information about a device. type DeviceLocality struct { // pci_bus_id is the PCI bus ID for the device. If reported, it @@ -770,7 +810,48 @@ func (m *DeviceStats) GetTimestamp() *timestamp.Timestamp { return nil } +// DeviceSharing is a representation of the DeviceSharing string enum +type DeviceSharing struct { + Shared Shared `protobuf:"varint,1,opt,name=shared,proto3,enum=hashicorp.nomad.plugins.device.Shared" json:"shared,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *DeviceSharing) Reset() { *m = DeviceSharing{} } +func (m *DeviceSharing) String() string { return proto.CompactTextString(m) } +func (*DeviceSharing) ProtoMessage() {} +func (*DeviceSharing) Descriptor() ([]byte, []int) { + return fileDescriptor_5edb0c35c07fa415, []int{14} +} + +func (m *DeviceSharing) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_DeviceSharing.Unmarshal(m, b) +} +func (m *DeviceSharing) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_DeviceSharing.Marshal(b, m, deterministic) +} +func (m *DeviceSharing) XXX_Merge(src proto.Message) { + xxx_messageInfo_DeviceSharing.Merge(m, src) +} +func (m *DeviceSharing) XXX_Size() int { + return xxx_messageInfo_DeviceSharing.Size(m) +} +func (m *DeviceSharing) XXX_DiscardUnknown() { + xxx_messageInfo_DeviceSharing.DiscardUnknown(m) +} + +var xxx_messageInfo_DeviceSharing proto.InternalMessageInfo + +func (m *DeviceSharing) GetShared() Shared { + if m != nil { + return m.Shared + } + return Shared_SHARED_UNSET +} + func init() { + proto.RegisterEnum("hashicorp.nomad.plugins.device.Shared", Shared_name, Shared_value) proto.RegisterType((*FingerprintRequest)(nil), "hashicorp.nomad.plugins.device.FingerprintRequest") proto.RegisterType((*FingerprintResponse)(nil), "hashicorp.nomad.plugins.device.FingerprintResponse") proto.RegisterType((*DeviceGroup)(nil), "hashicorp.nomad.plugins.device.DeviceGroup") @@ -788,6 +869,7 @@ func init() { proto.RegisterType((*DeviceGroupStats)(nil), "hashicorp.nomad.plugins.device.DeviceGroupStats") proto.RegisterMapType((map[string]*DeviceStats)(nil), "hashicorp.nomad.plugins.device.DeviceGroupStats.InstanceStatsEntry") proto.RegisterType((*DeviceStats)(nil), "hashicorp.nomad.plugins.device.DeviceStats") + proto.RegisterType((*DeviceSharing)(nil), "hashicorp.nomad.plugins.device.DeviceSharing") } func init() { @@ -795,68 +877,74 @@ func init() { } var fileDescriptor_5edb0c35c07fa415 = []byte{ - // 965 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xa4, 0x56, 0xef, 0x8e, 0xdb, 0x44, - 0x10, 0x27, 0xc9, 0xe5, 0x92, 0x4c, 0xee, 0xae, 0x65, 0x7b, 0x42, 0xc6, 0x40, 0x7b, 0x58, 0x42, - 0x3a, 0x41, 0xeb, 0x94, 0x14, 0x89, 0x0a, 0x04, 0x52, 0xdb, 0x94, 0x5e, 0xf8, 0xd3, 0xab, 0xb6, - 0x15, 0x52, 0x8b, 0x84, 0xb5, 0x67, 0x2f, 0xf1, 0xb6, 0xf6, 0xda, 0xec, 0xae, 0x53, 0x99, 0x4f, - 0x3c, 0x0e, 0x5f, 0x78, 0x01, 0x1e, 0x86, 0x0f, 0x3c, 0x09, 0xf2, 0xee, 0x3a, 0xf1, 0xfd, 0xe9, - 0x25, 0x81, 0x4f, 0xde, 0x9d, 0x99, 0xdf, 0xcc, 0xec, 0xcc, 0x6f, 0x67, 0x0d, 0x1f, 0xe6, 0x49, - 0x31, 0x63, 0x5c, 0x8e, 0x22, 0x3a, 0x67, 0x21, 0x1d, 0xe5, 0x22, 0x53, 0x99, 0xdd, 0xf8, 0x7a, - 0x83, 0xae, 0xc7, 0x44, 0xc6, 0x2c, 0xcc, 0x44, 0xee, 0xf3, 0x2c, 0x25, 0x91, 0x6f, 0x21, 0xbe, - 0xb1, 0x72, 0x6f, 0xcc, 0xb2, 0x6c, 0x96, 0x58, 0xe8, 0x49, 0xf1, 0xcb, 0x48, 0xb1, 0x94, 0x4a, - 0x45, 0xd2, 0xdc, 0x38, 0x70, 0xaf, 0x9f, 0x35, 0x88, 0x0a, 0x41, 0x14, 0xcb, 0xb8, 0xd5, 0xdf, - 0xac, 0x73, 0x90, 0x31, 0x11, 0x34, 0x1a, 0x49, 0x25, 0x8a, 0x50, 0x49, 0x9b, 0x0b, 0x51, 0x4a, - 0xb0, 0x93, 0x42, 0xd9, 0x74, 0xdc, 0xc3, 0x4b, 0xad, 0xa5, 0x22, 0x4a, 0x1a, 0x4b, 0x6f, 0x1f, - 0xd0, 0x37, 0x8c, 0xcf, 0xa8, 0xc8, 0x05, 0xe3, 0x0a, 0xd3, 0x5f, 0x0b, 0x2a, 0x95, 0x47, 0xe1, - 0xda, 0x29, 0xa9, 0xcc, 0x33, 0x2e, 0x29, 0x7a, 0x0c, 0x3b, 0xe6, 0x3c, 0xc1, 0x4c, 0x64, 0x45, - 0xee, 0xb4, 0x0e, 0x3a, 0x87, 0xc3, 0xf1, 0x27, 0xfe, 0xe5, 0x87, 0xf7, 0x27, 0xfa, 0xf3, 0xa8, - 0x82, 0xe0, 0x61, 0xb4, 0xdc, 0x78, 0xbf, 0x77, 0x60, 0xd8, 0x50, 0xa2, 0x77, 0x60, 0x7b, 0x4e, - 0x79, 0x94, 0x09, 0xa7, 0x75, 0xd0, 0x3a, 0x1c, 0x60, 0xbb, 0x43, 0x37, 0xc0, 0xc2, 0x02, 0x55, - 0xe6, 0xd4, 0x69, 0x6b, 0x25, 0x18, 0xd1, 0xb3, 0x32, 0xa7, 0x0d, 0x03, 0x4e, 0x52, 0xea, 0x74, - 0x9a, 0x06, 0x8f, 0x49, 0x4a, 0xd1, 0x11, 0xf4, 0xcc, 0x4e, 0x3a, 0x5b, 0x3a, 0x69, 0x7f, 0x75, - 0xd2, 0x8a, 0x86, 0x8a, 0x46, 0x26, 0x3f, 0x5c, 0xc3, 0xd1, 0x4f, 0x00, 0x8b, 0x6a, 0x4b, 0xa7, - 0xab, 0x9d, 0x7d, 0xb9, 0x41, 0x05, 0xfc, 0x7b, 0x0b, 0xf4, 0x43, 0xae, 0x44, 0x89, 0x1b, 0xee, - 0xdc, 0x1c, 0xae, 0x9c, 0x51, 0xa3, 0xab, 0xd0, 0x79, 0x45, 0x4b, 0x5b, 0x90, 0x6a, 0x89, 0x1e, - 0x41, 0x77, 0x4e, 0x92, 0xc2, 0xd4, 0x61, 0x38, 0xfe, 0xf4, 0x8d, 0xc1, 0x4d, 0xf3, 0x7d, 0xdb, - 0xfc, 0x65, 0x60, 0x6c, 0xf0, 0x5f, 0xb4, 0xef, 0xb6, 0xbc, 0xbf, 0x5a, 0xb0, 0x77, 0xfa, 0xa8, - 0x68, 0x0f, 0xda, 0xd3, 0x89, 0x0d, 0xd8, 0x9e, 0x4e, 0x90, 0x03, 0xbd, 0x98, 0x92, 0x44, 0xc5, - 0xa5, 0x8e, 0xd8, 0xc7, 0xf5, 0x16, 0xdd, 0x02, 0x64, 0x96, 0x41, 0x44, 0x65, 0x28, 0x58, 0x5e, - 0x11, 0xd6, 0x56, 0xff, 0x6d, 0xa3, 0x99, 0x2c, 0x15, 0xe8, 0x18, 0x86, 0xf1, 0xeb, 0x20, 0xc9, - 0x42, 0x92, 0x30, 0x55, 0x3a, 0x5b, 0x3a, 0x7d, 0x7f, 0xbd, 0xda, 0x7d, 0x6f, 0x51, 0x18, 0xe2, - 0xd7, 0xf5, 0xda, 0xf3, 0xab, 0xdc, 0x9b, 0x5a, 0xf4, 0x3e, 0x40, 0x1e, 0xb2, 0xe0, 0xa4, 0x90, - 0x01, 0x8b, 0xec, 0x19, 0xfa, 0x79, 0xc8, 0xee, 0x17, 0x72, 0x1a, 0x79, 0x23, 0xd8, 0xc3, 0x54, - 0x52, 0x31, 0xa7, 0x96, 0xe8, 0xe8, 0x03, 0xb0, 0x2c, 0x09, 0x58, 0x24, 0x35, 0x9f, 0x07, 0x78, - 0x60, 0x24, 0xd3, 0x48, 0x7a, 0x09, 0x5c, 0x59, 0x00, 0xec, 0x1d, 0x78, 0x0e, 0xbb, 0x61, 0xc6, - 0x15, 0x61, 0x9c, 0x8a, 0x40, 0x50, 0xa9, 0x83, 0x0c, 0xc7, 0x9f, 0xad, 0x3a, 0xc6, 0x83, 0x1a, - 0x64, 0x1c, 0xea, 0xbb, 0x8d, 0x77, 0xc2, 0x86, 0xd4, 0xfb, 0xa3, 0x0d, 0xfb, 0x17, 0x99, 0x21, - 0x0c, 0x5b, 0x94, 0xcf, 0xa5, 0xbd, 0x6f, 0x5f, 0xff, 0x97, 0x50, 0xfe, 0x43, 0x3e, 0xb7, 0x84, - 0xd3, 0xbe, 0xd0, 0x57, 0xb0, 0x9d, 0x66, 0x05, 0x57, 0xd2, 0x69, 0x6b, 0xaf, 0x1f, 0xad, 0xf2, - 0xfa, 0x43, 0x65, 0x8d, 0x2d, 0x08, 0x4d, 0x96, 0x17, 0xaa, 0xa3, 0xf1, 0x1f, 0xaf, 0xd7, 0xc7, - 0xa7, 0x39, 0x0d, 0x17, 0x97, 0xc9, 0xfd, 0x1c, 0x06, 0x8b, 0xbc, 0x2e, 0x60, 0xfa, 0x7e, 0x93, - 0xe9, 0x83, 0x26, 0x6d, 0x7f, 0x86, 0xae, 0xce, 0x07, 0xbd, 0x07, 0x03, 0x45, 0xe4, 0xab, 0x20, - 0x27, 0x2a, 0xae, 0xfb, 0x5d, 0x09, 0x9e, 0x10, 0x15, 0x57, 0xca, 0x38, 0x93, 0xca, 0x28, 0x8d, - 0x8f, 0x7e, 0x25, 0xa8, 0x95, 0x82, 0x92, 0x28, 0xc8, 0x78, 0x52, 0x6a, 0xce, 0xf6, 0x71, 0xbf, - 0x12, 0x1c, 0xf3, 0xa4, 0xf4, 0x62, 0x80, 0x65, 0xbe, 0xff, 0x23, 0xc8, 0x01, 0x0c, 0x73, 0x2a, - 0x52, 0x26, 0x25, 0xcb, 0xb8, 0xb4, 0x57, 0xa3, 0x29, 0xf2, 0x5e, 0xc0, 0xce, 0xd3, 0x6a, 0x1e, - 0xd7, 0x8c, 0xfc, 0x16, 0xae, 0x85, 0x59, 0x92, 0xd0, 0xb0, 0xea, 0x5a, 0xc0, 0xb8, 0xaa, 0x3a, - 0x98, 0x58, 0x96, 0xbd, 0xeb, 0x9b, 0x67, 0xc2, 0xaf, 0x9f, 0x09, 0x7f, 0x62, 0x9f, 0x09, 0x8c, - 0x96, 0xa8, 0xa9, 0x05, 0x79, 0xcf, 0x61, 0xd7, 0xfa, 0xb6, 0xe4, 0x3d, 0x82, 0x6d, 0x3d, 0xb9, - 0x6b, 0x2a, 0xdd, 0xde, 0x60, 0x70, 0x19, 0x4f, 0x16, 0xef, 0xfd, 0xd9, 0x86, 0xab, 0x67, 0x95, - 0x6f, 0x9c, 0xdf, 0x08, 0xb6, 0x1a, 0x83, 0x5b, 0xaf, 0x2b, 0x59, 0x63, 0x56, 0xeb, 0x35, 0x7a, - 0x09, 0x7b, 0x8c, 0x4b, 0x45, 0x78, 0x48, 0x03, 0xfd, 0x48, 0xd9, 0x61, 0xfd, 0x60, 0xd3, 0x34, - 0xfd, 0xa9, 0x75, 0xa3, 0x77, 0x86, 0xf6, 0xbb, 0xac, 0x29, 0x73, 0x53, 0x40, 0xe7, 0x8d, 0x2e, - 0xe0, 0xe0, 0xbd, 0xd3, 0xd3, 0x76, 0xcd, 0xc7, 0xce, 0x14, 0xab, 0x41, 0xd8, 0xbf, 0x5b, 0xf5, - 0x53, 0x67, 0x4a, 0xf5, 0x1d, 0xf4, 0x64, 0x91, 0xa6, 0x44, 0x94, 0xb6, 0xb5, 0x6b, 0x8f, 0xf1, - 0x0a, 0xff, 0x63, 0xe5, 0x17, 0xd7, 0x1e, 0xd0, 0x11, 0x74, 0x4d, 0xb9, 0x4c, 0x8e, 0xe3, 0x4d, - 0x5c, 0x1d, 0x9f, 0xbc, 0xa4, 0xa1, 0xc2, 0xc6, 0x01, 0xba, 0x0b, 0x83, 0xc5, 0x9f, 0x89, 0x6e, - 0xcd, 0x70, 0xec, 0x9e, 0xe3, 0xdc, 0xb3, 0xda, 0x02, 0x2f, 0x8d, 0xc7, 0xff, 0xb4, 0x61, 0xc7, - 0x1c, 0xf0, 0x89, 0x0e, 0x86, 0x7e, 0x83, 0x61, 0xe3, 0x1f, 0x02, 0x8d, 0x57, 0x15, 0xee, 0xfc, - 0x6f, 0x88, 0x7b, 0x67, 0x23, 0x8c, 0xe1, 0xb8, 0xf7, 0xd6, 0xed, 0x16, 0x4a, 0xa0, 0x67, 0xe7, - 0x36, 0x5a, 0xf9, 0xbe, 0x9c, 0x7e, 0x11, 0xdc, 0xd1, 0xda, 0xf6, 0x75, 0x3c, 0x14, 0x43, 0xd7, - 0x34, 0xf5, 0xe6, 0x2a, 0x6c, 0xf3, 0xa6, 0xbb, 0xb7, 0xd6, 0xb4, 0x5e, 0x9e, 0xeb, 0x7e, 0xef, - 0x45, 0xd7, 0x74, 0x61, 0x5b, 0x7f, 0xee, 0xfc, 0x1b, 0x00, 0x00, 0xff, 0xff, 0x11, 0xd4, 0x56, - 0x04, 0x9b, 0x0a, 0x00, 0x00, + // 1061 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xa4, 0x56, 0xdd, 0x6e, 0x1b, 0x45, + 0x14, 0xae, 0xed, 0xd8, 0x89, 0x8f, 0xf3, 0xe3, 0x4c, 0x02, 0x32, 0x06, 0xda, 0xb0, 0x12, 0x28, + 0x2a, 0xed, 0xba, 0xb8, 0x48, 0x54, 0x20, 0x2a, 0x25, 0xb1, 0x49, 0x0c, 0x21, 0xa9, 0x26, 0xa1, + 0x52, 0x8a, 0xc4, 0x6a, 0xb2, 0x3b, 0x78, 0xa7, 0xdd, 0x9d, 0x5d, 0x76, 0x66, 0x5d, 0x99, 0x2b, + 0x1e, 0x87, 0x1b, 0x5e, 0x89, 0x0b, 0x1e, 0x83, 0x2b, 0xb4, 0x33, 0xb3, 0xf6, 0xe6, 0xa7, 0xb5, + 0x5d, 0xae, 0x76, 0xe6, 0x9c, 0xf3, 0x7d, 0x73, 0xe6, 0xcc, 0xf9, 0x59, 0xf8, 0x24, 0x0e, 0xd2, + 0x21, 0xe3, 0xa2, 0xe3, 0xd1, 0x11, 0x73, 0x69, 0x27, 0x4e, 0x22, 0x19, 0x99, 0x8d, 0xad, 0x36, + 0xe8, 0xae, 0x4f, 0x84, 0xcf, 0xdc, 0x28, 0x89, 0x6d, 0x1e, 0x85, 0xc4, 0xb3, 0x0d, 0xc4, 0xd6, + 0x56, 0xed, 0x7b, 0xc3, 0x28, 0x1a, 0x06, 0x06, 0x7a, 0x99, 0xfe, 0xda, 0x91, 0x2c, 0xa4, 0x42, + 0x92, 0x30, 0xd6, 0x04, 0xed, 0xbb, 0xd7, 0x0d, 0xbc, 0x34, 0x21, 0x92, 0x45, 0xdc, 0xe8, 0x1f, + 0xe4, 0x3e, 0x08, 0x9f, 0x24, 0xd4, 0xeb, 0x08, 0x99, 0xa4, 0xae, 0x14, 0xc6, 0x17, 0x22, 0x65, + 0xc2, 0x2e, 0x53, 0x69, 0xdc, 0x69, 0xef, 0xbe, 0xd5, 0x5a, 0x48, 0x22, 0x85, 0xb6, 0xb4, 0xb6, + 0x01, 0x7d, 0xc7, 0xf8, 0x90, 0x26, 0x71, 0xc2, 0xb8, 0xc4, 0xf4, 0xb7, 0x94, 0x0a, 0x69, 0x51, + 0xd8, 0xba, 0x22, 0x15, 0x71, 0xc4, 0x05, 0x45, 0x27, 0xb0, 0xaa, 0xef, 0xe3, 0x0c, 0x93, 0x28, + 0x8d, 0x5b, 0xa5, 0x9d, 0xca, 0x6e, 0xa3, 0xfb, 0xb9, 0xfd, 0xf6, 0xcb, 0xdb, 0x3d, 0xf5, 0x39, + 0xcc, 0x20, 0xb8, 0xe1, 0x4d, 0x37, 0xd6, 0x1f, 0x15, 0x68, 0x14, 0x94, 0xe8, 0x7d, 0xa8, 0x8d, + 0x28, 0xf7, 0xa2, 0xa4, 0x55, 0xda, 0x29, 0xed, 0xd6, 0xb1, 0xd9, 0xa1, 0x7b, 0x60, 0x60, 0x8e, + 0x1c, 0xc7, 0xb4, 0x55, 0x56, 0x4a, 0xd0, 0xa2, 0xf3, 0x71, 0x4c, 0x0b, 0x06, 0x9c, 0x84, 0xb4, + 0x55, 0x29, 0x1a, 0x9c, 0x90, 0x90, 0xa2, 0x23, 0x58, 0xd6, 0x3b, 0xd1, 0x5a, 0x52, 0x4e, 0xdb, + 0xb3, 0x9d, 0x96, 0xd4, 0x95, 0xd4, 0xd3, 0xfe, 0xe1, 0x1c, 0x8e, 0x7e, 0x06, 0x98, 0x44, 0x5b, + 0xb4, 0xaa, 0x8a, 0xec, 0x9b, 0x05, 0x22, 0x60, 0xef, 0x4d, 0xd0, 0x7d, 0x2e, 0x93, 0x31, 0x2e, + 0xd0, 0xb5, 0x63, 0xd8, 0xb8, 0xa6, 0x46, 0x4d, 0xa8, 0xbc, 0xa2, 0x63, 0x13, 0x90, 0x6c, 0x89, + 0x0e, 0xa1, 0x3a, 0x22, 0x41, 0xaa, 0xe3, 0xd0, 0xe8, 0x7e, 0xf1, 0xc6, 0xc3, 0xf5, 0xe3, 0xdb, + 0xe6, 0xf1, 0xa7, 0x07, 0x63, 0x8d, 0xff, 0xba, 0xfc, 0xa4, 0x64, 0xfd, 0x5b, 0x82, 0xf5, 0xab, + 0x57, 0x45, 0xeb, 0x50, 0x1e, 0xf4, 0xcc, 0x81, 0xe5, 0x41, 0x0f, 0xb5, 0x60, 0xd9, 0xa7, 0x24, + 0x90, 0xfe, 0x58, 0x9d, 0xb8, 0x82, 0xf3, 0x2d, 0x7a, 0x08, 0x48, 0x2f, 0x1d, 0x8f, 0x0a, 0x37, + 0x61, 0x71, 0x96, 0xb0, 0x26, 0xfa, 0x9b, 0x5a, 0xd3, 0x9b, 0x2a, 0xd0, 0x29, 0x34, 0xfc, 0xd7, + 0x4e, 0x10, 0xb9, 0x24, 0x60, 0x72, 0xdc, 0x5a, 0x52, 0xee, 0xdb, 0xf3, 0xc5, 0xee, 0xd8, 0xa0, + 0x30, 0xf8, 0xaf, 0xf3, 0x35, 0x7a, 0x0a, 0x35, 0x7d, 0xc7, 0x56, 0x75, 0xa7, 0xb4, 0xbb, 0xde, + 0xfd, 0x6c, 0x16, 0xd7, 0x99, 0xb2, 0xc6, 0x06, 0x65, 0xd9, 0xd9, 0xdd, 0x8b, 0xec, 0xe8, 0x23, + 0x80, 0xd8, 0x65, 0xce, 0x65, 0x2a, 0x1c, 0xe6, 0x99, 0x18, 0xac, 0xc4, 0x2e, 0xdb, 0x4f, 0xc5, + 0xc0, 0xb3, 0x3a, 0xb0, 0x8e, 0xa9, 0xa0, 0xc9, 0x88, 0x9a, 0x42, 0x41, 0x1f, 0x83, 0xc9, 0x32, + 0x87, 0x79, 0x42, 0xd5, 0x43, 0x1d, 0xd7, 0xb5, 0x64, 0xe0, 0x09, 0x2b, 0x80, 0x8d, 0x09, 0xc0, + 0xd4, 0xd0, 0x05, 0xac, 0xb9, 0x11, 0x97, 0x84, 0x71, 0x9a, 0x38, 0x09, 0x15, 0xea, 0x90, 0x46, + 0xf7, 0xcb, 0x59, 0xae, 0x1f, 0xe4, 0x20, 0x4d, 0xa8, 0x7a, 0x03, 0x5e, 0x75, 0x0b, 0x52, 0xeb, + 0xcf, 0x32, 0x6c, 0xdf, 0x66, 0x86, 0x30, 0x2c, 0x51, 0x3e, 0x12, 0xa6, 0x5e, 0x9f, 0xbe, 0xcb, + 0x51, 0x76, 0x9f, 0x8f, 0x4c, 0xc2, 0x2a, 0x2e, 0xf4, 0x2d, 0xd4, 0xc2, 0x28, 0xe5, 0x52, 0xb4, + 0xca, 0x8a, 0xf5, 0xd3, 0x59, 0xac, 0x3f, 0x66, 0xd6, 0xd8, 0x80, 0x50, 0x6f, 0x5a, 0x90, 0x15, + 0x85, 0xbf, 0x3f, 0x5f, 0x1e, 0x9c, 0xc5, 0xd4, 0x9d, 0x14, 0x63, 0xfb, 0x2b, 0xa8, 0x4f, 0xfc, + 0xba, 0xa5, 0x52, 0xb6, 0x8b, 0x95, 0x52, 0x2f, 0xa6, 0xfd, 0x2f, 0x50, 0x55, 0xfe, 0xa0, 0x0f, + 0xa1, 0x2e, 0x89, 0x78, 0xe5, 0xc4, 0x44, 0xfa, 0xf9, 0x7b, 0x67, 0x82, 0x67, 0x44, 0xfa, 0x99, + 0xd2, 0x8f, 0x84, 0xd4, 0x4a, 0xcd, 0xb1, 0x92, 0x09, 0x72, 0x65, 0x42, 0x89, 0xe7, 0x44, 0x3c, + 0x18, 0xab, 0x9c, 0x5f, 0xc1, 0x2b, 0x99, 0xe0, 0x94, 0x07, 0x63, 0xcb, 0x07, 0x98, 0xfa, 0xfb, + 0x3f, 0x0e, 0xd9, 0x81, 0x46, 0x4c, 0x93, 0x90, 0x09, 0xc1, 0x22, 0x2e, 0x4c, 0x69, 0x15, 0x45, + 0xd6, 0x0b, 0x58, 0x3d, 0xcb, 0xfa, 0x79, 0x9e, 0x91, 0xdf, 0xc3, 0x96, 0x1b, 0x05, 0x01, 0x75, + 0xb3, 0x57, 0x73, 0x18, 0x97, 0xd9, 0x0b, 0x06, 0x26, 0xcb, 0x3e, 0xb0, 0xf5, 0x98, 0xb1, 0xf3, + 0x31, 0x63, 0xf7, 0xcc, 0x98, 0xc1, 0x68, 0x8a, 0x1a, 0x18, 0x90, 0x75, 0x01, 0x6b, 0x86, 0xdb, + 0x24, 0xef, 0x11, 0xd4, 0x54, 0xe7, 0xcf, 0x53, 0xe9, 0xd1, 0x02, 0x8d, 0x4f, 0x33, 0x19, 0xbc, + 0xf5, 0x57, 0x19, 0x9a, 0xd7, 0x95, 0x6f, 0xec, 0xff, 0x08, 0x96, 0x0a, 0x8d, 0x5f, 0xad, 0x33, + 0x59, 0xa1, 0xd7, 0xab, 0x35, 0x7a, 0x09, 0xeb, 0x8c, 0x0b, 0x49, 0xb8, 0x4b, 0x1d, 0x35, 0xe4, + 0x4c, 0xb3, 0x3f, 0x58, 0xd4, 0x4d, 0x7b, 0x60, 0x68, 0xd4, 0x4e, 0xa7, 0xfd, 0x1a, 0x2b, 0xca, + 0xda, 0x21, 0xa0, 0x9b, 0x46, 0xb7, 0xe4, 0xe0, 0xde, 0xd5, 0x6e, 0x3d, 0xe7, 0xb0, 0xd4, 0xc1, + 0x2a, 0x24, 0xec, 0xdf, 0xa5, 0x7c, 0x54, 0xea, 0x50, 0xfd, 0x00, 0xcb, 0x22, 0x0d, 0x43, 0x92, + 0x8c, 0xcd, 0xd3, 0xce, 0x3d, 0x06, 0x32, 0xfc, 0xf3, 0x8c, 0x17, 0xe7, 0x0c, 0xe8, 0x08, 0xaa, + 0x3a, 0x5c, 0xda, 0xc7, 0xee, 0x22, 0x54, 0xa7, 0x97, 0x2f, 0xa9, 0x2b, 0xb1, 0x26, 0x40, 0x4f, + 0xa0, 0x3e, 0xf9, 0xb3, 0x51, 0x4f, 0xd3, 0xe8, 0xb6, 0x6f, 0xe4, 0xdc, 0x79, 0x6e, 0x81, 0xa7, + 0xc6, 0xd6, 0x29, 0xac, 0x99, 0xfb, 0xf9, 0x24, 0x61, 0x7c, 0x58, 0x68, 0xee, 0xa5, 0x77, 0x69, + 0xee, 0xf7, 0x2f, 0xa0, 0xa6, 0x25, 0xa8, 0x09, 0xab, 0x67, 0x47, 0x7b, 0xb8, 0xdf, 0x73, 0x7e, + 0x3a, 0x39, 0xeb, 0x9f, 0x37, 0xef, 0xa0, 0x4d, 0x58, 0x33, 0x92, 0xbd, 0x83, 0xf3, 0xc1, 0xf3, + 0x7e, 0xb3, 0x84, 0xb6, 0x60, 0xc3, 0x88, 0x06, 0x27, 0x46, 0x58, 0x46, 0xef, 0xc1, 0xe6, 0x44, + 0xd8, 0x3f, 0x1e, 0x1c, 0x0e, 0xf6, 0x8f, 0xfb, 0xcd, 0x4a, 0xf7, 0x9f, 0x32, 0xac, 0x6a, 0x67, + 0x9f, 0x29, 0x17, 0xd0, 0xef, 0xd0, 0x28, 0xfc, 0x2f, 0xa1, 0xee, 0x2c, 0x57, 0x6f, 0xfe, 0x72, + 0xb5, 0x1f, 0x2f, 0x84, 0xd1, 0xf5, 0x68, 0xdd, 0x79, 0x54, 0x42, 0x01, 0x2c, 0x9b, 0x19, 0x83, + 0x66, 0xce, 0xd2, 0xab, 0xd3, 0xab, 0xdd, 0x99, 0xdb, 0x3e, 0x3f, 0x0f, 0xf9, 0x50, 0xd5, 0x09, + 0xf8, 0x60, 0xe6, 0x73, 0x14, 0xba, 0x52, 0xfb, 0xe1, 0x9c, 0xd6, 0xd3, 0x7b, 0xed, 0x2f, 0xbf, + 0xa8, 0xea, 0x8c, 0xa9, 0xa9, 0xcf, 0xe3, 0xff, 0x02, 0x00, 0x00, 0xff, 0xff, 0x32, 0x18, 0xe8, + 0x73, 0x87, 0x0b, 0x00, 0x00, } // Reference imports to suppress errors if they are not otherwise used. diff --git a/plugins/device/proto/device.proto b/plugins/device/proto/device.proto index 2387371dfdb..87db0828870 100644 --- a/plugins/device/proto/device.proto +++ b/plugins/device/proto/device.proto @@ -74,6 +74,9 @@ message DetectedDevice { // hw_locality is optionally set to expose hardware locality information for // more optimal placement decisions. DeviceLocality hw_locality = 4; + + // shared reports on the presence and state of a device sharing daemon + Shared shared = 5; } // DeviceLocality is used to expose HW locality information about a device. @@ -177,3 +180,15 @@ message DeviceStats { // timestamp is the time the statistics were collected. google.protobuf.Timestamp timestamp = 3; } + +// DeviceSharing is a representation of the DeviceSharing string enum +message DeviceSharing { + Shared shared = 1; +} + +enum Shared { + SHARED_UNSET = 0; + SHARED_ACTIVE = 1; + SHARED_INACTIVE = 2; + SHARED_INELIGIBLE = 3; +} diff --git a/plugins/device/util.go b/plugins/device/util.go index 24e99516e0c..234b41bb9b7 100644 --- a/plugins/device/util.go +++ b/plugins/device/util.go @@ -63,6 +63,7 @@ func convertProtoDevice(in *proto.DetectedDevice) *Device { Healthy: in.Healthy, HealthDesc: in.HealthDescription, HwLocality: convertProtoDeviceLocality(in.HwLocality), + Shared: convertProtoDeviceShared(in.GetShared()), } } @@ -77,6 +78,19 @@ func convertProtoDeviceLocality(in *proto.DeviceLocality) *DeviceLocality { } } +// convertProtoDeviceShared converts between a proto device.Shared and structs.Shared +func convertProtoDeviceShared(in proto.Shared) Shared { + switch in { + case proto.Shared_SHARED_INELIGIBLE: + return "ineligible" + case proto.Shared_SHARED_ACTIVE: + return "active" + case proto.Shared_SHARED_INACTIVE: + return "inactive" + } + return "" +} + // convertProtoContainerReservation is used to convert between a proto and struct // ContainerReservation func convertProtoContainerReservation(in *proto.ContainerReservation) *ContainerReservation { @@ -199,6 +213,7 @@ func convertStructDevice(in *Device) *proto.DetectedDevice { Healthy: in.Healthy, HealthDescription: in.HealthDesc, HwLocality: convertStructDeviceLocality(in.HwLocality), + Shared: convertShared(in.Shared), } } @@ -388,3 +403,15 @@ func convertStructDeviceStats(in *DeviceStats) *proto.DeviceStats { Timestamp: ts, } } + +func convertShared(s Shared) proto.Shared { + switch s.String() { + case "ineligible": + return proto.Shared_SHARED_INELIGIBLE + case "active": + return proto.Shared_SHARED_ACTIVE + case "inactive": + return proto.Shared_SHARED_INACTIVE + } + return proto.Shared_SHARED_UNSET +} diff --git a/scheduler/feasible/device.go b/scheduler/feasible/device.go index 66f083f7c92..6c8588595a2 100644 --- a/scheduler/feasible/device.go +++ b/scheduler/feasible/device.go @@ -102,50 +102,112 @@ func (m *memoryNodeMatcher) Matches(instanceID string, device *structs.NodeDevic // createOffer takes a device request and returns an assignment as well as a // score for the assignment. If no assignment is possible, an error is -// returned explaining why. -func (d *deviceAllocator) createOffer(mem *memoryNodeMatcher, ask *structs.RequestedDevice) (out *structs.AllocatedDeviceResource, score float64, err error) { +// returned explaining why. The returned sumMatchedAffinityWeights is the sum +// of affinity weights that matched, and totalAffinityWeight is the sum of +// absolute values of all affinity weights considered (for normalization). +func (d *deviceAllocator) createOffer(mem *memoryNodeMatcher, ask *structs.RequestedDevice) (out *structs.AllocatedDeviceResource, sumMatchedAffinityWeights float64, totalAffinityWeight float64, err error) { // Try to hot path if len(d.Devices) == 0 { - return nil, 0.0, fmt.Errorf("no devices available") + return nil, 0.0, 0.0, fmt.Errorf("no devices available") } + // Handle first_available selection + if len(ask.FirstAvailable) > 0 { + return d.createOfferFirstAvailable(mem, ask) + } + if ask.Count == 0 { - return nil, 0.0, fmt.Errorf("invalid request of zero devices") + return nil, 0.0, 0.0, fmt.Errorf("invalid request of zero devices") + } + + return d.createOfferWithParams(mem, ask.ID(), ask.Count, ask.Constraints, ask.Affinities, ask.ShareDevices) +} + +// createOfferFirstAvailable tries each option in the FirstAvailable list in order, +// returning the first successful offer. +func (d *deviceAllocator) createOfferFirstAvailable(mem *memoryNodeMatcher, ask *structs.RequestedDevice) (out *structs.AllocatedDeviceResource, sumMatchedAffinityWeights float64, totalAffinityWeight float64, err error) { + var lastErr error + + for _, opt := range ask.FirstAvailable { + if opt.Count == 0 { + continue + } + + // Combine base constraints with option-specific constraints + combinedConstraints := make(structs.Constraints, 0, len(ask.Constraints)+len(opt.Constraints)) + combinedConstraints = append(combinedConstraints, ask.Constraints...) + combinedConstraints = append(combinedConstraints, opt.Constraints...) + + offer, matchedWeights, totalWeight, offerErr := d.createOfferWithParams(mem, ask.ID(), opt.Count, + combinedConstraints, ask.Affinities, opt.ShareDevices) + if offer != nil { + return offer, matchedWeights, totalWeight, nil + } + lastErr = offerErr + } + + // None of the options could be satisfied + if lastErr != nil { + return nil, 0.0, 0.0, fmt.Errorf("no first_available option could be satisfied: %v", lastErr) } + return nil, 0.0, 0.0, fmt.Errorf("no first_available options defined") +} +// createOfferWithParams is the core offer creation logic that can be used for both +// standard requests and first_available options. +func (d *deviceAllocator) createOfferWithParams(mem *memoryNodeMatcher, deviceID *structs.DeviceIdTuple, count uint64, + constraints structs.Constraints, affinities structs.Affinities, shareDevices *structs.ShareDevices) (out *structs.AllocatedDeviceResource, sumMatchedAffinityWeights float64, totalAffinityWeight float64, err error) { // Hold the current best offer var offer *structs.AllocatedDeviceResource var offerScore float64 var matchedWeights float64 + // Calculate the total weight of all affinities (for normalization purposes) + var totalWeight float64 + for _, a := range affinities { + totalWeight += math.Abs(float64(a.Weight)) + } + // Determine the devices that are feasible based on availability and // constraints for id, devInst := range d.Devices { - // Check if the device works - if !nodeDeviceMatches(d.ctx, devInst.Device, ask) { + // Check if the device works (name/type match and constraints) + if !d.deviceMatchesWithConstraints(devInst.Device, deviceID, constraints) { continue } // Check if we have enough unused instances to use this assignable := []string{} - for instanceID, v := range devInst.Instances { - if v != 0 { + willShare := make(map[string]bool) + + for instanceID, claimCount := range devInst.Instances { + if claimCount != 0 && devInst.GetSharedByID(instanceID) != structs.DeviceSharingActive { continue } + if !mem.Matches(instanceID, devInst.Device) { continue } - if d.deviceIDMatchesConstraint(instanceID, ask.Constraints, devInst.Device) { - assignable = append(assignable, instanceID) + + if !d.deviceIDMatchesConstraint(instanceID, constraints, devInst.Device) { + continue + } + if !d.sharedDeviceIDMatches(instanceID, shareDevices) { + continue } + // if the task is willing to share, document in deviceAllocator + if d.deviceIDAllowsSharing(instanceID, shareDevices, devInst.Device) { + //only update willShare map if assignable & willing to share + willShare[instanceID] = true + } + assignable = append(assignable, instanceID) // Don't assign more than the ask - if len(assignable) == int(ask.Count) { + if len(assignable) == int(count) { break } } - // This device doesn't have enough instances - if len(assignable) < int(ask.Count) { + if len(assignable) < int(count) { continue } @@ -155,15 +217,11 @@ func (d *deviceAllocator) createOffer(mem *memoryNodeMatcher, ask *structs.Reque // Track the sum of matched affinity weights in a separate variable // We return this if this device had the best score compared to other devices considered var sumMatchedWeights float64 - if l := len(ask.Affinities); l != 0 { - totalWeight := 0.0 - for _, a := range ask.Affinities { + if len(affinities) != 0 { + for _, a := range affinities { // Resolve the targets lVal, lOk := resolveDeviceTarget(a.LTarget, devInst.Device) rVal, rOk := resolveDeviceTarget(a.RTarget, devInst.Device) - - totalWeight += math.Abs(float64(a.Weight)) - // Check if satisfied if !checkAttributeAffinity(d.ctx, a.Operand, lVal, rVal, lOk, rOk) { continue @@ -173,9 +231,10 @@ func (d *deviceAllocator) createOffer(mem *memoryNodeMatcher, ask *structs.Reque } // normalize - choiceScore /= totalWeight + if totalWeight > 0 { + choiceScore /= totalWeight + } } - // Only use the device if it is a higher score than we have already seen if offer != nil && choiceScore < offerScore { continue @@ -193,15 +252,39 @@ func (d *deviceAllocator) createOffer(mem *memoryNodeMatcher, ask *structs.Reque Type: id.Type, Name: id.Name, DeviceIDs: assignable, + WillShare: willShare, } } // Failed to find a match if offer == nil { - return nil, 0.0, fmt.Errorf("no devices match request") + return nil, 0.0, 0.0, fmt.Errorf("no devices match request") + } + + return offer, matchedWeights, totalWeight, nil +} + +// deviceMatchesWithConstraints checks if a device matches the given device ID +// and constraints. This is used for offer creation where we have explicit +// parameters rather than a full RequestedDevice. +func (d *deviceAllocator) deviceMatchesWithConstraints(device *structs.NodeDeviceResource, deviceID *structs.DeviceIdTuple, constraints structs.Constraints) bool { + if !device.ID().Matches(deviceID) { + return false + } + + // Check constraints + for _, c := range constraints { + // Resolve the targets + lVal, lOk := resolveDeviceTarget(c.LTarget, device) + rVal, rOk := resolveDeviceTarget(c.RTarget, device) + + // Check if satisfied + if !checkAttributeConstraint(d.ctx, c.Operand, lVal, rVal, lOk, rOk) { + return false + } } - return offer, matchedWeights, nil + return true } // deviceIDMatchesConstraint checks a device instance ID against the constraints @@ -234,3 +317,49 @@ func (d *deviceAllocator) deviceIDMatchesConstraint(id string, constraints struc return true } + +// deviceIDAllowsSharing checks a device instance ID against the device's +// Shared status to ensure we're only assigning devices that can share +func (d *deviceAllocator) deviceIDAllowsSharing(id string, shareDevices *structs.ShareDevices, device *structs.NodeDeviceResource) bool { + canShare := false + if shareDevices == nil { + return canShare + } + for _, dev := range device.Instances { + if dev.ID != id { + continue + } + // return true if the device has sharing active and the task will share + if shareDevices.Enabled && dev.Shared == structs.DeviceSharingActive { + canShare = true + } + + } + + return canShare +} +func (d *deviceAllocator) sharedDeviceIDMatches(instanceID string, shareDevices *structs.ShareDevices) bool { + if shareDevices == nil { + return true + } + // if we're targeting a specific GPU confirm its the one we want + if shareDevices.SharedDeviceId != "" && shareDevices.SharedDeviceId != instanceID { + return false + } + return true +} + +//// deviceIDConstraintAndSharingChecks returns a single boolean to report whether +//// device ID matches all of the constraints and if applicable all of the +//// requested sharing modes +//func (d *deviceAllocator) deviceIDConstraintAndSharingChecks(id string, constraints structs.Constraints, sharing *structs.ShareDevices, device *structs.NodeDeviceResource) bool { +// if passesConstraint := d.deviceIDMatchesConstraint(id, constraints, device); !passesConstraint { +// return false +// } +// if sharing != nil { +// if passesSharing := d.deviceIDAllowsSharing(id, sharing, device); !passesSharing { +// return false +// } +// } +// return true +//} diff --git a/scheduler/feasible/device_test.go b/scheduler/feasible/device_test.go index 14f52cfb13d..c4536560840 100644 --- a/scheduler/feasible/device_test.go +++ b/scheduler/feasible/device_test.go @@ -112,7 +112,7 @@ func TestDeviceAllocator_Allocate_GenericRequest(t *testing.T) { ask := deviceRequest("gpu", 1, nil, nil) mem := anyMemoryNodeMatcher() - out, score, err := d.createOffer(mem, ask) + out, score, _, err := d.createOffer(mem, ask) must.NotNil(t, out) must.Zero(t, score) must.NoError(t, err) @@ -135,7 +135,7 @@ func TestDeviceAllocator_Allocate_FullyQualifiedRequest(t *testing.T) { ask := deviceRequest("intel/fpga/F100", 1, nil, nil) mem := anyMemoryNodeMatcher() - out, score, err := d.createOffer(mem, ask) + out, score, _, err := d.createOffer(mem, ask) must.NotNil(t, out) must.Zero(t, score) must.NoError(t, err) @@ -158,7 +158,7 @@ func TestDeviceAllocator_Allocate_NotEnoughInstances(t *testing.T) { ask := deviceRequest("gpu", 4, nil, nil) mem := anyMemoryNodeMatcher() - out, _, err := d.createOffer(mem, ask) + out, _, _, err := d.createOffer(mem, ask) must.Nil(t, out) must.ErrorContains(t, err, "no devices match request") } @@ -177,7 +177,7 @@ func TestDeviceAllocator_Allocate_NUMA_available(t *testing.T) { topology: structs.MockWorkstationTopology(), devices: set.From([]string{"nvidia/gpu/1080ti"}), } - out, _, err := d.createOffer(mem, ask) + out, _, _, err := d.createOffer(mem, ask) must.NoError(t, err) must.SliceLen(t, 2, out.DeviceIDs) // DeviceIDs are actually instance ids } @@ -210,7 +210,7 @@ func TestDeviceAllocator_Allocate_NUMA_node1(t *testing.T) { topology: structs.MockWorkstationTopology(), devices: set.From([]string{"xilinx/fpga/7XA"}), } - out, _, err := d.createOffer(mem, ask) + out, _, _, err := d.createOffer(mem, ask) must.NoError(t, err) must.SliceLen(t, 1, out.DeviceIDs) } @@ -332,7 +332,7 @@ func TestDeviceAllocate_Constraints_NoMemoryMatch(t *testing.T) { ask := deviceRequest(c.Name, 1, c.Constraints, nil) mem := anyMemoryNodeMatcher() - out, score, err := d.createOffer(mem, ask) + out, score, _, err := d.createOffer(mem, ask) if c.NoPlacement { must.Nil(t, out) } else { @@ -380,7 +380,7 @@ func TestDeviceAllocate_Constraints_MemoryMatch(t *testing.T) { }, devices: set.From([]string{nvidia0.ID().String()}), } - out, _, err := d.createOffer(mem, ask) + out, _, _, err := d.createOffer(mem, ask) // the first memoryNodeMatcher does not have the correct memoryNode must.ErrorContains(t, err, "no devices match") @@ -388,7 +388,7 @@ func TestDeviceAllocate_Constraints_MemoryMatch(t *testing.T) { // change to the correct node mem.memoryNode = 2 - out, _, err = d.createOffer(mem, ask) + out, _, _, err = d.createOffer(mem, ask) must.NoError(t, err) must.Len(t, 1, out.DeviceIDs) @@ -485,7 +485,7 @@ func TestDeviceAllocator_Affinities(t *testing.T) { ask := deviceRequest(c.Name, 1, nil, c.Affinities) mem := anyMemoryNodeMatcher() - out, score, err := d.createOffer(mem, ask) + out, score, _, err := d.createOffer(mem, ask) must.NotNil(t, out) must.NoError(t, err) if c.ZeroScore { @@ -501,6 +501,286 @@ func TestDeviceAllocator_Affinities(t *testing.T) { } } +// Test FirstAvailable: first option is selected when it can be satisfied +func TestDeviceAllocator_FirstAvailable_SelectsFirstOption(t *testing.T) { + ci.Parallel(t) + + _, ctx := MockContext(t) + n := multipleNvidiaNode() + d := newDeviceAllocator(ctx, n) + must.NotNil(t, d) + + nvidia0 := n.NodeResources.Devices[0] // 1080ti with 2 instances + nvidia1 := n.NodeResources.Devices[1] // 2080ti with 2 instances + + // Build a request that prefers 1080ti first, then falls back to 2080ti + ask := &structs.RequestedDevice{ + Name: "nvidia/gpu", + FirstAvailable: []*structs.DeviceOption{ + { + Count: 1, + Constraints: []*structs.Constraint{ + { + LTarget: "${device.model}", + Operand: "=", + RTarget: "1080ti", + }, + }, + }, + { + Count: 1, + Constraints: []*structs.Constraint{ + { + LTarget: "${device.model}", + Operand: "=", + RTarget: "2080ti", + }, + }, + }, + }, + } + + mem := anyMemoryNodeMatcher() + out, _, _, err := d.createOffer(mem, ask) + must.NoError(t, err) + must.NotNil(t, out) + + // Should select 1080ti (first option) + must.Eq(t, "1080ti", out.Name) + must.SliceLen(t, 1, out.DeviceIDs) + must.SliceContains(t, collectInstanceIDs(nvidia0), out.DeviceIDs[0]) + _ = nvidia1 // silence unused warning +} + +// Test FirstAvailable: falls back to second option when first cannot be satisfied +func TestDeviceAllocator_FirstAvailable_FallsBackToSecondOption(t *testing.T) { + ci.Parallel(t) + + _, ctx := MockContext(t) + n := multipleNvidiaNode() + d := newDeviceAllocator(ctx, n) + must.NotNil(t, d) + + nvidia1 := n.NodeResources.Devices[1] // 2080ti with 2 instances + + // Build a request where first option cannot be satisfied (no H100) + // but second option can (2080ti exists) + ask := &structs.RequestedDevice{ + Name: "nvidia/gpu", + FirstAvailable: []*structs.DeviceOption{ + { + Count: 1, + Constraints: []*structs.Constraint{ + { + LTarget: "${device.model}", + Operand: "=", + RTarget: "H100", // doesn't exist + }, + }, + }, + { + Count: 1, + Constraints: []*structs.Constraint{ + { + LTarget: "${device.model}", + Operand: "=", + RTarget: "2080ti", + }, + }, + }, + }, + } + + mem := anyMemoryNodeMatcher() + out, _, _, err := d.createOffer(mem, ask) + must.NoError(t, err) + must.NotNil(t, out) + + // Should select 2080ti (second option since first failed) + must.Eq(t, "2080ti", out.Name) + must.SliceLen(t, 1, out.DeviceIDs) + must.SliceContains(t, collectInstanceIDs(nvidia1), out.DeviceIDs[0]) +} + +// Test FirstAvailable: count requirements are respected +func TestDeviceAllocator_FirstAvailable_CountRequirements(t *testing.T) { + ci.Parallel(t) + + _, ctx := MockContext(t) + n := multipleNvidiaNode() + d := newDeviceAllocator(ctx, n) + must.NotNil(t, d) + + nvidia1 := n.NodeResources.Devices[1] // 2080ti with 2 instances + + // Build a request where first option needs 4 GPUs (not available) + // but second option only needs 2 + ask := &structs.RequestedDevice{ + Name: "nvidia/gpu", + FirstAvailable: []*structs.DeviceOption{ + { + Count: 4, // can't satisfy - not enough instances + Constraints: []*structs.Constraint{ + { + LTarget: "${device.model}", + Operand: "=", + RTarget: "1080ti", + }, + }, + }, + { + Count: 2, // can satisfy + Constraints: []*structs.Constraint{ + { + LTarget: "${device.model}", + Operand: "=", + RTarget: "2080ti", + }, + }, + }, + }, + } + + mem := anyMemoryNodeMatcher() + out, _, _, err := d.createOffer(mem, ask) + must.NoError(t, err) + must.NotNil(t, out) + + // Should select 2080ti with 2 instances + must.Eq(t, "2080ti", out.Name) + must.SliceLen(t, 2, out.DeviceIDs) + must.SliceContainsSubset(t, collectInstanceIDs(nvidia1), out.DeviceIDs) +} + +// Test FirstAvailable: base constraints are applied to all options +func TestDeviceAllocator_FirstAvailable_BaseConstraints(t *testing.T) { + ci.Parallel(t) + + _, ctx := MockContext(t) + n := devNode() // has nvidia/gpu/1080ti and intel/fpga/F100 + d := newDeviceAllocator(ctx, n) + must.NotNil(t, d) + + // Build a request with a base constraint that limits to nvidia vendor + // First option asks for a model that doesn't exist, second asks for 1080ti + ask := &structs.RequestedDevice{ + Name: "gpu", + // Base constraint: must be nvidia + Constraints: []*structs.Constraint{ + { + LTarget: "${device.vendor}", + Operand: "=", + RTarget: "nvidia", + }, + }, + FirstAvailable: []*structs.DeviceOption{ + { + Count: 1, + Constraints: []*structs.Constraint{ + { + LTarget: "${device.model}", + Operand: "=", + RTarget: "H100", // doesn't exist + }, + }, + }, + { + Count: 1, + // No additional constraints - should match nvidia/gpu/1080ti + }, + }, + } + + mem := anyMemoryNodeMatcher() + out, _, _, err := d.createOffer(mem, ask) + must.NoError(t, err) + must.NotNil(t, out) + + // Should select nvidia device (second option) + must.Eq(t, "nvidia", out.Vendor) +} + +// Test FirstAvailable: all options fail returns error +func TestDeviceAllocator_FirstAvailable_AllOptionsFail(t *testing.T) { + ci.Parallel(t) + + _, ctx := MockContext(t) + n := devNode() + d := newDeviceAllocator(ctx, n) + must.NotNil(t, d) + + // Build a request where no option can be satisfied + ask := &structs.RequestedDevice{ + Name: "nvidia/gpu", + FirstAvailable: []*structs.DeviceOption{ + { + Count: 1, + Constraints: []*structs.Constraint{ + { + LTarget: "${device.model}", + Operand: "=", + RTarget: "H100", // doesn't exist + }, + }, + }, + { + Count: 1, + Constraints: []*structs.Constraint{ + { + LTarget: "${device.model}", + Operand: "=", + RTarget: "GH200", // doesn't exist either + }, + }, + }, + }, + } + + mem := anyMemoryNodeMatcher() + out, _, _, err := d.createOffer(mem, ask) + must.Nil(t, out) + must.ErrorContains(t, err, "no first_available option could be satisfied") +} + +// Test FirstAvailable: base affinities are applied to all options +func TestDeviceAllocator_FirstAvailable_BaseAffinities(t *testing.T) { + ci.Parallel(t) + + _, ctx := MockContext(t) + n := multipleNvidiaNode() + d := newDeviceAllocator(ctx, n) + must.NotNil(t, d) + + // Build a request with base affinities that apply to all first_available options + ask := &structs.RequestedDevice{ + Name: "nvidia/gpu", + // Base affinity applies to whichever option is selected + Affinities: []*structs.Affinity{ + { + LTarget: "${device.attr.memory}", + Operand: ">", + RTarget: "10 GiB", + Weight: 50, + }, + }, + FirstAvailable: []*structs.DeviceOption{ + { + Count: 1, + }, + }, + } + + mem := anyMemoryNodeMatcher() + out, sumMatched, totalWeight, err := d.createOffer(mem, ask) + must.NoError(t, err) + must.NotNil(t, out) + + // Base affinity should have been considered + must.Eq(t, 50.0, totalWeight) + // sumMatched depends on which device was selected and matched + must.True(t, sumMatched >= 0) +} + func Test_equalBusID(t *testing.T) { must.True(t, equalBusID("0000:03:00.1", "00000000:03:00.1")) must.False(t, equalBusID("0000:03:00.1", "0000:03:00.0")) @@ -615,3 +895,104 @@ func Test_memoryNodeMatcher(t *testing.T) { }) } } + +func TestDeviceAllocator_Allocate_SharedDevices(t *testing.T) { + ci.Parallel(t) + + n := mock.SharedNvidiaNode() + nvidia0 := n.NodeResources.Devices[0] + SharedDeviceId0 := n.NodeResources.Devices[0].Instances[0] + SharedDeviceId1 := n.NodeResources.Devices[0].Instances[1] + _, ctx := MockContext(t) + d := newDeviceAllocator(ctx, n) + must.NotNil(t, d) + mem := &memoryNodeMatcher{ + memoryNode: -1, // we are not testing + } + + for _, tc := range []struct { + name string + deviceName string + deviceID string + shareDevices *structs.ShareDevices + count uint64 + expectedErr string + }{ + { + name: "happy path", + deviceName: "nvidia/gpu", + deviceID: SharedDeviceId0.ID, + shareDevices: &structs.ShareDevices{Enabled: true}, + count: 1, + }, + { + name: "structs.ShareDevices can be nil", + deviceName: "nvidia/gpu", + deviceID: SharedDeviceId0.ID, + shareDevices: nil, + count: 1, + }, + { + name: "if shareDevices enabled, device must be sharable", + deviceName: "nvidia/gpu", + deviceID: SharedDeviceId0.ID, + shareDevices: &structs.ShareDevices{Enabled: true}, + count: 1, + }, + { + name: "if present, SharedDeviceID must match allocated device", + deviceName: "nvidia/gpu", + deviceID: SharedDeviceId0.ID, + shareDevices: &structs.ShareDevices{Enabled: true, SharedDeviceId: SharedDeviceId1.ID}, + count: 1, + expectedErr: "no devices match request", + }, + { + name: "sharing passes, constraint doesn't match", + deviceName: "nvidia/gpu", + deviceID: "notanID", + shareDevices: &structs.ShareDevices{Enabled: true}, + count: 1, + expectedErr: "no devices match request", + }, + } { + t.Run(tc.name, func(t *testing.T) { + testConstraints := []*structs.Constraint{ + { + LTarget: "${device.ids}", + Operand: "set_contains", + RTarget: tc.deviceID, + }, + } + ask := &structs.RequestedDevice{ + Name: tc.deviceName, + Count: tc.count, + Constraints: testConstraints, + ShareDevices: tc.shareDevices, + } + + out, _, _, err := d.createOffer(mem, ask) + if tc.expectedErr != "" { + must.ErrorContains(t, err, tc.expectedErr) + must.Nil(t, out) + return + } + must.NoError(t, err) + must.NotNil(t, out) + must.Len(t, 1, out.DeviceIDs) + // validate expected instance and device IDs + must.SliceContains(t, collectInstanceIDs(nvidia0), out.DeviceIDs[0]) + must.SliceContains(t, out.DeviceIDs, nvidia0.Instances[0].ID) + must.Eq(t, tc.deviceID, out.DeviceIDs[0]) + + if tc.shareDevices != nil { + must.MapContainsKey(t, out.WillShare, out.DeviceIDs[0]) + if tc.shareDevices.SharedDeviceId != "" { + must.SliceContains(t, out.DeviceIDs, tc.shareDevices.SharedDeviceId) + } + } + + }) + } + +} diff --git a/scheduler/feasible/feasible.go b/scheduler/feasible/feasible.go index 91f4cfcd5da..4e37ecfff71 100644 --- a/scheduler/feasible/feasible.go +++ b/scheduler/feasible/feasible.go @@ -1567,12 +1567,32 @@ func (c *DeviceChecker) hasDevices(option *structs.Node) bool { // Go through the required devices trying to find matches OUTER: for _, req := range c.required { - // Determine how many there are to place + // Handle first_available selection + if len(req.FirstAvailable) > 0 { + if c.canSatisfyFirstAvailable(req, available) { + continue OUTER + } + // None of the first_available options could be satisfied + return false + } + + // Standard device request - determine how many there are to place desiredCount := req.Count + var willShare bool + if req.ShareDevices != nil { + willShare = req.ShareDevices.Enabled + } // Go through the device resources and see if we have a match for d, unused := range available { - if unused == 0 { + sharable := false + if willShare { + s, ok := d.Attributes["shared"].GetString() + if ok && s == "active" { + sharable = true + } + } + if unused == 0 { // don't need to change this because we only decrement if device & task are not sharable // Depleted continue } @@ -1580,13 +1600,17 @@ OUTER: // Check the constraints if nodeDeviceMatches(c.ctx, d, req) { for desiredCount > 0 && available[d] > 0 { - available[d] -= 1 desiredCount -= 1 + // consume device if not sharable + if !sharable { + available[d] -= 1 + } } if desiredCount == 0 { continue OUTER } + } } @@ -1599,6 +1623,99 @@ OUTER: return true } +// canSatisfyFirstAvailable checks if any of the first_available options can be +// satisfied given the available devices. It tries each option in order and +// returns true if any option can be satisfied. If an option is satisfied, the +// available counts are decremented accordingly. +func (c *DeviceChecker) canSatisfyFirstAvailable(req *structs.RequestedDevice, available map[*structs.NodeDeviceResource]uint64) bool { + for _, opt := range req.FirstAvailable { + // Try to satisfy this option + if c.canSatisfyDeviceOption(req, opt, available) { + return true + } + } + return false +} + +// canSatisfyDeviceOption checks if a single device option can be satisfied. +// It combines the base constraints from the request with the option-specific +// constraints and checks if enough devices match. +func (c *DeviceChecker) canSatisfyDeviceOption(req *structs.RequestedDevice, opt *structs.DeviceOption, available map[*structs.NodeDeviceResource]uint64) bool { + desiredCount := opt.Count + var willShare bool + if opt.ShareDevices != nil { + willShare = opt.ShareDevices.Enabled + } + // Create a snapshot of available counts to restore if this option fails + snapshot := make(map[*structs.NodeDeviceResource]uint64, len(available)) + for k, v := range available { + snapshot[k] = v + } + + for d, unused := range available { + sharable := false + if willShare { + s, ok := d.Attributes["shared"].GetString() + if ok && s == "active" { + sharable = true + } + } + if unused == 0 { // don't need to change this because we only decrement if device & task are not sharable + // Depleted + continue + } + + // Check if device matches base requirements (name/type) + if !d.ID().Matches(req.ID()) { + continue + } + + // Check base constraints from the RequestedDevice + if !deviceMatchesConstraints(c.ctx, d, req.Constraints) { + continue + } + + // Check option-specific constraints + if !deviceMatchesConstraints(c.ctx, d, opt.Constraints) { + continue + } + + // This device type matches, consume instances + for desiredCount > 0 && available[d] > 0 { + desiredCount -= 1 + if !sharable { + available[d] -= 1 + } + } + + if desiredCount == 0 { + return true + } + + } + + // Failed to satisfy this option - restore available counts + for k, v := range snapshot { + available[k] = v + } + return false +} + +// deviceMatchesConstraints checks if a device satisfies a set of constraints. +func deviceMatchesConstraints(ctx Context, d *structs.NodeDeviceResource, constraints structs.Constraints) bool { + for _, c := range constraints { + // Resolve the targets + lVal, lOk := resolveDeviceTarget(c.LTarget, d) + rVal, rOk := resolveDeviceTarget(c.RTarget, d) + + // Check if satisfied + if !checkAttributeConstraint(ctx, c.Operand, lVal, rVal, lOk, rOk) { + return false + } + } + return true +} + // nodeDeviceMatches checks if the device matches the request and its // constraints. It doesn't check the count. func nodeDeviceMatches(ctx Context, d *structs.NodeDeviceResource, req *structs.RequestedDevice) bool { diff --git a/scheduler/feasible/feasible_test.go b/scheduler/feasible/feasible_test.go index 1db3b2631b2..bba10029810 100644 --- a/scheduler/feasible/feasible_test.go +++ b/scheduler/feasible/feasible_test.go @@ -3193,7 +3193,23 @@ func TestDeviceChecker(t *testing.T) { }, } } - + // will create a taskgroup with with len(devices) tasks, each task will request + // all devices + getSharedTg := func(devices ...*structs.RequestedDevice) *structs.TaskGroup { + var tasks []*structs.Task + + for range devices { + tasks = append(tasks, &structs.Task{ + Resources: &structs.Resources{ + Devices: devices, + }, + }) + } + return &structs.TaskGroup{ + Name: "example", + Tasks: tasks, + } + } // Just type gpuTypeReq := &structs.RequestedDevice{ Name: "gpu", @@ -3235,7 +3251,6 @@ func TestDeviceChecker(t *testing.T) { n.NodeResources.Devices = devices return n } - nvidia_A := &structs.NodeDeviceResource{ Vendor: "nvidia", Type: "gpu", @@ -3277,7 +3292,12 @@ func TestDeviceChecker(t *testing.T) { }, }, } - + makeDeviceSharable := func(device *structs.NodeDeviceResource) *structs.NodeDeviceResource { + for _, v := range device.Instances { + v.Shared = structs.DeviceSharingActive + } + return device + } nvidiaUnhealthy := &structs.NodeDeviceResource{ Vendor: "nvidia", Type: "gpu", @@ -3299,6 +3319,7 @@ func TestDeviceChecker(t *testing.T) { Result bool NodeDevices []*structs.NodeDeviceResource RequestedDevices []*structs.RequestedDevice + isShared bool }{ { Name: "no devices on node", @@ -3360,6 +3381,20 @@ func TestDeviceChecker(t *testing.T) { NodeDevices: []*structs.NodeDeviceResource{nvidia_A}, RequestedDevices: []*structs.RequestedDevice{gpuTypeHighCountReq}, }, + { + Name: "shared device and two tasks", + Result: true, + NodeDevices: []*structs.NodeDeviceResource{makeDeviceSharable(nvidia_A)}, + RequestedDevices: []*structs.RequestedDevice{gpuTypeReq, gpuTypeReq}, + isShared: true, + }, + { + Name: "unshared device and two tasks", + Result: true, + NodeDevices: []*structs.NodeDeviceResource{nvidia_A}, + RequestedDevices: []*structs.RequestedDevice{gpuTypeReq, gpuTypeReq}, + isShared: true, + }, { Name: "meets constraints requirement", Result: true, @@ -3564,12 +3599,170 @@ func TestDeviceChecker(t *testing.T) { }, }, }, + { + Name: "first_available first option satisfied", + Result: true, + NodeDevices: []*structs.NodeDeviceResource{nvidia_A}, + RequestedDevices: []*structs.RequestedDevice{ + { + Name: "nvidia/gpu", + FirstAvailable: []*structs.DeviceOption{ + { + Count: 1, + Constraints: []*structs.Constraint{ + { + Operand: "=", + LTarget: "${device.model}", + RTarget: "1080ti", + }, + }, + }, + { + Count: 1, + Constraints: []*structs.Constraint{ + { + Operand: "=", + LTarget: "${device.model}", + RTarget: "2080ti", + }, + }, + }, + }, + }, + }, + }, + { + Name: "first_available fallback to second option", + Result: true, + NodeDevices: []*structs.NodeDeviceResource{nvidia_B}, // only has 2080ti + RequestedDevices: []*structs.RequestedDevice{ + { + Name: "nvidia/gpu", + FirstAvailable: []*structs.DeviceOption{ + { + Count: 1, + Constraints: []*structs.Constraint{ + { + Operand: "=", + LTarget: "${device.model}", + RTarget: "1080ti", // not available + }, + }, + }, + { + Count: 1, + Constraints: []*structs.Constraint{ + { + Operand: "=", + LTarget: "${device.model}", + RTarget: "2080ti", // available + }, + }, + }, + }, + }, + }, + }, + { + Name: "first_available no options satisfy", + Result: false, + NodeDevices: []*structs.NodeDeviceResource{nvidia_A}, + RequestedDevices: []*structs.RequestedDevice{ + { + Name: "nvidia/gpu", + FirstAvailable: []*structs.DeviceOption{ + { + Count: 1, + Constraints: []*structs.Constraint{ + { + Operand: "=", + LTarget: "${device.model}", + RTarget: "H100", // not available + }, + }, + }, + { + Count: 1, + Constraints: []*structs.Constraint{ + { + Operand: "=", + LTarget: "${device.model}", + RTarget: "GH200", // not available + }, + }, + }, + }, + }, + }, + }, + { + Name: "first_available with base constraint applied", + Result: true, + NodeDevices: []*structs.NodeDeviceResource{nvidia_A, nvidia_B}, + RequestedDevices: []*structs.RequestedDevice{ + { + Name: "nvidia/gpu", + // Base constraint that must be satisfied + Constraints: []*structs.Constraint{ + { + Operand: ">", + LTarget: "${device.attr.memory}", + RTarget: "3 GiB", + }, + }, + FirstAvailable: []*structs.DeviceOption{ + { + Count: 2, // need 2 devices + }, + }, + }, + }, + }, + { + Name: "first_available count not satisfiable falls back", + Result: true, + NodeDevices: []*structs.NodeDeviceResource{nvidia_A}, // only has 2 instances + RequestedDevices: []*structs.RequestedDevice{ + { + Name: "nvidia/gpu", + FirstAvailable: []*structs.DeviceOption{ + { + Count: 4, // can't satisfy - need 4 but only 2 available + Constraints: []*structs.Constraint{ + { + Operand: "=", + LTarget: "${device.model}", + RTarget: "1080ti", + }, + }, + }, + { + Count: 1, // can satisfy + Constraints: []*structs.Constraint{ + { + Operand: "=", + LTarget: "${device.model}", + RTarget: "1080ti", + }, + }, + }, + }, + }, + }, + }, } for _, c := range cases { t.Run(c.Name, func(t *testing.T) { _, ctx := MockContext(t) checker := NewDeviceChecker(ctx) + var tg *structs.TaskGroup + tg = getTg(c.RequestedDevices...) + if c.isShared { + getSharedTg(c.RequestedDevices...) + } + checker.SetTaskGroup(tg) + checker.SetTaskGroup(getTg(c.RequestedDevices...)) if act := checker.Feasible(getNode(c.NodeDevices...)); act != c.Result { t.Fatalf("got %v; want %v", act, c.Result) diff --git a/scheduler/feasible/rank.go b/scheduler/feasible/rank.go index 294ad58ca82..5656dfd8d00 100644 --- a/scheduler/feasible/rank.go +++ b/scheduler/feasible/rank.go @@ -208,7 +208,6 @@ func (iter *BinPackIterator) SetSchedulerConfiguration(schedConfig *structs.Sche } func (iter *BinPackIterator) Next() *RankedNode { - NEXTNODE: for { // Get the next potential option @@ -224,7 +223,6 @@ NEXTNODE: iter.ctx.Logger().Named("binpack").Error("failed retrieving proposed allocations", "error", err) continue } - // Index the existing network usage. // This should never collide, since it represents the current state of // the node. If it does collide though, it means we found a bug! So @@ -286,7 +284,6 @@ NEXTNODE: currentPreemptions = append(currentPreemptions, allocs...) } preemptor.SetPreemptions(currentPreemptions) - // Check if we need task group network resource if len(iter.taskGroup.Networks) > 0 { ask := iter.taskGroup.Networks[0].Copy() @@ -509,7 +506,8 @@ NEXTNODE: var offer *structs.AllocatedDeviceResource var sumAffinities float64 - offer, sumAffinities, err = devAllocator.createOffer(memory, device) + var deviceTotalWeight float64 + offer, sumAffinities, deviceTotalWeight, err = devAllocator.createOffer(memory, device) if offer == nil || err != nil { devAllocator = devAllocatorSnapshot taskResources.Devices = taskResourcesSnapshot @@ -522,11 +520,10 @@ NEXTNODE: devAllocator.AddReserved(offer) taskResources.Devices = append(taskResources.Devices, offer) - // Add the scores - if len(device.Affinities) != 0 { - for _, a := range device.Affinities { - totalDeviceAffinityWeight += math.Abs(float64(a.Weight)) - } + // Add the scores - use returned weights which correctly + // handle first_available option-specific affinities + if deviceTotalWeight > 0 { + totalDeviceAffinityWeight += deviceTotalWeight sumMatchingAffinities += sumAffinities } count++ @@ -555,7 +552,6 @@ NEXTNODE: // and devices WITH leveraging preemption. We will have already // made attempts without preemption. - // If preemption is not enabled, then this node is exhausted. if !iter.evict { // surface err from createOffer() iter.ctx.Metrics().ExhaustedNode(option.Node, fmt.Sprintf("devices: %s", err)) @@ -596,7 +592,7 @@ NEXTNODE: devices: set.From(task.Resources.NUMA.GetDevices()), } - offer, sumAffinities, err := devAllocator.createOffer(memory, device) + offer, sumAffinities, deviceTotalWeight, err := devAllocator.createOffer(memory, device) if offer == nil { offerErr = err @@ -631,7 +627,7 @@ NEXTNODE: devAllocatorEvict.AddAllocs(proposed) // attempt the offer again - offerEvict, sumAffinitiesEvict, err := devAllocatorEvict.createOffer(memory, device) + offerEvict, sumAffinitiesEvict, deviceTotalWeightEvict, err := devAllocatorEvict.createOffer(memory, device) if offerEvict == nil || err != nil { // we cannot acquire this device even with preemption iter.ctx.Logger().Named("binpack").Debug("unexpected error, unable to create device offer after considering preemption", "error", err) @@ -642,17 +638,17 @@ NEXTNODE: offer = offerEvict sumAffinities = sumAffinitiesEvict devAllocator = devAllocatorEvict + deviceTotalWeight = deviceTotalWeightEvict } // assign the offer for this device to our allocator devAllocator.AddReserved(offer) taskResources.Devices = append(taskResources.Devices, offer) - // Add the scores - if len(device.Affinities) != 0 { - for _, a := range device.Affinities { - totalDeviceAffinityWeight += math.Abs(float64(a.Weight)) - } + // Add the scores - use returned weights which correctly + // handle first_available option-specific affinities + if deviceTotalWeight > 0 { + totalDeviceAffinityWeight += deviceTotalWeight sumMatchingAffinities += sumAffinities } count++ @@ -1002,7 +998,8 @@ type ScoreNormalizationIterator struct { func NewScoreNormalizationIterator(ctx Context, source RankIterator) *ScoreNormalizationIterator { return &ScoreNormalizationIterator{ ctx: ctx, - source: source} + source: source, + } } func (iter *ScoreNormalizationIterator) Reset() { diff --git a/scheduler/feasible/rank_test.go b/scheduler/feasible/rank_test.go index 42e57e5b869..050d384eb47 100644 --- a/scheduler/feasible/rank_test.go +++ b/scheduler/feasible/rank_test.go @@ -1648,6 +1648,20 @@ func TestBinPackIterator_Devices(t *testing.T) { }, } + sharedNvidiaNode := mock.SharedNvidiaNode() + sharedDevs := sharedNvidiaNode.NodeResources.Devices[0].Instances + sharedNvidiaDevices := []string{sharedDevs[0].ID, sharedDevs[1].ID} + + sharedNvidiaDev0 := mock.Alloc() + sharedNvidiaDev0.AllocatedResources.Tasks["web"].Devices = []*structs.AllocatedDeviceResource{ + { + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{sharedNvidiaDevices[0]}, + WillShare: map[string]bool{sharedNvidiaDevices[0]: true}, + }, + } type devPlacementTuple struct { Count int ExcludeIDs []string @@ -1887,6 +1901,66 @@ func TestBinPackIterator_Devices(t *testing.T) { }, PlannedAllocs: []*structs.Allocation{nvidiaDev0}, }, + { + Name: "shared request with planned uses", + Node: sharedNvidiaNode, + TaskGroup: &structs.TaskGroup{ + EphemeralDisk: &structs.EphemeralDisk{}, + Tasks: []*structs.Task{ + { + Name: "web2", + Resources: &structs.Resources{ + CPU: 1024, + MemoryMB: 1024, + Devices: []*structs.RequestedDevice{ + { + Name: "nvidia/gpu/1080ti", + Count: 1, + ShareDevices: &structs.ShareDevices{Enabled: true}, + }, + }, + }, + }, + { + Name: "web3", + Resources: &structs.Resources{ + CPU: 1024, + MemoryMB: 1024, + Devices: []*structs.RequestedDevice{ + { + Name: "nvidia/gpu/1080ti", + Count: 1, + ShareDevices: &structs.ShareDevices{Enabled: true}, + }, + }, + }, + }, + }, + }, + ExpectedPlacements: map[string]map[structs.DeviceIdTuple]devPlacementTuple{ + "web2": { + { + Vendor: "nvidia", + Type: "gpu", + Name: "1080ti", + }: { + Count: 1, + ExcludeIDs: []string{sharedNvidiaDevices[1]}, + }, + }, + "web3": { + { + Vendor: "nvidia", + Type: "gpu", + Name: "1080ti", + }: { + Count: 1, + ExcludeIDs: []string{sharedNvidiaDevices[1]}, + }, + }, + }, + PlannedAllocs: []*structs.Allocation{sharedNvidiaDev0}, + }, } for _, c := range cases { @@ -1958,9 +2032,9 @@ func TestBinPackIterator_Devices(t *testing.T) { } // Tests that bin packing iterator fails due to overprovisioning of devices +// when devices are not shared. Demonstrates shared devices do not fail // This test has devices at task level func TestBinPackIterator_Device_Failure_With_Eviction(t *testing.T) { - _, ctx := MockContext(t) nodes := []*RankedNode{ { Node: &structs.Node{ @@ -1999,71 +2073,124 @@ func TestBinPackIterator_Device_Failure_With_Eviction(t *testing.T) { }, } - // Add a planned alloc that takes up a gpu - plan := ctx.Plan() - plan.NodeAllocation[nodes[0].Node.ID] = []*structs.Allocation{ + for _, tc := range []struct { + name string + nodes []*RankedNode + deviceShared bool + taskWillShare bool + allocWillShare bool + rankedNodes int + exhaustedNodes int + }{ { - AllocatedResources: &structs.AllocatedResources{ - Tasks: map[string]*structs.AllocatedTaskResources{ - "web": { - Cpu: structs.AllocatedCpuResources{ - CpuShares: 2048, - }, - Memory: structs.AllocatedMemoryResources{ - MemoryMB: 2048, - }, - Networks: []*structs.NetworkResource{}, - Devices: []*structs.AllocatedDeviceResource{ - { - Vendor: "nvidia", - Type: "gpu", - Name: "SOME-GPU", - DeviceIDs: []string{"1"}, + name: "expect failure", + nodes: nodes, + rankedNodes: 0, + exhaustedNodes: 1, + }, + { + name: "shared device, expect success", + nodes: nodes, + deviceShared: true, + rankedNodes: 1, + exhaustedNodes: 0, + }, + } { + t.Run(tc.name, func(t *testing.T) { + _, ctx := MockContext(t) + nodes := tc.nodes + // , existing allocation, and task + if tc.deviceShared { + var n []*RankedNode + + //mark gpu as SharingActive on RankedNodes + for _, v := range tc.nodes { + newNode := v.Node.Copy() + newNode.NodeResources.Devices[0].Instances[0].Shared = structs.DeviceSharingActive + n = append(n, &RankedNode{ + Node: newNode, + }) + } + //overwrite RankedNodes + nodes = n + } + + plan := ctx.Plan() + plan.NodeAllocation[nodes[0].Node.ID] = []*structs.Allocation{ + { + AllocatedResources: &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 2048, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 2048, + }, + Networks: []*structs.NetworkResource{}, + Devices: []*structs.AllocatedDeviceResource{ + { + Vendor: "nvidia", + Type: "gpu", + Name: "SOME-GPU", + DeviceIDs: []string{"1"}, + }, + }, }, }, + Shared: structs.AllocatedSharedResources{}, }, }, - Shared: structs.AllocatedSharedResources{}, - }, - }, - } - static := NewStaticRankIterator(ctx, nodes) - - // Create a task group with gpu device specified - taskGroup := &structs.TaskGroup{ - EphemeralDisk: &structs.EphemeralDisk{}, - Tasks: []*structs.Task{ - { - Name: "web", - Resources: &structs.Resources{ - CPU: 1024, - MemoryMB: 1024, - Networks: []*structs.NetworkResource{}, - Devices: structs.ResourceDevices{ - { - Name: "nvidia/gpu", - Count: 1, + } + tg := &structs.TaskGroup{ + EphemeralDisk: &structs.EphemeralDisk{}, + Tasks: []*structs.Task{ + { + Name: "web", + Resources: &structs.Resources{ + CPU: 1024, + MemoryMB: 1024, + Networks: []*structs.NetworkResource{}, + Devices: structs.ResourceDevices{ + { + Name: "nvidia/gpu", + Count: 1, + }, + }, + NUMA: &structs.NUMA{Affinity: structs.NoneNUMA}, + }, + }, + { + Name: "web", + Resources: &structs.Resources{ + CPU: 1024, + MemoryMB: 1024, + Networks: []*structs.NetworkResource{}, + Devices: structs.ResourceDevices{ + { + Name: "nvidia/gpu", + Count: 1, + }, + }, + NUMA: &structs.NUMA{Affinity: structs.NoneNUMA}, }, }, - NUMA: &structs.NUMA{Affinity: structs.NoneNUMA}, }, - }, - }, - Networks: []*structs.NetworkResource{}, - } - - binp := NewBinPackIterator(ctx, static, true, 0) - binp.SetTaskGroup(taskGroup) - binp.SetSchedulerConfiguration(testSchedulerConfig) - - scoreNorm := NewScoreNormalizationIterator(ctx, binp) + Networks: []*structs.NetworkResource{}, + } + static := NewStaticRankIterator(ctx, nodes) + binp := NewBinPackIterator(ctx, static, true, 0) + binp.SetTaskGroup(tg) + binp.SetSchedulerConfiguration(testSchedulerConfig) - out := collectRanked(scoreNorm) + scoreNorm := NewScoreNormalizationIterator(ctx, binp) + out := collectRanked(scoreNorm) - // We expect a placement failure because we need 1 GPU device - // and the other one is taken - must.SliceEmpty(t, out) - must.Eq(t, 1, ctx.metrics.DimensionExhausted["devices: no devices match request"]) + // check if we get the expected number of rankedNodes (0 or 1) + must.SliceLen(t, tc.rankedNodes, out) + must.Eq(t, tc.exhaustedNodes, ctx.metrics.DimensionExhausted["devices: no devices match request"]) + }) + } } func TestBinPackIterator_Device_Preemption_MultipleDeviceRequests(t *testing.T) {