Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,15 @@ endif
pkg/windows_%/nomad: GO_OUT = $@.exe
pkg/windows_%/nomad: GO_TAGS += timetzdata

# Build the example device plugin for e2e device tests
pkg/%/nomad-device-example: GO_OUT ?= $@
pkg/%/nomad-device-example: ## Build the example device plugin for GOOS_GOARCH
@echo "==> Building $@..."
@CGO_ENABLED=0 \
GOOS=$(firstword $(subst _, ,$*)) \
GOARCH=$(lastword $(subst _, ,$*)) \
go build -trimpath -o $(GO_OUT) ./plugins/device/cmd/example/cmd

# Define package targets for each of the build targets we actually have on this system
define makePackageTarget

Expand Down
81 changes: 76 additions & 5 deletions api/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,10 @@ type NodeDevice struct {
// Locality stores HW locality information for the node to optionally be
// used when making placement decisions.
Locality *NodeDeviceLocality

// Shared mirrors a string enum on device.DetectedDevice that some
// devices use to report status and presence of sharing subsystems
Shared Shared
}

// Attribute is used to describe the value of an attribute, optionally
Expand Down Expand Up @@ -289,13 +293,63 @@ func (a Attribute) String() string {
}
}

// Shared mirrors the plugin.Shared string enum found
// on Devices.DetectedDevice that some devices use to
// report the status and presence of sharing subsystems

type Shared string

const (
DeviceSharingUnset Shared = ""
DeviceSharingIneligible Shared = "ineligible"
DeviceSharingActive Shared = "active"
DeviceSharingInactive Shared = "inactive"
)

// NodeDeviceLocality stores information about the devices hardware locality on
// the node.
type NodeDeviceLocality struct {
// PciBusID is the PCI Bus ID for the device.
PciBusID string
}

// ShareDevices indicates whether the task is willing to share it's device
type ShareDevices struct {
// Enabled
Enabled bool `hcl:"enabled"`
// SharedDeviceID is an optional field for use in environments with
// multiple shared devices, to make the shared device ID available to
// the plugin. If in use alongside the device.id constraint, the two must
// match or the job will not be placed.
SharedDeviceId string `hcl:"shared_device_id,optional"`
}

// DeviceOption represents a single option in a first_available device selection.
// Each option specifies a count and optional constraints that must be satisfied
// for this option to be selected.
type DeviceOption struct {
// Count is the number of requested devices for this option
Count *uint64 `hcl:"count,optional"`

// Constraints are a set of constraints to apply when selecting the device
// to use for this option.
Constraints []*Constraint `hcl:"constraint,block"`

// ShareDevices indicates whether this device option is willing to share
// TODO: determine if ShareDevices should be inherited or if, like count,
// it should only be set on one or the other
ShareDevices *ShareDevices `hcl:"share_devices,block"`
}

func (o *DeviceOption) Canonicalize() {
if o == nil {
return
}
if o.Count == nil {
o.Count = pointerOf(uint64(1))
}
}

// RequestedDevice is used to request a device for a task.
type RequestedDevice struct {
// Name is the request name. The possible values are as follows:
Expand All @@ -309,20 +363,37 @@ type RequestedDevice struct {
// * "nvidia/gpu/GTX2080Ti"
Name string `hcl:",label"`

// Count is the number of requested devices
// Count is the number of requested devices. Mutually exclusive with
// FirstAvailable.
Count *uint64 `hcl:"count,optional"`

// Constraints are a set of constraints to apply when selecting the device
// to use.
// to use. When FirstAvailable is specified, these constraints are applied
// as base constraints that all options must also satisfy.
Constraints []*Constraint `hcl:"constraint,block"`

// Affinities are a set of affinites to apply when selecting the device
// to use.
// Affinities are a set of affinities to apply when selecting the device
// to use. When FirstAvailable is specified, these affinities are applied
// as base affinities for all options.
Affinities []*Affinity `hcl:"affinity,block"`

// ShareDevices reports whether the task should be placed on a shared device
ShareDevices *ShareDevices `hcl:"share_devices,block"`

//// FirstAvailable specifies a prioritized list of device options. The
//// scheduler will attempt to satisfy each option in order, selecting the
//// first one that can be fulfilled. Mutually exclusive with Count.
FirstAvailable []*DeviceOption `hcl:"first_available,block"`
}

func (d *RequestedDevice) Canonicalize() {
if d.Count == nil {
// If using first_available, canonicalize each option but don't set default count
if len(d.FirstAvailable) > 0 {
for _, opt := range d.FirstAvailable {
opt.Canonicalize()
}
} else if d.Count == nil {
// Only set default count when not using first_available
d.Count = pointerOf(uint64(1))
}

Expand Down
46 changes: 46 additions & 0 deletions api/resources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,49 @@ func TestNUMAResource_Canonicalize(t *testing.T) {
n3.Canonicalize()
must.Eq(t, &NUMAResource{Affinity: "require", Devices: nil}, n3)
}

func TestDeviceOption_Canonicalize(t *testing.T) {
testutil.Parallel(t)

// Nil option
var opt *DeviceOption
opt.Canonicalize() // should not panic

// Count defaults to 1
opt2 := &DeviceOption{}
opt2.Canonicalize()
must.Eq(t, uint64(1), *opt2.Count)

// Explicit count preserved
opt3 := &DeviceOption{Count: pointerOf(uint64(4))}
opt3.Canonicalize()
must.Eq(t, uint64(4), *opt3.Count)
}

func TestRequestedDevice_Canonicalize_FirstAvailable(t *testing.T) {
testutil.Parallel(t)

// With FirstAvailable, Count should NOT be set to default
rd := &RequestedDevice{
Name: "nvidia/gpu",
FirstAvailable: []*DeviceOption{
{Count: pointerOf(uint64(2))},
{}, // no count set
},
}
rd.Canonicalize()

// Count should remain nil when using FirstAvailable
must.Nil(t, rd.Count)

// FirstAvailable options should be canonicalized
must.Eq(t, uint64(2), *rd.FirstAvailable[0].Count)
must.Eq(t, uint64(1), *rd.FirstAvailable[1].Count) // defaulted to 1

// Without FirstAvailable, Count defaults to 1
rd2 := &RequestedDevice{
Name: "nvidia/gpu",
}
rd2.Canonicalize()
must.Eq(t, uint64(1), *rd2.Count)
}
14 changes: 14 additions & 0 deletions client/devicemanager/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func convertDevice(dev *device.Device) *structs.NodeDevice {
Healthy: dev.Healthy,
HealthDescription: dev.HealthDesc,
Locality: convertHwLocality(dev.HwLocality),
Shared: convertShared(dev.Shared),
}
}

Expand All @@ -94,3 +95,16 @@ func convertHwLocality(l *device.DeviceLocality) *structs.NodeDeviceLocality {
PciBusID: l.PciBusID,
}
}

func convertShared(s device.Shared) structs.Shared {
switch s {
case device.SharingIneligible:
return structs.DeviceSharingIneligible
case device.SharingActive:
return structs.DeviceSharingActive
case device.SharingInactive:
return structs.DeviceSharingInactive
default:
}
return structs.DeviceSharingUnset
}
40 changes: 37 additions & 3 deletions command/agent/job_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -1623,13 +1623,37 @@ func ApiResourcesToStructs(in *api.Resources) *structs.Resources {

if len(in.Devices) > 0 {
out.Devices = []*structs.RequestedDevice{}

for _, d := range in.Devices {
out.Devices = append(out.Devices, &structs.RequestedDevice{
rd := &structs.RequestedDevice{
Name: d.Name,
Count: *d.Count,
Constraints: ApiConstraintsToStructs(d.Constraints),
Affinities: ApiAffinitiesToStructs(d.Affinities),
})
}
// Only set Count if not using FirstAvailable
if d.Count != nil && len(d.FirstAvailable) == 0 {

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

port to #27391

rd.Count = *d.Count
}
// Only set ShareDevices if not using FirstAvailable
if d.ShareDevices != nil && len(d.FirstAvailable) == 0 {
rd.ShareDevices = ApiShareDevicesToStructs(d.ShareDevices)
}
//// Convert FirstAvailable options
if len(d.FirstAvailable) > 0 {
rd.FirstAvailable = make([]*structs.DeviceOption, len(d.FirstAvailable))
for i, opt := range d.FirstAvailable {
rd.FirstAvailable[i] = &structs.DeviceOption{
Constraints: ApiConstraintsToStructs(opt.Constraints),
}
if opt.Count != nil {
rd.FirstAvailable[i].Count = *opt.Count
}
if opt.ShareDevices != nil {
rd.FirstAvailable[i].ShareDevices = ApiShareDevicesToStructs(opt.ShareDevices)
}
}
}
out.Devices = append(out.Devices, rd)
}
}

Expand All @@ -1646,6 +1670,16 @@ func ApiResourcesToStructs(in *api.Resources) *structs.Resources {

return out
}
func ApiShareDevicesToStructs(in *api.ShareDevices) *structs.ShareDevices {
if in == nil {
return nil
}
return &structs.ShareDevices{
Enabled: in.Enabled,
SharedDeviceId: in.SharedDeviceId,
}

}

func ApiNetworkResourceToStructs(in []*api.NetworkResource) []*structs.NetworkResource {
var out []*structs.NetworkResource
Expand Down
Loading
Loading