Skip to content

Commit 2ed17ac

Browse files
authored
Merge pull request #6029 from jdzikowski/master
Update gang scheduling KEPs with updated Workload API
2 parents 64d1906 + 4f7722b commit 2ed17ac

3 files changed

Lines changed: 64 additions & 27 deletions

File tree

keps/sig-apps/5547-integrate-workload-with-job/README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,9 @@ metadata:
168168
uid: <workload-uid>
169169
spec:
170170
podGroupTemplateRef:
171-
workloadName: <workload-name>
172-
podGroupTemplateName: <podGroup-template-name>
171+
workload:
172+
workloadName: <workload-name>
173+
podGroupTemplateName: <podGroup-template-name>
173174
schedulingPolicy:
174175
gang:
175176
minCount: 8 # Equal to Job.spec.parallelism
@@ -253,7 +254,7 @@ A `Workload` is considered the Workload for this Job object if:
253254

254255
Similarly, a `PodGroup` is considered the `PodGroup` for this Job if:
255256
- The `PodGroup` is in the Job’s namespace
256-
- Its `spec.podGroupTemplateReference.workloadName` equals the name of the `Workload` for this Job.
257+
- Its `spec.podGroupTemplateReference.workload.workloadName` equals the name of the `Workload` for this Job.
257258

258259
#### Controller Workflow
259260

keps/sig-scheduling/4671-gang-scheduling/README.md

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -587,20 +587,46 @@ type PodGroupSpec struct {
587587
SchedulingPolicy *PodGroupSchedulingPolicy
588588
}
589589
590+
// PodGroupStatus represents information about the status of a pod group.
590591
type PodGroupStatus struct {
591592
// Conditions represent the latest observations of the PodGroup's state.
593+
//
594+
// Known condition types:
595+
// - "PodGroupScheduled": Indicates whether the scheduling requirement has been satisfied.
596+
// - "DisruptionTarget": Indicates whether the PodGroup is about to be terminated
597+
// due to disruption such as preemption.
598+
//
599+
// Known reasons for the PodGroupScheduled condition:
600+
// - "Unschedulable": The PodGroup cannot be scheduled due to resource constraints,
601+
// affinity/anti-affinity rules, or insufficient capacity for the gang.
602+
// - "SchedulerError": The PodGroup cannot be scheduled due to some internal error
603+
// that happened during scheduling, for example due to nodeAffinity parsing errors.
604+
//
605+
// Known reasons for the DisruptionTarget condition:
606+
// - "PreemptionByScheduler": The PodGroup was preempted by the scheduler to make room for
607+
// higher-priority PodGroups or Pods.
608+
//
592609
// +optional
593610
Conditions []metav1.Condition
594611
}
595612
596-
// PodGroupTemplateReferenve references the PodGroupTemplate within the Workload object.
613+
// PodGroupTemplateReference references a PodGroup template defined in some object (e.g. Workload).
614+
// Exactly one reference must be set.
597615
type PodGroupTemplateReference struct {
616+
// Workload references the PodGroupTemplate within the Workload object that was used to create
617+
// the PodGroup.
618+
// +optional
619+
Workload *WorkloadPodGroupTemplateReference
620+
}
621+
622+
// WorkloadPodGroupTemplateReference references the PodGroupTemplate within the Workload object.
623+
type WorkloadPodGroupTemplateReference struct {
598624
// WorkloadName defines the name of the Workload object.
599-
// +optional
600-
WorkloadName string
601-
625+
// +required
626+
WorkloadName string
627+
602628
// PodGroupTemplateName defines the PodGroupTemplate name within the Workload object.
603-
// +optional
629+
// +required
604630
PodGroupTemplateName string
605631
}
606632
```

keps/sig-scheduling/5832-decouple-podgroup-api/README.md

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,9 @@ metadata:
131131
namespace: ns-1
132132
spec:
133133
podGroupTemplateRef:
134-
workloadName: training-workload
135-
podGroupTemplateName: pd-1-template
134+
workload:
135+
workloadName: training-workload
136+
podGroupTemplateName: pd-1-template
136137
schedulingPolicy:
137138
gang:
138139
minCount: 2
@@ -294,35 +295,44 @@ type PodGroupSpec struct {
294295
SchedulingPolicy *PodGroupSchedulingPolicy
295296
}
296297

297-
// PodGroupTemplateReference references the PodGroupTemplate name within
298-
// the Workload object.
298+
// PodGroupTemplateReference references a PodGroup template defined in some object (e.g. Workload).
299+
// Exactly one reference must be set.
299300
type PodGroupTemplateReference struct {
300-
// WorkloadName defines the name of the Workload object this PodGroup is part of.
301-
//
302-
// +optional
303-
WorkloadName string
301+
// Workload references the PodGroupTemplate within the Workload object that was used to create
302+
// the PodGroup.
303+
// +optional
304+
Workload *WorkloadPodGroupTemplateReference
305+
}
304306

305-
// PodGroupTemplateName references the PodGroupTemplate name that was used to
306-
// create this PodGroup.
307-
//
308-
// +optional
309-
PodGroupTemplateName string
307+
// WorkloadPodGroupTemplateReference references the PodGroupTemplate within the Workload object.
308+
type WorkloadPodGroupTemplateReference struct {
309+
// WorkloadName defines the name of the Workload object.
310+
// +required
311+
WorkloadName string
312+
313+
// PodGroupTemplateName defines the PodGroupTemplate name within the Workload object.
314+
// +required
315+
PodGroupTemplateName string
310316
}
311317

318+
// PodGroupStatus represents information about the status of a pod group.
312319
type PodGroupStatus struct {
313320
// Conditions represent the latest observations of the PodGroup's state.
314321
//
315322
// Known condition types:
316323
// - "PodGroupScheduled": Indicates whether the scheduling requirement has been satisfied.
317-
// - Status=True: All required pods have been assigned to nodes.
318-
// - Status=False: Scheduling failed (i.e., timeout, unschedulable, etc.).
324+
// - "DisruptionTarget": Indicates whether the PodGroup is about to be terminated
325+
// due to disruption such as preemption.
319326
//
320-
// Known reasons for PodGroupScheduled condition:
321-
// - "Scheduled": All required pods have been successfully scheduled.
327+
// Known reasons for the PodGroupScheduled condition:
322328
// - "Unschedulable": The PodGroup cannot be scheduled due to resource constraints,
323329
// affinity/anti-affinity rules, or insufficient capacity for the gang.
324-
// - "Preempted": The PodGroup was preempted to make room for higher-priority workloads.
325-
// - "Timeout": The PodGroup failed to schedule within the configured timeout.
330+
// - "SchedulerError": The PodGroup cannot be scheduled due to some internal error
331+
// that happened during scheduling, for example due to nodeAffinity parsing errors.
332+
//
333+
// Known reasons for the DisruptionTarget condition:
334+
// - "PreemptionByScheduler": The PodGroup was preempted by the scheduler to make room for
335+
// higher-priority PodGroups or Pods.
326336
//
327337
// +optional
328338
Conditions []metav1.Condition

0 commit comments

Comments
 (0)