@@ -6,11 +6,24 @@ import (
66 v2pb "github.com/michelangelo-ai/michelangelo/proto-go/api/v2"
77 rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
88 corev1 "k8s.io/api/core/v1"
9+ "k8s.io/apimachinery/pkg/api/resource"
910 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1011 "k8s.io/apimachinery/pkg/runtime"
1112 k8sptr "k8s.io/utils/ptr"
1213)
1314
15+ // LogPersistenceConfig holds platform-level configuration for log persistence.
16+ // This is loaded from YAML config at the mapper level, not from per-job proto.
17+ type LogPersistenceConfig struct {
18+ Enabled bool `yaml:"enabled"`
19+ StorageEndpoint string `yaml:"storageEndpoint"`
20+ Bucket string `yaml:"bucket"`
21+ PathPrefix string `yaml:"pathPrefix"`
22+ Region string `yaml:"region"`
23+ CredentialsSecret string `yaml:"credentialsSecret"`
24+ CollectorImage string `yaml:"collectorImage"`
25+ }
26+
1427func (m Mapper ) mapRay (rayJob * v2pb.RayJob , jobClusterObject runtime.Object , cluster * v2pb.Cluster ) (runtime.Object , error ) {
1528 if jobClusterObject == nil {
1629 return nil , fmt .Errorf ("ray job requires associated RayCluster object" )
@@ -52,6 +65,14 @@ func (m Mapper) mapRay(rayJob *v2pb.RayJob, jobClusterObject runtime.Object, clu
5265func (m Mapper ) mapRayCluster (rayCluster * v2pb.RayCluster ) (runtime.Object , error ) {
5366 workerGroupSpecs := getWorkerGroupSpecs (rayCluster .GetName (), rayCluster .GetSpec ().Workers )
5467 headGroupSpec := getHeadGroupSpec (rayCluster .GetSpec ().Head )
68+
69+ if m .LogPersistence .Enabled {
70+ InjectCollectorSidecar (& headGroupSpec .Template , m .LogPersistence , rayCluster .GetName (), "Head" )
71+ for i := range workerGroupSpecs {
72+ InjectCollectorSidecar (& workerGroupSpecs [i ].Template , m .LogPersistence , rayCluster .GetName (), "Worker" )
73+ }
74+ }
75+
5576 rayV1Cluster := & rayv1.RayCluster {
5677 TypeMeta : metav1.TypeMeta {
5778 Kind : RayClusterKind ,
@@ -94,6 +115,124 @@ func getWorkerGroupSpecs(clusterName string, workers []*v2pb.RayWorkerSpec) []ra
94115 return workerGroupSpecsJSON
95116}
96117
118+ const (
119+ rayLogsVolumeName = "ray-logs"
120+ rayLogsPath = "/tmp/ray"
121+ collectorPort = 8084
122+ )
123+
124+ // InjectCollectorSidecar injects a KubeRay History Server collector sidecar container
125+ // into the pod template. It adds a shared emptyDir volume for Ray logs, mounts it to
126+ // all existing containers, configures Ray event export env vars, adds a lifecycle hook
127+ // to write the node ID, and appends the collector sidecar container.
128+ func InjectCollectorSidecar (podTemplate * corev1.PodTemplateSpec , config LogPersistenceConfig , clusterName string , role string ) {
129+ // 1. Add ray-logs emptyDir volume
130+ podTemplate .Spec .Volumes = append (podTemplate .Spec .Volumes , corev1.Volume {
131+ Name : rayLogsVolumeName ,
132+ VolumeSource : corev1.VolumeSource {
133+ EmptyDir : & corev1.EmptyDirVolumeSource {},
134+ },
135+ })
136+
137+ rayLogsVolumeMount := corev1.VolumeMount {
138+ Name : rayLogsVolumeName ,
139+ MountPath : rayLogsPath ,
140+ }
141+
142+ eventExportEnvVars := []corev1.EnvVar {
143+ {
144+ Name : "RAY_enable_core_worker_ray_event_to_aggregator" ,
145+ Value : "1" ,
146+ },
147+ {
148+ Name : "RAY_DASHBOARD_AGGREGATOR_AGENT_EVENTS_EXPORT_ADDR" ,
149+ Value : fmt .Sprintf ("http://localhost:%d/events" , collectorPort ),
150+ },
151+ }
152+
153+ // 2-3, 5. Update all existing containers: add volume mount, env vars, lifecycle hook
154+ for i := range podTemplate .Spec .Containers {
155+ c := & podTemplate .Spec .Containers [i ]
156+ c .VolumeMounts = append (c .VolumeMounts , rayLogsVolumeMount )
157+ c .Env = append (c .Env , eventExportEnvVars ... )
158+
159+ // Add postStart lifecycle hook to write node ID
160+ if c .Lifecycle == nil {
161+ c .Lifecycle = & corev1.Lifecycle {}
162+ }
163+ c .Lifecycle .PostStart = & corev1.LifecycleHandler {
164+ Exec : & corev1.ExecAction {
165+ Command : []string {"/bin/sh" , "-c" , "echo $RAY_NODE_ID > /tmp/ray/init.log" },
166+ },
167+ }
168+ }
169+
170+ // 4. Add collector sidecar container
171+ collectorContainer := corev1.Container {
172+ Name : "collector" ,
173+ Image : config .CollectorImage ,
174+ Args : []string {
175+ fmt .Sprintf ("--role=%s" , role ),
176+ "--runtime-class-name=s3" ,
177+ fmt .Sprintf ("--ray-cluster-name=%s" , clusterName ),
178+ fmt .Sprintf ("--ray-root-dir=%s" , rayLogsPath ),
179+ fmt .Sprintf ("--events-port=%d" , collectorPort ),
180+ },
181+ Env : []corev1.EnvVar {
182+ {
183+ Name : "AWS_ACCESS_KEY_ID" ,
184+ ValueFrom : & corev1.EnvVarSource {
185+ SecretKeyRef : & corev1.SecretKeySelector {
186+ LocalObjectReference : corev1.LocalObjectReference {
187+ Name : config .CredentialsSecret ,
188+ },
189+ Key : "AWS_ACCESS_KEY_ID" ,
190+ },
191+ },
192+ },
193+ {
194+ Name : "AWS_SECRET_ACCESS_KEY" ,
195+ ValueFrom : & corev1.EnvVarSource {
196+ SecretKeyRef : & corev1.SecretKeySelector {
197+ LocalObjectReference : corev1.LocalObjectReference {
198+ Name : config .CredentialsSecret ,
199+ },
200+ Key : "AWS_SECRET_ACCESS_KEY" ,
201+ },
202+ },
203+ },
204+ {
205+ Name : "S3_ENDPOINT_URL" ,
206+ Value : config .StorageEndpoint ,
207+ },
208+ {
209+ Name : "S3_BUCKET" ,
210+ Value : config .Bucket ,
211+ },
212+ {
213+ Name : "S3_REGION" ,
214+ Value : config .Region ,
215+ },
216+ },
217+ Ports : []corev1.ContainerPort {
218+ {
219+ Name : "events" ,
220+ ContainerPort : int32 (collectorPort ),
221+ Protocol : corev1 .ProtocolTCP ,
222+ },
223+ },
224+ Resources : corev1.ResourceRequirements {
225+ Requests : corev1.ResourceList {
226+ corev1 .ResourceCPU : resource .MustParse ("100m" ),
227+ corev1 .ResourceMemory : resource .MustParse ("128Mi" ),
228+ },
229+ },
230+ VolumeMounts : []corev1.VolumeMount {rayLogsVolumeMount },
231+ }
232+
233+ podTemplate .Spec .Containers = append (podTemplate .Spec .Containers , collectorContainer )
234+ }
235+
97236// getRayClusterStateFromStatus maps KubeRay v1 cluster state to our internal v2pb.RayClusterState
98237func getRayClusterStateFromKubeRayState (kubeRayState rayv1.ClusterState ) v2pb.RayClusterState {
99238 switch kubeRayState {
0 commit comments