-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathagent.yaml
More file actions
46 lines (41 loc) · 1.2 KB
/
agent.yaml
File metadata and controls
46 lines (41 loc) · 1.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# DGPU Agent Configuration
agent:
# Agent ID (unique identifier)
# If empty, will be auto-generated from hostname
id: ""
# Heartbeat interval in seconds
heartbeat_interval: 5
scheduler:
# Primary scheduler address
master_address: "scheduler-master:9090"
# Standby scheduler address
standby_address: "scheduler-standby:9090"
# Connection timeout in seconds
connection_timeout: 10
# Reconnection retry interval in seconds
retry_interval: 2
# Maximum retry interval in seconds (exponential backoff)
max_retry_interval: 30
gpu:
# GPU detection method: nvml or nvidia-smi
detection_method: "nvml"
# Health check interval in seconds
health_check_interval: 30
executor:
# Task execution method: docker or process
execution_method: "docker"
# Working directory for tasks
work_dir: "/var/lib/dgpu-agent/tasks"
# Docker configuration (if execution_method is docker)
docker:
# Docker socket path
socket: "/var/run/docker.sock"
# Default image
default_image: "nvidia/cuda:12.0-runtime-ubuntu22.04"
logging:
# Log level: debug, info, warn, error
level: "info"
# Log format: json, text
format: "json"
# Log output: stdout, stderr, or file path
output: "stdout"