Project-HAMi · archlitchi · Jun 1, 2026 · May 21, 2026
diff --git a/charts/hami-webui/templates/configmap.yaml b/charts/hami-webui/templates/configmap.yaml
@@ -8,13 +8,16 @@ data:
     server:
       http:
         addr: 0.0.0.0:8000
-        timeout: 1s
+        timeout: {{ .Values.backend.http.timeout | default "60s" }}
       grpc:
         addr: 0.0.0.0:9000
-        timeout: 1s
+        timeout: {{ .Values.backend.grpc.timeout | default "60s" }}
     prometheus:
       address: {{ ternary .Values.externalPrometheus.address (printf "http://%s-kube-prometh-prometheus.%s.svc.cluster.local:9090" (include "hami-webui.fullname" .) (include "hami-webui.namespace" .)) .Values.externalPrometheus.enabled }}
-      timeout: 1m
+      timeout: {{ .Values.externalPrometheus.timeout | default "1m" }}
+    exporter:
+      interval: {{ .Values.metricsExporter.interval | default "30s" }}
+      timeout:  {{ .Values.metricsExporter.timeout  | default "60s" }}
     node_selectors:
     {{- range $key, $value := .Values.vendorNodeSelectors }}
       {{ $key }}: {{ $value }}

diff --git a/charts/hami-webui/templates/deployment.yaml b/charts/hami-webui/templates/deployment.yaml
@@ -59,6 +59,16 @@ spec:
           args:
             - "--conf"
             - "/apps/config/config.yaml"
+          {{- if .Values.backend.readinessProbe.enabled }}
+          readinessProbe:
+            httpGet:
+              path: /readyz
+              port: metrics
+            initialDelaySeconds: {{ .Values.backend.readinessProbe.initialDelaySeconds }}
+            periodSeconds: {{ .Values.backend.readinessProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.backend.readinessProbe.timeoutSeconds }}
+            failureThreshold: {{ .Values.backend.readinessProbe.failureThreshold }}
+          {{- end }}
           resources:
             {{- toYaml .Values.resources.backend | nindent 12 }}
           volumeMounts:

diff --git a/charts/hami-webui/values.yaml b/charts/hami-webui/values.yaml
@@ -162,4 +162,44 @@ kube-prometheus-stack:
 externalPrometheus:
   enabled: false
   # If externalPrometheus.enabled is true, this address will be used
-  address: "http://prometheus-kube-prometheus-prometheus.prometheus.svc.cluster.local:9090"
+  address: "http://prometheus-kube-prometheus-prometheus.prometheus.svc.cluster.local:9090"
+  # Single PromQL upstream timeout (sent to Prometheus / VictoriaMetrics as the &timeout= parameter).
+  timeout: "1m"
+
+# Kratos server timeouts. These bound the deadline placed on each incoming HTTP/gRPC
+# request context. They no longer affect /metrics generation (which runs in the background),
+# but they still gate the page-side APIs, some of which legitimately take a few seconds
+# against a large Prometheus / VictoriaMetrics cluster.
+backend:
+  http:
+    timeout: "60s"
+  grpc:
+    timeout: "60s"
+  # Readiness probe on the backend's /readyz. The backend only starts listening
+  # after its k8s informer caches have synced, so gating the pod on /readyz keeps
+  # it out of the Service until the backend can serve — this prevents the frontend
+  # BFF from hitting "connection refused" against the not-yet-listening backend
+  # during startup. failureThreshold * periodSeconds should comfortably exceed the
+  # worst-case informer sync time on a large cluster.
+  readinessProbe:
+    enabled: true
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 3
+    failureThreshold: 60
+
+# Background /metrics collector. The collector runs independently of Prometheus scrape;
+# scrapes only serialize the in-memory registry, so they are O(1) and immune to scrape
+# timeouts. Tune interval if you need fresher data or want to reduce upstream load further.
+metricsExporter:
+  # How often the collector refreshes hami_* metrics from the upstream Prometheus / VictoriaMetrics.
+  interval: "30s"
+  # Hard cap on a single refresh cycle. Should be >= externalPrometheus.timeout and < interval * 2.
+  timeout: "60s"
+
+# The frontend axios client timeout (default 60000ms) is compiled into the JS
+# bundle at image build time from the VUE_APP_REQUEST_TIMEOUT env var, as Vite
+# evaluates process.env at build, not at runtime. To change it for a deployed
+# cluster you need to rebuild packages/web with a different VUE_APP_REQUEST_TIMEOUT
+# (set in packages/web/.env.production or as a build-arg). It cannot be tuned
+# through this values.yaml.
diff --git a/packages/web/.env.development b/packages/web/.env.development
@@ -4,3 +4,6 @@ ENV = 'development'
 # base api
 VUE_APP_BASE_API = '/'
 
+# axios global timeout in ms; override via build env or chart values.frontend.requestTimeout
+VUE_APP_REQUEST_TIMEOUT = 60000
+
diff --git a/packages/web/.env.production b/packages/web/.env.production
@@ -3,3 +3,6 @@ ENV = 'production'
 
 # base api
 VUE_APP_BASE_API = './'
+
+# axios global timeout in ms; override via build env or chart values.frontend.requestTimeout
+VUE_APP_REQUEST_TIMEOUT = 60000
diff --git a/packages/web/src/utils/request.js b/packages/web/src/utils/request.js
@@ -3,9 +3,17 @@ import axios from 'axios';
 import { ElMessage, ElMessageBox, ElNotification } from 'element-plus';
 import i18n from '@/locales';
 
+// Default request timeout in ms. Override at build time via VUE_APP_REQUEST_TIMEOUT
+// (injected through .env.* or chart values.frontend.requestTimeout). 60s is large
+// enough for the slowest known page-side API (/v1/nodes can take a few seconds
+// against a large VictoriaMetrics cluster) while still bounding hung requests.
+const DEFAULT_REQUEST_TIMEOUT = 60000;
+const requestTimeout =
+  Number.parseInt(process.env.VUE_APP_REQUEST_TIMEOUT, 10) || DEFAULT_REQUEST_TIMEOUT;
+
 const service = axios.create({
   baseURL: process.env.VUE_APP_BASE_API, // url = base url + request url
-  timeout: 5000,
+  timeout: requestTimeout,
   validateStatus: function (status) {
     return (status >= 200 && status < 300) || status > 520;
   },

diff --git a/server/cmd/server/main.go b/server/cmd/server/main.go
@@ -9,6 +9,7 @@ import (
 	"github.com/go-kratos/kratos/v2/transport/http"
 	"os"
 	"vgpu/internal/conf"
+	"vgpu/internal/exporter"
 
 	_ "go.uber.org/automaxprocs"
 )
@@ -44,7 +45,9 @@ func main() {
 	}
 }
 
-func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Server) *kratos.App {
+// newApp wires the background metrics collector in as a kratos transport.Server so
+// its goroutine is started and stopped by the app lifecycle, alongside HTTP and gRPC.
+func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Server, mc *exporter.MetricsGenerator) *kratos.App {
 	return kratos.New(
 		kratos.Context(ctx),
 		kratos.ID(id),
@@ -55,6 +58,7 @@ func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Se
 		kratos.Server(
 			gs,
 			hs,
+			mc,
 		),
 	)
 }

diff --git a/server/config/config.yaml b/server/config/config.yaml
@@ -1,16 +1,19 @@
 server:
   http:
     addr: 0.0.0.0:8000
-    timeout: 1s
+    timeout: 60s
   grpc:
     addr: 0.0.0.0:9000
-    timeout: 1s
+    timeout: 60s
 prometheus:
   address: http://localhost:9090
   timeout: 1m
+exporter:
+  interval: 30s
+  timeout: 60s
 node_selectors:
   NVIDIA: gpu=on
   Ascend: ascend=on
   DCU: dcu=on
   MLU: mlu=on
-  Metax: metax-tech.com/gpu.installed=true
+  Metax: metax-tech.com/gpu.installed=true
diff --git a/server/internal/conf/conf.proto b/server/internal/conf/conf.proto
@@ -9,6 +9,7 @@ message Bootstrap {
   Server server = 1;
   Prometheus prometheus = 2;
   map<string, string> node_selectors = 3;
+  Exporter exporter = 4;
 }
 
 message Server {
@@ -31,3 +32,14 @@ message Prometheus {
   string timeout = 2;
   string auth = 3;
 }
+
+// Exporter controls the background /metrics collector that periodically refreshes the
+// in-memory Prometheus registry from upstream Prometheus / VictoriaMetrics. The /metrics
+// HTTP handler only serializes that registry, so it is no longer tied to the per-request
+// HTTP timeout and no longer fans out PromQL on every scrape.
+message Exporter {
+  // Interval between two refresh cycles. Defaults to 30s if unset.
+  google.protobuf.Duration interval = 1;
+  // Hard cap on a single refresh cycle. Defaults to 60s if unset.
+  google.protobuf.Duration timeout = 2;
+}
diff --git a/server/internal/data/node.go b/server/internal/data/node.go
@@ -98,7 +98,9 @@ func (r *nodeRepo) updateLocalNodes() {
 				}
 			}
 		}
+		r.mutex.Lock()
 		r.nodes = n
+		r.mutex.Unlock()
 	}
 }
 
@@ -156,6 +158,8 @@ func (r *nodeRepo) fetchNodeInfo(node *corev1.Node) *biz.Node {
 }
 
 func (r *nodeRepo) ListAll(context.Context) ([]*biz.Node, error) {
+	r.mutex.RLock()
+	defer r.mutex.RUnlock()
 	var nodeList []*biz.Node
 	for _, node := range r.nodes {
 		nodeList = append(nodeList, node)
@@ -164,13 +168,18 @@ func (r *nodeRepo) ListAll(context.Context) ([]*biz.Node, error) {
 }
 
 func (r *nodeRepo) GetNode(_ context.Context, uid string) (*biz.Node, error) {
-	if _, ok := r.nodes[k8stypes.UID(uid)]; !ok {
+	r.mutex.RLock()
+	defer r.mutex.RUnlock()
+	node, ok := r.nodes[k8stypes.UID(uid)]
+	if !ok {
 		return nil, errors.New("node not found")
 	}
-	return r.nodes[k8stypes.UID(uid)], nil
+	return node, nil
 }
 
 func (r *nodeRepo) ListAllDevices(context.Context) ([]*biz.DeviceInfo, error) {
+	r.mutex.RLock()
+	defer r.mutex.RUnlock()
 	var deviceList []*biz.DeviceInfo
 	for _, node := range r.nodes {
 		deviceList = append(deviceList, node.Devices...)
@@ -179,6 +188,8 @@ func (r *nodeRepo) ListAllDevices(context.Context) ([]*biz.DeviceInfo, error) {
 }
 
 func (r *nodeRepo) FindDeviceByAliasId(aliasId string) (*biz.DeviceInfo, error) {
+	r.mutex.RLock()
+	defer r.mutex.RUnlock()
 	for _, node := range r.nodes {
 		for _, d := range node.Devices {
 			if d.AliasId == aliasId {

diff --git a/server/internal/data/pod.go b/server/internal/data/pod.go
@@ -184,6 +184,8 @@ func (r *podRepo) GetStartTime(pod *corev1.Pod) time.Time {
 }
 
 func (r *podRepo) ListAll(context.Context) ([]*biz.Container, error) {
+	r.mutex.RLock()
+	defer r.mutex.RUnlock()
 	var containerList []*biz.Container
 	for _, pod := range r.pods {
 		containerList = append(containerList, pod.Ctrs...)
@@ -196,7 +198,13 @@ func (r *podRepo) FindOne(_ context.Context, podUID string, name string) (*biz.C
 		return nil, fmt.Errorf("podUID or name is empty")
 	}
 
-	for _, container := range r.pods[k8stypes.UID(podUID)].Ctrs {
+	r.mutex.RLock()
+	defer r.mutex.RUnlock()
+	pod, ok := r.pods[k8stypes.UID(podUID)]
+	if !ok {
+		return nil, fmt.Errorf("not found")
+	}
+	for _, container := range pod.Ctrs {
 		if container.Name == name {
 			return container, nil
 		}

diff --git a/server/internal/exporter/cells.go b/server/internal/exporter/cells.go
@@ -0,0 +1,97 @@
+package exporter
+
+import (
+	"strings"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// Diff-based cell tracking for the background /metrics collector.
+//
+// Why this exists: the old reset()+populate cycle (called from a synchronous HTTP
+// handler) was safe because scrape only ever observed the fully-populated state.
+// Once the cycle moved to a background goroutine, every Prometheus scrape that
+// landed between reset() and the end of populate saw partial / empty data, which
+// surfaces in the UI as "vGPU 分配率有时有数据，有时没有数据" — series flickering
+// in and out at scrape boundaries.
+//
+// The fix: instead of wiping the GaugeVec at the start of each cycle, every Set
+// goes through MetricsGenerator.set, which both writes the value AND records the
+// (gauge, label tuple) it touched. After a cycle completes successfully, we walk
+// the previous cycle's recorded set and DeleteLabelValues for any tuple that
+// disappeared this round. Existing series are atomically overwritten in place,
+// brand-new series appear when their Set runs, vanished series disappear at the
+// end-of-cycle prune. There is no window where a known device is missing.
+
+// cellKey identifies a single observation (gauge vector + concrete label tuple).
+// The joined string is just a map-key encoding of the labels; the original
+// []string is kept on the cell so we can pass it to DeleteLabelValues.
+type cellKey struct {
+	gauge  *prometheus.GaugeVec
+	joined string
+}
+
+type cell struct {
+	gauge  *prometheus.GaugeVec
+	labels []string
+}
+
+// labelSep is a 0-byte separator that cannot appear in normal Prometheus label
+// values, so strings.Join produces an unambiguous key.
+const labelSep = "\x00"
+
+// set writes value into the gauge and records the (gauge, labels) tuple in the
+// current-cycle map. Safe for concurrent use; the collector itself only calls
+// it from one goroutine, but we lock anyway in case callers add a parallel pass.
+func (s *MetricsGenerator) set(g *prometheus.GaugeVec, value float64, labels ...string) {
+	g.WithLabelValues(labels...).Set(value)
+
+	k := cellKey{gauge: g, joined: strings.Join(labels, labelSep)}
+	s.cellMu.Lock()
+	defer s.cellMu.Unlock()
+	if s.current == nil {
+		s.current = make(map[cellKey]cell)
+	}
+	// Copy the labels slice — callers reuse the underlying array between iterations.
+	s.current[k] = cell{gauge: g, labels: append([]string(nil), labels...)}
+}
+
+// commitCycle promotes the current cycle to "previous" and removes any label
+// tuple that existed in the previous cycle but not this one. Call ONLY when the
+// cycle completed without being cut short by ctx cancellation: pruning on a
+// partial map would erroneously delete cells that just weren't re-Set this time.
+func (s *MetricsGenerator) commitCycle() {
+	s.cellMu.Lock()
+	defer s.cellMu.Unlock()
+	if s.current == nil {
+		// Nothing was written this cycle; leave prev intact.
+		return
+	}
+	deleted := 0
+	for k, c := range s.prev {
+		if _, ok := s.current[k]; ok {
+			continue
+		}
+		if c.gauge.DeleteLabelValues(c.labels...) {
+			deleted++
+		}
+	}
+	if deleted > 0 {
+		s.log.Debugw("msg", "pruned stale metric cells", "count", deleted)
+	}
+	s.prev = s.current
+	s.current = nil
+}
+
+// dropCurrentCycle discards the in-progress map without promoting it. Use this
+// when a cycle ran into ctx cancellation or any other partial-completion path,
+// so the next cycle's prune still references the last KNOWN-GOOD snapshot.
+//
+// Any partial Set() calls that did land remain in the GaugeVec as freshly
+// overwritten cells — that is intentional and harmless, since they only update
+// values on label tuples that already existed.
+func (s *MetricsGenerator) dropCurrentCycle() {
+	s.cellMu.Lock()
+	s.current = nil
+	s.cellMu.Unlock()
+}