Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions charts/hami-webui/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,16 @@ data:
server:
http:
addr: 0.0.0.0:8000
timeout: 1s
timeout: {{ .Values.backend.http.timeout | default "60s" }}
grpc:
addr: 0.0.0.0:9000
timeout: 1s
timeout: {{ .Values.backend.grpc.timeout | default "60s" }}
prometheus:
address: {{ ternary .Values.externalPrometheus.address (printf "http://%s-kube-prometh-prometheus.%s.svc.cluster.local:9090" (include "hami-webui.fullname" .) (include "hami-webui.namespace" .)) .Values.externalPrometheus.enabled }}
timeout: 1m
timeout: {{ .Values.externalPrometheus.timeout | default "1m" }}
exporter:
interval: {{ .Values.metricsExporter.interval | default "30s" }}
timeout: {{ .Values.metricsExporter.timeout | default "60s" }}
node_selectors:
{{- range $key, $value := .Values.vendorNodeSelectors }}
{{ $key }}: {{ $value }}
Expand Down
10 changes: 10 additions & 0 deletions charts/hami-webui/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ spec:
args:
- "--conf"
- "/apps/config/config.yaml"
{{- if .Values.backend.readinessProbe.enabled }}
readinessProbe:
httpGet:
path: /readyz
port: metrics
initialDelaySeconds: {{ .Values.backend.readinessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.backend.readinessProbe.periodSeconds }}
timeoutSeconds: {{ .Values.backend.readinessProbe.timeoutSeconds }}
failureThreshold: {{ .Values.backend.readinessProbe.failureThreshold }}
{{- end }}
resources:
{{- toYaml .Values.resources.backend | nindent 12 }}
volumeMounts:
Expand Down
42 changes: 41 additions & 1 deletion charts/hami-webui/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,44 @@ kube-prometheus-stack:
externalPrometheus:
enabled: false
# If externalPrometheus.enabled is true, this address will be used
address: "http://prometheus-kube-prometheus-prometheus.prometheus.svc.cluster.local:9090"
address: "http://prometheus-kube-prometheus-prometheus.prometheus.svc.cluster.local:9090"
# Single PromQL upstream timeout (sent to Prometheus / VictoriaMetrics as the &timeout= parameter).
timeout: "1m"

# Kratos server timeouts. These bound the deadline placed on each incoming HTTP/gRPC
# request context. They no longer affect /metrics generation (which runs in the background),
# but they still gate the page-side APIs, some of which legitimately take a few seconds
# against a large Prometheus / VictoriaMetrics cluster.
backend:
http:
timeout: "60s"
grpc:
timeout: "60s"
# Readiness probe on the backend's /readyz. The backend only starts listening
# after its k8s informer caches have synced, so gating the pod on /readyz keeps
# it out of the Service until the backend can serve — this prevents the frontend
# BFF from hitting "connection refused" against the not-yet-listening backend
# during startup. failureThreshold * periodSeconds should comfortably exceed the
# worst-case informer sync time on a large cluster.
readinessProbe:
enabled: true
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 60

# Background /metrics collector. The collector runs independently of Prometheus scrape;
# scrapes only serialize the in-memory registry, so they are O(1) and immune to scrape
# timeouts. Tune interval if you need fresher data or want to reduce upstream load further.
metricsExporter:
# How often the collector refreshes hami_* metrics from the upstream Prometheus / VictoriaMetrics.
interval: "30s"
# Hard cap on a single refresh cycle. Should be >= externalPrometheus.timeout and < interval * 2.
timeout: "60s"

# The frontend axios client timeout (default 60000ms) is compiled into the JS
# bundle at image build time from the VUE_APP_REQUEST_TIMEOUT env var, as Vite
# evaluates process.env at build, not at runtime. To change it for a deployed
# cluster you need to rebuild packages/web with a different VUE_APP_REQUEST_TIMEOUT
# (set in packages/web/.env.production or as a build-arg). It cannot be tuned
# through this values.yaml.
3 changes: 3 additions & 0 deletions packages/web/.env.development
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ ENV = 'development'
# base api
VUE_APP_BASE_API = '/'

# axios global timeout in ms; override via build env or chart values.frontend.requestTimeout
VUE_APP_REQUEST_TIMEOUT = 60000

3 changes: 3 additions & 0 deletions packages/web/.env.production
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ ENV = 'production'

# base api
VUE_APP_BASE_API = './'

# axios global timeout in ms; override via build env or chart values.frontend.requestTimeout
VUE_APP_REQUEST_TIMEOUT = 60000
10 changes: 9 additions & 1 deletion packages/web/src/utils/request.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,17 @@ import axios from 'axios';
import { ElMessage, ElMessageBox, ElNotification } from 'element-plus';
import i18n from '@/locales';

// Default request timeout in ms. Override at build time via VUE_APP_REQUEST_TIMEOUT
// (injected through .env.* or chart values.frontend.requestTimeout). 60s is large
// enough for the slowest known page-side API (/v1/nodes can take a few seconds
// against a large VictoriaMetrics cluster) while still bounding hung requests.
const DEFAULT_REQUEST_TIMEOUT = 60000;
const requestTimeout =
Number.parseInt(process.env.VUE_APP_REQUEST_TIMEOUT, 10) || DEFAULT_REQUEST_TIMEOUT;

const service = axios.create({
baseURL: process.env.VUE_APP_BASE_API, // url = base url + request url
timeout: 5000,
timeout: requestTimeout,
validateStatus: function (status) {
return (status >= 200 && status < 300) || status > 520;
},
Expand Down
6 changes: 5 additions & 1 deletion server/cmd/server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/go-kratos/kratos/v2/transport/http"
"os"
"vgpu/internal/conf"
"vgpu/internal/exporter"

_ "go.uber.org/automaxprocs"
)
Expand Down Expand Up @@ -44,7 +45,9 @@ func main() {
}
}

func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Server) *kratos.App {
// newApp wires the background metrics collector in as a kratos transport.Server so
// its goroutine is started and stopped by the app lifecycle, alongside HTTP and gRPC.
func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Server, mc *exporter.MetricsGenerator) *kratos.App {
return kratos.New(
kratos.Context(ctx),
kratos.ID(id),
Expand All @@ -55,6 +58,7 @@ func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Se
kratos.Server(
gs,
hs,
mc,
),
)
}
Expand Down
9 changes: 6 additions & 3 deletions server/config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
server:
http:
addr: 0.0.0.0:8000
timeout: 1s
timeout: 60s
grpc:
addr: 0.0.0.0:9000
timeout: 1s
timeout: 60s
prometheus:
address: http://localhost:9090
timeout: 1m
exporter:
interval: 30s
timeout: 60s
node_selectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on
Metax: metax-tech.com/gpu.installed=true
Metax: metax-tech.com/gpu.installed=true
12 changes: 12 additions & 0 deletions server/internal/conf/conf.proto
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ message Bootstrap {
Server server = 1;
Prometheus prometheus = 2;
map<string, string> node_selectors = 3;
Exporter exporter = 4;
}

message Server {
Expand All @@ -31,3 +32,14 @@ message Prometheus {
string timeout = 2;
string auth = 3;
}

// Exporter controls the background /metrics collector that periodically refreshes the
// in-memory Prometheus registry from upstream Prometheus / VictoriaMetrics. The /metrics
// HTTP handler only serializes that registry, so it is no longer tied to the per-request
// HTTP timeout and no longer fans out PromQL on every scrape.
message Exporter {
// Interval between two refresh cycles. Defaults to 30s if unset.
google.protobuf.Duration interval = 1;
// Hard cap on a single refresh cycle. Defaults to 60s if unset.
google.protobuf.Duration timeout = 2;
}
15 changes: 13 additions & 2 deletions server/internal/data/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ func (r *nodeRepo) updateLocalNodes() {
}
}
}
r.mutex.Lock()
r.nodes = n
r.mutex.Unlock()
}
}

Expand Down Expand Up @@ -156,6 +158,8 @@ func (r *nodeRepo) fetchNodeInfo(node *corev1.Node) *biz.Node {
}

func (r *nodeRepo) ListAll(context.Context) ([]*biz.Node, error) {
r.mutex.RLock()
defer r.mutex.RUnlock()
var nodeList []*biz.Node
for _, node := range r.nodes {
nodeList = append(nodeList, node)
Expand All @@ -164,13 +168,18 @@ func (r *nodeRepo) ListAll(context.Context) ([]*biz.Node, error) {
}

func (r *nodeRepo) GetNode(_ context.Context, uid string) (*biz.Node, error) {
if _, ok := r.nodes[k8stypes.UID(uid)]; !ok {
r.mutex.RLock()
defer r.mutex.RUnlock()
node, ok := r.nodes[k8stypes.UID(uid)]
if !ok {
return nil, errors.New("node not found")
}
return r.nodes[k8stypes.UID(uid)], nil
return node, nil
}

func (r *nodeRepo) ListAllDevices(context.Context) ([]*biz.DeviceInfo, error) {
r.mutex.RLock()
defer r.mutex.RUnlock()
var deviceList []*biz.DeviceInfo
for _, node := range r.nodes {
deviceList = append(deviceList, node.Devices...)
Expand All @@ -179,6 +188,8 @@ func (r *nodeRepo) ListAllDevices(context.Context) ([]*biz.DeviceInfo, error) {
}

func (r *nodeRepo) FindDeviceByAliasId(aliasId string) (*biz.DeviceInfo, error) {
r.mutex.RLock()
defer r.mutex.RUnlock()
for _, node := range r.nodes {
for _, d := range node.Devices {
if d.AliasId == aliasId {
Expand Down
10 changes: 9 additions & 1 deletion server/internal/data/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ func (r *podRepo) GetStartTime(pod *corev1.Pod) time.Time {
}

func (r *podRepo) ListAll(context.Context) ([]*biz.Container, error) {
r.mutex.RLock()
defer r.mutex.RUnlock()
var containerList []*biz.Container
for _, pod := range r.pods {
containerList = append(containerList, pod.Ctrs...)
Expand All @@ -196,7 +198,13 @@ func (r *podRepo) FindOne(_ context.Context, podUID string, name string) (*biz.C
return nil, fmt.Errorf("podUID or name is empty")
}

for _, container := range r.pods[k8stypes.UID(podUID)].Ctrs {
r.mutex.RLock()
defer r.mutex.RUnlock()
pod, ok := r.pods[k8stypes.UID(podUID)]
if !ok {
return nil, fmt.Errorf("not found")
}
for _, container := range pod.Ctrs {
if container.Name == name {
return container, nil
}
Expand Down
97 changes: 97 additions & 0 deletions server/internal/exporter/cells.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package exporter

import (
"strings"

"github.com/prometheus/client_golang/prometheus"
)

// Diff-based cell tracking for the background /metrics collector.
//
// Why this exists: the old reset()+populate cycle (called from a synchronous HTTP
// handler) was safe because scrape only ever observed the fully-populated state.
// Once the cycle moved to a background goroutine, every Prometheus scrape that
// landed between reset() and the end of populate saw partial / empty data, which
// surfaces in the UI as "vGPU 分配率有时有数据,有时没有数据" — series flickering
// in and out at scrape boundaries.
//
// The fix: instead of wiping the GaugeVec at the start of each cycle, every Set
// goes through MetricsGenerator.set, which both writes the value AND records the
// (gauge, label tuple) it touched. After a cycle completes successfully, we walk
// the previous cycle's recorded set and DeleteLabelValues for any tuple that
// disappeared this round. Existing series are atomically overwritten in place,
// brand-new series appear when their Set runs, vanished series disappear at the
// end-of-cycle prune. There is no window where a known device is missing.

// cellKey identifies a single observation (gauge vector + concrete label tuple).
// The joined string is just a map-key encoding of the labels; the original
// []string is kept on the cell so we can pass it to DeleteLabelValues.
type cellKey struct {
gauge *prometheus.GaugeVec
joined string
}

type cell struct {
gauge *prometheus.GaugeVec
labels []string
}

// labelSep is a 0-byte separator that cannot appear in normal Prometheus label
// values, so strings.Join produces an unambiguous key.
const labelSep = "\x00"

// set writes value into the gauge and records the (gauge, labels) tuple in the
// current-cycle map. Safe for concurrent use; the collector itself only calls
// it from one goroutine, but we lock anyway in case callers add a parallel pass.
func (s *MetricsGenerator) set(g *prometheus.GaugeVec, value float64, labels ...string) {
g.WithLabelValues(labels...).Set(value)

k := cellKey{gauge: g, joined: strings.Join(labels, labelSep)}
s.cellMu.Lock()
defer s.cellMu.Unlock()
if s.current == nil {
s.current = make(map[cellKey]cell)
}
// Copy the labels slice — callers reuse the underlying array between iterations.
s.current[k] = cell{gauge: g, labels: append([]string(nil), labels...)}
}

// commitCycle promotes the current cycle to "previous" and removes any label
// tuple that existed in the previous cycle but not this one. Call ONLY when the
// cycle completed without being cut short by ctx cancellation: pruning on a
// partial map would erroneously delete cells that just weren't re-Set this time.
func (s *MetricsGenerator) commitCycle() {
s.cellMu.Lock()
defer s.cellMu.Unlock()
if s.current == nil {
// Nothing was written this cycle; leave prev intact.
return
}
deleted := 0
for k, c := range s.prev {
if _, ok := s.current[k]; ok {
continue
}
if c.gauge.DeleteLabelValues(c.labels...) {
deleted++
}
}
if deleted > 0 {
s.log.Debugw("msg", "pruned stale metric cells", "count", deleted)
}
s.prev = s.current
s.current = nil
}

// dropCurrentCycle discards the in-progress map without promoting it. Use this
// when a cycle ran into ctx cancellation or any other partial-completion path,
// so the next cycle's prune still references the last KNOWN-GOOD snapshot.
//
// Any partial Set() calls that did land remain in the GaugeVec as freshly
// overwritten cells — that is intentional and harmless, since they only update
// values on label tuples that already existed.
func (s *MetricsGenerator) dropCurrentCycle() {
s.cellMu.Lock()
s.current = nil
s.cellMu.Unlock()
}
Loading
Loading