Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions docs/installation/helm/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,22 @@ The WebUI can only be accessed by your localhost, so you need to connect your lo

The HAMi-WebUI open-source community offers Helm Charts for running it on Kubernetes. Please be aware that the code is provided without any warranties. If you encounter any problems, you can report them to the [Official GitHub repository](https://github.com/hami-webui/helm-charts/).

## Prequisities
## Prerequisites

To install HAMi-WebUI using Helm, ensure you meet these requirements:

1. Kubectl on your localhost

2. [HAMi](https://github.com/Project-HAMi/HAMi?tab=readme-ov-file#quick-start) >= 2.4.0
2. [HAMi](https://github.com/Project-HAMi/HAMi?tab=readme-ov-file#quick-start) (see version compatibility below)

### Version compatibility

> _**Important**_: HAMi-WebUI v1.1.1+ switches to the HAMi 2.9.0 metrics schema (renamed metrics/labels). If you upgrade HAMi-WebUI without upgrading HAMi, dashboards may break.

| HAMi-WebUI version | Supported HAMi version | Metrics schema | Notes |
| --- | --- | --- | --- |
| <= v1.1.0 | >= 2.4.0, < 2.9.0 | old labels: `deviceuuid`, `devicetype`, `podnamespace`, `podname`, `ctrname` | For existing HAMi deployments before the metrics rename |
| v1.1.1+ | >= 2.9.0 | new labels: `device_uuid`, `device_type`, `namespace`, `pod`, `container` | Required after the HAMi 2.9.0 metrics rename |

3. Prometheus > 2.8.0

Expand Down
2 changes: 1 addition & 1 deletion packages/web/projects/vgpu/components/Detail.vue
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
</block-box>
</div>

<block-box title="显卡列表" v-if="type !== 'deviceuuid'">
<block-box title="显卡列表" v-if="type !== 'device_uuid'">
<CardList :hideTitle="true" :filters="filters" />
</block-box>

Expand Down
6 changes: 3 additions & 3 deletions packages/web/projects/vgpu/components/previewBar.vue
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,10 @@ const gpuMemoryTop5 = computed(() => ({
}));

const pieConfig = {
deviceuuid: {
device_uuid: {
query:
'count by (devicetype) (sum by (deviceuuid, devicetype) (hami_vgpu_count))',
key: 'devicetype',
'count by (device_type) (sum by (device_uuid, device_type) (hami_vgpu_count))',
key: 'device_type',
},
node: {
query: 'count by (provider) (sum by (node,provider) (hami_vgpu_count))',
Expand Down
32 changes: 16 additions & 16 deletions packages/web/projects/vgpu/views/card/admin/Detail.vue
Original file line number Diff line number Diff line change
Expand Up @@ -364,38 +364,38 @@ const _gaugeConfigBase = [
{
titleKey: 'dashboard.computeAllocRate',
percent: 0,
query: `avg(sum(hami_container_vcore_allocated{deviceuuid=~"$deviceuuid"}) by (instance))`,
totalQuery: `avg(sum(hami_core_size{deviceuuid=~"$deviceuuid"}) by (instance))`,
percentQuery: `avg(sum(hami_container_vcore_allocated{deviceuuid=~"$deviceuuid"}) by (instance))/avg(sum(hami_core_size{deviceuuid=~"$deviceuuid"}) by (instance)) *100`,
query: `avg(sum(hami_container_vcore_allocated{device_uuid=~"$device_uuid"}) by (instance))`,
totalQuery: `avg(sum(hami_core_size{device_uuid=~"$device_uuid"}) by (instance))`,
percentQuery: `avg(sum(hami_container_vcore_allocated{device_uuid=~"$device_uuid"}) by (instance))/avg(sum(hami_core_size{device_uuid=~"$device_uuid"}) by (instance)) *100`,
total: 0,
used: 0,
unit: ' ',
},
{
titleKey: 'dashboard.memAllocRate',
percent: 0,
query: `avg(sum(hami_container_vmemory_allocated{deviceuuid=~"$deviceuuid"}) by (instance)) / 1024`,
totalQuery: `avg(sum(hami_memory_size{deviceuuid=~"$deviceuuid"}) by (instance)) / 1024`,
percentQuery: `(avg(sum(hami_container_vmemory_allocated{deviceuuid=~"$deviceuuid"}) by (instance)) / 1024 )/(avg(sum(hami_memory_size{deviceuuid=~"$deviceuuid"}) by (instance)) / 1024) *100 `,
query: `avg(sum(hami_container_vmemory_allocated{device_uuid=~"$device_uuid"}) by (instance)) / 1024`,
totalQuery: `avg(sum(hami_memory_size{device_uuid=~"$device_uuid"}) by (instance)) / 1024`,
percentQuery: `(avg(sum(hami_container_vmemory_allocated{device_uuid=~"$device_uuid"}) by (instance)) / 1024 )/(avg(sum(hami_memory_size{device_uuid=~"$device_uuid"}) by (instance)) / 1024) *100 `,
total: 0,
used: 0,
unit: 'GiB',
},
{
titleKey: 'dashboard.computeUsageRate',
percent: 0,
query: `avg(sum(hami_core_util{deviceuuid=~"$deviceuuid"}) by (instance))`,
percentQuery: `avg(sum(hami_core_util_avg{deviceuuid=~"$deviceuuid"}) by (instance))`,
query: `avg(sum(hami_core_util{device_uuid=~"$device_uuid"}) by (instance))`,
percentQuery: `avg(sum(hami_core_util_avg{device_uuid=~"$device_uuid"}) by (instance))`,
total: 100,
used: 0,
unit: ' ',
},
{
titleKey: 'dashboard.memUsageRate',
percent: 0,
query: `avg(sum(hami_memory_used{deviceuuid=~"$deviceuuid"}) by (instance)) / 1024`,
totalQuery: `avg(sum(hami_memory_size{deviceuuid=~"$deviceuuid"}) by (instance))/1024`,
percentQuery: `(avg(sum(hami_memory_used{deviceuuid=~"$deviceuuid"}) by (instance)) / 1024)/(avg(sum(hami_memory_size{deviceuuid=~"$deviceuuid"}) by (instance))/1024)*100`,
query: `avg(sum(hami_memory_used{device_uuid=~"$device_uuid"}) by (instance)) / 1024`,
totalQuery: `avg(sum(hami_memory_size{device_uuid=~"$device_uuid"}) by (instance))/1024`,
percentQuery: `(avg(sum(hami_memory_used{device_uuid=~"$device_uuid"}) by (instance)) / 1024)/(avg(sum(hami_memory_size{device_uuid=~"$device_uuid"}) by (instance))/1024)*100`,
total: 0,
used: 0,
unit: 'GiB',
Expand All @@ -404,7 +404,7 @@ const _gaugeConfigBase = [

const gaugeData = useInstantVector(
_gaugeConfigBase.map(item => ({ ...item, title: t(item.titleKey) })),
(query) => query.replaceAll(`$deviceuuid`, route.params.uuid),
(query) => query.replaceAll(`$device_uuid`, route.params.uuid),
times,
);

Expand Down Expand Up @@ -477,7 +477,7 @@ const memoryUsagePercentText = computed(() => (memoryUsagePercentRaw.value === u
const lineTools = ref([
{
titleKey: 'card.detail.gpuPowerTrend',
query: `avg by (device_no,driver_version) (hami_device_power{deviceuuid=~"$deviceuuid"})`,
query: `avg by (device_no,driver_version) (hami_device_power{device_uuid=~"$device_uuid"})`,
data: [],
unit: 'W',
gaugeUnit: 'W',
Expand All @@ -488,7 +488,7 @@ const lineTools = ref([
},
{
titleKey: 'card.detail.gpuTemperatureTrend',
query: `avg by (device_no,driver_version) (hami_device_temperature{deviceuuid=~"$deviceuuid"})`,
query: `avg by (device_no,driver_version) (hami_device_temperature{device_uuid=~"$device_uuid"})`,
data: [],
unit: '℃',
gaugeUnit: '℃',
Expand All @@ -515,7 +515,7 @@ const fetchLineData = async () => {
end: timeParse(times.value[1]),
step: calculatePrometheusStep(times.value[0], times.value[1]),
},
query: item.query.replaceAll(`$deviceuuid`, route.params.uuid),
query: item.query.replaceAll(`$device_uuid`, route.params.uuid),
})
.then((res) => {
const first = res.data?.[0];
Expand All @@ -531,7 +531,7 @@ const fetchLineData = async () => {

cardApi
.getInstantVector({
query: item.query.replaceAll(`$deviceuuid`, route.params.uuid),
query: item.query.replaceAll(`$device_uuid`, route.params.uuid),
})
.then((res) => {
lineTools.value[index].percent = res.data?.[0]?.value;
Expand Down
2 changes: 1 addition & 1 deletion packages/web/projects/vgpu/views/card/admin/Detail2.vue
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<Detail
title="显卡"
:detailColumns="columns"
type="deviceuuid"
type="device_uuid"
:detail="detail"
:name="detail.uuid"
:filters="{ deviceId: detail.uuid }"
Expand Down
2 changes: 1 addition & 1 deletion packages/web/projects/vgpu/views/card/admin/index.vue
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<div class="card-admin-top-wrap" v-if="!hideTitle">
<preview-bar
:title="$t('dashboard.card')"
type="deviceuuid"
type="device_uuid"
:handle-click="handleClick"
:handle-pie-click="handlePieClick"
:currentName="currentType"
Expand Down
8 changes: 4 additions & 4 deletions packages/web/projects/vgpu/views/task/admin/top.vue
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ const handleChartClick = async (params) => {
} else {
ElMessage.error(t('node.nodeNotFound'));
}
} else if (activeTabKey === 'deviceuuid') {
} else if (activeTabKey === 'device_uuid') {
router.push({
path: `/admin/vgpu/card/admin/${name}`,
});
Expand Down Expand Up @@ -59,12 +59,12 @@ const topConfig = computed(() => [
},
{
tab: t('dashboard.card'),
key: 'deviceuuid',
key: 'device_uuid',
data: [],
nameKey: 'deviceuuid',
nameKey: 'device_uuid',
unit: ' ',
query:
'topk(5, count by (deviceuuid) (sum by (container_pod_uuid, deviceuuid) (hami_container_vcore_allocated)))',
'topk(5, count by (device_uuid) (sum by (container_pod_uuid, device_uuid) (hami_container_vcore_allocated)))',
},
],
},
Expand Down
10 changes: 5 additions & 5 deletions server/internal/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -382,12 +382,12 @@ func (s *MetricsGenerator) taskCoreUsed(ctx context.Context, provider, namespace
query := ""
switch provider {
case biz.NvidiaGPUDevice:
//query = fmt.Sprintf("avg(Device_utilization_desc_of_container{deviceuuid=\"%s\", podnamespace=\"%s\", podname=\"%s\", ctrname=\"%s\"})", deviceUUID, namespace, pod, container)
// queryTemplate := `last_over_time((Device_utilization_desc_of_container{deviceuuid="%s", podnamespace="%s", podname="%s", ctrname="%s"} != 0)[1m:])
//query = fmt.Sprintf("avg(hami_container_device_utilization_ratio{device_uuid=\"%s\", namespace=\"%s\", pod=\"%s\", container=\"%s\"})", deviceUUID, namespace, pod, container)
// queryTemplate := `last_over_time((hami_container_device_utilization_ratio{device_uuid="%s", namespace="%s", pod="%s", container="%s"} != 0)[1m:])
//or
//last_over_time(Device_utilization_desc_of_container{deviceuuid="%s", podnamespace="%s", podname="%s", ctrname="%s"}[1m:])`
//last_over_time(hami_container_device_utilization_ratio{device_uuid="%s", namespace="%s", pod="%s", container="%s"}[1m:])`
// query = fmt.Sprintf(queryTemplate, deviceUUID, namespace, pod, container, deviceUUID, namespace, pod, container)
queryTemplate := fmt.Sprintf("Device_utilization_desc_of_container{deviceuuid=\"%s\", podnamespace=\"%s\", podname=\"%s\", ctrname=\"%s\"}", deviceUUID, namespace, pod, container)
queryTemplate := fmt.Sprintf("hami_container_device_utilization_ratio{device_uuid=\"%s\", namespace=\"%s\", pod=\"%s\", container=\"%s\"}", deviceUUID, namespace, pod, container)
query = fmt.Sprintf("sum_over_time(%s[1m]) == 0 or (sum_over_time(%s[10m:]) / count_over_time(( %s !=0)[10m:])) ", queryTemplate, queryTemplate, queryTemplate)
//query = queryTemplate
case biz.CambriconGPUDevice:
Expand All @@ -412,7 +412,7 @@ func (s *MetricsGenerator) taskMemoryUsed(ctx context.Context, provider, namespa
query := ""
switch provider {
case biz.NvidiaGPUDevice:
query = fmt.Sprintf("avg(vGPU_device_memory_usage_in_bytes{deviceuuid=\"%s\", podnamespace=\"%s\", podname=\"%s\", ctrname=\"%s\"})", deviceUUID, namespace, pod, container)
query = fmt.Sprintf("avg(hami_vgpu_memory_used_bytes{device_uuid=\"%s\", namespace=\"%s\", pod=\"%s\", container=\"%s\"})", deviceUUID, namespace, pod, container)
case biz.CambriconGPUDevice:
query = fmt.Sprintf("avg(mlu_memory_utilization * on(uuid) group_right mlu_container{namespace=\"%s\",pod=\"%s\",container=\"%s\",type=\"mlu370.smlu.vmemory\"})", namespace, pod, container)
case biz.AscendGPUDevice:
Expand Down
52 changes: 26 additions & 26 deletions server/internal/exporter/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,132 +80,132 @@ var (
HamiVCoreScaling = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_vcore_scaling",
Help: "GPU virtual core Scaling",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiVMemoryScaling = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_vmemory_scaling",
Help: "GPU virtual memory Scaling",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiVgpuCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_vgpu_count",
Help: "Total vGPU count",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiVmemorySize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_vmemory_size",
Help: "Total vMemory size",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiVcoreSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_vcore_size",
Help: "Total vCore size",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiMemoryUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_memory_used",
Help: "Actual memory usage, unit is 'MB' ",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiMemorySize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_memory_size",
Help: "Actual memory size, unit is 'MB' ",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiMemoryUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_memory_util",
Help: "Actual Memory Util percent 0-100",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiCoreSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_core_size",
Help: "Actual core size",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiCoreUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_core_used",
Help: "Actual Core Used",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiCoreUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_core_util",
Help: "Actual Core Util percent 0-100",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiCoreUsedAvg = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_core_used_avg",
Help: "Actual Core Used period avg",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiCoreUtilAvg = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_core_util_avg",
Help: "Actual Core Util percent 0-100 period avg",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiDeviceTemperature = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_device_temperature",
Help: "gpu temperature",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiDeviceMemoryTemperature = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_device_memory_temperature",
Help: "gpu memory temperature",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiDevicePower = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_device_power",
Help: "gpu power",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiDeviceHardwareHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_device_hardware_health",
Help: "gpu hardware health",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiDeviceFanSpeedP = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_device_fan_speed_p",
Help: "gpu fan speed percent 0-100",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiDeviceFanSpeedR = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_device_fan_speed_r",
Help: "gpu fan speed rpm",
}, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"})
}, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"})

HamiContainerVgpuAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_container_vgpu_allocated",
Help: "task allocated vGPU count",
}, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"})
}, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"})

HamiContainerVmemoryAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_container_vmemory_allocated",
Help: "task allocated vMemory size",
}, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"})
}, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"})

HamiContainerVcoreAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_container_vcore_allocated",
Help: "task allocated vCore size",
}, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"})
}, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"})

HamiContainerMemoryUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_container_memory_used",
Help: "task used memory unit MB",
}, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"})
}, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name"})

HamiContainerMemoryUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_container_memory_util",
Help: "task memory util percent 0-100",
}, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"})
}, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name"})

HamiContainerCoreUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_container_core_used",
Help: "task used core ",
}, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"})
}, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name"})

HamiContainerCoreUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_container_core_util",
Help: "task core util percent 0-100",
}, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"})
}, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name"})

HamiPoolVgpuCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "hami_pool_vgpu_count",
Expand Down
Loading
Loading