From 87d4b5636015d1dc9b0be4d571277d38b3059ccc Mon Sep 17 00:00:00 2001 From: Shenhan11 Date: Thu, 16 Apr 2026 15:49:54 +0800 Subject: [PATCH] optimize monitor overview charts and add workload insights Signed-off-by: Shenhan11 --- docs/installation/helm/index.md | 13 +- packages/web/package.json | 1 + .../web/projects/vgpu/components/Detail.vue | 216 ----- .../web/projects/vgpu/components/TabTop.vue | 3 +- .../web/projects/vgpu/components/config.js | 54 +- .../web/projects/vgpu/components/gauge.vue | 6 +- .../projects/vgpu/components/previewBar.vue | 8 +- .../projects/vgpu/views/card/admin/Detail.vue | 606 +++++++------ .../vgpu/views/card/admin/Detail2.vue | 82 -- .../admin/components/WorkloadSemiProgress.vue | 43 + .../projects/vgpu/views/card/admin/index.vue | 47 +- .../vgpu/views/monitor/overview/Block.vue | 4 +- .../vgpu/views/monitor/overview/getOptions.js | 40 +- .../vgpu/views/monitor/overview/index.vue | 280 +++++- .../vgpu/views/monitor/overview/style.scss | 104 ++- .../projects/vgpu/views/node/admin/Detail.vue | 634 +++++++------- .../vgpu/views/node/admin/getOptions.js | 75 +- .../projects/vgpu/views/node/admin/index.vue | 23 +- .../projects/vgpu/views/task/admin/Detail.vue | 810 ++++++++++++------ .../projects/vgpu/views/task/admin/index.vue | 147 ++-- .../projects/vgpu/views/task/admin/top.vue | 8 +- packages/web/src/App.vue | 11 +- packages/web/src/components/BackHeader.vue | 40 - packages/web/src/components/BlockBox.vue | 2 +- packages/web/src/components/PageHeader.vue | 88 ++ packages/web/src/components/index.js | 2 - packages/web/src/icons/svg/cpu-limit.svg | 5 + packages/web/src/icons/svg/gpu-ascend.svg | 10 + packages/web/src/icons/svg/gpu-aws.svg | 7 + packages/web/src/icons/svg/gpu-metax.svg | 12 + packages/web/src/icons/svg/gpu-nvidia.svg | 10 + packages/web/src/icons/svg/help-circle.svg | 6 + packages/web/src/icons/svg/node-cpu-total.svg | 4 + .../web/src/icons/svg/node-memory-total.svg | 16 + .../web/src/icons/svg/related-gpu-eye.svg | 3 + packages/web/src/icons/svg/vgpu-workload.svg | 10 + packages/web/src/locales/en.js | 52 +- packages/web/src/locales/zh.js | 55 +- packages/web/src/main.js | 2 + packages/web/src/plugins/echarts.js | 8 + pnpm-lock.yaml | 45 + server/api/v1/container.proto | 1 + server/internal/biz/pod.go | 1 + server/internal/data/pod.go | 1 + server/internal/exporter/exporter.go | 10 +- server/internal/exporter/metrics.go | 52 +- server/internal/service/card.go | 4 +- server/internal/service/container.go | 30 + 48 files changed, 2237 insertions(+), 1454 deletions(-) delete mode 100644 packages/web/projects/vgpu/components/Detail.vue delete mode 100644 packages/web/projects/vgpu/views/card/admin/Detail2.vue create mode 100644 packages/web/projects/vgpu/views/card/admin/components/WorkloadSemiProgress.vue delete mode 100644 packages/web/src/components/BackHeader.vue create mode 100644 packages/web/src/components/PageHeader.vue create mode 100644 packages/web/src/icons/svg/cpu-limit.svg create mode 100644 packages/web/src/icons/svg/gpu-ascend.svg create mode 100644 packages/web/src/icons/svg/gpu-aws.svg create mode 100644 packages/web/src/icons/svg/gpu-metax.svg create mode 100644 packages/web/src/icons/svg/gpu-nvidia.svg create mode 100644 packages/web/src/icons/svg/help-circle.svg create mode 100644 packages/web/src/icons/svg/node-cpu-total.svg create mode 100644 packages/web/src/icons/svg/node-memory-total.svg create mode 100644 packages/web/src/icons/svg/related-gpu-eye.svg create mode 100644 packages/web/src/icons/svg/vgpu-workload.svg create mode 100644 packages/web/src/plugins/echarts.js diff --git a/docs/installation/helm/index.md b/docs/installation/helm/index.md index b7cdb179..93cdfa17 100644 --- a/docs/installation/helm/index.md +++ b/docs/installation/helm/index.md @@ -8,13 +8,22 @@ The WebUI can only be accessed by your localhost, so you need to connect your lo The HAMi-WebUI open-source community offers Helm Charts for running it on Kubernetes. Please be aware that the code is provided without any warranties. If you encounter any problems, you can report them to the [Official GitHub repository](https://github.com/hami-webui/helm-charts/). -## Prequisities +## Prerequisites To install HAMi-WebUI using Helm, ensure you meet these requirements: 1. Kubectl on your localhost -2. [HAMi](https://github.com/Project-HAMi/HAMi?tab=readme-ov-file#quick-start) >= 2.4.0 +2. [HAMi](https://github.com/Project-HAMi/HAMi?tab=readme-ov-file#quick-start) (see version compatibility below) + +### Version compatibility + +> _**Important**_: HAMi-WebUI v1.1.1+ switches to the HAMi 2.9.0 metrics schema (renamed metrics/labels). If you upgrade HAMi-WebUI without upgrading HAMi, dashboards may break. + +| HAMi-WebUI version | Supported HAMi version | Metrics schema | Notes | +| --- | --- | --- | --- | +| <= v1.1.0 | >= 2.4.0, < 2.9.0 | old labels: `deviceuuid`, `devicetype`, `podnamespace`, `podname`, `ctrname` | For existing HAMi deployments before the metrics rename | +| v1.1.1+ | >= 2.9.0 | new labels: `device_uuid`, `device_type`, `namespace`, `pod`, `container` | Required after the HAMi 2.9.0 metrics rename | 3. Prometheus > 2.8.0 diff --git a/packages/web/package.json b/packages/web/package.json index d8b57a8a..e20371b8 100644 --- a/packages/web/package.json +++ b/packages/web/package.json @@ -46,6 +46,7 @@ "vue": "^3.2.13", "vue-clipboard3": "2.0.0", "vue-count-to": "1.0.13", + "vue-echarts": "^6.7.3", "vue-i18n": "9", "vue-native-websocket-vue3": "^3.1.7", "vue-router": "^4.0.3", diff --git a/packages/web/projects/vgpu/components/Detail.vue b/packages/web/projects/vgpu/components/Detail.vue deleted file mode 100644 index 3c0636b9..00000000 --- a/packages/web/projects/vgpu/components/Detail.vue +++ /dev/null @@ -1,216 +0,0 @@ - - - - - diff --git a/packages/web/projects/vgpu/components/TabTop.vue b/packages/web/projects/vgpu/components/TabTop.vue index 8ec8980f..1eed5b72 100644 --- a/packages/web/projects/vgpu/components/TabTop.vue +++ b/packages/web/projects/vgpu/components/TabTop.vue @@ -192,7 +192,8 @@ watch( transition: background-color 0.15s ease; &:hover { - background-color: #f3f4f6; + background-color: transparent; + box-shadow: none; } } diff --git a/packages/web/projects/vgpu/components/config.js b/packages/web/projects/vgpu/components/config.js index 18ed5050..e3f257d5 100644 --- a/packages/web/projects/vgpu/components/config.js +++ b/packages/web/projects/vgpu/components/config.js @@ -219,28 +219,51 @@ export const getTopOptions = ({ core, memory }) => { }; }; -export const getLineOptions = ({ data = [], unit = '%' }) => { +export const getLineOptions = ({ data = [], unit = '%', seriesName, animation = true }) => { return { + animation, tooltip: { trigger: 'axis', axisPointer: { - type: 'cross', + type: 'line', + lineStyle: { + type: 'dashed', + color: '#8A8A8A', + }, }, formatter: function (params) { - var res = params[0].name + '
'; - for (var i = 0; i < params.length; i++) { - res += - params[i].marker + (+params[i].value).toFixed(0) + ` ${unit}
`; + if (!Array.isArray(params) || params.length === 0) return ''; + + let result = `
${params[0]?.name ?? ''}
`; + for (let i = 0; i < params.length; i++) { + const item = params[i]; + const raw = Array.isArray(item?.value) ? item.value[item.value.length - 1] : item?.value; + const num = Number(raw); + const value = Number.isFinite(num) ? `${num.toFixed(1)} ${unit}` : '-'; + result += ` +
+ + ${item?.seriesName || '-'}:  + ${value} +
+ `; } - return res; + return result; }, }, grid: { - top: 7, // 上边距 - bottom: 20, // 下边距 - left: '7%', // 左边距 - right: 10, // 右边距 + top: 20, // 上边距 + bottom: 30, // 下边距 + left: 30, // 左边距 + right: 30, // 右边距 }, + dataZoom: [ + { + type: 'inside', + xAxisIndex: 0, + filterMode: 'none', + }, + ], xAxis: { type: 'category', data: data.map((item) => timeParse(+item.timestamp)), @@ -255,10 +278,19 @@ export const getLineOptions = ({ data = [], unit = '%' }) => { }, series: [ { + name: seriesName || '', data: data.map((item) => { return item.value.toFixed(1); }), type: 'line', + lineStyle: { + width: 3, + color: '#5B8FF9', + }, + itemStyle: { + color: '#5B8FF9', + borderColor: '#5B8FF9', + }, }, ], }; diff --git a/packages/web/projects/vgpu/components/gauge.vue b/packages/web/projects/vgpu/components/gauge.vue index a0ebc818..984990b6 100644 --- a/packages/web/projects/vgpu/components/gauge.vue +++ b/packages/web/projects/vgpu/components/gauge.vue @@ -18,7 +18,7 @@ {{ title.includes('使用') || title.includes('Usage') ? $t('dashboard.usage') : $t('dashboard.allocation') }} ({{ unit }}) : - {{ used.toFixed(1) }} / {{ total.toFixed() }} + {{ displayUsed.toFixed(1) }} / {{ displayTotal.toFixed() }} @@ -38,6 +38,10 @@ const props = defineProps([ ]); const showProgress = computed(() => props.showProgress !== false); +const isComputeTitle = computed(() => props.title?.includes('算力') || props.title?.includes('Compute')); +const displayDivisor = computed(() => (isComputeTitle.value ? 100 : 1)); +const displayUsed = computed(() => Number(props.used || 0) / displayDivisor.value); +const displayTotal = computed(() => Number(props.total || 0) / displayDivisor.value); const progressColor = computed(() => { const value = Number(props.percent); diff --git a/packages/web/projects/vgpu/components/previewBar.vue b/packages/web/projects/vgpu/components/previewBar.vue index 2235ca17..2e56b9ac 100644 --- a/packages/web/projects/vgpu/components/previewBar.vue +++ b/packages/web/projects/vgpu/components/previewBar.vue @@ -156,10 +156,10 @@ const gpuMemoryTop5 = computed(() => ({ })); const pieConfig = { - deviceuuid: { + device_uuid: { query: - 'count by (devicetype) (sum by (deviceuuid, devicetype) (hami_vgpu_count))', - key: 'devicetype', + 'count by (device_type) (sum by (device_uuid, device_type) (hami_vgpu_count))', + key: 'device_type', }, node: { query: 'count by (provider) (sum by (node,provider) (hami_vgpu_count))', @@ -197,7 +197,7 @@ onMounted(async () => { query: thisPieConfig.query, }); - const colors = ['#5470c6', '#91cc75', '#2563EB', '#16A34A', '#7dd3fc', '#86efac']; + const colors = ['#76B900', '#9FCB98', '#F59E0B', '#4F8F87', '#14B8A6', '#6B7280']; pieData.value = data.map((item, index) => { return { name: item.metric[thisPieConfig.key], diff --git a/packages/web/projects/vgpu/views/card/admin/Detail.vue b/packages/web/projects/vgpu/views/card/admin/Detail.vue index 5c84640f..eab99860 100644 --- a/packages/web/projects/vgpu/views/card/admin/Detail.vue +++ b/packages/web/projects/vgpu/views/card/admin/Detail.vue @@ -1,25 +1,67 @@ diff --git a/packages/web/projects/vgpu/views/card/admin/components/WorkloadSemiProgress.vue b/packages/web/projects/vgpu/views/card/admin/components/WorkloadSemiProgress.vue new file mode 100644 index 00000000..cb1151af --- /dev/null +++ b/packages/web/projects/vgpu/views/card/admin/components/WorkloadSemiProgress.vue @@ -0,0 +1,43 @@ + + + + + diff --git a/packages/web/projects/vgpu/views/card/admin/index.vue b/packages/web/projects/vgpu/views/card/admin/index.vue index e3d6aecb..a8bc6a44 100644 --- a/packages/web/projects/vgpu/views/card/admin/index.vue +++ b/packages/web/projects/vgpu/views/card/admin/index.vue @@ -5,7 +5,7 @@
String(locale.value || '').startsWith('en')); +const { t } = useI18n(); const parseTypeFromQuery = (value) => { if (typeof value === 'string') return value || undefined; if (Array.isArray(value) && typeof value[0] === 'string') return value[0] || undefined; @@ -150,15 +149,16 @@ const getCardStatusDisplay = ({ health, isExternal }) => { return { icon: 'status-unschedulable', text: t('card.abnormal') }; }; -const getRemainingTotalText = ({ total, used, unit = '' }) => { +const getRemainingTotalText = ({ total, used, unit = '', divisor = 1 }) => { const totalNum = Number(total || 0); if (!totalNum) return null; const usedNum = Number(used || 0); + const normalizedDivisor = Number(divisor) > 0 ? Number(divisor) : 1; const remaining = Math.max(0, totalNum - usedNum); const unitText = unit ? ` ${unit}` : ''; return { - remaining: roundToDecimal(remaining, 1), - total: roundToDecimal(totalNum, 1), + remaining: roundToDecimal(remaining / normalizedDivisor, 1), + total: roundToDecimal(totalNum / normalizedDivisor, 1), unitText, }; }; @@ -190,7 +190,7 @@ const baseColumns = computed(() => [ { title: t('task.status'), dataIndex: 'health', - width: 100, + width: 150, render: ({ health, isExternal }) => { const { icon, text } = getCardStatusDisplay({ health, isExternal }); return ( @@ -201,44 +201,23 @@ const baseColumns = computed(() => [ ); }, }, - { - title: t('card.mode'), - dataIndex: 'mode', - width: 120, - render: ({ mode, type }) => ( - - {type?.split('-')[0] === "NVIDIA" ? mode : 'default'} - - ) - }, { title: t('card.node'), dataIndex: 'nodeName', - width: 170, + width: 200, hideTooltip: true, render: ({ nodeName }) => ( ), }, - { - title: t('card.vgpu'), - key: 'card-vgpu', - dataIndex: 'used', - width: 100, - render: ({ vgpuTotal, vgpuUsed, isExternal }) => ( - - {isExternal ? '--' : vgpuUsed}/{isExternal ? '--' : vgpuTotal} - - ), - }, { title: t('card.computeRemainingTotal'), key: 'card-compute-remaining-total', dataIndex: 'used', - width: isEnglish.value ? 220 : 180, + width: 220, render: ({ coreTotal, coreUsed, isExternal }) => { if (isExternal || !coreTotal) return --; - const stats = getRemainingTotalText({ total: coreTotal, used: coreUsed }); + const stats = getRemainingTotalText({ total: coreTotal, used: coreUsed, divisor: 100 }); if (!stats) return --; return (
@@ -254,7 +233,7 @@ const baseColumns = computed(() => [ title: t('card.computeAllocTotal'), key: 'card-compute-allocation', dataIndex: 'used', - width: isEnglish.value ? 170 : 140, + width: 180, render: ({ coreTotal, coreUsed, isExternal }) => { if (isExternal || !coreTotal) return --; const percent = Math.max( @@ -281,7 +260,7 @@ const baseColumns = computed(() => [ title: t('card.memoryRemainingTotal'), key: 'card-memory-remaining-total', dataIndex: 'used', - width: isEnglish.value ? 210 : 180, + width: 220, render: ({ memoryTotal, memoryUsed, isExternal }) => { if (isExternal || !memoryTotal) return --; const stats = getRemainingTotalText({ @@ -304,7 +283,7 @@ const baseColumns = computed(() => [ title: t('card.memoryAllocTotal'), key: 'card-memory-allocation', dataIndex: 'w', - width: isEnglish.value ? 160 : 140, + width: 180, render: ({ memoryTotal, memoryUsed, isExternal }) => { if (isExternal || !memoryTotal) return --; const percent = Math.max( diff --git a/packages/web/projects/vgpu/views/monitor/overview/Block.vue b/packages/web/projects/vgpu/views/monitor/overview/Block.vue index 8f237f54..6c2f88e2 100644 --- a/packages/web/projects/vgpu/views/monitor/overview/Block.vue +++ b/packages/web/projects/vgpu/views/monitor/overview/Block.vue @@ -16,9 +16,9 @@ defineProps(['title']); diff --git a/packages/web/projects/vgpu/views/node/admin/getOptions.js b/packages/web/projects/vgpu/views/node/admin/getOptions.js index 22b04c8b..b55587d9 100644 --- a/packages/web/projects/vgpu/views/node/admin/getOptions.js +++ b/packages/web/projects/vgpu/views/node/admin/getOptions.js @@ -1,8 +1,43 @@ import { timeParse } from '@/utils'; +const normalizePoints = (points = []) => { + return points + .map((point) => { + if (Array.isArray(point)) { + return { timestamp: Number(point[0]), value: Number(point[1]) }; + } + return { + timestamp: Number(point?.timestamp), + value: Number(point?.value), + }; + }) + .filter((item) => Number.isFinite(item.timestamp) && Number.isFinite(item.value)); +}; + +const buildPercentTooltipFormatter = () => { + return (params) => { + if (!Array.isArray(params) || params.length === 0) return ''; + let result = `
${params[0]?.axisValueLabel || params[0]?.name || ''}
`; + for (let i = 0; i < params.length; i++) { + const item = params[i]; + const num = Number(item?.value); + const value = Number.isFinite(num) ? `${num.toFixed(3)}%` : '-'; + result += `
+ + ${item?.seriesName || '-'}:  + ${value} +
`; + } + return result; + }; +}; + export const getRangeOptions = ({ allocation = [], usage = [] }, t = (v) => v) => { - const xDataSource = allocation?.length ? allocation : usage; + const normalizedAllocation = normalizePoints(allocation); + const normalizedUsage = normalizePoints(usage); + const xDataSource = normalizedAllocation.length ? normalizedAllocation : normalizedUsage; return { + animation: false, legend: { bottom: 10, left: 'center', @@ -12,20 +47,7 @@ export const getRangeOptions = ({ allocation = [], usage = [] }, t = (v) => v) = axisPointer: { type: 'cross', }, - formatter: function (params) { - if (!params || params.length === 0) return ''; - var res = params[0].name + '
'; - for (var i = 0; i < params.length; i++) { - res += - params[i].marker + - params[i].seriesName + - ' : ' + - (+params[i].value).toFixed(0) + - '
'; - } - - return res; - }, + formatter: buildPercentTooltipFormatter(), }, grid: { top: 20, // 上边距 @@ -33,9 +55,16 @@ export const getRangeOptions = ({ allocation = [], usage = [] }, t = (v) => v) = left: '7%', // 左边距 right: 10, // 右边距 }, + dataZoom: [ + { + type: 'inside', + xAxisIndex: 0, + filterMode: 'none', + }, + ], xAxis: { type: 'category', - data: xDataSource.map((item) => timeParse(+item.timestamp)), + data: xDataSource.map((item) => timeParse(item.timestamp)), axisLabel: { formatter: function (value) { return timeParse(value, 'HH:mm'); @@ -53,24 +82,26 @@ export const getRangeOptions = ({ allocation = [], usage = [] }, t = (v) => v) = series: [ { name: t('dashboard.allocRateLegend'), - data: allocation, + data: normalizedAllocation.map((item) => item.value), type: 'line', itemStyle: { - color: 'rgb(84, 112, 198)', + color: '#5B8FF9', }, lineStyle: { - color: 'rgb(84, 112, 198)', + width: 3, + color: '#5B8FF9', }, }, { name: t('dashboard.usageRateLegend'), - data: usage, + data: normalizedUsage.map((item) => item.value), type: 'line', itemStyle: { - color: 'rgb(145, 204, 117)', + color: '#42C090', }, lineStyle: { - color: 'rgb(145, 204, 117)', + width: 3, + color: '#42C090', }, }, ], diff --git a/packages/web/projects/vgpu/views/node/admin/index.vue b/packages/web/projects/vgpu/views/node/admin/index.vue index 48691a57..ed5615af 100644 --- a/packages/web/projects/vgpu/views/node/admin/index.vue +++ b/packages/web/projects/vgpu/views/node/admin/index.vue @@ -140,6 +140,7 @@ const getNodeStatusDisplay = ({ isSchedulable, isExternal }) => { const baseColumns = computed(() => [ { title: t('node.name'), + minWidth: 200, dataIndex: 'name', render: ({ uid, name }) => { const to = `/admin/vgpu/node/admin/${uid}?nodeName=${name}`; @@ -157,6 +158,7 @@ const baseColumns = computed(() => [ }, { title: t('task.status'), + minWidth: 150, dataIndex: 'isSchedulable', render: ({ isSchedulable, isExternal }) => { const { icon, text } = getNodeStatusDisplay({ isSchedulable, isExternal }); @@ -170,29 +172,13 @@ const baseColumns = computed(() => [ }, { title: t('node.ip'), + minWidth: 100, dataIndex: 'ip', }, - { - title: t('node.cardModel'), - dataIndex: 'type', - }, - { - title: t('node.cardCount'), - dataIndex: 'cardCnt', - }, - { - title: t('node.vgpu'), - key: 'node-vgpu', - dataIndex: 'used', - render: ({ vgpuTotal, vgpuUsed, isExternal }) => ( - - {isExternal ? '--' : vgpuUsed}/{isExternal ? '--' : vgpuTotal} - - ), - }, { title: t('node.computeAllocTotal'), key: 'node-compute-allocation', + minWidth: 280, dataIndex: 'used', render: ({ coreTotal, coreUsed, isExternal }) => { if (isExternal || !coreTotal) return --; @@ -218,6 +204,7 @@ const baseColumns = computed(() => [ { title: t('node.memoryAllocTotal'), key: 'node-memory-allocation', + minWidth: 280, dataIndex: 'used', render: ({ memoryTotal, memoryUsed, isExternal }) => { if (isExternal || !memoryTotal) return --; diff --git a/packages/web/projects/vgpu/views/task/admin/Detail.vue b/packages/web/projects/vgpu/views/task/admin/Detail.vue index f062fadb..4d43f8a9 100644 --- a/packages/web/projects/vgpu/views/task/admin/Detail.vue +++ b/packages/web/projects/vgpu/views/task/admin/Detail.vue @@ -1,41 +1,153 @@
@@ -57,21 +173,21 @@ diff --git a/packages/web/src/components/BackHeader.vue b/packages/web/src/components/BackHeader.vue deleted file mode 100644 index 212113a5..00000000 --- a/packages/web/src/components/BackHeader.vue +++ /dev/null @@ -1,40 +0,0 @@ - - - - - diff --git a/packages/web/src/components/BlockBox.vue b/packages/web/src/components/BlockBox.vue index 1195b0ad..9ce7f755 100644 --- a/packages/web/src/components/BlockBox.vue +++ b/packages/web/src/components/BlockBox.vue @@ -16,7 +16,7 @@ defineProps(['title']); diff --git a/packages/web/src/components/index.js b/packages/web/src/components/index.js index 496ff74b..8535cafc 100644 --- a/packages/web/src/components/index.js +++ b/packages/web/src/components/index.js @@ -10,7 +10,6 @@ import FormItem from './FormPlus/FormItem.vue'; import FormRender from './FormPlus/FormRender.vue'; import FormGroup from './FormPlus/FormGroup.vue'; import ItemGroup from './ItemGroup.vue'; -import BackHeader from './BackHeader.vue'; import InfoPreview from './InfoPreview.vue'; import ButtonGroup from './ButtonGroup.vue'; import DetailDrawer from './DetailDrawer.vue'; @@ -37,7 +36,6 @@ export default (app) => { app.component('FormRender', FormRender); app.component('FormGroup', FormGroup); app.component('ItemGroup', ItemGroup); - app.component('BackHeader', BackHeader); app.component('InfoPreview', InfoPreview); app.component('ButtonGroup', ButtonGroup); app.component('DetailDrawer', DetailDrawer); diff --git a/packages/web/src/icons/svg/cpu-limit.svg b/packages/web/src/icons/svg/cpu-limit.svg new file mode 100644 index 00000000..19a51eb8 --- /dev/null +++ b/packages/web/src/icons/svg/cpu-limit.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/packages/web/src/icons/svg/gpu-ascend.svg b/packages/web/src/icons/svg/gpu-ascend.svg new file mode 100644 index 00000000..ed594ed8 --- /dev/null +++ b/packages/web/src/icons/svg/gpu-ascend.svg @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/packages/web/src/icons/svg/gpu-aws.svg b/packages/web/src/icons/svg/gpu-aws.svg new file mode 100644 index 00000000..7b2b590b --- /dev/null +++ b/packages/web/src/icons/svg/gpu-aws.svg @@ -0,0 +1,7 @@ + + + + + + + diff --git a/packages/web/src/icons/svg/gpu-metax.svg b/packages/web/src/icons/svg/gpu-metax.svg new file mode 100644 index 00000000..dd21e707 --- /dev/null +++ b/packages/web/src/icons/svg/gpu-metax.svg @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/packages/web/src/icons/svg/gpu-nvidia.svg b/packages/web/src/icons/svg/gpu-nvidia.svg new file mode 100644 index 00000000..5689ee9a --- /dev/null +++ b/packages/web/src/icons/svg/gpu-nvidia.svg @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/packages/web/src/icons/svg/help-circle.svg b/packages/web/src/icons/svg/help-circle.svg new file mode 100644 index 00000000..8161988f --- /dev/null +++ b/packages/web/src/icons/svg/help-circle.svg @@ -0,0 +1,6 @@ + + + diff --git a/packages/web/src/icons/svg/node-cpu-total.svg b/packages/web/src/icons/svg/node-cpu-total.svg new file mode 100644 index 00000000..a7a4470d --- /dev/null +++ b/packages/web/src/icons/svg/node-cpu-total.svg @@ -0,0 +1,4 @@ + + + + diff --git a/packages/web/src/icons/svg/node-memory-total.svg b/packages/web/src/icons/svg/node-memory-total.svg new file mode 100644 index 00000000..cb41241f --- /dev/null +++ b/packages/web/src/icons/svg/node-memory-total.svg @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/packages/web/src/icons/svg/related-gpu-eye.svg b/packages/web/src/icons/svg/related-gpu-eye.svg new file mode 100644 index 00000000..53cd1b28 --- /dev/null +++ b/packages/web/src/icons/svg/related-gpu-eye.svg @@ -0,0 +1,3 @@ + + + diff --git a/packages/web/src/icons/svg/vgpu-workload.svg b/packages/web/src/icons/svg/vgpu-workload.svg new file mode 100644 index 00000000..925b3809 --- /dev/null +++ b/packages/web/src/icons/svg/vgpu-workload.svg @@ -0,0 +1,10 @@ + + + + diff --git a/packages/web/src/locales/en.js b/packages/web/src/locales/en.js index 1f893124..bf579fd2 100644 --- a/packages/web/src/locales/en.js +++ b/packages/web/src/locales/en.js @@ -74,10 +74,14 @@ export default { nodeTotal: 'Total Nodes', card: 'GPU', gpuCardCount: 'GPU Cards', + workloadCount: 'Workloads', vgpu: 'vGPU', compute: 'Compute', + cpuTotalCores: 'Total CPU Cores', + cpuCoreUnit: 'Core', + systemMemoryTotal: 'Total Memory', memory: 'Memory', - memoryTotal: 'Total Memory', + memoryTotal: 'Total GPU Memory', schedulable: 'Schedulable', unschedulable: 'Unschedulable', allocated: 'Allocated', @@ -87,10 +91,24 @@ export default { allocRateLegend: 'Alloc Rate', gpuComputeAllocUsageTrend: 'GPU Compute Alloc/Usage Trend (%)', gpuMemAllocUsageTrend: 'GPU Memory Alloc/Usage Trend (%)', + cpuAllocUsageTrend: 'CPU Alloc/Usage Trend (%)', + memAllocUsageTrend: 'Memory Alloc/Usage Trend (%)', + cpuAllocRate: 'CPU Allocation Rate', + cpuUsageRate: 'CPU Usage Rate', + systemMemAllocRate: 'Memory Allocation Rate', + systemMemUsageRate: 'Memory Usage Rate', nodeResourceUsageTop5: 'Node Usage Top5', nodeResourceAllocTop5: 'Node Alloc Top5', nodeComputeTop5: 'Node Compute Top5', nodeMemoryTop5: 'Node Memory Top5', + nodeWorkloadTop5: 'Node Workload Count Top5', + nodeWorkloadDistribution: 'Node Workload Distribution', + nodeWorkloadDistributionDesc1: + 'Show the distribution of all cluster nodes by workload-count ranges.', + nodeWorkloadDistributionDesc2: + 'X-axis is workload-count range per node, and Y-axis is number of nodes in that range.', + workload24hTrendTop5: '24h Workload Usage Trend Top5 (%)', + workloadRange: 'Workload Range', gpuComputeTop5: 'GPU Compute Top5', gpuMemoryTop5: 'GPU Memory Top5', resourceAllocTrendTitle: 'Alloc Trend', @@ -139,6 +157,8 @@ export default { resourceOverview: 'Resource Overview', title: 'Node Detail', detailInfo: 'Details', + nodeOverview: 'Node Overview', + workloadDistribution: 'Workload Distribution', nodeSchedule: 'Node Schedule', resourceAllocTrend: 'Alloc Trend (%)', resourceUsageTrend: 'Usage Trend (%)', @@ -150,7 +170,7 @@ export default { vgpuOvercommit: 'vGPU Overcommit', computeOvercommit: 'Compute Overcommit', memoryOvercommit: 'Memory Overcommit', - nodeIpAddress: 'Node IP Address', + nodeIpAddress: 'Node IP', nodeUuid: 'Node UUID', osType: 'Operating System Type', architecture: 'Architecture', @@ -191,14 +211,14 @@ export default { unmanaged: 'Unmanaged', detail: { title: 'GPU', - detailInfo: 'Details', + detailInfo: 'Basic Info', resourceOverview: 'Resource Overview', + workloadCount: 'Workloads', + workloadCountTip: + 'Each accelerator card supports sharing by multiple workloads at the same time.\nThe chart shows [Allocated Count / Maximum Supported Count].', resourceAllocTrend: 'Alloc Trend (%)', resourceUsageTrend: 'Usage Trend (%)', - taskList: 'Workloads', noAllocData: 'No allocation data', - unmanagedNoTask: 'GPU unmanaged; task data unavailable', - noTaskData: 'No task data', uuid: 'GPU UUID', temperature: 'Temperature', powerUsage: 'Power Usage', @@ -227,6 +247,11 @@ export default { allocatedVgpu: 'vGPU', allocatedCompute: 'Compute', allocatedMemory: 'Memory', + gpuCardCount: 'GPU Cards', + computePowerLimit: 'Compute Power Limit', + singleCardMemory: 'Single-card Memory', + cpuLimit: 'CPU Limit', + memoryLimit: 'Memory Limit', startTime: 'Start Time', duration: 'Duration', namespace: 'Namespace', @@ -240,17 +265,26 @@ export default { card: 'GPU', allCards: 'All GPUs', cardType: 'GPU Type', + gpuModel: 'GPU Model', + relatedGpu: 'Related GPU', + relatedGpuCards: '{count} GPUs', + image: 'Image', + cpu: 'CPU', + memory: 'Memory', + priority: 'Priority', appName: 'App Name', createTime: 'Creation Time', times: 'x', - computeUsageTrend: 'GPU Compute Usage Trend (%)', - memUsageTrend: 'GPU Memory Usage Trend (%)', + computeUsageTrend: 'GPU Compute Utilization (%)', + memUsageTrend: 'GPU Memory Utilization (%)', + cpuUsageTrend: 'CPU Usage (%)', + memoryUsageTrend: 'Memory Usage (%)', noMonitorSupport: 'Vendor does not support task-level monitoring', topCount: 'Workload Count Top5', topApply: 'Workload Requests Top5', detail: { title: 'Workload', - detailInfo: 'Details', + detailInfo: 'Basic Info', resourceOverview: 'Resource Overview', containerInfo: 'Container', podName: 'Pod Name', diff --git a/packages/web/src/locales/zh.js b/packages/web/src/locales/zh.js index 35d5452c..83225670 100644 --- a/packages/web/src/locales/zh.js +++ b/packages/web/src/locales/zh.js @@ -74,10 +74,14 @@ export default { nodeTotal: '节点总数', card: 'GPU', gpuCardCount: 'GPU 卡数', + workloadCount: '工作负载数量', vgpu: 'vGPU', compute: '算力', + cpuTotalCores: 'CPU 总核', + cpuCoreUnit: 'Core', + systemMemoryTotal: '内存总量', memory: '显存', - memoryTotal: '显存总量', + memoryTotal: 'GPU 显存总量', schedulable: '可调度', unschedulable: '禁止调度', allocated: '已分配', @@ -87,10 +91,22 @@ export default { allocRateLegend: '分配率', gpuComputeAllocUsageTrend: 'GPU 算力分配/使用趋势(%)', gpuMemAllocUsageTrend: 'GPU 显存分配/使用趋势(%)', + cpuAllocUsageTrend: 'CPU 分配/使用趋势(%)', + memAllocUsageTrend: '内存分配/使用趋势(%)', + cpuAllocRate: 'CPU 分配率', + cpuUsageRate: 'CPU 使用率', + systemMemAllocRate: '内存分配率', + systemMemUsageRate: '内存使用率', nodeResourceUsageTop5: '节点使用率 Top5', nodeResourceAllocTop5: '节点分配率 Top5', nodeComputeTop5: '节点算力 Top5', nodeMemoryTop5: '节点显存 Top5', + nodeWorkloadTop5: '节点工作负载数量 Top5', + nodeWorkloadDistribution: '节点工作负载分布(台)', + nodeWorkloadDistributionDesc1: '展示集群中所有节点的工作负载在不同数量区间的分布情况。', + nodeWorkloadDistributionDesc2: '横轴是节点的工作负载数量范围,纵轴表示落入该范围的节点数量。', + workload24hTrendTop5: '24 小时工作负载使用趋势 Top5 (%)', + workloadRange: '工作负载区间', gpuComputeTop5: 'GPU 算力 Top5', gpuMemoryTop5: 'GPU 显存 Top5', resourceAllocTrendTitle: '分配趋势', @@ -139,6 +155,8 @@ export default { resourceOverview: '资源概览', title: '节点详情', detailInfo: '详细信息', + nodeOverview: '节点概览', + workloadDistribution: '工作负载分布', nodeSchedule: '节点调度', resourceAllocTrend: '分配趋势(%)', resourceUsageTrend: '使用趋势(%)', @@ -150,13 +168,13 @@ export default { vgpuOvercommit: 'vGPU 超配', computeOvercommit: '算力超配', memoryOvercommit: '显存超配', - nodeIpAddress: '节点 IP 地址', + nodeIpAddress: '节点 IP', nodeUuid: '节点 UUID', osType: '操作系统类型', - architecture: '系统架构', + architecture: '架构', kubeletVersion: 'kubelet 版本', - osVersion: '操作系统版本', - kernelVersion: '内核版本', + osVersion: '操作系统', + kernelVersion: 'Kernel 版本', kubeProxyVersion: 'kube-proxy 版本', containerRuntime: '容器运行时', creationTime: '创建时间', @@ -191,14 +209,13 @@ export default { unmanaged: '未纳管', detail: { title: 'GPU', - detailInfo: '详细信息', + detailInfo: '基本信息', resourceOverview: '资源概览', + workloadCount: '工作负载数', + workloadCountTip: '每张算力卡支持被多个工作负载同时共享。\n图中展示的是【已分配数量 / 最大支持数量】。', resourceAllocTrend: '资源分配趋势(%)', resourceUsageTrend: '资源使用趋势(%)', - taskList: '工作负载', noAllocData: '暂无资源分配数据', - unmanagedNoTask: '由于 GPU 未纳管,无法获取到任务数据', - noTaskData: '暂无任务数据', uuid: 'GPU UUID', temperature: '温度', powerUsage: '功耗', @@ -227,6 +244,11 @@ export default { allocatedVgpu: 'vGPU', allocatedCompute: '算力', allocatedMemory: '显存', + gpuCardCount: 'GPU 卡数', + computePowerLimit: '算力限制', + singleCardMemory: '单卡显存', + cpuLimit: 'CPU 限制', + memoryLimit: '内存限制', startTime: '启动时间', duration: '运行时长', namespace: '命名空间', @@ -240,17 +262,26 @@ export default { card: '所属显卡', allCards: '全部显卡', cardType: '显卡类型', + gpuModel: 'GPU 型号', + relatedGpu: '关联 GPU', + relatedGpuCards: '共 {count} 张', + image: '镜像', + cpu: 'CPU', + memory: '内存', + priority: '优先级', appName: '应用名称', createTime: '创建时间', times: '倍', - computeUsageTrend: '算力使用趋势(%)', - memUsageTrend: '显存使用趋势(%)', + computeUsageTrend: 'GPU 算力使用率(%)', + memUsageTrend: 'GPU 显存使用率(%)', + cpuUsageTrend: 'CPU 使用率(%)', + memoryUsageTrend: '内存使用率(%)', noMonitorSupport: '该设备厂商暂不支持任务维度监控', topCount: '任务数量分布 Top5', topApply: '任务资源申请 Top5', detail: { title: '任务管理', - detailInfo: '详细信息', + detailInfo: '基础信息', resourceOverview: '资源概览', containerInfo: '容器信息', podName: 'Pod 名称', diff --git a/packages/web/src/main.js b/packages/web/src/main.js index c7b72834..e51dfb11 100644 --- a/packages/web/src/main.js +++ b/packages/web/src/main.js @@ -4,6 +4,7 @@ import router from './router'; import store from './store'; import installElementPlus from './plugins/element'; import installTDesign from './plugins/tdesign'; +import installEcharts from './plugins/echarts'; import 'normalize.css/normalize.css'; // a modern alternative to CSS resets import '@/styles/index.scss'; // global css @@ -20,6 +21,7 @@ const app = createApp(App); app.use(i18n); installElementPlus(app); installTDesign(app); +installEcharts(); installIcons(app); app.use(components); diff --git a/packages/web/src/plugins/echarts.js b/packages/web/src/plugins/echarts.js new file mode 100644 index 00000000..d74296a8 --- /dev/null +++ b/packages/web/src/plugins/echarts.js @@ -0,0 +1,8 @@ +import { LineChart, BarChart } from 'echarts/charts'; +import { GridComponent, TooltipComponent, LegendComponent, DataZoomComponent } from 'echarts/components'; +import { use } from 'echarts/core'; +import { CanvasRenderer } from 'echarts/renderers'; + +export default function installEcharts() { + use([CanvasRenderer, LineChart, BarChart, GridComponent, TooltipComponent, LegendComponent, DataZoomComponent]); +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f300656e..c8998f70 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -213,6 +213,9 @@ importers: vue-count-to: specifier: 1.0.13 version: 1.0.13 + vue-echarts: + specifier: ^6.7.3 + version: 6.7.3(@vue/runtime-core@3.5.25)(echarts@5.4.3)(vue@3.5.25(typescript@5.9.3)) vue-i18n: specifier: '9' version: 9.14.5(vue@3.5.25(typescript@5.9.3)) @@ -5929,6 +5932,9 @@ packages: requires-port@1.0.0: resolution: {integrity: sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==} + resize-detector@0.3.0: + resolution: {integrity: sha512-R/tCuvuOHQ8o2boRP6vgx8hXCCy87H1eY9V5imBYeVNyNVpuL9ciReSccLj2gDcax9+2weXy3bc8Vv+NRXeEvQ==} + resolve-cwd@3.0.0: resolution: {integrity: sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==} engines: {node: '>=8'} @@ -6930,6 +6936,17 @@ packages: vue-count-to@1.0.13: resolution: {integrity: sha512-6R4OVBVNtQTlcbXu6SJ8ENR35M2/CdWt3Jmv57jOUM+1ojiFmjVGvZPH8DfHpMDSA+ITs+EW5V6qthADxeyYOQ==} + vue-demi@0.13.11: + resolution: {integrity: sha512-IR8HoEEGM65YY3ZJYAjMlKygDQn25D5ajNFNoKh9RSDMQtlzCxtfQjdQgv9jjK+m3377SsJXY8ysq8kLCZL25A==} + engines: {node: '>=12'} + hasBin: true + peerDependencies: + '@vue/composition-api': ^1.0.0-rc.1 + vue: ^3.0.0-0 || ^2.6.0 + peerDependenciesMeta: + '@vue/composition-api': + optional: true + vue-demi@0.14.10: resolution: {integrity: sha512-nMZBOwuzabUO0nLgIcc6rycZEebF6eeUfaiQx9+WSk8e29IbLvPU9feI6tqW4kTo3hvoYAJkMh8n8D0fuISphg==} engines: {node: '>=12'} @@ -6941,6 +6958,19 @@ packages: '@vue/composition-api': optional: true + vue-echarts@6.7.3: + resolution: {integrity: sha512-vXLKpALFjbPphW9IfQPOVfb1KjGZ/f8qa/FZHi9lZIWzAnQC1DgnmEK3pJgEkyo6EP7UnX6Bv/V3Ke7p+qCNXA==} + peerDependencies: + '@vue/composition-api': ^1.0.5 + '@vue/runtime-core': ^3.0.0 + echarts: ^5.4.1 + vue: ^2.6.12 || ^3.1.1 + peerDependenciesMeta: + '@vue/composition-api': + optional: true + '@vue/runtime-core': + optional: true + vue-eslint-parser@8.3.0: resolution: {integrity: sha512-dzHGG3+sYwSf6zFBa0Gi9ZDshD7+ad14DGOdTLjruRVgZXe2J+DcZ9iUhyR48z5g1PqRa20yt3Njna/veLJL/g==} engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0} @@ -13942,6 +13972,8 @@ snapshots: requires-port@1.0.0: {} + resize-detector@0.3.0: {} + resolve-cwd@3.0.0: dependencies: resolve-from: 5.0.0 @@ -15091,10 +15123,23 @@ snapshots: vue-count-to@1.0.13: {} + vue-demi@0.13.11(vue@3.5.25(typescript@5.9.3)): + dependencies: + vue: 3.5.25(typescript@5.9.3) + vue-demi@0.14.10(vue@3.5.25(typescript@5.9.3)): dependencies: vue: 3.5.25(typescript@5.9.3) + vue-echarts@6.7.3(@vue/runtime-core@3.5.25)(echarts@5.4.3)(vue@3.5.25(typescript@5.9.3)): + dependencies: + echarts: 5.4.3 + resize-detector: 0.3.0 + vue: 3.5.25(typescript@5.9.3) + vue-demi: 0.13.11(vue@3.5.25(typescript@5.9.3)) + optionalDependencies: + '@vue/runtime-core': 3.5.25 + vue-eslint-parser@8.3.0(eslint@8.56.0): dependencies: debug: 4.4.3 diff --git a/server/api/v1/container.proto b/server/api/v1/container.proto index 8445f188..04d97d73 100644 --- a/server/api/v1/container.proto +++ b/server/api/v1/container.proto @@ -66,6 +66,7 @@ message ContainerReply { string priority = 16; string namespace = 17; repeated string device_ids = 18; + repeated string images = 19; } message ContainersReply { diff --git a/server/internal/biz/pod.go b/server/internal/biz/pod.go index 15aa6c52..4162db75 100644 --- a/server/internal/biz/pod.go +++ b/server/internal/biz/pod.go @@ -15,6 +15,7 @@ type Container struct { NodeName string PodUID string PodName string + Image string ContainerDevices ContainerDevices Status string CreateTime time.Time diff --git a/server/internal/data/pod.go b/server/internal/data/pod.go index bfc71e8c..1cb2db78 100644 --- a/server/internal/data/pod.go +++ b/server/internal/data/pod.go @@ -140,6 +140,7 @@ func (r *podRepo) fetchContainerInfo(pod *corev1.Pod) []*biz.Container { NodeName: pod.Spec.NodeName, PodName: pod.Name, PodUID: string(pod.UID), + Image: ctr.Image, Status: containerStat[ctr.Name], NodeUID: r.GetNodeUUID(pod), Namespace: pod.Namespace, diff --git a/server/internal/exporter/exporter.go b/server/internal/exporter/exporter.go index 87583965..51361457 100644 --- a/server/internal/exporter/exporter.go +++ b/server/internal/exporter/exporter.go @@ -382,12 +382,12 @@ func (s *MetricsGenerator) taskCoreUsed(ctx context.Context, provider, namespace query := "" switch provider { case biz.NvidiaGPUDevice: - //query = fmt.Sprintf("avg(Device_utilization_desc_of_container{deviceuuid=\"%s\", podnamespace=\"%s\", podname=\"%s\", ctrname=\"%s\"})", deviceUUID, namespace, pod, container) - // queryTemplate := `last_over_time((Device_utilization_desc_of_container{deviceuuid="%s", podnamespace="%s", podname="%s", ctrname="%s"} != 0)[1m:]) + //query = fmt.Sprintf("avg(hami_container_device_utilization_ratio{device_uuid=\"%s\", namespace=\"%s\", pod=\"%s\", container=\"%s\"})", deviceUUID, namespace, pod, container) + // queryTemplate := `last_over_time((hami_container_device_utilization_ratio{device_uuid="%s", namespace="%s", pod="%s", container="%s"} != 0)[1m:]) //or - //last_over_time(Device_utilization_desc_of_container{deviceuuid="%s", podnamespace="%s", podname="%s", ctrname="%s"}[1m:])` + //last_over_time(hami_container_device_utilization_ratio{device_uuid="%s", namespace="%s", pod="%s", container="%s"}[1m:])` // query = fmt.Sprintf(queryTemplate, deviceUUID, namespace, pod, container, deviceUUID, namespace, pod, container) - queryTemplate := fmt.Sprintf("Device_utilization_desc_of_container{deviceuuid=\"%s\", podnamespace=\"%s\", podname=\"%s\", ctrname=\"%s\"}", deviceUUID, namespace, pod, container) + queryTemplate := fmt.Sprintf("hami_container_device_utilization_ratio{device_uuid=\"%s\", namespace=\"%s\", pod=\"%s\", container=\"%s\"}", deviceUUID, namespace, pod, container) query = fmt.Sprintf("sum_over_time(%s[1m]) == 0 or (sum_over_time(%s[10m:]) / count_over_time(( %s !=0)[10m:])) ", queryTemplate, queryTemplate, queryTemplate) //query = queryTemplate case biz.CambriconGPUDevice: @@ -412,7 +412,7 @@ func (s *MetricsGenerator) taskMemoryUsed(ctx context.Context, provider, namespa query := "" switch provider { case biz.NvidiaGPUDevice: - query = fmt.Sprintf("avg(vGPU_device_memory_usage_in_bytes{deviceuuid=\"%s\", podnamespace=\"%s\", podname=\"%s\", ctrname=\"%s\"})", deviceUUID, namespace, pod, container) + query = fmt.Sprintf("avg(hami_vgpu_memory_used_bytes{device_uuid=\"%s\", namespace=\"%s\", pod=\"%s\", container=\"%s\"})", deviceUUID, namespace, pod, container) case biz.CambriconGPUDevice: query = fmt.Sprintf("avg(mlu_memory_utilization * on(uuid) group_right mlu_container{namespace=\"%s\",pod=\"%s\",container=\"%s\",type=\"mlu370.smlu.vmemory\"})", namespace, pod, container) case biz.AscendGPUDevice: diff --git a/server/internal/exporter/metrics.go b/server/internal/exporter/metrics.go index 50c33bdb..c00e1fda 100644 --- a/server/internal/exporter/metrics.go +++ b/server/internal/exporter/metrics.go @@ -80,132 +80,132 @@ var ( HamiVCoreScaling = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_vcore_scaling", Help: "GPU virtual core Scaling", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiVMemoryScaling = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_vmemory_scaling", Help: "GPU virtual memory Scaling", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiVgpuCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_vgpu_count", Help: "Total vGPU count", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiVmemorySize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_vmemory_size", Help: "Total vMemory size", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiVcoreSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_vcore_size", Help: "Total vCore size", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiMemoryUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_memory_used", Help: "Actual memory usage, unit is 'MB' ", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiMemorySize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_memory_size", Help: "Actual memory size, unit is 'MB' ", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiMemoryUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_memory_util", Help: "Actual Memory Util percent 0-100", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiCoreSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_core_size", Help: "Actual core size", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiCoreUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_core_used", Help: "Actual Core Used", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiCoreUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_core_util", Help: "Actual Core Util percent 0-100", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiCoreUsedAvg = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_core_used_avg", Help: "Actual Core Used period avg", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiCoreUtilAvg = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_core_util_avg", Help: "Actual Core Util percent 0-100 period avg", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiDeviceTemperature = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_temperature", Help: "gpu temperature", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiDeviceMemoryTemperature = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_memory_temperature", Help: "gpu memory temperature", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiDevicePower = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_power", Help: "gpu power", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiDeviceHardwareHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_hardware_health", Help: "gpu hardware health", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiDeviceFanSpeedP = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_fan_speed_p", Help: "gpu fan speed percent 0-100", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiDeviceFanSpeedR = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_fan_speed_r", Help: "gpu fan speed rpm", - }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) + }, []string{"node", "provider", "device_type", "device_uuid", "driver_version", "device_no"}) HamiContainerVgpuAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_vgpu_allocated", Help: "task allocated vGPU count", - }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"}) + }, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"}) HamiContainerVmemoryAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_vmemory_allocated", Help: "task allocated vMemory size", - }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"}) + }, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"}) HamiContainerVcoreAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_vcore_allocated", Help: "task allocated vCore size", - }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"}) + }, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"}) HamiContainerMemoryUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_memory_used", Help: "task used memory unit MB", - }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"}) + }, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name"}) HamiContainerMemoryUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_memory_util", Help: "task memory util percent 0-100", - }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"}) + }, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name"}) HamiContainerCoreUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_core_used", Help: "task used core ", - }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"}) + }, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name"}) HamiContainerCoreUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_core_util", Help: "task core util percent 0-100", - }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"}) + }, []string{"node", "provider", "device_type", "device_uuid", "pod_name", "container_name", "namespace_name"}) HamiPoolVgpuCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_pool_vgpu_count", diff --git a/server/internal/service/card.go b/server/internal/service/card.go index 6569c1fb..155ba756 100644 --- a/server/internal/service/card.go +++ b/server/internal/service/card.go @@ -58,11 +58,11 @@ func (s *CardService) GetAllGPUs(ctx context.Context, req *pb.GetAllGpusReq) (*p gpu.CoreUsed = core gpu.MemoryUsed = memory } - resp, err := s.ms.QueryInstant(ctx, &pb.QueryInstantRequest{Query: fmt.Sprintf("avg(hami_core_size{deviceuuid=~\"%s\"})", device.Id)}) + resp, err := s.ms.QueryInstant(ctx, &pb.QueryInstantRequest{Query: fmt.Sprintf("avg(hami_core_size{device_uuid=~\"%s\"})", device.Id)}) if err == nil && len(resp.Data) > 0 { gpu.CoreTotal = int32(resp.Data[0].Value) } - resp, err = s.ms.QueryInstant(ctx, &pb.QueryInstantRequest{Query: fmt.Sprintf("avg(hami_memory_size{deviceuuid=~\"%s\"})", device.Id)}) + resp, err = s.ms.QueryInstant(ctx, &pb.QueryInstantRequest{Query: fmt.Sprintf("avg(hami_memory_size{device_uuid=~\"%s\"})", device.Id)}) if err == nil && len(resp.Data) > 0 { gpu.MemoryTotal = int32(resp.Data[0].Value) } diff --git a/server/internal/service/container.go b/server/internal/service/container.go index 4bde7f2a..c108586d 100644 --- a/server/internal/service/container.go +++ b/server/internal/service/container.go @@ -27,6 +27,23 @@ func NewContainerService(node *biz.NodeUsecase, pod *biz.PodUseCase) *ContainerS return &ContainerService{node: node, pod: pod} } +func uniqueNonEmpty(values []string) []string { + seen := make(map[string]struct{}, len(values)) + result := make([]string, 0, len(values)) + for _, value := range values { + v := strings.TrimSpace(value) + if v == "" { + continue + } + if _, ok := seen[v]; ok { + continue + } + seen[v] = struct{}{} + result = append(result, v) + } + return result +} + func (s *ContainerService) GetAllContainers(ctx context.Context, req *pb.GetAllContainersReq) (*pb.ContainersReply, error) { filters := req.Filters containers, err := s.pod.ListAllContainers(ctx) @@ -58,6 +75,7 @@ func (s *ContainerService) GetAllContainers(ctx context.Context, req *pb.GetAllC containerReply.Name = container.Name containerReply.Status = container.Status containerReply.AppName = container.PodName + containerReply.Images = uniqueNonEmpty([]string{container.Image}) containerReply.NodeName = container.NodeName containerReply.PodUid = container.PodUID containerReply.NodeUid = container.NodeUID @@ -109,6 +127,18 @@ func (s *ContainerService) GetContainer(ctx context.Context, req *pb.GetContaine ctrReply.NodeUid = container.NodeUID ctrReply.Namespace = container.Namespace ctrReply.Priority = container.Priority + allContainers, err := s.pod.ListAllContainers(ctx) + if err == nil { + images := make([]string, 0) + for _, item := range allContainers { + if item.PodUID == container.PodUID { + images = append(images, item.Image) + } + } + ctrReply.Images = uniqueNonEmpty(images) + } else { + ctrReply.Images = uniqueNonEmpty([]string{container.Image}) + } for _, containerDevice := range container.ContainerDevices { if req.DeviceId != "" && req.DeviceId != containerDevice.UUID { continue