From 83b827df6b7fc6c2548a3bd9474af315dd43123b Mon Sep 17 00:00:00 2001 From: Arindam Samanta Date: Wed, 3 Jun 2026 12:08:03 +0530 Subject: [PATCH 1/2] Add interactive GPU benchmark data dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Complete dataset from all 54 benchmark tables - Dark mode support with localStorage persistence - CSV download functionality for all data - Metric selection dropdown (Vector Add, BMM, Softmax Fwd/Bwd) - Performance heatmap visualization - Speedup calculations for duration metrics - Static HTML/JS/CSS for GitHub Pages deployment Made with ❤️ by Red Hat PyTorch Engineering Interns --- data_dashboard/app.js | 218 ++++ data_dashboard/data.js | 2123 +++++++++++++++++++++++++++++++++++++ data_dashboard/index.html | 463 ++++++++ 3 files changed, 2804 insertions(+) create mode 100644 data_dashboard/app.js create mode 100644 data_dashboard/data.js create mode 100644 data_dashboard/index.html diff --git a/data_dashboard/app.js b/data_dashboard/app.js new file mode 100644 index 0000000..9fb46fb --- /dev/null +++ b/data_dashboard/app.js @@ -0,0 +1,218 @@ +const kernelSelect = document.getElementById('kernelSelect'); +const metricSelect = document.getElementById('metricSelect'); +const tableHead = document.getElementById('tableHead'); +const tableBody = document.getElementById('tableBody'); +const avgSpeedup = document.getElementById('avgSpeedup'); +const maxSpeedup = document.getElementById('maxSpeedup'); +const darkModeToggle = document.getElementById('darkModeToggle'); +const modeIcon = document.getElementById('modeIcon'); +const modeText = document.getElementById('modeText'); +const downloadBtn = document.getElementById('downloadBtn'); + +// Dark mode functionality +function initDarkMode() { + const savedMode = localStorage.getItem('darkMode'); + if (savedMode === 'true') { + document.body.classList.add('dark-mode'); + modeIcon.textContent = '☀️'; + modeText.textContent = 'Light'; + } +} + +darkModeToggle.addEventListener('click', () => { + document.body.classList.toggle('dark-mode'); + const isDark = document.body.classList.contains('dark-mode'); + localStorage.setItem('darkMode', isDark); + modeIcon.textContent = isDark ? '☀️' : '🌙'; + modeText.textContent = isDark ? 'Light' : 'Dark'; +}); + +// CSV Download functionality +downloadBtn.addEventListener('click', () => { + let csvContent = "Kernel,Metric,Configuration,CUDA,Triton,Helion\n"; + + for (const [kernelKey, kernelData] of Object.entries(DATA)) { + const kernelName = kernelKey.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()); + + for (const [metricKey, metricData] of Object.entries(kernelData.metrics)) { + const metricName = metricData.name; + + kernelData.configs.forEach((config, idx) => { + let configName = config.name; + if (config.detail) { + configName += ` (${config.detail})`; + } + + const cudaVal = metricData.cuda[idx]; + const tritonVal = metricData.triton[idx]; + const helionVal = metricData.helion[idx]; + + csvContent += `"${kernelName}","${metricName}","${configName}",${cudaVal},${tritonVal},${helionVal}\n`; + }); + } + } + + const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' }); + const link = document.createElement('a'); + link.href = URL.createObjectURL(blob); + link.download = 'gpu_benchmark_complete_data.csv'; + link.click(); +}); + +function populateMetrics() { + const kernel = kernelSelect.value; + const metrics = DATA[kernel].metrics; + + metricSelect.innerHTML = ''; + Object.keys(metrics).forEach(metricKey => { + const option = document.createElement('option'); + option.value = metricKey; + option.textContent = metrics[metricKey].name; + metricSelect.appendChild(option); + }); +} + +function calculateCellClass(value, values, lowerIsBetter) { + const sorted = [...values].sort((a, b) => lowerIsBetter ? a - b : b - a); + const best = sorted[0]; + const second = sorted[1]; + if (value === best) return 'best'; + if (value === second) return 'second'; + return 'worst'; +} + +function formatValue(value) { + if (value >= 1e6) return value.toExponential(2); + if (value >= 1000) return value.toFixed(0); + if (value >= 10) return value.toFixed(1); + if (value >= 1) return value.toFixed(2); + return value.toFixed(4); +} + +function calculateSpeedup(values, lowerIsBetter) { + const sorted = [...values].sort((a, b) => lowerIsBetter ? a - b : b - a); + return lowerIsBetter ? sorted[2] / sorted[0] : sorted[0] / sorted[2]; +} + +function getWinner(values, lowerIsBetter) { + const sorted = [...values].sort((a, b) => lowerIsBetter ? a - b : b - a); + const best = sorted[0]; + if (values[0] === best) return 'CUDA'; + if (values[1] === best) return 'Triton'; + return 'Helion'; +} + +function renderTable() { + const kernel = kernelSelect.value; + const metric = metricSelect.value; + const kernelData = DATA[kernel]; + const metricData = kernelData.metrics[metric]; + + // Extract unit from metric name + const metricName = metricData.name; + let unit = ''; + const unitMatch = metricName.match(/\(([^)]+)\)/); + if (unitMatch) { + unit = unitMatch[1]; + } else { + // For metrics without explicit units in parentheses + if (metric === 'registers_per_thread') { + unit = 'registers'; + } else if (metric === 'register_limited_blocks') { + unit = 'blocks'; + } else if (metric === 'gflops') { + unit = 'GFLOPs'; + } + } + + // Only show speedup for duration metric + const isDuration = metric === 'duration'; + + let speedups = []; + let winners = { CUDA: 0, Triton: 0, Helion: 0 }; + + // Build header with conditional speedup column and units + tableHead.innerHTML = ` + + Configuration + + CUDA + ${unit ? `${unit}` : ''} + + + Triton + ${unit ? `${unit}` : ''} + + + Helion + ${unit ? `${unit}` : ''} + + ${isDuration ? 'Speedup×' : ''} + + `; + + tableBody.innerHTML = ''; + kernelData.configs.forEach((config, idx) => { + const cuda = metricData.cuda[idx]; + const triton = metricData.triton[idx]; + const helion = metricData.helion[idx]; + const values = [cuda, triton, helion]; + + const cudaClass = calculateCellClass(cuda, values, metricData.lower_is_better); + const tritonClass = calculateCellClass(triton, values, metricData.lower_is_better); + const helionClass = calculateCellClass(helion, values, metricData.lower_is_better); + + let speedupCell = ''; + if (isDuration) { + const speedup = calculateSpeedup(values, metricData.lower_is_better); + speedups.push(speedup); + + let speedupClass = 'speedup-medium'; + if (speedup > 50) speedupClass = 'speedup-extreme'; + else if (speedup > 10) speedupClass = 'speedup-high'; + else if (speedup < 1) speedupClass = 'speedup-regression'; + + speedupCell = `${speedup.toFixed(1)}x`; + } + + const winner = getWinner(values, metricData.lower_is_better); + winners[winner]++; + + const row = document.createElement('tr'); + row.innerHTML = ` + + ${config.name} + ${config.detail ? `${config.detail}` : ''} + + ${formatValue(cuda)} + ${formatValue(triton)} + ${formatValue(helion)} + ${speedupCell} + `; + tableBody.appendChild(row); + }); + + // Update stats - show speedup stats only for duration + if (isDuration && speedups.length > 0) { + const avgSpeed = speedups.reduce((a, b) => a + b, 0) / speedups.length; + const maxSpeed = Math.max(...speedups); + avgSpeedup.textContent = avgSpeed.toFixed(1) + 'x'; + maxSpeedup.textContent = maxSpeed.toFixed(1) + 'x'; + avgSpeedup.parentElement.style.display = 'block'; + maxSpeedup.parentElement.style.display = 'block'; + } else { + avgSpeedup.parentElement.style.display = 'none'; + maxSpeedup.parentElement.style.display = 'none'; + } +} + +kernelSelect.addEventListener('change', () => { + populateMetrics(); + renderTable(); +}); + +metricSelect.addEventListener('change', renderTable); + +initDarkMode(); +populateMetrics(); +renderTable(); diff --git a/data_dashboard/data.js b/data_dashboard/data.js new file mode 100644 index 0000000..dc1702d --- /dev/null +++ b/data_dashboard/data.js @@ -0,0 +1,2123 @@ +const DATA = { + "vector_addition": { + "configs": [ + { + "name": "Config A", + "detail": "B=32, S=4K, H=256" + }, + { + "name": "Config B", + "detail": "B=64, S=8K, H=512" + }, + { + "name": "BF16", + "detail": "B=64, S=8K, H=1024" + }, + { + "name": "FP32", + "detail": "B=64, S=8K, H=1024" + }, + { + "name": "FP16", + "detail": "B=64, S=8K, H=1024" + }, + { + "name": "Config G", + "detail": "B=48, S=12K, H=1536" + }, + { + "name": "Config C", + "detail": "B=64, S=16K, H=1024" + }, + { + "name": "FP16 Large", + "detail": "B=64, S=16K, H=2048" + }, + { + "name": "Config H", + "detail": "B=24, S=48K, H=2048" + }, + { + "name": "Config D", + "detail": "B=32, S=32K, H=2048" + }, + { + "name": "Config E", + "detail": "B=16, S=65K, H=4096" + }, + { + "name": "Config F", + "detail": "B=8, S=131K, H=8192" + } + ], + "metrics": { + "duration": { + "name": "Duration (ms)", + "lower_is_better": true, + "cuda": [ + 0.10573, + 0.83632, + 1.67, + 1.95, + 2.33, + 2.64, + 4.58, + 5.1, + 7.21, + 8.78, + 15.6, + 31.2 + ], + "triton": [ + 0.08842, + 0.71251, + 0.734, + 1.42, + 1.97, + 2.41, + 3.98, + 2.85, + 6.42, + 6.71, + 11.71, + 23.47 + ], + "helion": [ + 0.088, + 0.70326, + 0.71, + 1.4, + 1.75, + 2.41, + 4.09, + 2.85, + 6.42, + 6.98, + 11.94, + 23.9 + ] + }, + "dram_throughput": { + "name": "DRAM Throughput (%)", + "lower_is_better": false, + "cuda": [ + 36.67, + 38.92, + 40.06, + 66.99, + 28.6, + 81.78, + 58.42, + 50.15, + 79.83, + 60.96, + 68.66, + 68.66 + ], + "triton": [ + 86.64, + 89.38, + 90.8, + 89.72, + 33.9, + 89.67, + 67.23, + 89.86, + 89.72, + 79.73, + 91.44, + 91.26 + ], + "helion": [ + 88.35, + 92.36, + 92.48, + 92.85, + 38.12, + 89.58, + 65.44, + 89.8, + 89.7, + 76.63, + 89.65, + 89.61 + ] + }, + "compute_throughput": { + "name": "Compute Throughput (%)", + "lower_is_better": false, + "cuda": [ + 38.09, + 38.01, + 38.21, + 29.39, + 23.7, + 18.52, + 23.63, + 35.75, + 16.64, + 26.17, + 29.5, + 29.34 + ], + "triton": [ + 14.94, + 14.66, + 22.51, + 14.55, + 11.02, + 7.43, + 11.1, + 12.73, + 8.88, + 11.85, + 12.41, + 12.33 + ], + "helion": [ + 15.2, + 14.86, + 14.88, + 14.78, + 11.43, + 7.27, + 10.68, + 14.55, + 6.16, + 8.26, + 5.72, + 5.71 + ] + }, + "no_eligible_warps": { + "name": "No Eligible Warps (%)", + "lower_is_better": false, + "cuda": [ + 61.27, + 61.82, + 61.72, + 70.68, + 63.18, + 81.46, + 71.18, + 67.2, + 83.34, + 71.19, + 70.5, + 70.65 + ], + "triton": [ + 89.18, + 89.18, + 77.34, + 89.76, + 82.78, + 92.55, + 89.78, + 87.25, + 91.12, + 89.07, + 87.67, + 87.68 + ], + "helion": [ + 89.68, + 90.2, + 90.21, + 90.27, + 86.15, + 94.91, + 92.32, + 92.35, + 93.83, + 90.17, + 95.92, + 95.93 + ] + } + } + }, + "batched_matmul": { + "configs": [ + { + "name": "Config A", + "detail": "B=32, S=4K, H=256" + }, + { + "name": "Config B", + "detail": "B=64, S=8K, H=512" + }, + { + "name": "FP32", + "detail": "B=64, S=8K, H=1024" + }, + { + "name": "FP16", + "detail": "B=64, S=8K, H=1024" + }, + { + "name": "Config G", + "detail": "B=48, S=12K, H=1536" + }, + { + "name": "Config C", + "detail": "B=64, S=16K, H=1024" + }, + { + "name": "FP16 Large", + "detail": "B=64, S=16K, H=2048" + }, + { + "name": "Config D", + "detail": "B=32, S=32K, H=2048" + }, + { + "name": "Config H", + "detail": "B=24, S=48K, H=2048" + }, + { + "name": "Config E", + "detail": "B=16, S=65K, H=4096" + }, + { + "name": "Config F", + "detail": "B=8, S=131K, H=8192" + } + ], + "metrics": { + "duration": { + "name": "Duration (ms)", + "lower_is_better": true, + "cuda": [ + 2.78, + 44.19, + 176.13, + 174.52, + 520.74, + 352.45, + 1830, + 1410, + 2160, + 5590, + 22370 + ], + "triton": [ + 0.238, + 7.95, + 32.35, + 2.8, + 72.66, + 65.47, + 26.95, + 236.11, + 249.6, + 916.01, + 3640 + ], + "helion": [ + 0.198, + 2.44, + 101.14, + 2.27, + 47.65, + 17.83, + 16.37, + 69.17, + 168.9, + 605.19, + 2420 + ] + }, + "aggregate_memory": { + "name": "Aggregate Memory Throughput (%)", + "lower_is_better": false, + "cuda": [ + 94.42, + 94.57, + 94.6, + 95.31, + 95.63, + 94.6, + 95.34, + 94.66, + 95.56, + 95.28, + 95.23 + ], + "triton": [ + 68.77, + 88.1, + 92.42, + 70.87, + 88.91, + 92.43, + 93.9, + 87.76, + 88.53, + 87.33, + 87.53 + ], + "helion": [ + 74.1, + 58.96, + 78.72, + 68.59, + 79.98, + 60.35, + 79.39, + 59.81, + 79.38, + 79.34, + 80.39 + ] + }, + "l1_tex_throughput": { + "name": "L1/TEX Cache Throughput (%)", + "lower_is_better": false, + "cuda": [ + 94.54, + 94.58, + 94.6, + 95.32, + 95.65, + 94.6, + 95.34, + 94.66, + 95.58, + 95.28, + 95.23 + ], + "triton": [ + 71.85, + 88.21, + 92.43, + 45.1, + 88.57, + 92.43, + 68.56, + 87.76, + 88.57, + 87.32, + 87.53 + ], + "helion": [ + 76.21, + 60.06, + 78.74, + 66.82, + 80.4, + 60.7, + 64.83, + 59.94, + 79.44, + 79.22, + 80.42 + ] + }, + "l2_throughput": { + "name": "L2 Cache Throughput (%)", + "lower_is_better": false, + "cuda": [ + 14.22, + 14.81, + 14.65, + 15.12, + 13.84, + 14.63, + 12.79, + 14.26, + 11.9, + 27.56, + 27.56 + ], + "triton": [ + 72.5, + 38.87, + 50.06, + 88.88, + 20.12, + 49.75, + 93.89, + 47.4, + 20.12, + 41.65, + 37.98 + ], + "helion": [ + 55.41, + 50.75, + 46.94, + 68.87, + 19.98, + 45.22, + 78.66, + 43.01, + 21.1, + 22.15, + 26.1 + ] + }, + "dram_throughput": { + "name": "DRAM Throughput (%)", + "lower_is_better": false, + "cuda": [ + 1.96, + 1.04, + 0.54, + 0.27, + 0.35, + 0.52, + 0.11, + 0.26, + 0.2, + 16.47, + 16.41 + ], + "triton": [ + 59.42, + 27.08, + 32.89, + 70.87, + 2.09, + 32.65, + 6.73, + 30.85, + 2.09, + 28.39, + 26.93 + ], + "helion": [ + 27.12, + 26.64, + 29.88, + 20.96, + 13.07, + 25.37, + 17.03, + 23.69, + 13.49, + 14.72, + 17.13 + ] + }, + "compute_throughput": { + "name": "Compute Throughput (%)", + "lower_is_better": false, + "cuda": [ + 73.61, + 73.92, + 74.05, + 74.75, + 74.85, + 74.06, + 74.79, + 74.16, + 74.83, + 74.63, + 74.61 + ], + "triton": [ + 19.8, + 15.91, + 14.26, + 56.34, + 11.36, + 14.26, + 62.92, + 15.05, + 11.5, + 14.8, + 14.71 + ], + "helion": [ + 36.37, + 30.38, + 53.38, + 67.16, + 15.98, + 33.82, + 72.98, + 34.87, + 15.94, + 16.03, + 16.3 + ] + }, + "no_eligible_warps": { + "name": "No Eligible Warps (%)", + "lower_is_better": false, + "cuda": [ + 47.7, + 48.75, + 49.26, + 47.21, + 48.03, + 49.26, + 46.65, + 49.49, + 48.3, + 45.34, + 45.48 + ], + "triton": [ + 80.97, + 84.07, + 85.74, + 76.02, + 86.63, + 85.74, + 37.19, + 84.95, + 88.5, + 85.18, + 85.29 + ], + "helion": [ + 76.1, + 87.45, + 80.06, + 77.41, + 99.02, + 89.44, + 79.01, + 89.3, + 96.24, + 96.41, + 96.43 + ] + } + } + }, + "softmax_forward": { + "configs": [ + { + "name": "Batch (B=1)", + "detail": "" + }, + { + "name": "Batch (B=8)", + "detail": "" + }, + { + "name": "Batch (B=32)", + "detail": "" + }, + { + "name": "Batch (B=128)", + "detail": "" + }, + { + "name": "Data (32, 16K, 1024)", + "detail": "" + }, + { + "name": "Data (32, 4K, 256)", + "detail": "" + }, + { + "name": "Data (64, 8K, 1K)", + "detail": "" + }, + { + "name": "Data (64, 8K, 1K, BF16)", + "detail": "" + }, + { + "name": "Data (64, 8K, 1K, FP16)", + "detail": "" + }, + { + "name": "Data (64, 8K, 512)", + "detail": "" + }, + { + "name": "Hidden (32, 2K, 1K)", + "detail": "" + }, + { + "name": "Hidden (32, 2K, 4K)", + "detail": "" + }, + { + "name": "Hidden (32, 2K, 8K)", + "detail": "" + }, + { + "name": "Seq (32, 4K, 4K)", + "detail": "" + }, + { + "name": "Seq (32, 8K, 4K)", + "detail": "" + }, + { + "name": "Seq (32, 16K, 4K)", + "detail": "" + } + ], + "metrics": { + "duration": { + "name": "Duration (ms)", + "lower_is_better": true, + "cuda": [ + 0.0494, + 0.3755, + 1.49, + 5.93, + 14.23, + 0.3867, + 7.12, + 7.17, + 7.16, + 3.14, + 0.892, + 1.49, + 2.29, + 2.97, + 5.93, + 11.88 + ], + "triton": [ + 0.0177, + 0.1322, + 0.5309, + 2.15, + 2.02, + 0.1249, + 1.02, + 0.8622, + 0.7968, + 0.6206, + 0.131, + 0.5249, + 1.07, + 1.05, + 2.14, + 4.31 + ], + "helion": [ + 0.0174, + 0.1318, + 0.5076, + 1.98, + 1.99, + 0.0622, + 0.9964, + 0.5413, + 0.5054, + 0.4941, + 0.1919, + 0.5159, + 0.9963, + 0.9951, + 1.98, + 4.2 + ] + }, + "compute_throughput": { + "name": "Compute Throughput (%)", + "lower_is_better": false, + "cuda": [ + 57.7, + 60.52, + 60.94, + 61.04, + 69.95, + 84.42, + 69.94, + 70.0, + 69.83, + 80.32, + 69.75, + 61.07, + 56.19, + 61.12, + 61.05, + 61.16 + ], + "triton": [ + 54.46, + 62.29, + 58.58, + 57.46, + 76.41, + 83.98, + 76.34, + 89.51, + 93.53, + 82.72, + 77.92, + 59.01, + 58.83, + 57.59, + 57.35, + 57.24 + ], + "helion": [ + 34.42, + 58.0, + 63.68, + 64.16, + 78.01, + 62.4, + 78.23, + 75.23, + 78.99, + 67.72, + 59.61, + 57.55, + 60.78, + 64.49, + 64.7, + 69.77 + ] + }, + "memory_throughput": { + "name": "Memory Throughput (%)", + "lower_is_better": false, + "cuda": [ + 19.67, + 28.05, + 29.77, + 30.01, + 22.38, + 28.05, + 22.38, + 22.11, + 22.21, + 26.16, + 22.32, + 29.09, + 38.76, + 29.26, + 30.01, + 29.38 + ], + "triton": [ + 52.09, + 79.5, + 83.71, + 83.53, + 86.1, + 70.65, + 85.11, + 50.22, + 55.5, + 71.24, + 80.07, + 82.71, + 83.03, + 82.94, + 83.7, + 81.76 + ], + "helion": [ + 47.6, + 79.2, + 87.76, + 89.81, + 87.45, + 79.86, + 87.18, + 79.86, + 87.31, + 89.22, + 77.96, + 83.76, + 89.15, + 87.4, + 89.78, + 83.19 + ] + }, + "l1_tex_throughput": { + "name": "L1/TEX Throughput (%)", + "lower_is_better": false, + "cuda": [ + 21.2, + 20.6, + 20.69, + 20.68, + 22.39, + 28.23, + 22.39, + 22.12, + 22.22, + 26.18, + 22.39, + 20.66, + 19.77, + 20.65, + 20.68, + 20.65 + ], + "triton": [ + 46.5, + 45.56, + 42.64, + 42.22, + 58.08, + 71.91, + 57.84, + 48.69, + 53.0, + 69.61, + 60.57, + 42.17, + 41.81, + 41.95, + 41.8, + 41.18 + ], + "helion": [ + 40.8, + 49.76, + 41.25, + 40.43, + 57.55, + 67.05, + 57.72, + 35.87, + 38.78, + 64.92, + 47.98, + 48.54, + 39.26, + 40.52, + 40.45, + 50.66 + ] + }, + "l2_throughput": { + "name": "L2 Throughput (%)", + "lower_is_better": false, + "cuda": [ + 33.35, + 36.51, + 37.21, + 37.17, + 15.9, + 18.07, + 15.95, + 8.65, + 8.66, + 18.03, + 15.73, + 37.12, + 47.62, + 37.12, + 37.17, + 37.09 + ], + "triton": [ + 80.32, + 88.71, + 90.25, + 90.1, + 92.91, + 53.62, + 92.15, + 61.93, + 66.63, + 82.63, + 90.88, + 89.85, + 88.17, + 90.4, + 90.39, + 90.18 + ], + "helion": [ + 81.11, + 90.52, + 93.69, + 94.08, + 93.69, + 92.15, + 93.7, + 89.55, + 92.94, + 93.65, + 89.42, + 92.21, + 93.63, + 93.78, + 94.07, + 89.76 + ] + }, + "sm_busy": { + "name": "SM Busy (%)", + "lower_is_better": false, + "cuda": [ + 62.19, + 61.08, + 60.94, + 61.04, + 69.96, + 84.96, + 69.96, + 70.03, + 69.83, + 80.32, + 69.97, + 61.21, + 56.19, + 61.19, + 61.05, + 61.18 + ], + "triton": [ + 64.78, + 62.93, + 58.58, + 57.46, + 76.67, + 85.49, + 76.31, + 89.72, + 93.53, + 82.72, + 79.86, + 58.49, + 58.83, + 58.3, + 57.35, + 57.36 + ], + "helion": [ + 41.93, + 59.0, + 63.68, + 64.16, + 78.17, + 67.19, + 78.38, + 75.73, + 78.99, + 67.72, + 59.91, + 57.66, + 60.78, + 64.7, + 64.7, + 69.84 + ] + }, + "no_eligible_warps": { + "name": "No Eligible Warps (%)", + "lower_is_better": false, + "cuda": [ + 37.84, + 38.86, + 38.92, + 38.91, + 30.04, + 15.08, + 30.04, + 29.98, + 30.15, + 19.63, + 30.04, + 38.79, + 43.73, + 38.82, + 38.92, + 38.82 + ], + "triton": [ + 35.38, + 35.91, + 41.01, + 42.7, + 23.35, + 14.68, + 23.77, + 10.14, + 6.18, + 17.07, + 20.65, + 41.24, + 40.97, + 40.69, + 42.91, + 42.73 + ], + "helion": [ + 57.97, + 40.41, + 35.66, + 35.26, + 21.84, + 32.87, + 21.77, + 24.23, + 20.86, + 31.58, + 39.9, + 42.15, + 39.53, + 35.38, + 35.5, + 30.25 + ] + }, + "gflops": { + "name": "GFLOPs", + "lower_is_better": false, + "cuda": [ + 0.00472, + 0.0378, + 0.151, + 0.605, + 1.51, + 0.0507, + 0.757, + 0.757, + 0.757, + 0.387, + 0.0946, + 0.151, + 0.227, + 0.302, + 0.605, + 1.21 + ], + "triton": [ + 0.00339, + 0.0271, + 0.109, + 0.434, + 0.461, + 0.0199, + 0.231, + 0.231, + 0.231, + 0.13, + 0.0288, + 0.109, + 0.218, + 0.217, + 0.434, + 0.868 + ], + "helion": [ + 0.00463, + 0.0417, + 0.246, + 0.985, + 1.03, + 0.0229, + 0.516, + 0.295, + 0.295, + 0.185, + 0.0685, + 0.167, + 0.482, + 0.492, + 0.985, + 2.06 + ] + }, + "registers_per_thread": { + "name": "Registers per Thread", + "lower_is_better": true, + "cuda": [ + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 18 + ], + "triton": [ + 80, + 32, + 64, + 64, + 32, + 32, + 32, + 64, + 64, + 32, + 47, + 32, + 57, + 64, + 64, + 40 + ], + "helion": [ + 36, + 36, + 34, + 34, + 25, + 21, + 25, + 31, + 31, + 22, + 25, + 36, + 39, + 36, + 34, + 36 + ] + }, + "register_limited_blocks": { + "name": "Register-Limited Blocks", + "lower_is_better": true, + "cuda": [ + 2, + 2, + 2, + 2, + 2, + 10, + 2, + 2, + 2, + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "triton": [ + 6, + 4, + 4, + 4, + 16, + 32, + 16, + 32, + 32, + 16, + 40, + 4, + 4, + 4, + 4, + 3 + ], + "helion": [ + 6, + 6, + 6, + 6, + 16, + 21, + 16, + 16, + 16, + 21, + 16, + 6, + 3, + 6, + 6, + 6 + ] + }, + "memory_bandwidth": { + "name": "Memory Bandwidth (GB/s)", + "lower_is_better": false, + "cuda": [ + 954.6, + 1380, + 1430, + 1440, + 602.39, + 646.03, + 600.86, + 297.02, + 297.64, + 678.67, + 580.95, + 1430, + 1870, + 1440, + 1440, + 1440 + ], + "triton": [ + 2560, + 3910, + 4030, + 4010, + 4230, + 1990, + 4190, + 2470, + 2670, + 3430, + 3930, + 4060, + 4000, + 4080, + 4030, + 4020 + ], + "helion": [ + 2330, + 3890, + 4220, + 4320, + 4300, + 3920, + 4280, + 3920, + 4200, + 4290, + 3830, + 4120, + 4290, + 4300, + 4320, + 4090 + ] + } + } + }, + "softmax_backward": { + "configs": [ + { + "name": "Batch (B=1)", + "detail": "" + }, + { + "name": "Batch (B=8)", + "detail": "" + }, + { + "name": "Batch (B=32)", + "detail": "" + }, + { + "name": "Batch (B=128)", + "detail": "" + }, + { + "name": "Data (32, 16K, 1024)", + "detail": "" + }, + { + "name": "Data (32, 4K, 256)", + "detail": "" + }, + { + "name": "Data (64, 8K, 1K)", + "detail": "" + }, + { + "name": "Data (64, 8K, 1K, BF16)", + "detail": "" + }, + { + "name": "Data (64, 8K, 1K, FP16)", + "detail": "" + }, + { + "name": "Data (64, 8K, 512)", + "detail": "" + }, + { + "name": "Hidden (32, 2K, 1K)", + "detail": "" + }, + { + "name": "Hidden (32, 2K, 4K)", + "detail": "" + }, + { + "name": "Hidden (32, 2K, 8K)", + "detail": "" + }, + { + "name": "Seq (32, 4K, 4K)", + "detail": "" + }, + { + "name": "Seq (32, 8K, 4K)", + "detail": "" + }, + { + "name": "Seq (32, 16K, 4K)", + "detail": "" + } + ], + "metrics": { + "duration": { + "name": "Duration (ms)", + "lower_is_better": true, + "cuda": [ + 0.0442, + 0.33, + 1.31, + 5.2, + 9.22, + 0.2556, + 4.6, + 4.51, + 4.45, + 2.12, + 0.5773, + 1.3, + 2.3, + 2.58, + 5.21, + 10.43 + ], + "triton": [ + 0.0289, + 0.2394, + 0.9745, + 4.22, + 3.13, + 0.1129, + 1.55, + 0.7338, + 0.7295, + 0.7284, + 0.1865, + 0.9757, + 1.59, + 1.97, + 4.2, + 7.96 + ], + "helion": [ + 0.0233, + 0.1876, + 0.7271, + 2.88, + 2.87, + 0.0881, + 1.43, + 0.7287, + 0.7174, + 0.7222, + 0.3209, + 0.7596, + 1.45, + 1.5, + 2.88, + 13.88 + ] + }, + "compute_throughput": { + "name": "Compute Throughput (%)", + "lower_is_better": false, + "cuda": [ + 45.74, + 48.03, + 47.91, + 48.4, + 62.96, + 74.27, + 63.07, + 66.8, + 67.01, + 68.76, + 62.84, + 48.77, + 43.21, + 49.18, + 48.4, + 48.57 + ], + "triton": [ + 17.54, + 17.32, + 15.63, + 14.34, + 27.78, + 64.47, + 28.02, + 64.21, + 57.92, + 41.77, + 30.52, + 15.83, + 19.38, + 15.68, + 14.4, + 15.47 + ], + "helion": [ + 7.45, + 21.36, + 15.12, + 15.37, + 30.08, + 24.63, + 28.24, + 52.2, + 41.83, + 31.32, + 54.2, + 26.69, + 18.47, + 25.45, + 15.2, + 9.79 + ] + }, + "memory_throughput": { + "name": "Memory Throughput (%)", + "lower_is_better": false, + "cuda": [ + 40.78, + 48.89, + 50.99, + 51.4, + 28.41, + 31.11, + 28.42, + 25.57, + 25.87, + 31.44, + 27.98, + 50.13, + 58.23, + 50.77, + 51.37, + 50.25 + ], + "triton": [ + 66.8, + 79.61, + 81.79, + 76.22, + 84.55, + 70.45, + 84.87, + 88.94, + 91.36, + 91.5, + 86.56, + 79.97, + 88.19, + 79.13, + 76.42, + 78.6 + ], + "helion": [ + 72.11, + 85.8, + 91.54, + 92.74, + 91.35, + 87.42, + 91.18, + 89.51, + 92.67, + 92.24, + 84.31, + 85.91, + 92.33, + 86.95, + 92.75, + 62.96 + ] + }, + "l1_tex_throughput": { + "name": "L1/TEX Throughput (%)", + "lower_is_better": false, + "cuda": [ + 25.9, + 25.41, + 25.12, + 25.44, + 25.57, + 31.18, + 25.61, + 25.58, + 25.88, + 28.45, + 25.68, + 25.5, + 26.01, + 25.79, + 25.48, + 25.51 + ], + "triton": [ + 36.24, + 30.71, + 28.54, + 26.43, + 42.05, + 62.81, + 42.3, + 54.68, + 54.85, + 55.17, + 46.75, + 28.84, + 35.73, + 28.55, + 28.1, + 27.75 + ], + "helion": [ + 31.56, + 44.21, + 32.99, + 32.97, + 46.3, + 43.74, + 46.48, + 52.76, + 38.99, + 53.96, + 45.64, + 45.32, + 42.17, + 39.69, + 33.02, + 29.19 + ] + }, + "l2_throughput": { + "name": "L2 Throughput (%)", + "lower_is_better": false, + "cuda": [ + 56.3, + 61.89, + 62.47, + 62.71, + 36.78, + 41.09, + 36.93, + 21.18, + 21.4, + 39.97, + 36.77, + 62.74, + 70.49, + 63.45, + 62.67, + 62.56 + ], + "triton": [ + 82.27, + 89.71, + 92.07, + 85.44, + 92.43, + 84.67, + 92.97, + 94.31, + 94.55, + 94.59, + 93.27, + 91.87, + 94.35, + 91.4, + 88.1, + 91.4 + ], + "helion": [ + 88.48, + 92.37, + 94.09, + 94.61, + 95.07, + 93.86, + 95.04, + 94.43, + 94.94, + 94.67, + 89.94, + 92.17, + 94.25, + 92.22, + 94.62, + 80.49 + ] + }, + "sm_busy": { + "name": "SM Busy (%)", + "lower_is_better": false, + "cuda": [ + 49.4, + 48.56, + 47.91, + 48.4, + 62.98, + 74.97, + 63.1, + 66.83, + 67.01, + 68.76, + 63.19, + 48.87, + 43.21, + 49.21, + 48.4, + 48.6 + ], + "triton": [ + 19.9, + 16.99, + 15.63, + 14.34, + 27.87, + 65.96, + 28.06, + 64.19, + 57.92, + 41.77, + 30.9, + 15.99, + 19.38, + 15.85, + 14.4, + 15.43 + ], + "helion": [ + 8.39, + 21.41, + 15.12, + 15.37, + 30.04, + 25.89, + 28.33, + 52.44, + 41.83, + 31.32, + 55.76, + 26.49, + 17.17, + 24.45, + 15.2, + 5.8 + ] + }, + "no_eligible_warps": { + "name": "No Eligible Warps (%)", + "lower_is_better": false, + "cuda": [ + 50.33, + 51.1, + 51.87, + 51.52, + 37.01, + 24.89, + 36.9, + 33.18, + 32.97, + 31.15, + 36.88, + 51.0, + 56.69, + 50.75, + 51.53, + 51.4 + ], + "triton": [ + 79.93, + 83.37, + 84.3, + 84.62, + 71.93, + 33.77, + 71.85, + 35.74, + 41.96, + 58.19, + 69.12, + 84.08, + 80.66, + 84.37, + 85.49, + 84.52 + ], + "helion": [ + 91.55, + 78.51, + 84.81, + 84.69, + 70.02, + 73.96, + 71.62, + 47.58, + 57.88, + 68.51, + 43.86, + 73.52, + 82.77, + 75.5, + 84.62, + 94.2 + ] + }, + "fused_operations": { + "name": "Fused Operations (instruction-mix)", + "lower_is_better": false, + "cuda": [ + 262000.0, + 2100000.0, + 8390000.0, + 33600000.0, + 33600000.0, + 1050000.0, + 16800000.0, + 16800000.0, + 16800000.0, + 8390000.0, + 2100000.0, + 8390000.0, + 16800000.0, + 16800000.0, + 33600000.0, + 67100000.0 + ], + "triton": [ + 246000.0, + 1970000.0, + 7860000.0, + 31500000.0, + 29400000.0, + 524000.0, + 14700000.0, + 14700000.0, + 14700000.0, + 6290000.0, + 1840000.0, + 7860000.0, + 15700000.0, + 15700000.0, + 31500000.0, + 62900000.0 + ], + "helion": [ + 254000.0, + 1840000.0, + 7340000.0, + 29400000.0, + 25200000.0, + 786000.0, + 12600000.0, + 0, + 0, + 6290000.0, + 1570000.0, + 6290000.0, + 6290000.0, + 12600000.0, + 29400000.0, + 33600000.0 + ] + }, + "non_fused_operations": { + "name": "Non-Fused Operations (instruction-mix)", + "lower_is_better": false, + "cuda": [ + 862000.0, + 6900000.0, + 27600000.0, + 110000000.0, + 240000000.0, + 8000000.0, + 120000000.0, + 120000000.0, + 120000000.0, + 61300000.0, + 15000000.0, + 27600000.0, + 44400000.0, + 55200000.0, + 110000000.0, + 221000000.0 + ], + "triton": [ + 688000.0, + 5510000.0, + 22000000.0, + 88100000.0, + 105000000.0, + 6820000.0, + 52400000.0, + 52400000.0, + 52400000.0, + 35700000.0, + 6550000.0, + 22000000.0, + 45100000.0, + 44000000.0, + 88100000.0, + 176000000.0 + ], + "helion": [ + 598000.0, + 7080000.0, + 26200000.0, + 105000000.0, + 151000000.0, + 4190000.0, + 75500000.0, + 35700000.0, + 35700000.0, + 35700000.0, + 7860000.0, + 41900000.0, + 58700000.0, + 83900000.0, + 105000000.0, + 302000000.0 + ] + }, + "gflops": { + "name": "GFLOPs", + "lower_is_better": false, + "cuda": [ + 0.00139, + 0.0111, + 0.0444, + 0.177, + 0.307, + 0.0101, + 0.154, + 0.154, + 0.154, + 0.0781, + 0.0192, + 0.0444, + 0.0779, + 0.0887, + 0.177, + 0.355 + ], + "triton": [ + 0.00118, + 0.00944, + 0.0377, + 0.151, + 0.164, + 0.00786, + 0.0818, + 0.0818, + 0.0818, + 0.0482, + 0.0102, + 0.0371, + 0.0765, + 0.0755, + 0.151, + 0.302 + ], + "helion": [ + 0.00111, + 0.0107, + 0.0409, + 0.164, + 0.201, + 0.00577, + 0.101, + 0.0357, + 0.0357, + 0.0482, + 0.011, + 0.0545, + 0.0881, + 0.109, + 0.164, + 0.369 + ] + }, + "registers_per_thread": { + "name": "Registers per Thread", + "lower_is_better": true, + "cuda": [ + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 16 + ], + "triton": [ + 32, + 32, + 32, + 32, + 32, + 32, + 32, + 30, + 32, + 32, + 32, + 32, + 40, + 32, + 32, + 32 + ], + "helion": [ + 78, + 46, + 54, + 54, + 22, + 22, + 22, + 22, + 30, + 21, + 22, + 21, + 32, + 22, + 54, + 48 + ] + }, + "register_limited_blocks": { + "name": "Register-Limited Blocks", + "lower_is_better": true, + "cuda": [ + 4, + 4, + 4, + 4, + 4, + 16, + 4, + 4, + 4, + 8, + 4, + 4, + 4, + 4, + 4, + 4 + ], + "triton": [ + 8, + 8, + 8, + 8, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 8, + 3, + 8, + 8, + 8 + ], + "helion": [ + 6, + 2, + 2, + 2, + 5, + 42, + 5, + 5, + 4, + 10, + 5, + 2, + 2, + 2, + 2, + 10 + ] + }, + "memory_bandwidth": { + "name": "Memory Bandwidth (GB/s)", + "lower_is_better": false, + "cuda": [ + 2000, + 2470, + 2450, + 2400, + 1400, + 1530, + 1400, + 712.17, + 720.66, + 1510, + 1370, + 1970, + 4220, + 4440, + 3090, + 4270 + ], + "triton": [ + 3540, + 4460, + 4400, + 4220, + 4490, + 4300, + 4480, + 4400, + 4460, + 4440, + 4140, + 4130, + 3930, + 4240, + 3860, + 3890 + ], + "helion": [ + 3280, + 3670, + 3940, + 3910, + 4160, + 3460, + 4170, + 4370, + 4400, + 4400, + 4250, + 2460, + 2800, + 2470, + 2500, + 2470 + ] + } + } + } +}; \ No newline at end of file diff --git a/data_dashboard/index.html b/data_dashboard/index.html new file mode 100644 index 0000000..fa0cc82 --- /dev/null +++ b/data_dashboard/index.html @@ -0,0 +1,463 @@ + + + + + + GPU Kernel Performance Dashboard + + + +
+
+

GPU Kernel Performance: COMPLETE Dataset

+

NVIDIA H200 • CUDA vs Triton vs Helion • All 54 Tables

+
+ + +
+
+ +
+
+ + +
+ +
+ + +
+
+ +
+
+
-
+
Avg Speedup
+
+
+
-
+
Max Speedup
+
+
+ +
+
+
+ Best +
+
+
+ Second +
+
+
+ Worst +
+
+ +
+ + + +
+
+ +
+

Made with ❤️ by Red Hat PyTorch Engineering Interns

+
+
+ + + + + From a4ca5b59d58ff991e02ec30f1fc00dbdef9bceff Mon Sep 17 00:00:00 2001 From: Arindam Samanta Date: Wed, 3 Jun 2026 12:20:50 +0530 Subject: [PATCH 2/2] Enhance dashboard UI with glassmorphism and improvements - Add elegant frosted glass effect to sticky header with 40px blur - Center-align all data cells including speedup badges - Update title to 'GPU Kernel Profiling Data' - Simplify subtitle (remove '54 tables' reference) - Multi-layer shadows and gradient effects for premium look - Enhanced backdrop filter with saturation and brightness - Smooth fade effect below sticky header --- data_dashboard/index.html | 81 +++++++++++++++++++++++++++++++++++---- 1 file changed, 74 insertions(+), 7 deletions(-) diff --git a/data_dashboard/index.html b/data_dashboard/index.html index fa0cc82..9d947fb 100644 --- a/data_dashboard/index.html +++ b/data_dashboard/index.html @@ -226,16 +226,76 @@ } .perf-table th { - background: var(--bg-secondary); - color: var(--text-primary); - padding: 16px; + background: linear-gradient(180deg, + rgba(29, 29, 31, 0.98) 0%, + rgba(29, 29, 31, 0.95) 100%); + backdrop-filter: blur(40px) saturate(200%) brightness(1.1); + -webkit-backdrop-filter: blur(40px) saturate(200%) brightness(1.1); + color: white; + padding: 18px 16px; text-align: center; font-weight: 600; font-size: 12px; - border-bottom: 2px solid var(--border-color); + letter-spacing: 0.8px; + border-bottom: 1px solid rgba(255, 255, 255, 0.15); + position: sticky; + top: -1px; + z-index: 1000; + box-shadow: + 0 8px 32px rgba(0, 0, 0, 0.4), + 0 2px 8px rgba(0, 0, 0, 0.2), + inset 0 1px 0 rgba(255, 255, 255, 0.1), + inset 0 -1px 0 rgba(0, 0, 0, 0.2); + transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); + } + + body.dark-mode .perf-table th { + background: linear-gradient(180deg, + rgba(10, 10, 10, 0.98) 0%, + rgba(15, 15, 15, 0.95) 100%); + backdrop-filter: blur(40px) saturate(200%) brightness(0.9); + -webkit-backdrop-filter: blur(40px) saturate(200%) brightness(0.9); + color: #f5f5f7; + border-bottom: 1px solid rgba(255, 255, 255, 0.08); + box-shadow: + 0 8px 32px rgba(0, 0, 0, 0.6), + 0 2px 8px rgba(0, 0, 0, 0.4), + inset 0 1px 0 rgba(255, 255, 255, 0.05), + inset 0 -1px 0 rgba(0, 0, 0, 0.3); + } + + .perf-table thead { position: sticky; top: 0; - z-index: 10; + z-index: 1000; + } + + .perf-table thead::after { + content: ''; + position: absolute; + bottom: -8px; + left: 0; + right: 0; + height: 8px; + background: linear-gradient(180deg, + rgba(0, 0, 0, 0.15) 0%, + transparent 100%); + pointer-events: none; + } + + .perf-table th::before { + content: ''; + position: absolute; + top: 0; + left: 0; + right: 0; + height: 2px; + background: linear-gradient(90deg, + transparent 0%, + rgba(255, 255, 255, 0.15) 20%, + rgba(255, 255, 255, 0.15) 80%, + transparent 100%); + opacity: 0.6; } .perf-table th:first-child { @@ -263,6 +323,11 @@ border-bottom: 1px solid var(--border-color); font-weight: 500; color: var(--text-primary); + text-align: center; + } + + .perf-table td:first-child { + text-align: left; } .perf-table tr:last-child td { @@ -273,6 +338,8 @@ font-weight: 600; font-size: 13px; text-align: left; + background: var(--bg-tertiary); + color: var(--text-primary); } .config-detail { @@ -386,8 +453,8 @@
-

GPU Kernel Performance: COMPLETE Dataset

-

NVIDIA H200 • CUDA vs Triton vs Helion • All 54 Tables

+

GPU Kernel Profiling Data

+

NVIDIA H200 • CUDA vs Triton vs Helion