From 8a868aff17c262106fde9b09f73dcf5e5b9a6f94 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Wed, 17 Jun 2026 13:45:41 +0000
Subject: [PATCH 1/2] [ROCm] Add AMD GPU support via HIP for tiny-vllm

This adds AMD GPU support to tiny-vllm through ROCm/HIP while leaving the
existing NVIDIA/CUDA build unchanged.

The CUDA kernels and host code are reused as-is. A new src/cuda_to_hip.h
compatibility header keeps the CUDA spellings in the source and aliases them
to their HIP equivalents (runtime calls, cuBLAS -> hipBLAS, and the
__nv_bfloat16 -> __hip_bfloat16 type) when building for AMD. The only
kernel-source change is the warp-shuffle mask: HIP requires a 64-bit lane mask
for __shfl_*_sync, so the hardcoded 0xffffffff becomes a WARP_FULL_MASK macro
(0xffffffffffffffffULL on HIP, 0xffffffff on CUDA). The paged-attention
reduction is wave-size agnostic, so the same source runs correctly on wave64
(gfx90a) and wave32 (gfx1100, gfx1201).

CMakeLists.txt gains a USE_HIP option (default OFF). When OFF, the build is the
existing CUDA configuration, unchanged. When ON, it enables the HIP language,
compiles the sources with hipcc, and links hipBLAS. The GPU architecture is
selected by the caller via CMAKE_HIP_ARCHITECTURES (it is not hardcoded), e.g.:

    cmake -B build -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1100 -G Ninja
    cmake --build build

The README's setup section documents the AMD build path alongside the existing
NVIDIA instructions.

Validation: built and exercised on real AMD GPUs -- gfx90a (MI250X), gfx1100
(Radeon Pro W7800), and gfx1201 (RX 9070 XT). On each, the HIP runtime, the
bf16 embedding-gather kernel, the 64-bit-mask warp-shuffle reduction at 64
threads/block, and the hipBLAS bf16 GEMM all pass (the 64-bit mask fix
confirmed on both wave64 and wave32). On gfx1100, full end-to-end inference was
additionally validated: loading Llama 3.2 1B Instruct weights and running
prefill+decode produces coherent, correct output (for example "What is 2+2?"
-> 4 and "Capital of France?" -> Paris), exercising the complete path
(embedding, 16 transformer layers with hipBLAS GEMMs, paged attention, SwiGLU
MLP, lm_head). The CUDA build path is unchanged.

Test Plan:

    cmake -B build -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1100 -G Ninja
    cmake --build build
    ./build/tiny-vllm   # with Llama 3.2 1B Instruct model.safetensors in CWD

On gfx1100 (ROCm 7.2.1) the four built-in prompts return correct answers. The
targeted GPU component tests (runtime, embedding-gather, 64-bit-mask shuffle,
hipBLAS bf16 GEMM) pass on gfx90a, gfx1100, and gfx1201.

Authored with the assistance of Claude.
---
 CMakeLists.txt    | 69 +++++++++++++++++++++++++++++++++++++----------
 README.md         |  9 +++++++
 src/cuda_to_hip.h | 64 +++++++++++++++++++++++++++++++++++++++++++
 src/kernels.cu    | 11 ++++----
 src/kernels.cuh   |  7 +++++
 src/main.cpp      |  3 +--
 6 files changed, 142 insertions(+), 21 deletions(-)
 create mode 100644 src/cuda_to_hip.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7b1f9ed..e6560ed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,37 +1,78 @@
 cmake_minimum_required(VERSION 3.24)
-set(CMAKE_CUDA_COMPILER "/opt/cuda/bin/nvcc")
-set(CUDACXX "/opt/cuda/bin/nvcc")
-project(tiny-vllm LANGUAGES CXX CUDA)
+
+option(USE_HIP "Build with HIP for AMD GPUs" OFF)
+
+if(NOT USE_HIP)
+    set(CMAKE_CUDA_COMPILER "/opt/cuda/bin/nvcc")
+    set(CUDACXX "/opt/cuda/bin/nvcc")
+endif()
+
+if(USE_HIP)
+    project(tiny-vllm LANGUAGES CXX HIP)
+else()
+    project(tiny-vllm LANGUAGES CXX CUDA)
+endif()
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
-set(CMAKE_CUDA_ARCHITECTURES 120)
+if(USE_HIP)
+    # HIP architecture: default to gfx90a if not specified
+    if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
+        set(CMAKE_HIP_ARCHITECTURES "gfx90a")
+    endif()
+else()
+    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    set(CMAKE_CUDA_ARCHITECTURES 120)
+endif()
 
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif()
 
 set(CMAKE_CXX_FLAGS_RELEASE "-O2")
-set(CMAKE_CUDA_FLAGS_RELEASE "-O2")
+if(USE_HIP)
+    set(CMAKE_HIP_FLAGS_RELEASE "-O2")
+    set(CMAKE_HIP_FLAGS_DEBUG "-g -DDEBUG")
+else()
+    set(CMAKE_CUDA_FLAGS_RELEASE "-O2")
+    set(CMAKE_CUDA_FLAGS_DEBUG "-G -g -DDEBUG")
+endif()
 
-set(CMAKE_CUDA_FLAGS_DEBUG "-G -g -DDEBUG")
 set(CMAKE_CXX_FLAGS_DEBUG "-g -DDEBUG")
 
-find_package(CUDAToolkit REQUIRED)
+if(USE_HIP)
+    find_package(hipblas REQUIRED)
+    find_package(hip REQUIRED)
+else()
+    find_package(CUDAToolkit REQUIRED)
+endif()
 
 add_executable(tiny-vllm
     src/main.cpp
     src/kernels.cu
 )
 
+if(USE_HIP)
+    # Both main.cpp and kernels.cu need HIP compilation because they use bfloat16 types
+    # which require the HIP compiler (hip_bf16.h uses clang-specific builtins)
+    set_source_files_properties(src/main.cpp src/kernels.cu PROPERTIES LANGUAGE HIP)
+    target_compile_definitions(tiny-vllm PRIVATE USE_HIP)
+    set_target_properties(tiny-vllm PROPERTIES HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
+endif()
+
 target_include_directories(tiny-vllm PRIVATE src)
 target_include_directories(tiny-vllm PRIVATE include)
 
-target_link_libraries(tiny-vllm PRIVATE
-    CUDA::cublas
-    CUDA::cudart
-)
-
+if(USE_HIP)
+    target_link_libraries(tiny-vllm PRIVATE
+        hip::host
+        roc::hipblas
+    )
+else()
+    target_link_libraries(tiny-vllm PRIVATE
+        CUDA::cublas
+        CUDA::cudart
+    )
+endif()
diff --git a/README.md b/README.md
index d7c7d94..7df2066 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,15 @@ The exact setup on which I develop and test it:
 
 Install the dependencies and run the program with `./test.sh` - it will build it and immediately execute it
 
+It also runs on AMD GPUs through ROCm/HIP. Pass `-DUSE_HIP=ON` to CMake and it builds with hipcc against hipBLAS instead of nvcc and cuBLAS; the CUDA sources are reused as-is through a thin `src/cuda_to_hip.h` compatibility header. Pick your GPU's architecture with `-DCMAKE_HIP_ARCHITECTURES` (for example `gfx90a` for MI200, `gfx1100` for RDNA3, `gfx1201` for RDNA4) - it is not hardcoded, so set it to match your card:
+
+```bash
+cmake -B build -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1100 -G Ninja
+cmake --build build
+```
+
+I tested the AMD path on gfx90a, gfx1100, and gfx1201. The default build (no `-DUSE_HIP`) is unchanged and still targets NVIDIA through CUDA.
+
 If you fail to build or run it and your AI of choice won't be able to help, please open an Issue on GitHub - I will try to help. Make sure to provide all useful context
 
 ## Safetensors and your model
diff --git a/src/cuda_to_hip.h b/src/cuda_to_hip.h
new file mode 100644
index 0000000..0a4f8e1
--- /dev/null
+++ b/src/cuda_to_hip.h
@@ -0,0 +1,64 @@
+#pragma once
+
+// Copyright (c) 2026 Advanced Micro Devices, Inc.
+// Author: Jeff Daily <jeff.daily@amd.com>
+//
+// CUDA-to-HIP compatibility header for tiny-vllm
+// Keeps CUDA spellings in source and aliases them to HIP on AMD GPUs
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_bf16.h>
+#include <hipblas/hipblas.h>
+
+// bfloat16 type mappings
+#define __nv_bfloat16 __hip_bfloat16
+#define nv_bfloat16   __hip_bfloat16
+
+// CUDA runtime -> HIP runtime
+#define cudaMalloc              hipMalloc
+#define cudaFree                hipFree
+#define cudaMemcpy              hipMemcpy
+#define cudaMemcpyHostToDevice  hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost  hipMemcpyDeviceToHost
+#define cudaGetLastError        hipGetLastError
+#define cudaDeviceSynchronize   hipDeviceSynchronize
+#define cudaGetDeviceCount      hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaMemGetInfo          hipMemGetInfo
+#define cudaDeviceProp          hipDeviceProp_t
+#define cudaError               hipError_t
+#define cudaError_t             hipError_t
+#define cudaSuccess             hipSuccess
+
+// cuBLAS -> hipBLAS
+#define cublasHandle_t          hipblasHandle_t
+#define cublasStatus_t          hipblasStatus_t
+#define cublasCreate            hipblasCreate
+#define cublasDestroy           hipblasDestroy
+#define cublasGemmEx            hipblasGemmEx
+#define CUBLAS_STATUS_SUCCESS   HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_OP_N             HIPBLAS_OP_N
+#define CUBLAS_OP_T             HIPBLAS_OP_T
+#define CUBLAS_COMPUTE_32F      HIPBLAS_COMPUTE_32F
+#define CUBLAS_GEMM_DEFAULT     HIPBLAS_GEMM_DEFAULT
+
+// Data types for GEMM
+#define CUDA_R_16BF             HIP_R_16BF
+
+// Warp shuffle mask for HIP (64-bit required)
+// HIP requires 64-bit masks for __shfl_* functions
+#define WARP_FULL_MASK 0xffffffffffffffffULL
+
+#else
+
+#include <cuda_runtime.h>
+#include <cuda_bf16.h>
+#include <cublas_v2.h>
+
+// On CUDA, use the standard 32-bit mask
+#define WARP_FULL_MASK 0xffffffff
+
+#endif
diff --git a/src/kernels.cu b/src/kernels.cu
index f3a5d98..c55fef3 100644
--- a/src/kernels.cu
+++ b/src/kernels.cu
@@ -1,3 +1,4 @@
+#include "cuda_to_hip.h"
 #include "kernels.cuh"
 #include <iostream>
 
@@ -407,11 +408,11 @@ __global__ void pagedAttentionKernel(int layer, int num_active_slots, __nv_bfloa
             float qk = (float)q * (float)*k;
             // tree reduction within current warp, thread 0 gets sum of all 32 elements within warp
             // could be done with __syncthreads but accessing memory of other threads in warp is op
-            qk += __shfl_down_sync(0xffffffff, qk, 16);
-            qk += __shfl_down_sync(0xffffffff, qk, 8);
-            qk += __shfl_down_sync(0xffffffff, qk, 4);
-            qk += __shfl_down_sync(0xffffffff, qk, 2);
-            qk += __shfl_down_sync(0xffffffff, qk, 1);
+            qk += __shfl_down_sync(WARP_FULL_MASK, qk, 16);
+            qk += __shfl_down_sync(WARP_FULL_MASK, qk, 8);
+            qk += __shfl_down_sync(WARP_FULL_MASK, qk, 4);
+            qk += __shfl_down_sync(WARP_FULL_MASK, qk, 2);
+            qk += __shfl_down_sync(WARP_FULL_MASK, qk, 1);
             if (thread_id == 0)
             {
                 dot_products[0] = qk;
diff --git a/src/kernels.cuh b/src/kernels.cuh
index 19ff54e..958acd6 100644
--- a/src/kernels.cuh
+++ b/src/kernels.cuh
@@ -1,5 +1,12 @@
 #pragma once
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_bf16.h>
+#define __nv_bfloat16 __hip_bfloat16
+#define nv_bfloat16   __hip_bfloat16
+#else
 #include <cuda_bf16.h>
+#endif
 
 // prefill
 void embeddingGather(int *gpu_input_tokens, __nv_bfloat16 *gpu_input_embeds, __nv_bfloat16 *embed_tokens, int num_input_tokens);
diff --git a/src/main.cpp b/src/main.cpp
index 3a901b3..db7072f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,8 +1,7 @@
 #include <iostream>
 #include <numeric>
 #include <fstream>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
+#include "cuda_to_hip.h"
 #include <queue>
 #define JSON_USE_IMPLICIT_CONVERSIONS 0
 #include "json.hpp"

From a67a67399bb71ea13421521b67626e1d43533d70 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Fri, 19 Jun 2026 19:45:44 +0000
Subject: [PATCH 2/2] [ROCm] docs: note CMAKE_PREFIX_PATH for the HIP build

The documented HIP build did not tell CMake where ROCm is, so on a clean
install with /opt/rocm/bin not on PATH the configure fails to find the hip
and hipBLAS packages (hip_DIR-NOTFOUND). Add -DCMAKE_PREFIX_PATH=/opt/rocm
to the example so the command works as written.

Authored with the assistance of an AI coding agent.
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7df2066..ff41b1d 100644
--- a/README.md
+++ b/README.md
@@ -91,11 +91,11 @@ Install the dependencies and run the program with `./test.sh` - it will build it
 It also runs on AMD GPUs through ROCm/HIP. Pass `-DUSE_HIP=ON` to CMake and it builds with hipcc against hipBLAS instead of nvcc and cuBLAS; the CUDA sources are reused as-is through a thin `src/cuda_to_hip.h` compatibility header. Pick your GPU's architecture with `-DCMAKE_HIP_ARCHITECTURES` (for example `gfx90a` for MI200, `gfx1100` for RDNA3, `gfx1201` for RDNA4) - it is not hardcoded, so set it to match your card:
 
 ```bash
-cmake -B build -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1100 -G Ninja
+cmake -B build -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1100 -DCMAKE_PREFIX_PATH=/opt/rocm -G Ninja
 cmake --build build
 ```
 
-I tested the AMD path on gfx90a, gfx1100, and gfx1201. The default build (no `-DUSE_HIP`) is unchanged and still targets NVIDIA through CUDA.
+The `-DCMAKE_PREFIX_PATH=/opt/rocm` lets CMake find the hip and hipBLAS packages; drop it if `/opt/rocm/bin` is already on your `PATH`, or change it if ROCm lives elsewhere. I tested the AMD path on gfx90a, gfx1100, and gfx1201. The default build (no `-DUSE_HIP`) is unchanged and still targets NVIDIA through CUDA.
 
 If you fail to build or run it and your AI of choice won't be able to help, please open an Issue on GitHub - I will try to help. Make sure to provide all useful context