diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b1f9ed..e6560ed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,37 +1,78 @@ cmake_minimum_required(VERSION 3.24) -set(CMAKE_CUDA_COMPILER "/opt/cuda/bin/nvcc") -set(CUDACXX "/opt/cuda/bin/nvcc") -project(tiny-vllm LANGUAGES CXX CUDA) + +option(USE_HIP "Build with HIP for AMD GPUs" OFF) + +if(NOT USE_HIP) + set(CMAKE_CUDA_COMPILER "/opt/cuda/bin/nvcc") + set(CUDACXX "/opt/cuda/bin/nvcc") +endif() + +if(USE_HIP) + project(tiny-vllm LANGUAGES CXX HIP) +else() + project(tiny-vllm LANGUAGES CXX CUDA) +endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CUDA_STANDARD 17) -set(CMAKE_CUDA_STANDARD_REQUIRED ON) -set(CMAKE_CUDA_ARCHITECTURES 120) +if(USE_HIP) + # HIP architecture: default to gfx90a if not specified + if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "") + set(CMAKE_HIP_ARCHITECTURES "gfx90a") + endif() +else() + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + set(CMAKE_CUDA_ARCHITECTURES 120) +endif() if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() set(CMAKE_CXX_FLAGS_RELEASE "-O2") -set(CMAKE_CUDA_FLAGS_RELEASE "-O2") +if(USE_HIP) + set(CMAKE_HIP_FLAGS_RELEASE "-O2") + set(CMAKE_HIP_FLAGS_DEBUG "-g -DDEBUG") +else() + set(CMAKE_CUDA_FLAGS_RELEASE "-O2") + set(CMAKE_CUDA_FLAGS_DEBUG "-G -g -DDEBUG") +endif() -set(CMAKE_CUDA_FLAGS_DEBUG "-G -g -DDEBUG") set(CMAKE_CXX_FLAGS_DEBUG "-g -DDEBUG") -find_package(CUDAToolkit REQUIRED) +if(USE_HIP) + find_package(hipblas REQUIRED) + find_package(hip REQUIRED) +else() + find_package(CUDAToolkit REQUIRED) +endif() add_executable(tiny-vllm src/main.cpp src/kernels.cu ) +if(USE_HIP) + # Both main.cpp and kernels.cu need HIP compilation because they use bfloat16 types + # which require the HIP compiler (hip_bf16.h uses clang-specific builtins) + set_source_files_properties(src/main.cpp src/kernels.cu PROPERTIES LANGUAGE HIP) + target_compile_definitions(tiny-vllm PRIVATE USE_HIP) + set_target_properties(tiny-vllm PROPERTIES HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}") +endif() + target_include_directories(tiny-vllm PRIVATE src) target_include_directories(tiny-vllm PRIVATE include) -target_link_libraries(tiny-vllm PRIVATE - CUDA::cublas - CUDA::cudart -) - +if(USE_HIP) + target_link_libraries(tiny-vllm PRIVATE + hip::host + roc::hipblas + ) +else() + target_link_libraries(tiny-vllm PRIVATE + CUDA::cublas + CUDA::cudart + ) +endif() diff --git a/README.md b/README.md index d7c7d94..ff41b1d 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,15 @@ The exact setup on which I develop and test it: Install the dependencies and run the program with `./test.sh` - it will build it and immediately execute it +It also runs on AMD GPUs through ROCm/HIP. Pass `-DUSE_HIP=ON` to CMake and it builds with hipcc against hipBLAS instead of nvcc and cuBLAS; the CUDA sources are reused as-is through a thin `src/cuda_to_hip.h` compatibility header. Pick your GPU's architecture with `-DCMAKE_HIP_ARCHITECTURES` (for example `gfx90a` for MI200, `gfx1100` for RDNA3, `gfx1201` for RDNA4) - it is not hardcoded, so set it to match your card: + +```bash +cmake -B build -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1100 -DCMAKE_PREFIX_PATH=/opt/rocm -G Ninja +cmake --build build +``` + +The `-DCMAKE_PREFIX_PATH=/opt/rocm` lets CMake find the hip and hipBLAS packages; drop it if `/opt/rocm/bin` is already on your `PATH`, or change it if ROCm lives elsewhere. I tested the AMD path on gfx90a, gfx1100, and gfx1201. The default build (no `-DUSE_HIP`) is unchanged and still targets NVIDIA through CUDA. + If you fail to build or run it and your AI of choice won't be able to help, please open an Issue on GitHub - I will try to help. Make sure to provide all useful context ## Safetensors and your model diff --git a/src/cuda_to_hip.h b/src/cuda_to_hip.h new file mode 100644 index 0000000..0a4f8e1 --- /dev/null +++ b/src/cuda_to_hip.h @@ -0,0 +1,64 @@ +#pragma once + +// Copyright (c) 2026 Advanced Micro Devices, Inc. +// Author: Jeff Daily +// +// CUDA-to-HIP compatibility header for tiny-vllm +// Keeps CUDA spellings in source and aliases them to HIP on AMD GPUs + +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + +#include +#include +#include + +// bfloat16 type mappings +#define __nv_bfloat16 __hip_bfloat16 +#define nv_bfloat16 __hip_bfloat16 + +// CUDA runtime -> HIP runtime +#define cudaMalloc hipMalloc +#define cudaFree hipFree +#define cudaMemcpy hipMemcpy +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaGetLastError hipGetLastError +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaMemGetInfo hipMemGetInfo +#define cudaDeviceProp hipDeviceProp_t +#define cudaError hipError_t +#define cudaError_t hipError_t +#define cudaSuccess hipSuccess + +// cuBLAS -> hipBLAS +#define cublasHandle_t hipblasHandle_t +#define cublasStatus_t hipblasStatus_t +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasGemmEx hipblasGemmEx +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT + +// Data types for GEMM +#define CUDA_R_16BF HIP_R_16BF + +// Warp shuffle mask for HIP (64-bit required) +// HIP requires 64-bit masks for __shfl_* functions +#define WARP_FULL_MASK 0xffffffffffffffffULL + +#else + +#include +#include +#include + +// On CUDA, use the standard 32-bit mask +#define WARP_FULL_MASK 0xffffffff + +#endif diff --git a/src/kernels.cu b/src/kernels.cu index f3a5d98..c55fef3 100644 --- a/src/kernels.cu +++ b/src/kernels.cu @@ -1,3 +1,4 @@ +#include "cuda_to_hip.h" #include "kernels.cuh" #include @@ -407,11 +408,11 @@ __global__ void pagedAttentionKernel(int layer, int num_active_slots, __nv_bfloa float qk = (float)q * (float)*k; // tree reduction within current warp, thread 0 gets sum of all 32 elements within warp // could be done with __syncthreads but accessing memory of other threads in warp is op - qk += __shfl_down_sync(0xffffffff, qk, 16); - qk += __shfl_down_sync(0xffffffff, qk, 8); - qk += __shfl_down_sync(0xffffffff, qk, 4); - qk += __shfl_down_sync(0xffffffff, qk, 2); - qk += __shfl_down_sync(0xffffffff, qk, 1); + qk += __shfl_down_sync(WARP_FULL_MASK, qk, 16); + qk += __shfl_down_sync(WARP_FULL_MASK, qk, 8); + qk += __shfl_down_sync(WARP_FULL_MASK, qk, 4); + qk += __shfl_down_sync(WARP_FULL_MASK, qk, 2); + qk += __shfl_down_sync(WARP_FULL_MASK, qk, 1); if (thread_id == 0) { dot_products[0] = qk; diff --git a/src/kernels.cuh b/src/kernels.cuh index 19ff54e..958acd6 100644 --- a/src/kernels.cuh +++ b/src/kernels.cuh @@ -1,5 +1,12 @@ #pragma once + +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) +#include +#define __nv_bfloat16 __hip_bfloat16 +#define nv_bfloat16 __hip_bfloat16 +#else #include +#endif // prefill void embeddingGather(int *gpu_input_tokens, __nv_bfloat16 *gpu_input_embeds, __nv_bfloat16 *embed_tokens, int num_input_tokens); diff --git a/src/main.cpp b/src/main.cpp index 3a901b3..db7072f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,8 +1,7 @@ #include #include #include -#include -#include +#include "cuda_to_hip.h" #include #define JSON_USE_IMPLICIT_CONVERSIONS 0 #include "json.hpp"