k2-fsa · csukuangfj · Jun 26, 2026 · Jun 11, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -29,30 +29,68 @@ endif()
 
 set(CMAKE_DISABLE_FIND_PACKAGE_MKL TRUE)
 
+# Build with HIP for AMD GPUs (ROCm). When ON, the CUDA toolchain is not used;
+# the .cu sources are compiled as HIP (see the K2_WITH_HIP blocks below) and the
+# two non-portable third-party deps (moderngpu, the vendored CUDPP segmented
+# scan) are replaced by hipCUB/rocThrust. See k2/csrc/cuda_to_hip.h.
+option(K2_WITH_HIP "Build k2 with HIP for AMD GPUs" OFF)
+
 set(languages CXX)
 set(_K2_WITH_CUDA ON)
 
-find_program(K2_HAS_NVCC nvcc)
-if(NOT K2_HAS_NVCC AND "$ENV{CUDACXX}" STREQUAL "")
-  message(STATUS "No NVCC detected. Disable CUDA support")
+if(K2_WITH_HIP)
+  # On the HIP build the CUDA toolchain is irrelevant.
   set(_K2_WITH_CUDA OFF)
-endif()
-
-if(APPLE OR (DEFINED K2_WITH_CUDA AND NOT K2_WITH_CUDA))
-  if(_K2_WITH_CUDA)
-    message(STATUS "Disable CUDA support")
+else()
+  find_program(K2_HAS_NVCC nvcc)
+  if(NOT K2_HAS_NVCC AND "$ENV{CUDACXX}" STREQUAL "")
+    message(STATUS "No NVCC detected. Disable CUDA support")
     set(_K2_WITH_CUDA OFF)
   endif()
+
+  if(APPLE OR (DEFINED K2_WITH_CUDA AND NOT K2_WITH_CUDA))
+    if(_K2_WITH_CUDA)
+      message(STATUS "Disable CUDA support")
+      set(_K2_WITH_CUDA OFF)
+    endif()
+  endif()
 endif()
 
 if(_K2_WITH_CUDA)
   set(languages ${languages} CUDA)
 endif()
 
+if(K2_WITH_HIP)
+  # Add HIP to the project() language list (mirrors how CUDA is added) so the
+  # compiler is enabled in the initial project() call; enabling it afterwards via
+  # enable_language() forces a mid-configure re-run that drops cache values. The
+  # project() probe then honors a user-supplied -DCMAKE_HIP_ARCHITECTURES or, when
+  # unset, auto-detects the host GPU; it is not pinned to any specific arch.
+  set(languages ${languages} HIP)
+endif()
+
 message(STATUS "Enabled languages: ${languages}")
 
 project(k2 ${languages})
 
+if(K2_WITH_HIP)
+  message(STATUS "CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
+  find_package(hip REQUIRED)
+  find_package(hipcub REQUIRED)
+  find_package(rocprim REQUIRED)
+  find_package(hiprand REQUIRED)
+  # These must be found before find_package(Torch) so that Caffe2Targets.cmake
+  # can resolve all ROCm interface targets it imports (hiprtc::hiprtc, roc::hipblas, etc.).
+  find_package(hiprtc QUIET)
+  find_package(hipblas QUIET)
+  find_package(hipblaslt QUIET)
+  find_package(hipsolver QUIET)
+  find_package(hipsparse QUIET)
+  find_package(rocblas QUIET)
+  find_package(rocsolver QUIET)
+  find_package(hipfft QUIET)
+endif()
+
 set(K2_VERSION "1.24.4")
 
 # ----------------- Supported build types for K2 project -----------------
@@ -107,7 +145,7 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
 
-if(BUILD_SHARED_LIBS AND MSVC)
+if(BUILD_SHARED_LIBS AND (MSVC OR (WIN32 AND K2_WITH_HIP)))
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 
@@ -350,6 +388,35 @@ if(K2_WITH_CUDA)
   add_definitions(-DK2_WITH_CUDA)
 endif()
 
+if(K2_WITH_HIP)
+  # K2_WITH_HIP/USE_HIP gate the genuinely divergent code; K2_WITH_CUDA is ALSO
+  # defined as a compile macro so the many `#ifdef K2_WITH_CUDA` blocks that hold
+  # shared GPU/driver/torch code (c10::cuda bridge, cub.h, rand.cu, eval.h CG)
+  # compile -- cuda_to_hip.h retargets the cuda*/curand*/cub spellings to HIP.
+  # The CMake OPTION K2_WITH_CUDA stays OFF (no nvcc arch gencode, no moderngpu
+  # fetch, NVTX auto-off). (AutoDock-GPU lesson.)
+  add_definitions(-DK2_WITH_HIP)
+  add_definitions(-DUSE_HIP)
+  add_definitions(-DK2_WITH_CUDA)
+
+  # Force-include the single compat header on every HIP TU so its aliases
+  # precede all other includes regardless of per-file include order.
+  string(APPEND CMAKE_HIP_FLAGS
+    " -include ${CMAKE_SOURCE_DIR}/k2/csrc/cuda_to_hip.h")
+
+  # Vendored ROCm/libhipcxx supplies <cuda/std/*> (ROCm ships no cuda/std), used
+  # by cub.h and utils_inl.h (cuda::std::plus). Point at it if present.
+  if(DEFINED K2_LIBHIPCXX_INCLUDE_DIR AND
+     EXISTS "${K2_LIBHIPCXX_INCLUDE_DIR}/cuda/std/functional")
+    message(STATUS "libhipcxx include dir: ${K2_LIBHIPCXX_INCLUDE_DIR}")
+    include_directories(SYSTEM ${K2_LIBHIPCXX_INCLUDE_DIR})
+  else()
+    message(FATAL_ERROR
+      "K2_WITH_HIP=ON requires ROCm/libhipcxx (provides <cuda/std/*>). Pass "
+      "-DK2_LIBHIPCXX_INCLUDE_DIR=/path/to/libhipcxx/include")
+  endif()
+endif()
+
 if(WIN32)
   add_definitions(-DNOMINMAX) # Otherwise, std::max() and std::min() won't work
 endif()
@@ -376,14 +443,22 @@ if(K2_WITH_CUDA AND NOT WIN32)
   message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
 endif()
 
+if(K2_WITH_HIP)
+  # hipcc is clang; these are clang flags (no nvcc --compiler-options wrapper).
+  string(APPEND CMAKE_HIP_FLAGS " -Wno-strict-overflow ")
+  string(APPEND CMAKE_HIP_FLAGS " -Wno-unknown-pragmas ")
+  string(APPEND CMAKE_HIP_FLAGS " -Wno-unused-variable ")
+  message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")
+endif()
+
 
 if(NOT WIN32)
   string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-variable ")
   string(APPEND CMAKE_CXX_FLAGS " -Wno-strict-overflow ")
 endif()
 
-if(WIN32)
-  # disable various warnings for MSVC
+if(WIN32 AND NOT K2_WITH_HIP)
+  # disable various warnings for MSVC (not applicable to the clang HIP build)
   # NOTE: Most of the warnings are from PyTorch C++ APIs
   # 4005: macro redefinition
   # 4018: signed/unsigned mismatch

diff --git a/README.md b/README.md
@@ -55,6 +55,11 @@ general and extensible framework to allow further development of ASR technology.
  done via the cub library, parts of which we wrap with our own convenient
  interface.
 
+ The same code also builds for AMD GPUs with ROCm/HIP, where the GPU primitives
+ are provided by hipCUB and rocThrust. See the
+ [installation docs](https://k2-fsa.github.io/k2/installation/from_source.html)
+ for how to build the ROCm backend.
+
  The Finite State Automaton object is then implemented as a Ragged tensor templated
  on a specific data type (a struct representing an arc in the automaton).
 

diff --git a/cmake/torch.cmake b/cmake/torch.cmake
@@ -71,6 +71,27 @@ execute_process(
 
 message(STATUS "PyTorch version: ${TORCH_VERSION}")
 
+if(K2_WITH_HIP)
+  # torch's source hipify (torch/utils/hipify) has two generations that disagree
+  # on the c10 device namespace.  Generation 1 RENAMED the device classes, so the
+  # hip-spelled symbols (c10::hip::*) are the only public ones.  Generation 2
+  # (pytorch#174087, version.py bumped 1.0.0 -> 2.0.0) STOPPED renaming: the CUDA
+  # spellings stay public as the masquerading API (c10::cuda::* on a ROCm build)
+  # while c10::hip::* survive only as thin wrappers.  k2 drives the .cu through
+  # CMake/USE_HIP and never runs torch source-hipify, so it must detect the
+  # generation itself and select the matching namespace.
+  execute_process(
+    COMMAND "${PYTHON_EXECUTABLE}" -c "from packaging.version import Version; import torch.utils.hipify as h; print(1 if Version(getattr(h, '__version__', '1.0.0')) >= Version('2.0.0') else 0)"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE K2_TORCH_HIPIFY_V2
+    RESULT_VARIABLE _k2_hipify_probe_rc
+  )
+  if(NOT _k2_hipify_probe_rc EQUAL 0)
+    set(K2_TORCH_HIPIFY_V2 0)
+  endif()
+  message(STATUS "torch hipify generation v2 (masquerading c10::cuda): ${K2_TORCH_HIPIFY_V2}")
+endif()
+
 if(K2_WITH_CUDA)
   execute_process(
     COMMAND "${PYTHON_EXECUTABLE}" -c "import torch; print(torch.version.cuda)"
@@ -105,3 +126,13 @@ if(K2_WITH_CUDA)
   )
 endif()
 
+if(K2_WITH_HIP)
+  # On a ROCm torch the GPU target is torch_hip; clear its (and torch_cpu's)
+  # interface compile options for the same reason as the CUDA path above.
+  foreach(_t torch_hip torch_cpu)
+    if(TARGET ${_t})
+      set_property(TARGET ${_t} PROPERTY INTERFACE_COMPILE_OPTIONS "")
+    endif()
+  endforeach()
+endif()
+
diff --git a/docs/source/installation/from_source.rst b/docs/source/installation/from_source.rst
@@ -12,7 +12,9 @@ Install from source
 
 .. hint::
 
-    It supports Linux (CPU + CUDA), macOS (CPU), and Windows (CPU + CUDA).
+    It supports Linux (CPU + CUDA + ROCm), macOS (CPU), and Windows (CPU + CUDA
+    + ROCm). For AMD GPUs via ROCm, see the "Building with ROCm (AMD GPUs)"
+    section below.
 
 .. hint::
 
@@ -81,6 +83,63 @@ That is all you need to run.
       e == cudaSuccess (98 vs. 0)  Error: invalid device function.
 
 
+Building with ROCm (AMD GPUs)
+-----------------------------
+
+k2 can also be built for AMD GPUs with ROCm/HIP. The ``.cu`` sources are
+compiled as HIP and the GPU primitives are provided by hipCUB and rocThrust.
+
+.. hint::
+
+  This builds the Python ``_k2`` extension module and the C++ gtest suite (the
+  FSA core that `icefall <https://github.com/k2-fsa/icefall>`_ and
+  `sherpa <https://github.com/k2-fsa/sherpa>`_ consume). The standalone
+  ``k2/torch`` C++ decoder layer is not yet built on the ROCm path.
+
+Before compiling, prepare the environment:
+
+  - Install ROCm (7.2 or newer) including hipCUB, rocPRIM, hipRAND and rocThrust.
+  - Install a ROCm build of PyTorch.
+  - libcu++ is not shipped by ROCm; vendor the ROCm fork
+    (``git clone --branch amd-develop https://github.com/ROCm/libhipcxx``) and
+    point ``K2_LIBHIPCXX_INCLUDE_DIR`` at its ``include`` directory.
+
+Then configure and build, selecting your GPU architecture(s) with
+``CMAKE_HIP_ARCHITECTURES`` (e.g. ``gfx90a`` for MI200, ``gfx1100`` for RDNA3;
+pass a semicolon-separated list to target several). When unset it defaults to
+``gfx90a``.
+
+.. code-block:: bash
+
+  git clone https://github.com/k2-fsa/k2.git
+  cd k2
+  mkdir build_rocm
+  cd build_rocm
+  cmake -DCMAKE_BUILD_TYPE=Release \
+        -DK2_WITH_HIP=ON -DK2_WITH_CUDA=OFF \
+        -DCMAKE_HIP_ARCHITECTURES=gfx90a \
+        -DCMAKE_CXX_STANDARD=20 \
+        -DK2_LIBHIPCXX_INCLUDE_DIR=/path/to/libhipcxx/include \
+        -DK2_ENABLE_TESTS=ON \
+        ..
+  make -j
+
+.. hint::
+
+  To build and install the Python package with ROCm, pass the same options
+  through ``K2_CMAKE_ARGS``:
+
+  .. code-block:: bash
+
+    export K2_CMAKE_ARGS="-DK2_WITH_HIP=ON -DK2_WITH_CUDA=OFF -DCMAKE_HIP_ARCHITECTURES=gfx90a"
+    python3 setup.py install
+
+.. hint::
+
+  Run the GPU tests on a single device, serially, by setting
+  ``HIP_VISIBLE_DEVICES`` to one GPU and running the ``cu_*_test`` executables
+  (or ``ctest``) from the build directory.
+
 To test that k2 is installed successfully, you can run:
 
 .. code-block::

diff --git a/k2/CMakeLists.txt b/k2/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(csrc)
 add_subdirectory(python)
 
-if(K2_USE_PYTORCH)
+if(K2_USE_PYTORCH AND NOT K2_WITH_HIP)
   # We use K2_TORCH_VERSION instead of TORCH_VERSION
   # since TORCH_VERSION may contain something like "+cpu", "+cu113"
   if(K2_TORCH_VERSION VERSION_GREATER_EQUAL 1.8 OR NOT K2_WITH_CUDA)
@@ -12,4 +12,11 @@ if(K2_USE_PYTORCH)
     message(WARNING "Please use at least torch 1.8.0 when CUDA \
     is enabled - skipping compiling k2/torch. Current torch version: ${TORCH_VERSION}")
   endif()
+elseif(K2_WITH_HIP)
+  # k2/torch is the standalone libtorch C++ decoder layer; it pulls in kaldifeat
+  # (itself a separate CUDA project). It is not part of the core _k2 / gtest port
+  # and is not yet built on the ROCm path. The Python _k2 module and the csrc
+  # gtest suite (the FSA core consumed by icefall/sherpa) are built and
+  # validated.
+  message(STATUS "K2_WITH_HIP: skipping k2/torch (standalone C++ decoders + kaldifeat)")
 endif()