Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 86 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,30 +29,68 @@ endif()

set(CMAKE_DISABLE_FIND_PACKAGE_MKL TRUE)

# Build with HIP for AMD GPUs (ROCm). When ON, the CUDA toolchain is not used;
# the .cu sources are compiled as HIP (see the K2_WITH_HIP blocks below) and the
# two non-portable third-party deps (moderngpu, the vendored CUDPP segmented
# scan) are replaced by hipCUB/rocThrust. See k2/csrc/cuda_to_hip.h.
option(K2_WITH_HIP "Build k2 with HIP for AMD GPUs" OFF)

set(languages CXX)
set(_K2_WITH_CUDA ON)

find_program(K2_HAS_NVCC nvcc)
if(NOT K2_HAS_NVCC AND "$ENV{CUDACXX}" STREQUAL "")
message(STATUS "No NVCC detected. Disable CUDA support")
if(K2_WITH_HIP)
# On the HIP build the CUDA toolchain is irrelevant.
set(_K2_WITH_CUDA OFF)
endif()

if(APPLE OR (DEFINED K2_WITH_CUDA AND NOT K2_WITH_CUDA))
if(_K2_WITH_CUDA)
message(STATUS "Disable CUDA support")
else()
find_program(K2_HAS_NVCC nvcc)
if(NOT K2_HAS_NVCC AND "$ENV{CUDACXX}" STREQUAL "")
message(STATUS "No NVCC detected. Disable CUDA support")
set(_K2_WITH_CUDA OFF)
endif()

if(APPLE OR (DEFINED K2_WITH_CUDA AND NOT K2_WITH_CUDA))
if(_K2_WITH_CUDA)
message(STATUS "Disable CUDA support")
set(_K2_WITH_CUDA OFF)
endif()
endif()
endif()

if(_K2_WITH_CUDA)
set(languages ${languages} CUDA)
endif()

if(K2_WITH_HIP)
# Add HIP to the project() language list (mirrors how CUDA is added) so the
# compiler is enabled in the initial project() call; enabling it afterwards via
# enable_language() forces a mid-configure re-run that drops cache values. The
# project() probe then honors a user-supplied -DCMAKE_HIP_ARCHITECTURES or, when
# unset, auto-detects the host GPU; it is not pinned to any specific arch.
set(languages ${languages} HIP)
endif()

message(STATUS "Enabled languages: ${languages}")

project(k2 ${languages})

if(K2_WITH_HIP)
message(STATUS "CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
find_package(hip REQUIRED)
find_package(hipcub REQUIRED)
find_package(rocprim REQUIRED)
find_package(hiprand REQUIRED)
# These must be found before find_package(Torch) so that Caffe2Targets.cmake
# can resolve all ROCm interface targets it imports (hiprtc::hiprtc, roc::hipblas, etc.).
find_package(hiprtc QUIET)
find_package(hipblas QUIET)
find_package(hipblaslt QUIET)
find_package(hipsolver QUIET)
find_package(hipsparse QUIET)
find_package(rocblas QUIET)
find_package(rocsolver QUIET)
find_package(hipfft QUIET)
endif()

set(K2_VERSION "1.24.4")

# ----------------- Supported build types for K2 project -----------------
Expand Down Expand Up @@ -107,7 +145,7 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")

if(BUILD_SHARED_LIBS AND MSVC)
if(BUILD_SHARED_LIBS AND (MSVC OR (WIN32 AND K2_WITH_HIP)))
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()

Expand Down Expand Up @@ -350,6 +388,35 @@ if(K2_WITH_CUDA)
add_definitions(-DK2_WITH_CUDA)
endif()

if(K2_WITH_HIP)
# K2_WITH_HIP/USE_HIP gate the genuinely divergent code; K2_WITH_CUDA is ALSO
# defined as a compile macro so the many `#ifdef K2_WITH_CUDA` blocks that hold
# shared GPU/driver/torch code (c10::cuda bridge, cub.h, rand.cu, eval.h CG)
# compile -- cuda_to_hip.h retargets the cuda*/curand*/cub spellings to HIP.
# The CMake OPTION K2_WITH_CUDA stays OFF (no nvcc arch gencode, no moderngpu
# fetch, NVTX auto-off). (AutoDock-GPU lesson.)
add_definitions(-DK2_WITH_HIP)
add_definitions(-DUSE_HIP)
add_definitions(-DK2_WITH_CUDA)

# Force-include the single compat header on every HIP TU so its aliases
# precede all other includes regardless of per-file include order.
string(APPEND CMAKE_HIP_FLAGS
" -include ${CMAKE_SOURCE_DIR}/k2/csrc/cuda_to_hip.h")

# Vendored ROCm/libhipcxx supplies <cuda/std/*> (ROCm ships no cuda/std), used
# by cub.h and utils_inl.h (cuda::std::plus). Point at it if present.
if(DEFINED K2_LIBHIPCXX_INCLUDE_DIR AND
EXISTS "${K2_LIBHIPCXX_INCLUDE_DIR}/cuda/std/functional")
message(STATUS "libhipcxx include dir: ${K2_LIBHIPCXX_INCLUDE_DIR}")
include_directories(SYSTEM ${K2_LIBHIPCXX_INCLUDE_DIR})
else()
message(FATAL_ERROR
"K2_WITH_HIP=ON requires ROCm/libhipcxx (provides <cuda/std/*>). Pass "
"-DK2_LIBHIPCXX_INCLUDE_DIR=/path/to/libhipcxx/include")
endif()
endif()

if(WIN32)
add_definitions(-DNOMINMAX) # Otherwise, std::max() and std::min() won't work
endif()
Expand All @@ -376,14 +443,22 @@ if(K2_WITH_CUDA AND NOT WIN32)
message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
endif()

if(K2_WITH_HIP)
# hipcc is clang; these are clang flags (no nvcc --compiler-options wrapper).
string(APPEND CMAKE_HIP_FLAGS " -Wno-strict-overflow ")
string(APPEND CMAKE_HIP_FLAGS " -Wno-unknown-pragmas ")
string(APPEND CMAKE_HIP_FLAGS " -Wno-unused-variable ")
message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")
endif()


if(NOT WIN32)
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-variable ")
string(APPEND CMAKE_CXX_FLAGS " -Wno-strict-overflow ")
endif()

if(WIN32)
# disable various warnings for MSVC
if(WIN32 AND NOT K2_WITH_HIP)
# disable various warnings for MSVC (not applicable to the clang HIP build)
# NOTE: Most of the warnings are from PyTorch C++ APIs
# 4005: macro redefinition
# 4018: signed/unsigned mismatch
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ general and extensible framework to allow further development of ASR technology.
done via the cub library, parts of which we wrap with our own convenient
interface.

The same code also builds for AMD GPUs with ROCm/HIP, where the GPU primitives
are provided by hipCUB and rocThrust. See the
[installation docs](https://k2-fsa.github.io/k2/installation/from_source.html)
for how to build the ROCm backend.

The Finite State Automaton object is then implemented as a Ragged tensor templated
on a specific data type (a struct representing an arc in the automaton).

Expand Down
31 changes: 31 additions & 0 deletions cmake/torch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,27 @@ execute_process(

message(STATUS "PyTorch version: ${TORCH_VERSION}")

if(K2_WITH_HIP)
# torch's source hipify (torch/utils/hipify) has two generations that disagree
# on the c10 device namespace. Generation 1 RENAMED the device classes, so the
# hip-spelled symbols (c10::hip::*) are the only public ones. Generation 2
# (pytorch#174087, version.py bumped 1.0.0 -> 2.0.0) STOPPED renaming: the CUDA
# spellings stay public as the masquerading API (c10::cuda::* on a ROCm build)
# while c10::hip::* survive only as thin wrappers. k2 drives the .cu through
# CMake/USE_HIP and never runs torch source-hipify, so it must detect the
# generation itself and select the matching namespace.
execute_process(
COMMAND "${PYTHON_EXECUTABLE}" -c "from packaging.version import Version; import torch.utils.hipify as h; print(1 if Version(getattr(h, '__version__', '1.0.0')) >= Version('2.0.0') else 0)"
OUTPUT_STRIP_TRAILING_WHITESPACE
OUTPUT_VARIABLE K2_TORCH_HIPIFY_V2
RESULT_VARIABLE _k2_hipify_probe_rc
)
if(NOT _k2_hipify_probe_rc EQUAL 0)
set(K2_TORCH_HIPIFY_V2 0)
endif()
message(STATUS "torch hipify generation v2 (masquerading c10::cuda): ${K2_TORCH_HIPIFY_V2}")
endif()

if(K2_WITH_CUDA)
execute_process(
COMMAND "${PYTHON_EXECUTABLE}" -c "import torch; print(torch.version.cuda)"
Expand Down Expand Up @@ -105,3 +126,13 @@ if(K2_WITH_CUDA)
)
endif()

if(K2_WITH_HIP)
# On a ROCm torch the GPU target is torch_hip; clear its (and torch_cpu's)
# interface compile options for the same reason as the CUDA path above.
foreach(_t torch_hip torch_cpu)
if(TARGET ${_t})
set_property(TARGET ${_t} PROPERTY INTERFACE_COMPILE_OPTIONS "")
endif()
endforeach()
endif()

61 changes: 60 additions & 1 deletion docs/source/installation/from_source.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ Install from source

.. hint::

It supports Linux (CPU + CUDA), macOS (CPU), and Windows (CPU + CUDA).
It supports Linux (CPU + CUDA + ROCm), macOS (CPU), and Windows (CPU + CUDA
+ ROCm). For AMD GPUs via ROCm, see the "Building with ROCm (AMD GPUs)"
section below.

.. hint::

Expand Down Expand Up @@ -81,6 +83,63 @@ That is all you need to run.
e == cudaSuccess (98 vs. 0) Error: invalid device function.


Building with ROCm (AMD GPUs)
-----------------------------

k2 can also be built for AMD GPUs with ROCm/HIP. The ``.cu`` sources are
compiled as HIP and the GPU primitives are provided by hipCUB and rocThrust.

.. hint::

This builds the Python ``_k2`` extension module and the C++ gtest suite (the
FSA core that `icefall <https://github.com/k2-fsa/icefall>`_ and
`sherpa <https://github.com/k2-fsa/sherpa>`_ consume). The standalone
``k2/torch`` C++ decoder layer is not yet built on the ROCm path.

Before compiling, prepare the environment:

- Install ROCm (7.2 or newer) including hipCUB, rocPRIM, hipRAND and rocThrust.
- Install a ROCm build of PyTorch.
- libcu++ is not shipped by ROCm; vendor the ROCm fork
(``git clone --branch amd-develop https://github.com/ROCm/libhipcxx``) and
point ``K2_LIBHIPCXX_INCLUDE_DIR`` at its ``include`` directory.

Then configure and build, selecting your GPU architecture(s) with
``CMAKE_HIP_ARCHITECTURES`` (e.g. ``gfx90a`` for MI200, ``gfx1100`` for RDNA3;
pass a semicolon-separated list to target several). When unset it defaults to
``gfx90a``.

.. code-block:: bash

git clone https://github.com/k2-fsa/k2.git
cd k2
mkdir build_rocm
cd build_rocm
cmake -DCMAKE_BUILD_TYPE=Release \
-DK2_WITH_HIP=ON -DK2_WITH_CUDA=OFF \
-DCMAKE_HIP_ARCHITECTURES=gfx90a \
-DCMAKE_CXX_STANDARD=20 \
-DK2_LIBHIPCXX_INCLUDE_DIR=/path/to/libhipcxx/include \
-DK2_ENABLE_TESTS=ON \
..
make -j

.. hint::

To build and install the Python package with ROCm, pass the same options
through ``K2_CMAKE_ARGS``:

.. code-block:: bash

export K2_CMAKE_ARGS="-DK2_WITH_HIP=ON -DK2_WITH_CUDA=OFF -DCMAKE_HIP_ARCHITECTURES=gfx90a"
python3 setup.py install

.. hint::

Run the GPU tests on a single device, serially, by setting
``HIP_VISIBLE_DEVICES`` to one GPU and running the ``cu_*_test`` executables
(or ``ctest``) from the build directory.

To test that k2 is installed successfully, you can run:

.. code-block::
Expand Down
9 changes: 8 additions & 1 deletion k2/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
add_subdirectory(csrc)
add_subdirectory(python)

if(K2_USE_PYTORCH)
if(K2_USE_PYTORCH AND NOT K2_WITH_HIP)
# We use K2_TORCH_VERSION instead of TORCH_VERSION
# since TORCH_VERSION may contain something like "+cpu", "+cu113"
if(K2_TORCH_VERSION VERSION_GREATER_EQUAL 1.8 OR NOT K2_WITH_CUDA)
Expand All @@ -12,4 +12,11 @@ if(K2_USE_PYTORCH)
message(WARNING "Please use at least torch 1.8.0 when CUDA \
is enabled - skipping compiling k2/torch. Current torch version: ${TORCH_VERSION}")
endif()
elseif(K2_WITH_HIP)
# k2/torch is the standalone libtorch C++ decoder layer; it pulls in kaldifeat
# (itself a separate CUDA project). It is not part of the core _k2 / gtest port
# and is not yet built on the ROCm path. The Python _k2 module and the csrc
# gtest suite (the FSA core consumed by icefall/sherpa) are built and
# validated.
message(STATUS "K2_WITH_HIP: skipping k2/torch (standalone C++ decoders + kaldifeat)")
endif()
Loading
Loading