diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d2eb41ca..6bae45b45 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,30 +29,68 @@ endif() set(CMAKE_DISABLE_FIND_PACKAGE_MKL TRUE) +# Build with HIP for AMD GPUs (ROCm). When ON, the CUDA toolchain is not used; +# the .cu sources are compiled as HIP (see the K2_WITH_HIP blocks below) and the +# two non-portable third-party deps (moderngpu, the vendored CUDPP segmented +# scan) are replaced by hipCUB/rocThrust. See k2/csrc/cuda_to_hip.h. +option(K2_WITH_HIP "Build k2 with HIP for AMD GPUs" OFF) + set(languages CXX) set(_K2_WITH_CUDA ON) -find_program(K2_HAS_NVCC nvcc) -if(NOT K2_HAS_NVCC AND "$ENV{CUDACXX}" STREQUAL "") - message(STATUS "No NVCC detected. Disable CUDA support") +if(K2_WITH_HIP) + # On the HIP build the CUDA toolchain is irrelevant. set(_K2_WITH_CUDA OFF) -endif() - -if(APPLE OR (DEFINED K2_WITH_CUDA AND NOT K2_WITH_CUDA)) - if(_K2_WITH_CUDA) - message(STATUS "Disable CUDA support") +else() + find_program(K2_HAS_NVCC nvcc) + if(NOT K2_HAS_NVCC AND "$ENV{CUDACXX}" STREQUAL "") + message(STATUS "No NVCC detected. Disable CUDA support") set(_K2_WITH_CUDA OFF) endif() + + if(APPLE OR (DEFINED K2_WITH_CUDA AND NOT K2_WITH_CUDA)) + if(_K2_WITH_CUDA) + message(STATUS "Disable CUDA support") + set(_K2_WITH_CUDA OFF) + endif() + endif() endif() if(_K2_WITH_CUDA) set(languages ${languages} CUDA) endif() +if(K2_WITH_HIP) + # Add HIP to the project() language list (mirrors how CUDA is added) so the + # compiler is enabled in the initial project() call; enabling it afterwards via + # enable_language() forces a mid-configure re-run that drops cache values. The + # project() probe then honors a user-supplied -DCMAKE_HIP_ARCHITECTURES or, when + # unset, auto-detects the host GPU; it is not pinned to any specific arch. + set(languages ${languages} HIP) +endif() + message(STATUS "Enabled languages: ${languages}") project(k2 ${languages}) +if(K2_WITH_HIP) + message(STATUS "CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}") + find_package(hip REQUIRED) + find_package(hipcub REQUIRED) + find_package(rocprim REQUIRED) + find_package(hiprand REQUIRED) + # These must be found before find_package(Torch) so that Caffe2Targets.cmake + # can resolve all ROCm interface targets it imports (hiprtc::hiprtc, roc::hipblas, etc.). + find_package(hiprtc QUIET) + find_package(hipblas QUIET) + find_package(hipblaslt QUIET) + find_package(hipsolver QUIET) + find_package(hipsparse QUIET) + find_package(rocblas QUIET) + find_package(rocsolver QUIET) + find_package(hipfft QUIET) +endif() + set(K2_VERSION "1.24.4") # ----------------- Supported build types for K2 project ----------------- @@ -107,7 +145,7 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") -if(BUILD_SHARED_LIBS AND MSVC) +if(BUILD_SHARED_LIBS AND (MSVC OR (WIN32 AND K2_WITH_HIP))) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() @@ -350,6 +388,35 @@ if(K2_WITH_CUDA) add_definitions(-DK2_WITH_CUDA) endif() +if(K2_WITH_HIP) + # K2_WITH_HIP/USE_HIP gate the genuinely divergent code; K2_WITH_CUDA is ALSO + # defined as a compile macro so the many `#ifdef K2_WITH_CUDA` blocks that hold + # shared GPU/driver/torch code (c10::cuda bridge, cub.h, rand.cu, eval.h CG) + # compile -- cuda_to_hip.h retargets the cuda*/curand*/cub spellings to HIP. + # The CMake OPTION K2_WITH_CUDA stays OFF (no nvcc arch gencode, no moderngpu + # fetch, NVTX auto-off). (AutoDock-GPU lesson.) + add_definitions(-DK2_WITH_HIP) + add_definitions(-DUSE_HIP) + add_definitions(-DK2_WITH_CUDA) + + # Force-include the single compat header on every HIP TU so its aliases + # precede all other includes regardless of per-file include order. + string(APPEND CMAKE_HIP_FLAGS + " -include ${CMAKE_SOURCE_DIR}/k2/csrc/cuda_to_hip.h") + + # Vendored ROCm/libhipcxx supplies (ROCm ships no cuda/std), used + # by cub.h and utils_inl.h (cuda::std::plus). Point at it if present. + if(DEFINED K2_LIBHIPCXX_INCLUDE_DIR AND + EXISTS "${K2_LIBHIPCXX_INCLUDE_DIR}/cuda/std/functional") + message(STATUS "libhipcxx include dir: ${K2_LIBHIPCXX_INCLUDE_DIR}") + include_directories(SYSTEM ${K2_LIBHIPCXX_INCLUDE_DIR}) + else() + message(FATAL_ERROR + "K2_WITH_HIP=ON requires ROCm/libhipcxx (provides ). Pass " + "-DK2_LIBHIPCXX_INCLUDE_DIR=/path/to/libhipcxx/include") + endif() +endif() + if(WIN32) add_definitions(-DNOMINMAX) # Otherwise, std::max() and std::min() won't work endif() @@ -376,14 +443,22 @@ if(K2_WITH_CUDA AND NOT WIN32) message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") endif() +if(K2_WITH_HIP) + # hipcc is clang; these are clang flags (no nvcc --compiler-options wrapper). + string(APPEND CMAKE_HIP_FLAGS " -Wno-strict-overflow ") + string(APPEND CMAKE_HIP_FLAGS " -Wno-unknown-pragmas ") + string(APPEND CMAKE_HIP_FLAGS " -Wno-unused-variable ") + message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}") +endif() + if(NOT WIN32) string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-variable ") string(APPEND CMAKE_CXX_FLAGS " -Wno-strict-overflow ") endif() -if(WIN32) - # disable various warnings for MSVC +if(WIN32 AND NOT K2_WITH_HIP) + # disable various warnings for MSVC (not applicable to the clang HIP build) # NOTE: Most of the warnings are from PyTorch C++ APIs # 4005: macro redefinition # 4018: signed/unsigned mismatch diff --git a/README.md b/README.md index 1fd10b4b8..68810cbe1 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,11 @@ general and extensible framework to allow further development of ASR technology. done via the cub library, parts of which we wrap with our own convenient interface. + The same code also builds for AMD GPUs with ROCm/HIP, where the GPU primitives + are provided by hipCUB and rocThrust. See the + [installation docs](https://k2-fsa.github.io/k2/installation/from_source.html) + for how to build the ROCm backend. + The Finite State Automaton object is then implemented as a Ragged tensor templated on a specific data type (a struct representing an arc in the automaton). diff --git a/cmake/torch.cmake b/cmake/torch.cmake index ceabe388f..ec306d702 100644 --- a/cmake/torch.cmake +++ b/cmake/torch.cmake @@ -71,6 +71,27 @@ execute_process( message(STATUS "PyTorch version: ${TORCH_VERSION}") +if(K2_WITH_HIP) + # torch's source hipify (torch/utils/hipify) has two generations that disagree + # on the c10 device namespace. Generation 1 RENAMED the device classes, so the + # hip-spelled symbols (c10::hip::*) are the only public ones. Generation 2 + # (pytorch#174087, version.py bumped 1.0.0 -> 2.0.0) STOPPED renaming: the CUDA + # spellings stay public as the masquerading API (c10::cuda::* on a ROCm build) + # while c10::hip::* survive only as thin wrappers. k2 drives the .cu through + # CMake/USE_HIP and never runs torch source-hipify, so it must detect the + # generation itself and select the matching namespace. + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" -c "from packaging.version import Version; import torch.utils.hipify as h; print(1 if Version(getattr(h, '__version__', '1.0.0')) >= Version('2.0.0') else 0)" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE K2_TORCH_HIPIFY_V2 + RESULT_VARIABLE _k2_hipify_probe_rc + ) + if(NOT _k2_hipify_probe_rc EQUAL 0) + set(K2_TORCH_HIPIFY_V2 0) + endif() + message(STATUS "torch hipify generation v2 (masquerading c10::cuda): ${K2_TORCH_HIPIFY_V2}") +endif() + if(K2_WITH_CUDA) execute_process( COMMAND "${PYTHON_EXECUTABLE}" -c "import torch; print(torch.version.cuda)" @@ -105,3 +126,13 @@ if(K2_WITH_CUDA) ) endif() +if(K2_WITH_HIP) + # On a ROCm torch the GPU target is torch_hip; clear its (and torch_cpu's) + # interface compile options for the same reason as the CUDA path above. + foreach(_t torch_hip torch_cpu) + if(TARGET ${_t}) + set_property(TARGET ${_t} PROPERTY INTERFACE_COMPILE_OPTIONS "") + endif() + endforeach() +endif() + diff --git a/docs/source/installation/from_source.rst b/docs/source/installation/from_source.rst index 75af6bd58..3b74a7c1b 100644 --- a/docs/source/installation/from_source.rst +++ b/docs/source/installation/from_source.rst @@ -12,7 +12,9 @@ Install from source .. hint:: - It supports Linux (CPU + CUDA), macOS (CPU), and Windows (CPU + CUDA). + It supports Linux (CPU + CUDA + ROCm), macOS (CPU), and Windows (CPU + CUDA + + ROCm). For AMD GPUs via ROCm, see the "Building with ROCm (AMD GPUs)" + section below. .. hint:: @@ -81,6 +83,63 @@ That is all you need to run. e == cudaSuccess (98 vs. 0) Error: invalid device function. +Building with ROCm (AMD GPUs) +----------------------------- + +k2 can also be built for AMD GPUs with ROCm/HIP. The ``.cu`` sources are +compiled as HIP and the GPU primitives are provided by hipCUB and rocThrust. + +.. hint:: + + This builds the Python ``_k2`` extension module and the C++ gtest suite (the + FSA core that `icefall `_ and + `sherpa `_ consume). The standalone + ``k2/torch`` C++ decoder layer is not yet built on the ROCm path. + +Before compiling, prepare the environment: + + - Install ROCm (7.2 or newer) including hipCUB, rocPRIM, hipRAND and rocThrust. + - Install a ROCm build of PyTorch. + - libcu++ is not shipped by ROCm; vendor the ROCm fork + (``git clone --branch amd-develop https://github.com/ROCm/libhipcxx``) and + point ``K2_LIBHIPCXX_INCLUDE_DIR`` at its ``include`` directory. + +Then configure and build, selecting your GPU architecture(s) with +``CMAKE_HIP_ARCHITECTURES`` (e.g. ``gfx90a`` for MI200, ``gfx1100`` for RDNA3; +pass a semicolon-separated list to target several). When unset it defaults to +``gfx90a``. + +.. code-block:: bash + + git clone https://github.com/k2-fsa/k2.git + cd k2 + mkdir build_rocm + cd build_rocm + cmake -DCMAKE_BUILD_TYPE=Release \ + -DK2_WITH_HIP=ON -DK2_WITH_CUDA=OFF \ + -DCMAKE_HIP_ARCHITECTURES=gfx90a \ + -DCMAKE_CXX_STANDARD=20 \ + -DK2_LIBHIPCXX_INCLUDE_DIR=/path/to/libhipcxx/include \ + -DK2_ENABLE_TESTS=ON \ + .. + make -j + +.. hint:: + + To build and install the Python package with ROCm, pass the same options + through ``K2_CMAKE_ARGS``: + + .. code-block:: bash + + export K2_CMAKE_ARGS="-DK2_WITH_HIP=ON -DK2_WITH_CUDA=OFF -DCMAKE_HIP_ARCHITECTURES=gfx90a" + python3 setup.py install + +.. hint:: + + Run the GPU tests on a single device, serially, by setting + ``HIP_VISIBLE_DEVICES`` to one GPU and running the ``cu_*_test`` executables + (or ``ctest``) from the build directory. + To test that k2 is installed successfully, you can run: .. code-block:: diff --git a/k2/CMakeLists.txt b/k2/CMakeLists.txt index 6a7839d0e..8b03dcf3e 100644 --- a/k2/CMakeLists.txt +++ b/k2/CMakeLists.txt @@ -1,7 +1,7 @@ add_subdirectory(csrc) add_subdirectory(python) -if(K2_USE_PYTORCH) +if(K2_USE_PYTORCH AND NOT K2_WITH_HIP) # We use K2_TORCH_VERSION instead of TORCH_VERSION # since TORCH_VERSION may contain something like "+cpu", "+cu113" if(K2_TORCH_VERSION VERSION_GREATER_EQUAL 1.8 OR NOT K2_WITH_CUDA) @@ -12,4 +12,11 @@ if(K2_USE_PYTORCH) message(WARNING "Please use at least torch 1.8.0 when CUDA \ is enabled - skipping compiling k2/torch. Current torch version: ${TORCH_VERSION}") endif() +elseif(K2_WITH_HIP) + # k2/torch is the standalone libtorch C++ decoder layer; it pulls in kaldifeat + # (itself a separate CUDA project). It is not part of the core _k2 / gtest port + # and is not yet built on the ROCm path. The Python _k2 module and the csrc + # gtest suite (the FSA core consumed by icefall/sherpa) are built and + # validated. + message(STATUS "K2_WITH_HIP: skipping k2/torch (standalone C++ decoders + kaldifeat)") endif() diff --git a/k2/csrc/CMakeLists.txt b/k2/csrc/CMakeLists.txt index 5b0442a47..8f9c57d11 100644 --- a/k2/csrc/CMakeLists.txt +++ b/k2/csrc/CMakeLists.txt @@ -91,7 +91,13 @@ else() list(APPEND context_srcs default_context.cu) endif() -if(NOT K2_WITH_CUDA) +if(K2_WITH_HIP) + # moderngpu and the warp-synchronous CUDPP segmented scan are not portable to + # ROCm; drop them and add the hipCUB/rocThrust-backed replacements (same API). + list(REMOVE_ITEM context_srcs moderngpu_allocator.cu) + list(APPEND context_srcs moderngpu_allocator_hip.cu) + list(APPEND context_srcs cudpp/cudpp_hip.cu) +elseif(NOT K2_WITH_CUDA) transform(OUTPUT_VARIABLE context_srcs SRCS ${context_srcs}) else() list(APPEND context_srcs cudpp/cudpp.cu) @@ -110,6 +116,21 @@ target_compile_definitions(context PUBLIC THRUST_NS_QUALIFIER=thrust) set_target_properties(context PROPERTIES CUDA_SEPARABLE_COMPILATION ON) set_target_properties(context PROPERTIES OUTPUT_NAME "k2context") +if(K2_WITH_HIP) + # Compile the .cu translation units as HIP and turn on relocatable device code + # (k2 declares __device__ symbols in headers and defines them across .cu TUs). + set_source_files_properties(${context_srcs} PROPERTIES LANGUAGE HIP) + set_target_properties(context PROPERTIES + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" + HIP_SEPARABLE_COMPILATION ON) + # Select the c10 device namespace by the detected torch hipify generation, not + # by OS. PUBLIC so the consumers that select the same namespace (the gtest + # suite, the _k2 module's mutual_information TU) inherit the define. + if(K2_TORCH_HIPIFY_V2) + target_compile_definitions(context PUBLIC TORCH_HIPIFY_V2) + endif() +endif() + # lib deps if(K2_WITH_CUDA AND CUDA_VERSION VERSION_LESS 11.0) target_link_libraries(context PUBLIC cub) @@ -119,6 +140,13 @@ if(K2_WITH_CUDA) target_link_libraries(context PUBLIC moderngpu) endif() +if(K2_WITH_HIP) + # Link hip::host (NOT hip::device, whose --offload-arch propagation interferes + # with the per-target HIP_ARCHITECTURES). hipcub is an INTERFACE (header-only) + # target; hiprand provides the device RNG used by rand.cu. + target_link_libraries(context PUBLIC hip::host hip::hipcub hip::hiprand) +endif() + target_link_libraries(context PUBLIC fsa) target_link_libraries(context PUBLIC k2_log) if(CUDA_VERSION VERSION_LESS "12.6") @@ -128,16 +156,13 @@ if(K2_USE_PYTORCH) if(NOT WIN32) target_link_libraries(context PUBLIC ${TORCH_LIBRARIES}) else() - # see https://discuss.pytorch.org/t/nvcc-fatal-a-single-input-file-is-required-for-a-non-link-phase-when-an-outputfile-is-specified/142843/6 - # Depending on ${TORCH_LIBRARIES} will introduce a compile time option "/bigobj", - # which causes the error in the above link. - # - # It would be ideal to remove /bigobj so that we can use ${TORCH_LIBRARIES}. - # To make life simpler, we use the following approach. - # + # On Windows (both MSVC+CUDA and clang+HIP), CMake imported shared-library + # targets are not reliably expanded to their import .lib paths in the HIP or + # Ninja link rules. Use file(GLOB) over ${TORCH_DIR}/lib/*.lib instead; this + # mirrors what the original MSVC+CUDA path already did for the same reason. message(STATUS "TORCH_DIR: ${TORCH_DIR}") # TORCH_DIR is defined in cmake/torch.cmake - # target_link_libraries(context PUBLIC D:/software/anaconda3/envs/py38/Lib/site-packages/torch/lib/*.lib) - target_link_libraries(context PUBLIC ${TORCH_DIR}/lib/*.lib) + file(GLOB _torch_libs "${TORCH_DIR}/lib/*.lib") + target_link_libraries(context PUBLIC ${_torch_libs}) target_include_directories(context PUBLIC ${TORCH_DIR}/include) target_include_directories(context PUBLIC ${TORCH_DIR}/include/torch/csrc/api/include) endif() @@ -151,11 +176,17 @@ target_include_directories(context PUBLIC ${PYTHON_INCLUDE_DIRS}) if(K2_ENABLE_TESTS OR K2_ENABLE_BENCHMARK) set(test_utils_srcs test_utils.cu) - if(NOT K2_WITH_CUDA) + if(NOT K2_WITH_CUDA AND NOT K2_WITH_HIP) transform(OUTPUT_VARIABLE test_utils_srcs SRCS ${test_utils_srcs}) endif() add_library(test_utils ${test_utils_srcs}) + if(K2_WITH_HIP) + set_source_files_properties(${test_utils_srcs} PROPERTIES LANGUAGE HIP) + set_target_properties(test_utils PROPERTIES + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" + HIP_SEPARABLE_COMPILATION ON) + endif() target_link_libraries(test_utils PUBLIC context gtest) endif() @@ -194,7 +225,7 @@ if(K2_ENABLE_TESTS) top_sort_test.cu utils_test.cu ) - if(NOT K2_WITH_CUDA) + if(NOT K2_WITH_CUDA AND NOT K2_WITH_HIP) transform(OUTPUT_VARIABLE cuda_test_srcs SRCS ${cuda_test_srcs}) endif() @@ -206,6 +237,14 @@ if(K2_ENABLE_TESTS) set(target_name "cu_${name}") add_executable(${target_name} "${source}") set_target_properties(${target_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + if(K2_WITH_HIP) + # Mark the test .cu LANGUAGE HIP and turn on RDC, else CMake drops them from + # the device link (RXMesh lesson: tests would link zero device objects). + set_source_files_properties("${source}" PROPERTIES LANGUAGE HIP) + set_target_properties(${target_name} PROPERTIES + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" + HIP_SEPARABLE_COMPILATION ON) + endif() target_link_libraries(${target_name} PRIVATE context diff --git a/k2/csrc/array_ops_inl.h b/k2/csrc/array_ops_inl.h index 94607ec2b..0c047e618 100644 --- a/k2/csrc/array_ops_inl.h +++ b/k2/csrc/array_ops_inl.h @@ -60,6 +60,15 @@ struct PtrPtr { tmp.data += n; return tmp; } +#if defined(USE_HIP) + // rocPRIM's scan dereferences `input - 1` on its offset path, so it needs + // operator-(int) on the iterator; NVIDIA cub does not call it. + __host__ __device__ __forceinline__ PtrPtr operator-(int32_t n) const { + PtrPtr tmp(*this); + tmp.data -= n; + return tmp; + } +#endif __host__ __device__ __forceinline__ const T &operator*() const { return **data; } @@ -86,6 +95,14 @@ struct ConstReversedPtr { tmp.data -= n; return tmp; } +#if defined(USE_HIP) + __host__ __device__ __forceinline__ ConstReversedPtr + operator-(int32_t n) const { + ConstReversedPtr tmp(*this); + tmp.data += n; // reversed: subtracting an offset moves forward in memory + return tmp; + } +#endif __host__ __device__ __forceinline__ const T &operator*() const { return *data; } @@ -108,6 +125,13 @@ struct ReversedPtr { tmp.data -= n; return tmp; } +#if defined(USE_HIP) + __host__ __device__ __forceinline__ ReversedPtr operator-(int32_t n) const { + ReversedPtr tmp(*this); + tmp.data += n; // reversed: subtracting an offset moves forward in memory + return tmp; + } +#endif __host__ __device__ __forceinline__ T &operator*() { return *data; } }; diff --git a/k2/csrc/cub.h b/k2/csrc/cub.h index 02b159983..291568258 100644 --- a/k2/csrc/cub.h +++ b/k2/csrc/cub.h @@ -20,7 +20,17 @@ #ifndef K2_CSRC_CUB_H_ #define K2_CSRC_CUB_H_ -#ifdef K2_WITH_CUDA +#if defined(K2_WITH_HIP) + +// cuda_to_hip.h (force-included) already aliases `cub` -> `hipcub`. +// is provided by the vendored ROCm/libhipcxx on the include path, +// so the cuda::std::plus uses in utils_inl.h resolve unchanged. nvToolsExt has +// no ROCm equivalent and NVTX is disabled on HIP, so it is not included here. +#include + +#include + +#elif defined(K2_WITH_CUDA) #include diff --git a/k2/csrc/cuda_to_hip.h b/k2/csrc/cuda_to_hip.h new file mode 100644 index 000000000..ea32c08f1 --- /dev/null +++ b/k2/csrc/cuda_to_hip.h @@ -0,0 +1,128 @@ +/** + * Copyright (c) 2026 Advanced Micro Devices, Inc. (authors: Jeff Daily ) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The single CUDA->HIP compat header for the ROCm port. It is force-included on +// every HIP translation unit (CMAKE_HIP_FLAGS -include .../cuda_to_hip.h), so +// the aliases below precede all other includes regardless of file order. It is +// the only file that knows about HIP; everywhere else keeps the CUDA spelling. +// +// On the NVIDIA build this header is never compiled (it is only force-included +// under K2_WITH_HIP), so the CUDA path is byte-for-byte unchanged. + +#ifndef K2_CSRC_CUDA_TO_HIP_H_ +#define K2_CSRC_CUDA_TO_HIP_H_ + +#if defined(K2_WITH_HIP) + +// Pull in the libc host declarations BEFORE so that inside +// a .cu compiled as HIP a host-side memcpy/memset resolves to the libc host +// overload rather than HIP's __device__ overload (gpuRIR lesson). +#include +#include + +#include // NOLINT(build/include_order) + +// Note on device-vs-host dispatch under clang/HIP: k2 keys two different things +// on __CUDA_ARCH__: (a) the host/device DECORATOR K2_CUDA_HOSTDEV, and (b) +// intra-function `#ifdef __CUDA_ARCH__` device-intrinsic-vs-host dispatch. +// Under clang/HIP a __host__ __device__ function is preprocessed ONCE in the +// host pass (where __CUDA_ARCH__ is absent and a `#define` of it does not +// take), so we canNOT emulate (a) by defining __CUDA_ARCH__ (cudaKDTree +// lesson). Instead the decorator is unconditionally __host__ __device__ on +// HIP (log.h) and the (b) dispatch sites use K2_DEVICE_CODE (defined in +// macros.h), which keys on +// __HIP_DEVICE_COMPILE__ -- correct per-pass inside a __host__ __device__ body. + +// --------------------------------------------------------------------------- +// Runtime types and enums +// --------------------------------------------------------------------------- +#define cudaError_t hipError_t +#define cudaError hipError +#define cudaSuccess hipSuccess +#define cudaErrorNotReady hipErrorNotReady +#define cudaErrorAssert hipErrorAssert +#define cudaErrorMemoryAllocation hipErrorOutOfMemory +#define cudaErrorInitializationError hipErrorNotInitialized + +#define cudaStream_t hipStream_t +#define cudaEvent_t hipEvent_t + +#define cudaDeviceProp hipDeviceProp_t + +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice + +#define cudaEventDisableTiming hipEventDisableTiming + +// --------------------------------------------------------------------------- +// Runtime API +// --------------------------------------------------------------------------- +#define cudaMalloc hipMalloc +#define cudaMallocHost hipHostMalloc +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync + +#define cudaSetDevice hipSetDevice +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDeviceSynchronize hipDeviceSynchronize + +#define cudaGetLastError hipGetLastError +#define cudaGetErrorString hipGetErrorString +#define cudaRuntimeGetVersion hipRuntimeGetVersion +#define cudaDriverGetVersion hipDriverGetVersion + +#define cudaStreamCreate hipStreamCreate +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent + +#define cudaEventCreate hipEventCreate +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventRecord hipEventRecord +#define cudaEventDestroy hipEventDestroy +#define cudaEventQuery hipEventQuery +#define cudaEventSynchronize hipEventSynchronize +#define cudaEventElapsedTime hipEventElapsedTime + +// --------------------------------------------------------------------------- +// CUB: k2 uses the CUDA spelling cub::. hipcub puts its API in the (inline, +// version-tagged, hidden-visibility) hipcub namespace, so aliasing cub -> +// hipcub lets every existing cub::DeviceScan/DeviceReduce/... call resolve to +// hipcub unchanged. (hipcub ignores CUB_WRAPPED_NAMESPACE; the inline namespace +// plus hidden visibility already prevent a clash with torch's bundled hipcub.) +// --------------------------------------------------------------------------- +#define cub hipcub + +// --------------------------------------------------------------------------- +// cuRAND device API (rand.cu): 1:1 with the hipRAND device API. +// --------------------------------------------------------------------------- +#define curandStatePhilox4_32_10_t hiprandStatePhilox4_32_10_t +#define curand_init hiprand_init +#define curand_uniform4 hiprand_uniform4 +#define curand_uniform2_double hiprand_uniform2_double +#define curand4 hiprand4 + +#endif // defined(K2_WITH_HIP) + +#endif // K2_CSRC_CUDA_TO_HIP_H_ diff --git a/k2/csrc/cudpp/cudpp_hip.cu b/k2/csrc/cudpp/cudpp_hip.cu new file mode 100644 index 000000000..e86d0f867 --- /dev/null +++ b/k2/csrc/cudpp/cudpp_hip.cu @@ -0,0 +1,98 @@ +/** + * k2/csrc/cudpp/cudpp_hip.cu + * + * Copyright (c) 2026 Advanced Micro Devices, Inc. (authors: Jeff Daily ) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// HIP replacement for k2/csrc/cudpp/cudpp.cu. The vendored CUDPP segmented scan +// is warp-synchronous (WARP_SIZE=32, two 32-lane warps per block) and so is a +// wave64 hazard on CDNA. SegmentedExclusiveSum is exactly a per-segment +// exclusive prefix sum, which hipCUB provides natively via +// DeviceScan::ExclusiveScanByKey. We turn the head-flags array into a monotone +// segment key (inclusive scan of the flags: the key increments by one at each +// segment start, so the per-key scan resets exactly at segment boundaries) and +// run an exclusive Sum scan keyed on it. Identical semantics, no warp +// arithmetic, correct on wave32 and wave64. + +#include + +#include "k2/csrc/array.h" +#include "k2/csrc/context.h" +#include "k2/csrc/cudpp/cudpp.h" +#include "k2/csrc/log.h" + +namespace k2 { + +template +void SegmentedExclusiveSum(ContextPtr context, const T *d_in, + int32_t num_elements, const uint32_t *d_iflags, + T *d_out) { + if (num_elements <= 0) return; + hipStream_t stream = context->GetCudaStream(); + + // keys[i] = inclusive sum of flags[0..i]; the key changes (by +1) exactly at + // each segment start, which is where ExclusiveScanByKey must reset. + Array1 keys(context, num_elements); + uint32_t *keys_data = keys.Data(); + + size_t temp_bytes = 0; + K2_CHECK_EQ(hipcub::DeviceScan::InclusiveSum(nullptr, temp_bytes, d_iflags, + keys_data, num_elements, stream), + hipSuccess); + { + Array1 d_temp(context, static_cast(temp_bytes)); + K2_CHECK_EQ( + hipcub::DeviceScan::InclusiveSum(d_temp.Data(), temp_bytes, d_iflags, + keys_data, num_elements, stream), + hipSuccess); + } + + temp_bytes = 0; + K2_CHECK_EQ( + hipcub::DeviceScan::ExclusiveScanByKey( + nullptr, temp_bytes, keys_data, d_in, d_out, hipcub::Sum(), T(0), + num_elements, hipcub::Equality(), stream), + hipSuccess); + { + Array1 d_temp(context, static_cast(temp_bytes)); + K2_CHECK_EQ( + hipcub::DeviceScan::ExclusiveScanByKey( + d_temp.Data(), temp_bytes, keys_data, d_in, d_out, hipcub::Sum(), + T(0), num_elements, hipcub::Equality(), stream), + hipSuccess); + } +} + +template void SegmentedExclusiveSum(ContextPtr context, + const int32_t *d_in, + int32_t num_elements, + const uint32_t *d_iflags, + int32_t *d_out); + +template void SegmentedExclusiveSum(ContextPtr context, + const float *d_in, + int32_t num_elements, + const uint32_t *d_iflags, + float *d_out); + +template void SegmentedExclusiveSum(ContextPtr context, + const double *d_in, + int32_t num_elements, + const uint32_t *d_iflags, + double *d_out); + +} // namespace k2 diff --git a/k2/csrc/eval.h b/k2/csrc/eval.h index 449636d6a..107ad6525 100644 --- a/k2/csrc/eval.h +++ b/k2/csrc/eval.h @@ -28,7 +28,9 @@ #include #include -#ifdef K2_WITH_CUDA +#if defined(K2_WITH_HIP) +#include +#elif defined(K2_WITH_CUDA) #include #endif diff --git a/k2/csrc/fsa_utils.cu b/k2/csrc/fsa_utils.cu index 9eb6231fe..1119da332 100644 --- a/k2/csrc/fsa_utils.cu +++ b/k2/csrc/fsa_utils.cu @@ -18,7 +18,9 @@ * limitations under the License. */ -#ifdef K2_WITH_CUDA +#if defined(K2_WITH_HIP) +#include +#elif defined(K2_WITH_CUDA) #include #endif diff --git a/k2/csrc/hash.h b/k2/csrc/hash.h index 5be08b574..ead2cacdf 100644 --- a/k2/csrc/hash.h +++ b/k2/csrc/hash.h @@ -41,7 +41,7 @@ unsigned long long int __forceinline__ __host__ __device__ AtomicCAS( unsigned long long int* address, unsigned long long int compare, unsigned long long int val) { -#ifdef __CUDA_ARCH__ +#if K2_DEVICE_CODE return atomicCAS(address, compare, val); #else // For host code, we assume single-threaded for now). diff --git a/k2/csrc/intersect.cu b/k2/csrc/intersect.cu index 625456bc7..9b0d4b655 100644 --- a/k2/csrc/intersect.cu +++ b/k2/csrc/intersect.cu @@ -16,7 +16,9 @@ * limitations under the License. */ -#ifdef K2_WITH_CUDA +#if defined(K2_WITH_HIP) +#include +#elif defined(K2_WITH_CUDA) #include #endif diff --git a/k2/csrc/log.h b/k2/csrc/log.h index c85cc6d40..85436568e 100644 --- a/k2/csrc/log.h +++ b/k2/csrc/log.h @@ -43,7 +43,17 @@ #include "k2/csrc/macros.h" -#ifdef __CUDA_ARCH__ +#if defined(__HIPCC__) +// Under clang/HIP a __host__ __device__ function is preprocessed once (host +// pass) where __CUDA_ARCH__ is absent, so gating the decorator on __CUDA_ARCH__ +// would make it host-only and break device callers. Make it unconditional when +// the HIP compiler is in use (cudaKDTree __both__ lesson); HIP's +// __host__ __device__ is valid in both passes. Per-pass dispatch INSIDE these +// functions uses K2_DEVICE_CODE. Keyed on __HIPCC__ (not the K2_WITH_HIP build +// flag) so the plain-C++ TUs (e.g. the k2_log .cc built by g++, and +// k2/csrc/host) that include this header do NOT see the HIP-only attributes. +#define K2_CUDA_HOSTDEV __host__ __device__ +#elif defined(__CUDA_ARCH__) #define K2_CUDA_HOSTDEV __host__ __device__ #else #define K2_CUDA_HOSTDEV @@ -156,7 +166,7 @@ class Logger { if (cur_level_ <= level_) { printf("%s:%u:%s ", filename, line_num, func_name); -#if defined(__CUDA_ARCH__) +#if K2_DEVICE_CODE printf("block:[%u,%u,%u], thread: [%u,%u,%u] ", blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z); #endif @@ -179,7 +189,7 @@ class Logger { )"; printf("\n"); if (level_ == FATAL) { -#if defined(__CUDA_ARCH__) +#if K2_DEVICE_CODE // this is usually caused by one of the K2_CHECK macros and the detailed // error messages should have already been printed by the macro, so we // use an arbitrary string here. @@ -322,7 +332,7 @@ inline int64_t MaxCpuMemAllocate() { } inline K2_CUDA_HOSTDEV LogLevel GetCurrentLogLevel() { -#if defined(__CUDA_ARCH__) +#if K2_DEVICE_CODE return DEBUG; #else static LogLevel log_level = INFO; diff --git a/k2/csrc/macros.h b/k2/csrc/macros.h index 9d9c39084..8f95116bb 100644 --- a/k2/csrc/macros.h +++ b/k2/csrc/macros.h @@ -29,6 +29,21 @@ #define K2_FUNC __func__ #endif +// K2_DEVICE_CODE is 1 when compiling the device path of a __host__ __device__ +// function and 0 for the host path. On CUDA that is __CUDA_ARCH__; under the +// HIP compiler it is __HIP_DEVICE_COMPILE__ (clang/HIP does NOT define +// __CUDA_ARCH__). Use this for device-intrinsic-vs-host-fallback dispatch +// instead of a bare `#ifdef __CUDA_ARCH__`, which silently takes the host path +// in HIP device code. Keyed on __HIPCC__ (the compiler), not the K2_WITH_HIP +// build flag, so plain-C++ TUs that include this header evaluate to 0 cleanly. +#if defined(__HIPCC__) +#define K2_DEVICE_CODE __HIP_DEVICE_COMPILE__ +#elif defined(__CUDA_ARCH__) +#define K2_DEVICE_CODE 1 +#else +#define K2_DEVICE_CODE 0 +#endif + /* `K2_EVAL` simplifies the task of writing lambdas for CUDA as well as for CPU. diff --git a/k2/csrc/math.cu b/k2/csrc/math.cu index f53b854f0..5774e9cc3 100644 --- a/k2/csrc/math.cu +++ b/k2/csrc/math.cu @@ -50,7 +50,8 @@ int32_t HighestBitSet(int64_t i) { K2_CHECK_GE(i, 0); #if defined(__clang__) || defined (__GNUC__) if (i == 0) return -1; - return 63 - static_cast(__builtin_clzl(i)); + // Use __builtin_clzll (always 64-bit) not __builtin_clzl (32-bit on Windows) + return 63 - static_cast(__builtin_clzll(static_cast(i))); #else for (int64_t j = 0; j < 64; ++j) { if (i < ((int64_t)1 << j)) { diff --git a/k2/csrc/moderngpu.h b/k2/csrc/moderngpu.h index 6b7180be3..ea21b0c9b 100644 --- a/k2/csrc/moderngpu.h +++ b/k2/csrc/moderngpu.h @@ -19,7 +19,11 @@ #ifndef K2_CSRC_MODERNGPU_H_ #define K2_CSRC_MODERNGPU_H_ -#ifdef K2_WITH_CUDA +#if defined(K2_WITH_HIP) +// moderngpu is not portable to ROCm; use the HIP replacement that exposes the +// same mgpu:: API (see moderngpu_shim.h). +#include "k2/csrc/moderngpu_shim.h" +#elif defined(K2_WITH_CUDA) #include "moderngpu/context.hxx" #include "moderngpu/kernel_load_balance.hxx" #include "moderngpu/kernel_mergesort.hxx" diff --git a/k2/csrc/moderngpu_allocator_hip.cu b/k2/csrc/moderngpu_allocator_hip.cu new file mode 100644 index 000000000..27f6407c1 --- /dev/null +++ b/k2/csrc/moderngpu_allocator_hip.cu @@ -0,0 +1,56 @@ +/** + * Copyright (c) 2026 Advanced Micro Devices, Inc. (authors: Jeff Daily ) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// HIP replacement for moderngpu_allocator.cu. The HIP mgpu shim +// (moderngpu_shim.h) allocates and launches directly through the k2 Context, so +// the moderngpu allocator subclass is unnecessary; GetModernGpuAllocator just +// hands back a per-device shim context_t wrapping the k2 ContextPtr. + +#include // NOLINT +#include + +#include "k2/csrc/context.h" +#include "k2/csrc/moderngpu_allocator.h" + +namespace k2 { + +static mgpu::context_t *mgpu_contexts[kMaxNumGpus]; +static std::once_flag mgpu_once_flags[kMaxNumGpus]; + +static void InitModernGpuAllocator(ContextPtr context) { + int32_t device_index = context->GetDeviceId(); + K2_CHECK_GE(device_index, 0); + K2_CHECK_LT(device_index, kMaxNumGpus); + // It is never freed (same lifetime policy as the CUDA build). + mgpu_contexts[device_index] = new mgpu::context_t(context); +} + +mgpu::context_t *GetModernGpuAllocator(ContextPtr context) { + K2_CHECK_EQ(context->GetDeviceType(), kCuda); + + int32_t device_index = context->GetDeviceId(); + K2_CHECK_GE(device_index, 0); + K2_CHECK_LT(device_index, kMaxNumGpus); + + std::call_once(mgpu_once_flags[device_index], InitModernGpuAllocator, + context); + + return mgpu_contexts[device_index]; +} + +} // namespace k2 diff --git a/k2/csrc/moderngpu_shim.h b/k2/csrc/moderngpu_shim.h new file mode 100644 index 000000000..42d19c05f --- /dev/null +++ b/k2/csrc/moderngpu_shim.h @@ -0,0 +1,398 @@ +/** + * Copyright (c) 2026 Advanced Micro Devices, Inc. (authors: Jeff Daily ) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// HIP replacement for the small slice of moderngpu that k2 uses. moderngpu is +// not portable to ROCm (its intrinsics.hxx #errors under non-nvcc, hardcodes a +// 32-lane warp, and uses inline PTX), so on the HIP build we do NOT compile +// moderngpu at all. Instead this header provides the exact mgpu::-shaped API +// that k2's call sites expect, backed by rocThrust and a couple of small +// kernels, so every existing `mgpu::...` call compiles unchanged. The CUDA +// build never sees this file (moderngpu.h includes the real moderngpu there). +// +// Semantics are matched to moderngpu: +// - mergesort / segmented_sort* are STABLE (moderngpu's are, and k2's CPU +// reference uses std::stable_sort; index maps must be reproducible); +// - the comparators k2 passes are arbitrary device callables (Arc/ArcComparer, +// device lambdas, LessThan/GreaterThan), which rocThrust handles directly +// and a radix sort could not. + +#ifndef K2_CSRC_MODERNGPU_SHIM_H_ +#define K2_CSRC_MODERNGPU_SHIM_H_ + +#if !defined(K2_WITH_HIP) +#error "moderngpu_shim.h is only for the HIP build" +#endif + +#include +#include +#include +#include +#include + +#include +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) +#include // NOLINT(build/include_order) + +#include "k2/csrc/context.h" +#include "k2/csrc/log.h" + +namespace mgpu { + +// moderngpu's allocator/context base. Here it only needs to carry the k2 +// Context (for device allocate/deallocate) and the HIP stream. +struct context_t { + k2::ContextPtr k2_context; + + context_t() = default; + explicit context_t(k2::ContextPtr c) : k2_context(std::move(c)) {} + + // Re-query each time so a CudaStreamOverride in effect is honored (matches + // how the CUDA build reads the stream). + hipStream_t stream() const { return k2_context->GetCudaStream(); } +}; + +// k2's GetModernGpuAllocator returns a `standard_context_t`-derived object on +// CUDA; on HIP the plain context_t is enough. +using standard_context_t = context_t; + +// ---- transform_lbs caching-tuple support (CatWithOffsets) ----------------- +// k2 calls transform_lbs with an optional mgpu::make_tuple(ptr...) whose values +// are loaded per-segment and passed to the lambda as mgpu::tuple<...>. +template +struct tuple { + // Only the single-element case is exercised by k2, but keep it general. +}; + +template +struct tuple { + T v0; +}; + +template +__host__ __device__ __forceinline__ const T &get_impl( + const tuple &t, std::integral_constant) { + return t.v0; +} + +template +__host__ __device__ __forceinline__ auto get(const tuple &t) + -> decltype(get_impl(t, std::integral_constant())) { + return get_impl(t, std::integral_constant()); +} + +// A tuple of device pointers; indexing it by segment yields a tuple of values. +template +struct ptr_tuple {}; + +template +struct ptr_tuple { + const T *p0; + __device__ __forceinline__ tuple at(int32_t seg) const { + return tuple{p0[seg]}; + } +}; + +template +__host__ __forceinline__ ptr_tuple make_tuple(const T *p0) { + return ptr_tuple{p0}; +} + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- +namespace shim_internal { + +constexpr int32_t kBlockSize = 256; + +template +inline thrust::device_ptr dptr(T *p) { + return thrust::device_pointer_cast(p); +} + +// row_ids[i] = segment of element i = upper_bound(row_splits[1..nsegs], i), +// i.e. the moderngpu load-balance-search result. row_splits has nsegs+1 +// entries. +inline void ComputeRowIds(context_t &ctx, int32_t count, + const int32_t *row_splits, int32_t num_segments, + int32_t *row_ids) { + if (count <= 0) return; + auto policy = thrust::hip::par.on(ctx.stream()); + // ends = row_splits + 1 (the per-segment end offsets). + thrust::upper_bound( + policy, dptr(const_cast(row_splits)) + 1, + dptr(const_cast(row_splits)) + 1 + num_segments, + thrust::counting_iterator(0), + thrust::counting_iterator(count), dptr(row_ids)); +} + +// Grid-stride kernel that invokes the user lambda f(index, seg, rank) for the +// plain transform_lbs; row_ids/row_splits give seg and rank. +template +__global__ void TransformLbsKernel(int32_t count, const int32_t *row_ids, + const int32_t *row_splits, Lambda f) { + for (int32_t index = blockIdx.x * blockDim.x + threadIdx.x; index < count; + index += gridDim.x * blockDim.x) { + int32_t seg = row_ids[index]; + int32_t rank = index - row_splits[seg]; + f(index, seg, rank); + } +} + +// Same, with a per-segment cached tuple passed as the 4th lambda argument. +template +__global__ void TransformLbsTupleKernel(int32_t count, const int32_t *row_ids, + const int32_t *row_splits, + PtrTuple cached, Lambda f) { + for (int32_t index = blockIdx.x * blockDim.x + threadIdx.x; index < count; + index += gridDim.x * blockDim.x) { + int32_t seg = row_ids[index]; + int32_t rank = index - row_splits[seg]; + f(index, seg, rank, cached.at(seg)); + } +} + +// Scratch device buffer tied to a k2 Context (used for row_ids). +struct DeviceScratch { + k2::ContextPtr c; + void *data = nullptr; + void *deleter_context = nullptr; + DeviceScratch(k2::ContextPtr c, size_t bytes) : c(std::move(c)) { + data = this->c->Allocate(bytes, &deleter_context); + } + ~DeviceScratch() { c->Deallocate(data, deleter_context); } + template + T *as() { + return reinterpret_cast(data); + } +}; + +// Copy the nsegs+1 segment offsets to host so we can iterate segments. +inline std::vector SegmentsToHost(context_t &ctx, + const int32_t *segments, + int32_t num_segments) { + std::vector host(num_segments + 1); + K2_CHECK_EQ(hipMemcpyAsync(host.data(), segments, + (num_segments + 1) * sizeof(int32_t), + hipMemcpyDeviceToHost, ctx.stream()), + hipSuccess); + K2_CHECK_EQ(hipStreamSynchronize(ctx.stream()), hipSuccess); + return host; +} + +} // namespace shim_internal + +// --------------------------------------------------------------------------- +// transform_lbs: f(index, seg, rank) for each index in [0, count). +// --------------------------------------------------------------------------- +template +void transform_lbs(Lambda f, int32_t count, const int32_t *row_splits, + int32_t num_segments, context_t &ctx) { + if (count <= 0) return; + shim_internal::DeviceScratch row_ids_buf(ctx.k2_context, + count * sizeof(int32_t)); + int32_t *row_ids = row_ids_buf.as(); + shim_internal::ComputeRowIds(ctx, count, row_splits, num_segments, row_ids); + + int32_t grid = (count + shim_internal::kBlockSize - 1) / + shim_internal::kBlockSize; + shim_internal::TransformLbsKernel<<>>(count, row_ids, + row_splits, f); +} + +template +void transform_lbs(Lambda f, int32_t count, const int32_t *row_splits, + int32_t num_segments, ptr_tuple cached, + context_t &ctx) { + if (count <= 0) return; + shim_internal::DeviceScratch row_ids_buf(ctx.k2_context, + count * sizeof(int32_t)); + int32_t *row_ids = row_ids_buf.as(); + shim_internal::ComputeRowIds(ctx, count, row_splits, num_segments, row_ids); + + int32_t grid = (count + shim_internal::kBlockSize - 1) / + shim_internal::kBlockSize; + shim_internal::TransformLbsTupleKernel<<>>( + count, row_ids, row_splits, cached, f); +} + +// --------------------------------------------------------------------------- +// mergesort: stable sort of keys, optionally permuting an index/value array. +// --------------------------------------------------------------------------- +template +void mergesort(Key *keys, int32_t count, Comp comp, context_t &ctx) { + if (count <= 0) return; + auto policy = thrust::hip::par.on(ctx.stream()); + thrust::stable_sort(policy, shim_internal::dptr(keys), + shim_internal::dptr(keys) + count, comp); +} + +template +void mergesort(Key *keys, Value *vals, int32_t count, Comp comp, + context_t &ctx) { + if (count <= 0) return; + auto policy = thrust::hip::par.on(ctx.stream()); + thrust::stable_sort_by_key(policy, shim_internal::dptr(keys), + shim_internal::dptr(keys) + count, + shim_internal::dptr(vals), comp); +} + +// --------------------------------------------------------------------------- +// segmented_sort / segmented_sort_indices: per-segment stable sort. segments is +// a device array of nsegs+1 offsets (row_splits style). +// --------------------------------------------------------------------------- +template +void segmented_sort(Key *keys, int32_t count, const int32_t *segments, + int32_t num_segments, Comp comp, context_t &ctx) { + if (count <= 0) return; + std::vector off = + shim_internal::SegmentsToHost(ctx, segments, num_segments); + auto policy = thrust::hip::par.on(ctx.stream()); + for (int32_t s = 0; s < num_segments; ++s) { + int32_t begin = off[s], end = off[s + 1]; + if (end - begin > 1) + thrust::stable_sort(policy, shim_internal::dptr(keys) + begin, + shim_internal::dptr(keys) + end, comp); + } +} + +template +void segmented_sort_indices(Key *keys, Index *indices, int32_t count, + const int32_t *segments, int32_t num_segments, + Comp comp, context_t &ctx) { + if (count <= 0) return; + // moderngpu's segmented_sort_indices fills `indices` with the GLOBAL identity + // permutation (0..count-1) and then stable-sorts each segment's slice + // alongside the keys, so afterwards indices[p] is the original global index + // of the element now at p. k2 relies on this (it does NOT pre-seed `indices`, + // unlike mergesort which seeds with Range()): PruneRaggedAxis1 reads + // order_map[idx01] as a global original index. Seed the identity here. + auto policy = thrust::hip::par.on(ctx.stream()); + thrust::sequence(policy, shim_internal::dptr(indices), + shim_internal::dptr(indices) + count, Index(0)); + std::vector off = + shim_internal::SegmentsToHost(ctx, segments, num_segments); + for (int32_t s = 0; s < num_segments; ++s) { + int32_t begin = off[s], end = off[s + 1]; + if (end - begin > 1) + thrust::stable_sort_by_key(policy, shim_internal::dptr(keys) + begin, + shim_internal::dptr(keys) + end, + shim_internal::dptr(indices) + begin, comp); + } +} + +// --------------------------------------------------------------------------- +// load_balance_search: out_row_ids[i] = segment of element i. +// --------------------------------------------------------------------------- +inline void load_balance_search(int32_t count, const int32_t *row_splits, + int32_t num_segments, int32_t *out_row_ids, + context_t &ctx) { + shim_internal::ComputeRowIds(ctx, count, row_splits, num_segments, + out_row_ids); +} + +// bounds_lower/bounds_upper are enum constants used as a non-type template +// argument, matching moderngpu's `sorted_search(...)`. +enum bounds_t { bounds_lower, bounds_upper }; + +// moderngpu's plus_t; only used as the scan op tag for transform_scan. +template +struct plus_t { + __host__ __device__ __forceinline__ T operator()(T a, T b) const { + return a + b; + } +}; + +namespace shim_internal { +template +__global__ void MaterializeKernel(int32_t count, Lambda f, T *out) { + for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; + i += gridDim.x * blockDim.x) { + out[i] = f(i); + } +} +} // namespace shim_internal + +// transform_scan(f, count, output, plus, reduction_out, ctx): exclusive sum +// of f(i) for i in [0,count) into output[0..count-1], with the grand total +// written to reduction_out (which k2 passes as output+count). Implemented as in +// k2's own ExclusiveSum: materialize f(.) into count+1 elements (the last is a +// don't-care input) and run hipcub ExclusiveSum, so output[count] == total. +template +void transform_scan(Lambda f, int32_t count, T *output, Op /*op*/, + T *reduction_out, context_t &ctx) { + if (count <= 0) { + if (count == 0) + K2_CHECK_EQ(hipMemsetAsync(reduction_out, 0, sizeof(T), ctx.stream()), + hipSuccess); + return; + } + // values: count+1 transformed inputs (index `count` is a readable + // don't-care). + shim_internal::DeviceScratch values_buf(ctx.k2_context, + (count + 1) * sizeof(T)); + T *values = values_buf.as(); + int32_t grid = (count + shim_internal::kBlockSize - 1) / + shim_internal::kBlockSize; + shim_internal::MaterializeKernel + <<>>(count, f, values); + + size_t temp_bytes = 0; + K2_CHECK_EQ(hipcub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, values, + output, count + 1, ctx.stream()), + hipSuccess); + shim_internal::DeviceScratch temp_buf(ctx.k2_context, temp_bytes); + K2_CHECK_EQ( + hipcub::DeviceScan::ExclusiveSum(temp_buf.data, temp_bytes, values, + output, count + 1, ctx.stream()), + hipSuccess); + // output[count] now holds the total; reduction_out is output + count for k2's + // K2_TRANS_EXCSUM, so it is already populated, but write it explicitly in + // case reduction_out aliases elsewhere. + if (reduction_out != output + count) + K2_CHECK_EQ(hipMemcpyAsync(reduction_out, output + count, sizeof(T), + hipMemcpyDeviceToDevice, ctx.stream()), + hipSuccess); +} + +// sorted_search: out[i] = lower_bound(haystack, needles[i]). +template +void sorted_search(const T *needles, int32_t num_needles, const T *haystack, + int32_t num_haystack, int32_t *out, Comp /*comp*/, + context_t &ctx) { + if (num_needles <= 0) return; + auto policy = thrust::hip::par.on(ctx.stream()); + auto hay_begin = shim_internal::dptr(const_cast(haystack)); + auto needle_begin = shim_internal::dptr(const_cast(needles)); + if (Bounds == bounds_lower) + thrust::lower_bound(policy, hay_begin, hay_begin + num_haystack, + needle_begin, needle_begin + num_needles, + shim_internal::dptr(out)); + else + thrust::upper_bound(policy, hay_begin, hay_begin + num_haystack, + needle_begin, needle_begin + num_needles, + shim_internal::dptr(out)); +} + +} // namespace mgpu + +#endif // K2_CSRC_MODERNGPU_SHIM_H_ diff --git a/k2/csrc/nvtx_test.cu b/k2/csrc/nvtx_test.cu index 8a6d99ff9..c6330cb70 100644 --- a/k2/csrc/nvtx_test.cu +++ b/k2/csrc/nvtx_test.cu @@ -47,7 +47,8 @@ * */ -#include // NOLINT +#include // NOLINT +#include // NOLINT #include "gtest/gtest.h" #include "k2/csrc/nvtx.h" diff --git a/k2/csrc/pytorch_context.cu b/k2/csrc/pytorch_context.cu index f732d9794..8a4788b3a 100644 --- a/k2/csrc/pytorch_context.cu +++ b/k2/csrc/pytorch_context.cu @@ -1,5 +1,6 @@ /** * Copyright 2020 Mobvoi Inc. (authors: Fangjun Kuang) + * Copyright (c) 2026 Advanced Micro Devices, Inc. (authors: Jeff Daily ) * * See LICENSE for clarification regarding multiple authors * @@ -19,7 +20,18 @@ #include #include // NOLINT -#ifdef K2_WITH_CUDA +#if defined(K2_WITH_HIP) +// On a ROCm torch the c10/cuda/* headers include a generated +// cuda_cmake_macros.h that only exists as the hip variant, so they do not +// compile; the c10/hip/* headers are correct on every torch hipify generation. +// They define the device APIs under c10::hip (hipify v1 rename) AND under +// c10::cuda (hipify v2 masquerading); the call sites pick the right namespace +// via TORCH_HIPIFY_V2. +#include "c10/hip/HIPCachingAllocator.h" +#include "c10/hip/HIPFunctions.h" +#include "c10/hip/HIPStream.h" +#include "torch/cuda.h" +#elif defined(K2_WITH_CUDA) #include "c10/cuda/CUDACachingAllocator.h" #include "c10/cuda/CUDAFunctions.h" #include "torch/cuda.h" @@ -145,21 +157,35 @@ class PytorchCudaContext : public Context { explicit PytorchCudaContext(int32_t gpu_id) : gpu_id_(gpu_id) { #ifdef K2_WITH_CUDA K2_CHECK_GE(gpu_id, 0); +#if defined(K2_WITH_HIP) && !defined(TORCH_HIPIFY_V2) + // torch hipify v1 renamed the device classes: c10::hip is the only + // spelling. + K2_CHECK_LT(gpu_id, c10::hip::device_count()); + c10::hip::set_device(static_cast(gpu_id)); +#else + // hipify v2 keeps the CUDA spelling as the masquerading API; pure CUDA too. K2_CHECK_LT(gpu_id, c10::cuda::device_count()); - c10::cuda::set_device(gpu_id); +#endif // The internals of `lazyInitCUDA` are executed only once // so it is fine to invoke lazyInitCUDA() multiple times. // The call will be inlined since it is defined in the header // aten/src/ATen/Context.h -#if K2_TORCH_VERSION_MAJOR > 2 || (K2_TORCH_VERSION_MAJOR == 2 && K2_TORCH_VERSION_MINOR >= 6) +#if K2_TORCH_VERSION_MAJOR > 2 || \ + (K2_TORCH_VERSION_MAJOR == 2 && K2_TORCH_VERSION_MINOR >= 6) at::globalContext().lazyInitDevice(torch::kCUDA); #else at::globalContext().lazyInitCUDA(); #endif +#if defined(K2_WITH_HIP) && !defined(TORCH_HIPIFY_V2) + // torch hipify v1 rename: c10::hip::HIPCachingAllocator. + allocator_ = c10::hip::HIPCachingAllocator::get(); +#else + // hipify v2 masquerading or pure CUDA: c10::cuda::CUDACachingAllocator. allocator_ = c10::cuda::CUDACachingAllocator::get(); +#endif K2_CHECK(allocator_->raw_deleter() != nullptr); #else K2_LOG(FATAL) << "Unreachable code."; @@ -172,8 +198,15 @@ class PytorchCudaContext : public Context { cudaStream_t GetCudaStream() const override { #ifdef K2_WITH_CUDA +#if defined(K2_WITH_HIP) && !defined(TORCH_HIPIFY_V2) + // torch hipify v1 rename: c10::hip::getCurrentHIPStream. + return g_stream_override.OverrideStream( + c10::hip::getCurrentHIPStream(static_cast(gpu_id_))); +#else + // hipify v2 masquerading or pure CUDA: c10::cuda::getCurrentCUDAStream. return g_stream_override.OverrideStream( c10::cuda::getCurrentCUDAStream(gpu_id_)); +#endif #else return cudaStream_t{}; #endif @@ -256,7 +289,14 @@ ContextPtr GetCudaContext(int32_t gpu_id /*= -1*/) { if (has_cuda) { #ifdef K2_WITH_CUDA +#if defined(K2_WITH_HIP) && !defined(TORCH_HIPIFY_V2) + // torch hipify v1 rename: c10::hip::current_device. + if (gpu_id < 0) + gpu_id = static_cast(c10::hip::current_device()); +#else + // hipify v2 masquerading or pure CUDA: c10::cuda::current_device. if (gpu_id < 0) gpu_id = c10::cuda::current_device(); +#endif DeviceGuard guard(gpu_id); return std::make_shared(gpu_id); #else diff --git a/k2/csrc/pytorch_context_test.cu b/k2/csrc/pytorch_context_test.cu index 217c89f3f..4eac3d241 100644 --- a/k2/csrc/pytorch_context_test.cu +++ b/k2/csrc/pytorch_context_test.cu @@ -16,7 +16,22 @@ * limitations under the License. */ +#if defined(K2_WITH_HIP) +// The c10/cuda/* headers pull a generated cuda_cmake_macros.h that only exists +// as the hip variant on a ROCm torch, so include the c10/hip/* header on every +// hipify generation; the device-fn namespace is selected below. +#include "c10/hip/HIPFunctions.h" +#else #include "c10/cuda/CUDAFunctions.h" +#endif + +#if defined(K2_WITH_HIP) && !defined(TORCH_HIPIFY_V2) +// torch hipify v1 renamed the device classes; c10::hip is the only spelling. +namespace c10_device = c10::hip; +#else +// hipify v2 masquerading or pure CUDA: the CUDA spelling is public. +namespace c10_device = c10::cuda; +#endif #include "gtest/gtest.h" #include "k2/csrc/test_utils.h" // @@ -28,17 +43,17 @@ namespace k2 { // Use a separate function because there is a lambda function inside K2_EVAL(). static void TestImpl() { - K2_LOG(INFO) << "Number of devices: " << c10::cuda::device_count(); + K2_LOG(INFO) << "Number of devices: " << c10_device::device_count(); // Set the default device to 1 - c10::cuda::set_device(1); - EXPECT_EQ(c10::cuda::current_device(), 1); + c10_device::set_device(1); + EXPECT_EQ(c10_device::current_device(), 1); ContextPtr c = GetCudaContext(0); EXPECT_EQ(c->GetDeviceId(), 0); // the default device should still be 1 - EXPECT_EQ(c10::cuda::current_device(), 1); + EXPECT_EQ(c10_device::current_device(), 1); Array1 a(c, "[1 2]"); EXPECT_EQ(a.Context()->GetDeviceId(), 0); @@ -73,7 +88,7 @@ TEST(PyTorchContext, GetCudaContext) { if (!torch::cuda::is_available()) return; // skip it if there are less than two CUDA GPUs. - if (c10::cuda::device_count() < 2) return; + if (c10_device::device_count() < 2) return; TestImpl(); } diff --git a/k2/csrc/ragged.h b/k2/csrc/ragged.h index d4bde0bfa..8020ea261 100644 --- a/k2/csrc/ragged.h +++ b/k2/csrc/ragged.h @@ -39,13 +39,14 @@ namespace k2 { // Note: row_splits is of size num_rows + 1 and row_ids is of size // num_elements. struct RaggedShapeLayer { - RaggedShapeLayer() = default; - - RaggedShapeLayer(const RaggedShapeLayer &) = default; - RaggedShapeLayer& operator=(const RaggedShapeLayer &) = default; - - RaggedShapeLayer(RaggedShapeLayer &&) = default; - RaggedShapeLayer& operator=(RaggedShapeLayer &&) = default; + // NOTE: deliberately no user-declared constructors/assignment. k2 + // brace-initializes this as an aggregate, e.g. RaggedShapeLayer{splits, ids, + // tot}. In C++20 a user-DECLARED (even = default) constructor disqualifies a + // type from aggregate initialization (the rule tightened from "user-provided" + // in C++17), and the ROCm torch 2.13 headers force a C++20 build, so the + // previously-defaulted copy/move/default members are left implicit (which are + // identical here -- all members are trivially copyable). The CUDA build is + // unaffected: the implicit members match the removed defaulted ones. // Search for "row_splits concept" in utils.h for explanation. row_splits // is required; it must always be nonempty for a RaggedShapeLayer to be valid. diff --git a/k2/csrc/ragged_ops.cu b/k2/csrc/ragged_ops.cu index 5b73cdda7..77761fe08 100644 --- a/k2/csrc/ragged_ops.cu +++ b/k2/csrc/ragged_ops.cu @@ -2530,9 +2530,13 @@ struct HashOutputIterator { // outputs just the index of the pair. return HashOutputIteratorDeref(t_); } __host__ __device__ __forceinline__ HashOutputIterator - operator+(size_t offset) { + operator+(size_t offset) const { return HashOutputIterator{t_ + offset}; } + __host__ __device__ __forceinline__ HashOutputIterator + operator-(size_t offset) const { + return HashOutputIterator{t_ - offset}; + } __host__ __device__ __forceinline__ HashOutputIterator &operator+=( size_t offset) { t_ += offset; diff --git a/k2/csrc/ragged_ops_inl.h b/k2/csrc/ragged_ops_inl.h index 6f2a38c71..3b2f1425b 100644 --- a/k2/csrc/ragged_ops_inl.h +++ b/k2/csrc/ragged_ops_inl.h @@ -593,9 +593,17 @@ struct PairOutputIterator { // outputs just the index of the pair. return PairOutputIteratorDeref(i_); } __host__ __device__ __forceinline__ PairOutputIterator - operator+(int32_t offset) { + operator+(int32_t offset) const { return PairOutputIterator{i_ + offset}; } + __host__ __device__ __forceinline__ PairOutputIterator + operator+(size_t offset) const { + return PairOutputIterator{i_ + offset}; + } + __host__ __device__ __forceinline__ PairOutputIterator + operator-(int32_t offset) const { + return PairOutputIterator{i_ - offset}; + } __host__ __device__ __forceinline__ PairOutputIterator &operator+=( int32_t offset) { i_ += offset; diff --git a/k2/csrc/rand.cu b/k2/csrc/rand.cu index 426355baa..750ede335 100644 --- a/k2/csrc/rand.cu +++ b/k2/csrc/rand.cu @@ -20,7 +20,9 @@ #include #include -#ifdef K2_WITH_CUDA +#if defined(K2_WITH_HIP) +#include // NOLINT +#elif defined(K2_WITH_CUDA) #include "curand.h" // NOLINT #include "curand_kernel.h" // NOLINT #endif diff --git a/k2/csrc/utils.h b/k2/csrc/utils.h index 434748c01..c9e01bbc7 100644 --- a/k2/csrc/utils.h +++ b/k2/csrc/utils.h @@ -338,7 +338,7 @@ __host__ __device__ __forceinline__ float IntAsFloat(int32_t i) { is an error if it becomes less than zero). */ __host__ __device__ __forceinline__ bool AtomicDecAndCompareZero(int32_t *i) { -#ifdef __CUDA_ARCH__ +#if K2_DEVICE_CODE int32_t old = atomicAdd(i, -1); K2_CHECK_GT(old, 0); return old == 1; @@ -362,7 +362,7 @@ __host__ __device__ __forceinline__ bool AtomicDecAndCompareZero(int32_t *i) { */ template __host__ __device__ __forceinline__ void AtomicAdd(T *address, T value) { -#ifdef __CUDA_ARCH__ +#if K2_DEVICE_CODE atomicAdd(address, value); #else // For host code, we assume single-threaded for now). @@ -376,7 +376,15 @@ __host__ __device__ __forceinline__ void AtomicAdd(T *address, T value) { // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions __host__ __device__ __forceinline__ void AtomicAdd(double *address, double value) { -#if __CUDA_ARCH__ >= 600 +#if defined(K2_WITH_HIP) +#if K2_DEVICE_CODE + // gfx9 (CDNA) has native fp64 atomicAdd. + atomicAdd(address, value); +#else + // For host code, we assume single-threaded for now. + *address += value; +#endif +#elif __CUDA_ARCH__ >= 600 atomicAdd(address, value); #elif defined(__CUDA_ARCH__) // clang-format off @@ -418,7 +426,7 @@ __host__ __device__ __forceinline__ float OrderedIntToFloat(int32_t i) { */ __host__ __device__ __forceinline__ int32_t AtomicMax(int32_t *address, int32_t val) { -#if defined(__CUDA_ARCH__) +#if K2_DEVICE_CODE return atomicMax(address, val); #else int32_t old = *address; diff --git a/k2/python/csrc/CMakeLists.txt b/k2/python/csrc/CMakeLists.txt index 19e34fc0c..8682bfe69 100644 --- a/k2/python/csrc/CMakeLists.txt +++ b/k2/python/csrc/CMakeLists.txt @@ -16,7 +16,7 @@ else() message(FATAL_ERROR "Please select a framework.") endif() -if(NOT K2_WITH_CUDA) +if(NOT K2_WITH_CUDA AND NOT K2_WITH_HIP) transform(OUTPUT_VARIABLE k2_srcs SRCS ${k2_srcs}) endif() @@ -29,7 +29,19 @@ if(WIN32) set_property(TARGET pybind11::windows_extras PROPERTY INTERFACE_COMPILE_OPTIONS "") endif() -pybind11_add_module(_k2 ${k2_srcs}) +if(K2_WITH_HIP) + # NO_EXTRAS disables pybind11's LTO/-flto: under the HIP toolchain LTO leaves the + # module as slim bitcode with no PyInit__k2 (ImportError), since the HIP link does + # not finalize LTO (gpuRIR lesson). + pybind11_add_module(_k2 NO_EXTRAS ${k2_srcs}) + set_source_files_properties(${k2_srcs} PROPERTIES LANGUAGE HIP) + set_target_properties(_k2 PROPERTIES + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" + HIP_SEPARABLE_COMPILATION ON + INTERPROCEDURAL_OPTIMIZATION OFF) +else() + pybind11_add_module(_k2 ${k2_srcs}) +endif() target_link_libraries(_k2 PRIVATE context) target_link_libraries(_k2 PRIVATE fsa) diff --git a/k2/python/csrc/torch/CMakeLists.txt b/k2/python/csrc/torch/CMakeLists.txt index 8c6803f8a..5c64507d9 100644 --- a/k2/python/csrc/torch/CMakeLists.txt +++ b/k2/python/csrc/torch/CMakeLists.txt @@ -21,7 +21,7 @@ set(torch_srcs v2/ragged_shape.cu ) -if (K2_WITH_CUDA) +if (K2_WITH_CUDA OR K2_WITH_HIP) list(APPEND torch_srcs mutual_information_cuda.cu) endif() diff --git a/k2/python/csrc/torch/mutual_information_cuda.cu b/k2/python/csrc/torch/mutual_information_cuda.cu index 84e60871e..d9069fb10 100644 --- a/k2/python/csrc/torch/mutual_information_cuda.cu +++ b/k2/python/csrc/torch/mutual_information_cuda.cu @@ -18,8 +18,15 @@ * limitations under the License. */ +#if defined(K2_WITH_HIP) +// c10/cuda/* does not compile on a ROCm torch (generated cuda_cmake_macros.h is +// hip-only); the c10/hip/* stream header is correct on every hipify generation. +#include +#include +#else #include // for getCurrentCUDAStream() #include +#endif #include "k2/csrc/utils.h" // for LogAdd #include "k2/python/csrc/torch/mutual_information.h" diff --git a/k2/python/csrc/torch/ragged.cu b/k2/python/csrc/torch/ragged.cu index 31ea395b8..f4b25e0d0 100644 --- a/k2/python/csrc/torch/ragged.cu +++ b/k2/python/csrc/torch/ragged.cu @@ -162,7 +162,11 @@ static void PybindRaggedTpl(py::module &m, const char *name) { }); pyclass.def(py::pickle( - [](const PyClass &obj) { + [](const PyClass &obj) -> py::tuple { + // Explicit return type: clang/C++20 deduces a lambda's return type from + // all return statements and rejects the two py::make_tuple() calls + // below (different arities) as "different" deduced types; nvcc/gcc + // accept it. DeviceGuard guard(obj.Context()); K2_CHECK(obj.NumAxes() == 2 || obj.NumAxes() == 3) << "Only support Ragged with NumAxes() == 2 or 3 for now, given "