diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d2eb41ca..6bae45b45 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,30 +29,68 @@ endif()
 
 set(CMAKE_DISABLE_FIND_PACKAGE_MKL TRUE)
 
+# Build with HIP for AMD GPUs (ROCm). When ON, the CUDA toolchain is not used;
+# the .cu sources are compiled as HIP (see the K2_WITH_HIP blocks below) and the
+# two non-portable third-party deps (moderngpu, the vendored CUDPP segmented
+# scan) are replaced by hipCUB/rocThrust. See k2/csrc/cuda_to_hip.h.
+option(K2_WITH_HIP "Build k2 with HIP for AMD GPUs" OFF)
+
 set(languages CXX)
 set(_K2_WITH_CUDA ON)
 
-find_program(K2_HAS_NVCC nvcc)
-if(NOT K2_HAS_NVCC AND "$ENV{CUDACXX}" STREQUAL "")
-  message(STATUS "No NVCC detected. Disable CUDA support")
+if(K2_WITH_HIP)
+  # On the HIP build the CUDA toolchain is irrelevant.
   set(_K2_WITH_CUDA OFF)
-endif()
-
-if(APPLE OR (DEFINED K2_WITH_CUDA AND NOT K2_WITH_CUDA))
-  if(_K2_WITH_CUDA)
-    message(STATUS "Disable CUDA support")
+else()
+  find_program(K2_HAS_NVCC nvcc)
+  if(NOT K2_HAS_NVCC AND "$ENV{CUDACXX}" STREQUAL "")
+    message(STATUS "No NVCC detected. Disable CUDA support")
     set(_K2_WITH_CUDA OFF)
   endif()
+
+  if(APPLE OR (DEFINED K2_WITH_CUDA AND NOT K2_WITH_CUDA))
+    if(_K2_WITH_CUDA)
+      message(STATUS "Disable CUDA support")
+      set(_K2_WITH_CUDA OFF)
+    endif()
+  endif()
 endif()
 
 if(_K2_WITH_CUDA)
   set(languages ${languages} CUDA)
 endif()
 
+if(K2_WITH_HIP)
+  # Add HIP to the project() language list (mirrors how CUDA is added) so the
+  # compiler is enabled in the initial project() call; enabling it afterwards via
+  # enable_language() forces a mid-configure re-run that drops cache values. The
+  # project() probe then honors a user-supplied -DCMAKE_HIP_ARCHITECTURES or, when
+  # unset, auto-detects the host GPU; it is not pinned to any specific arch.
+  set(languages ${languages} HIP)
+endif()
+
 message(STATUS "Enabled languages: ${languages}")
 
 project(k2 ${languages})
 
+if(K2_WITH_HIP)
+  message(STATUS "CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
+  find_package(hip REQUIRED)
+  find_package(hipcub REQUIRED)
+  find_package(rocprim REQUIRED)
+  find_package(hiprand REQUIRED)
+  # These must be found before find_package(Torch) so that Caffe2Targets.cmake
+  # can resolve all ROCm interface targets it imports (hiprtc::hiprtc, roc::hipblas, etc.).
+  find_package(hiprtc QUIET)
+  find_package(hipblas QUIET)
+  find_package(hipblaslt QUIET)
+  find_package(hipsolver QUIET)
+  find_package(hipsparse QUIET)
+  find_package(rocblas QUIET)
+  find_package(rocsolver QUIET)
+  find_package(hipfft QUIET)
+endif()
+
 set(K2_VERSION "1.24.4")
 
 # ----------------- Supported build types for K2 project -----------------
@@ -107,7 +145,7 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
 
-if(BUILD_SHARED_LIBS AND MSVC)
+if(BUILD_SHARED_LIBS AND (MSVC OR (WIN32 AND K2_WITH_HIP)))
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 
@@ -350,6 +388,35 @@ if(K2_WITH_CUDA)
   add_definitions(-DK2_WITH_CUDA)
 endif()
 
+if(K2_WITH_HIP)
+  # K2_WITH_HIP/USE_HIP gate the genuinely divergent code; K2_WITH_CUDA is ALSO
+  # defined as a compile macro so the many `#ifdef K2_WITH_CUDA` blocks that hold
+  # shared GPU/driver/torch code (c10::cuda bridge, cub.h, rand.cu, eval.h CG)
+  # compile -- cuda_to_hip.h retargets the cuda*/curand*/cub spellings to HIP.
+  # The CMake OPTION K2_WITH_CUDA stays OFF (no nvcc arch gencode, no moderngpu
+  # fetch, NVTX auto-off). (AutoDock-GPU lesson.)
+  add_definitions(-DK2_WITH_HIP)
+  add_definitions(-DUSE_HIP)
+  add_definitions(-DK2_WITH_CUDA)
+
+  # Force-include the single compat header on every HIP TU so its aliases
+  # precede all other includes regardless of per-file include order.
+  string(APPEND CMAKE_HIP_FLAGS
+    " -include ${CMAKE_SOURCE_DIR}/k2/csrc/cuda_to_hip.h")
+
+  # Vendored ROCm/libhipcxx supplies <cuda/std/*> (ROCm ships no cuda/std), used
+  # by cub.h and utils_inl.h (cuda::std::plus). Point at it if present.
+  if(DEFINED K2_LIBHIPCXX_INCLUDE_DIR AND
+     EXISTS "${K2_LIBHIPCXX_INCLUDE_DIR}/cuda/std/functional")
+    message(STATUS "libhipcxx include dir: ${K2_LIBHIPCXX_INCLUDE_DIR}")
+    include_directories(SYSTEM ${K2_LIBHIPCXX_INCLUDE_DIR})
+  else()
+    message(FATAL_ERROR
+      "K2_WITH_HIP=ON requires ROCm/libhipcxx (provides <cuda/std/*>). Pass "
+      "-DK2_LIBHIPCXX_INCLUDE_DIR=/path/to/libhipcxx/include")
+  endif()
+endif()
+
 if(WIN32)
   add_definitions(-DNOMINMAX) # Otherwise, std::max() and std::min() won't work
 endif()
@@ -376,14 +443,22 @@ if(K2_WITH_CUDA AND NOT WIN32)
   message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
 endif()
 
+if(K2_WITH_HIP)
+  # hipcc is clang; these are clang flags (no nvcc --compiler-options wrapper).
+  string(APPEND CMAKE_HIP_FLAGS " -Wno-strict-overflow ")
+  string(APPEND CMAKE_HIP_FLAGS " -Wno-unknown-pragmas ")
+  string(APPEND CMAKE_HIP_FLAGS " -Wno-unused-variable ")
+  message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")
+endif()
+
 
 if(NOT WIN32)
   string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-variable ")
   string(APPEND CMAKE_CXX_FLAGS " -Wno-strict-overflow ")
 endif()
 
-if(WIN32)
-  # disable various warnings for MSVC
+if(WIN32 AND NOT K2_WITH_HIP)
+  # disable various warnings for MSVC (not applicable to the clang HIP build)
   # NOTE: Most of the warnings are from PyTorch C++ APIs
   # 4005: macro redefinition
   # 4018: signed/unsigned mismatch
diff --git a/README.md b/README.md
index 1fd10b4b8..68810cbe1 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,11 @@ general and extensible framework to allow further development of ASR technology.
  done via the cub library, parts of which we wrap with our own convenient
  interface.
 
+ The same code also builds for AMD GPUs with ROCm/HIP, where the GPU primitives
+ are provided by hipCUB and rocThrust. See the
+ [installation docs](https://k2-fsa.github.io/k2/installation/from_source.html)
+ for how to build the ROCm backend.
+
  The Finite State Automaton object is then implemented as a Ragged tensor templated
  on a specific data type (a struct representing an arc in the automaton).
 
diff --git a/cmake/torch.cmake b/cmake/torch.cmake
index ceabe388f..ec306d702 100644
--- a/cmake/torch.cmake
+++ b/cmake/torch.cmake
@@ -71,6 +71,27 @@ execute_process(
 
 message(STATUS "PyTorch version: ${TORCH_VERSION}")
 
+if(K2_WITH_HIP)
+  # torch's source hipify (torch/utils/hipify) has two generations that disagree
+  # on the c10 device namespace.  Generation 1 RENAMED the device classes, so the
+  # hip-spelled symbols (c10::hip::*) are the only public ones.  Generation 2
+  # (pytorch#174087, version.py bumped 1.0.0 -> 2.0.0) STOPPED renaming: the CUDA
+  # spellings stay public as the masquerading API (c10::cuda::* on a ROCm build)
+  # while c10::hip::* survive only as thin wrappers.  k2 drives the .cu through
+  # CMake/USE_HIP and never runs torch source-hipify, so it must detect the
+  # generation itself and select the matching namespace.
+  execute_process(
+    COMMAND "${PYTHON_EXECUTABLE}" -c "from packaging.version import Version; import torch.utils.hipify as h; print(1 if Version(getattr(h, '__version__', '1.0.0')) >= Version('2.0.0') else 0)"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE K2_TORCH_HIPIFY_V2
+    RESULT_VARIABLE _k2_hipify_probe_rc
+  )
+  if(NOT _k2_hipify_probe_rc EQUAL 0)
+    set(K2_TORCH_HIPIFY_V2 0)
+  endif()
+  message(STATUS "torch hipify generation v2 (masquerading c10::cuda): ${K2_TORCH_HIPIFY_V2}")
+endif()
+
 if(K2_WITH_CUDA)
   execute_process(
     COMMAND "${PYTHON_EXECUTABLE}" -c "import torch; print(torch.version.cuda)"
@@ -105,3 +126,13 @@ if(K2_WITH_CUDA)
   )
 endif()
 
+if(K2_WITH_HIP)
+  # On a ROCm torch the GPU target is torch_hip; clear its (and torch_cpu's)
+  # interface compile options for the same reason as the CUDA path above.
+  foreach(_t torch_hip torch_cpu)
+    if(TARGET ${_t})
+      set_property(TARGET ${_t} PROPERTY INTERFACE_COMPILE_OPTIONS "")
+    endif()
+  endforeach()
+endif()
+
diff --git a/docs/source/installation/from_source.rst b/docs/source/installation/from_source.rst
index 75af6bd58..3b74a7c1b 100644
--- a/docs/source/installation/from_source.rst
+++ b/docs/source/installation/from_source.rst
@@ -12,7 +12,9 @@ Install from source
 
 .. hint::
 
-    It supports Linux (CPU + CUDA), macOS (CPU), and Windows (CPU + CUDA).
+    It supports Linux (CPU + CUDA + ROCm), macOS (CPU), and Windows (CPU + CUDA
+    + ROCm). For AMD GPUs via ROCm, see the "Building with ROCm (AMD GPUs)"
+    section below.
 
 .. hint::
 
@@ -81,6 +83,63 @@ That is all you need to run.
       e == cudaSuccess (98 vs. 0)  Error: invalid device function.
 
 
+Building with ROCm (AMD GPUs)
+-----------------------------
+
+k2 can also be built for AMD GPUs with ROCm/HIP. The ``.cu`` sources are
+compiled as HIP and the GPU primitives are provided by hipCUB and rocThrust.
+
+.. hint::
+
+  This builds the Python ``_k2`` extension module and the C++ gtest suite (the
+  FSA core that `icefall <https://github.com/k2-fsa/icefall>`_ and
+  `sherpa <https://github.com/k2-fsa/sherpa>`_ consume). The standalone
+  ``k2/torch`` C++ decoder layer is not yet built on the ROCm path.
+
+Before compiling, prepare the environment:
+
+  - Install ROCm (7.2 or newer) including hipCUB, rocPRIM, hipRAND and rocThrust.
+  - Install a ROCm build of PyTorch.
+  - libcu++ is not shipped by ROCm; vendor the ROCm fork
+    (``git clone --branch amd-develop https://github.com/ROCm/libhipcxx``) and
+    point ``K2_LIBHIPCXX_INCLUDE_DIR`` at its ``include`` directory.
+
+Then configure and build, selecting your GPU architecture(s) with
+``CMAKE_HIP_ARCHITECTURES`` (e.g. ``gfx90a`` for MI200, ``gfx1100`` for RDNA3;
+pass a semicolon-separated list to target several). When unset it defaults to
+``gfx90a``.
+
+.. code-block:: bash
+
+  git clone https://github.com/k2-fsa/k2.git
+  cd k2
+  mkdir build_rocm
+  cd build_rocm
+  cmake -DCMAKE_BUILD_TYPE=Release \
+        -DK2_WITH_HIP=ON -DK2_WITH_CUDA=OFF \
+        -DCMAKE_HIP_ARCHITECTURES=gfx90a \
+        -DCMAKE_CXX_STANDARD=20 \
+        -DK2_LIBHIPCXX_INCLUDE_DIR=/path/to/libhipcxx/include \
+        -DK2_ENABLE_TESTS=ON \
+        ..
+  make -j
+
+.. hint::
+
+  To build and install the Python package with ROCm, pass the same options
+  through ``K2_CMAKE_ARGS``:
+
+  .. code-block:: bash
+
+    export K2_CMAKE_ARGS="-DK2_WITH_HIP=ON -DK2_WITH_CUDA=OFF -DCMAKE_HIP_ARCHITECTURES=gfx90a"
+    python3 setup.py install
+
+.. hint::
+
+  Run the GPU tests on a single device, serially, by setting
+  ``HIP_VISIBLE_DEVICES`` to one GPU and running the ``cu_*_test`` executables
+  (or ``ctest``) from the build directory.
+
 To test that k2 is installed successfully, you can run:
 
 .. code-block::
diff --git a/k2/CMakeLists.txt b/k2/CMakeLists.txt
index 6a7839d0e..8b03dcf3e 100644
--- a/k2/CMakeLists.txt
+++ b/k2/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(csrc)
 add_subdirectory(python)
 
-if(K2_USE_PYTORCH)
+if(K2_USE_PYTORCH AND NOT K2_WITH_HIP)
   # We use K2_TORCH_VERSION instead of TORCH_VERSION
   # since TORCH_VERSION may contain something like "+cpu", "+cu113"
   if(K2_TORCH_VERSION VERSION_GREATER_EQUAL 1.8 OR NOT K2_WITH_CUDA)
@@ -12,4 +12,11 @@ if(K2_USE_PYTORCH)
     message(WARNING "Please use at least torch 1.8.0 when CUDA \
     is enabled - skipping compiling k2/torch. Current torch version: ${TORCH_VERSION}")
   endif()
+elseif(K2_WITH_HIP)
+  # k2/torch is the standalone libtorch C++ decoder layer; it pulls in kaldifeat
+  # (itself a separate CUDA project). It is not part of the core _k2 / gtest port
+  # and is not yet built on the ROCm path. The Python _k2 module and the csrc
+  # gtest suite (the FSA core consumed by icefall/sherpa) are built and
+  # validated.
+  message(STATUS "K2_WITH_HIP: skipping k2/torch (standalone C++ decoders + kaldifeat)")
 endif()
diff --git a/k2/csrc/CMakeLists.txt b/k2/csrc/CMakeLists.txt
index 5b0442a47..8f9c57d11 100644
--- a/k2/csrc/CMakeLists.txt
+++ b/k2/csrc/CMakeLists.txt
@@ -91,7 +91,13 @@ else()
   list(APPEND context_srcs default_context.cu)
 endif()
 
-if(NOT K2_WITH_CUDA)
+if(K2_WITH_HIP)
+  # moderngpu and the warp-synchronous CUDPP segmented scan are not portable to
+  # ROCm; drop them and add the hipCUB/rocThrust-backed replacements (same API).
+  list(REMOVE_ITEM context_srcs moderngpu_allocator.cu)
+  list(APPEND context_srcs moderngpu_allocator_hip.cu)
+  list(APPEND context_srcs cudpp/cudpp_hip.cu)
+elseif(NOT K2_WITH_CUDA)
   transform(OUTPUT_VARIABLE context_srcs SRCS ${context_srcs})
 else()
   list(APPEND context_srcs cudpp/cudpp.cu)
@@ -110,6 +116,21 @@ target_compile_definitions(context PUBLIC THRUST_NS_QUALIFIER=thrust)
 set_target_properties(context PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 set_target_properties(context PROPERTIES OUTPUT_NAME "k2context")
 
+if(K2_WITH_HIP)
+  # Compile the .cu translation units as HIP and turn on relocatable device code
+  # (k2 declares __device__ symbols in headers and defines them across .cu TUs).
+  set_source_files_properties(${context_srcs} PROPERTIES LANGUAGE HIP)
+  set_target_properties(context PROPERTIES
+    HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}"
+    HIP_SEPARABLE_COMPILATION ON)
+  # Select the c10 device namespace by the detected torch hipify generation, not
+  # by OS.  PUBLIC so the consumers that select the same namespace (the gtest
+  # suite, the _k2 module's mutual_information TU) inherit the define.
+  if(K2_TORCH_HIPIFY_V2)
+    target_compile_definitions(context PUBLIC TORCH_HIPIFY_V2)
+  endif()
+endif()
+
 # lib deps
 if(K2_WITH_CUDA AND CUDA_VERSION VERSION_LESS 11.0)
   target_link_libraries(context PUBLIC cub)
@@ -119,6 +140,13 @@ if(K2_WITH_CUDA)
   target_link_libraries(context PUBLIC moderngpu)
 endif()
 
+if(K2_WITH_HIP)
+  # Link hip::host (NOT hip::device, whose --offload-arch propagation interferes
+  # with the per-target HIP_ARCHITECTURES). hipcub is an INTERFACE (header-only)
+  # target; hiprand provides the device RNG used by rand.cu.
+  target_link_libraries(context PUBLIC hip::host hip::hipcub hip::hiprand)
+endif()
+
 target_link_libraries(context PUBLIC fsa)
 target_link_libraries(context PUBLIC k2_log)
 if(CUDA_VERSION VERSION_LESS "12.6")
@@ -128,16 +156,13 @@ if(K2_USE_PYTORCH)
   if(NOT WIN32)
     target_link_libraries(context PUBLIC ${TORCH_LIBRARIES})
   else()
-    # see https://discuss.pytorch.org/t/nvcc-fatal-a-single-input-file-is-required-for-a-non-link-phase-when-an-outputfile-is-specified/142843/6
-    # Depending on ${TORCH_LIBRARIES} will introduce a compile time option "/bigobj",
-    # which causes the error in the above link.
-    #
-    # It would be ideal to remove /bigobj so that we can use ${TORCH_LIBRARIES}.
-    # To make life simpler, we use the following approach.
-    #
+    # On Windows (both MSVC+CUDA and clang+HIP), CMake imported shared-library
+    # targets are not reliably expanded to their import .lib paths in the HIP or
+    # Ninja link rules. Use file(GLOB) over ${TORCH_DIR}/lib/*.lib instead; this
+    # mirrors what the original MSVC+CUDA path already did for the same reason.
     message(STATUS "TORCH_DIR: ${TORCH_DIR}") # TORCH_DIR is defined in cmake/torch.cmake
-    # target_link_libraries(context PUBLIC D:/software/anaconda3/envs/py38/Lib/site-packages/torch/lib/*.lib)
-    target_link_libraries(context PUBLIC ${TORCH_DIR}/lib/*.lib)
+    file(GLOB _torch_libs "${TORCH_DIR}/lib/*.lib")
+    target_link_libraries(context PUBLIC ${_torch_libs})
     target_include_directories(context PUBLIC ${TORCH_DIR}/include)
     target_include_directories(context PUBLIC ${TORCH_DIR}/include/torch/csrc/api/include)
   endif()
@@ -151,11 +176,17 @@ target_include_directories(context PUBLIC ${PYTHON_INCLUDE_DIRS})
 
 if(K2_ENABLE_TESTS OR K2_ENABLE_BENCHMARK)
   set(test_utils_srcs test_utils.cu)
-  if(NOT K2_WITH_CUDA)
+  if(NOT K2_WITH_CUDA AND NOT K2_WITH_HIP)
     transform(OUTPUT_VARIABLE test_utils_srcs SRCS ${test_utils_srcs})
   endif()
 
   add_library(test_utils ${test_utils_srcs})
+  if(K2_WITH_HIP)
+    set_source_files_properties(${test_utils_srcs} PROPERTIES LANGUAGE HIP)
+    set_target_properties(test_utils PROPERTIES
+      HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}"
+      HIP_SEPARABLE_COMPILATION ON)
+  endif()
   target_link_libraries(test_utils PUBLIC context gtest)
 endif()
 
@@ -194,7 +225,7 @@ if(K2_ENABLE_TESTS)
     top_sort_test.cu
     utils_test.cu
   )
-  if(NOT K2_WITH_CUDA)
+  if(NOT K2_WITH_CUDA AND NOT K2_WITH_HIP)
     transform(OUTPUT_VARIABLE cuda_test_srcs SRCS ${cuda_test_srcs})
   endif()
 
@@ -206,6 +237,14 @@ if(K2_ENABLE_TESTS)
     set(target_name "cu_${name}")
     add_executable(${target_name} "${source}")
     set_target_properties(${target_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    if(K2_WITH_HIP)
+      # Mark the test .cu LANGUAGE HIP and turn on RDC, else CMake drops them from
+      # the device link (RXMesh lesson: tests would link zero device objects).
+      set_source_files_properties("${source}" PROPERTIES LANGUAGE HIP)
+      set_target_properties(${target_name} PROPERTIES
+        HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}"
+        HIP_SEPARABLE_COMPILATION ON)
+    endif()
     target_link_libraries(${target_name}
       PRIVATE
       context
diff --git a/k2/csrc/array_ops_inl.h b/k2/csrc/array_ops_inl.h
index 94607ec2b..0c047e618 100644
--- a/k2/csrc/array_ops_inl.h
+++ b/k2/csrc/array_ops_inl.h
@@ -60,6 +60,15 @@ struct PtrPtr {
     tmp.data += n;
     return tmp;
   }
+#if defined(USE_HIP)
+  // rocPRIM's scan dereferences `input - 1` on its offset path, so it needs
+  // operator-(int) on the iterator; NVIDIA cub does not call it.
+  __host__ __device__ __forceinline__ PtrPtr operator-(int32_t n) const {
+    PtrPtr tmp(*this);
+    tmp.data -= n;
+    return tmp;
+  }
+#endif
   __host__ __device__ __forceinline__ const T &operator*() const {
     return **data;
   }
@@ -86,6 +95,14 @@ struct ConstReversedPtr {
     tmp.data -= n;
     return tmp;
   }
+#if defined(USE_HIP)
+  __host__ __device__ __forceinline__ ConstReversedPtr
+  operator-(int32_t n) const {
+    ConstReversedPtr tmp(*this);
+    tmp.data += n;  // reversed: subtracting an offset moves forward in memory
+    return tmp;
+  }
+#endif
   __host__ __device__ __forceinline__ const T &operator*() const {
     return *data;
   }
@@ -108,6 +125,13 @@ struct ReversedPtr {
     tmp.data -= n;
     return tmp;
   }
+#if defined(USE_HIP)
+  __host__ __device__ __forceinline__ ReversedPtr operator-(int32_t n) const {
+    ReversedPtr tmp(*this);
+    tmp.data += n;  // reversed: subtracting an offset moves forward in memory
+    return tmp;
+  }
+#endif
   __host__ __device__ __forceinline__ T &operator*() { return *data; }
 };
 
diff --git a/k2/csrc/cub.h b/k2/csrc/cub.h
index 02b159983..291568258 100644
--- a/k2/csrc/cub.h
+++ b/k2/csrc/cub.h
@@ -20,7 +20,17 @@
 #ifndef K2_CSRC_CUB_H_
 #define K2_CSRC_CUB_H_
 
-#ifdef K2_WITH_CUDA
+#if defined(K2_WITH_HIP)
+
+// cuda_to_hip.h (force-included) already aliases `cub` -> `hipcub`.
+// <cuda/std/*> is provided by the vendored ROCm/libhipcxx on the include path,
+// so the cuda::std::plus uses in utils_inl.h resolve unchanged. nvToolsExt has
+// no ROCm equivalent and NVTX is disabled on HIP, so it is not included here.
+#include <cuda/std/functional>
+
+#include <hipcub/hipcub.hpp>
+
+#elif defined(K2_WITH_CUDA)
 
 #include <cuda/std/functional>
 
diff --git a/k2/csrc/cuda_to_hip.h b/k2/csrc/cuda_to_hip.h
new file mode 100644
index 000000000..ea32c08f1
--- /dev/null
+++ b/k2/csrc/cuda_to_hip.h
@@ -0,0 +1,128 @@
+/**
+ * Copyright (c) 2026  Advanced Micro Devices, Inc. (authors: Jeff Daily <jeff.daily@amd.com>)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The single CUDA->HIP compat header for the ROCm port. It is force-included on
+// every HIP translation unit (CMAKE_HIP_FLAGS -include .../cuda_to_hip.h), so
+// the aliases below precede all other includes regardless of file order. It is
+// the only file that knows about HIP; everywhere else keeps the CUDA spelling.
+//
+// On the NVIDIA build this header is never compiled (it is only force-included
+// under K2_WITH_HIP), so the CUDA path is byte-for-byte unchanged.
+
+#ifndef K2_CSRC_CUDA_TO_HIP_H_
+#define K2_CSRC_CUDA_TO_HIP_H_
+
+#if defined(K2_WITH_HIP)
+
+// Pull in the libc host declarations BEFORE <hip/hip_runtime.h> so that inside
+// a .cu compiled as HIP a host-side memcpy/memset resolves to the libc host
+// overload rather than HIP's __device__ overload (gpuRIR lesson).
+#include <cstdlib>
+#include <cstring>
+
+#include <hip/hip_runtime.h>  // NOLINT(build/include_order)
+
+// Note on device-vs-host dispatch under clang/HIP: k2 keys two different things
+// on __CUDA_ARCH__: (a) the host/device DECORATOR K2_CUDA_HOSTDEV, and (b)
+// intra-function `#ifdef __CUDA_ARCH__` device-intrinsic-vs-host dispatch.
+// Under clang/HIP a __host__ __device__ function is preprocessed ONCE in the
+// host pass (where __CUDA_ARCH__ is absent and a `#define` of it does not
+// take), so we canNOT emulate (a) by defining __CUDA_ARCH__ (cudaKDTree
+// lesson). Instead the decorator is unconditionally __host__ __device__ on
+// HIP (log.h) and the (b) dispatch sites use K2_DEVICE_CODE (defined in
+// macros.h), which keys on
+// __HIP_DEVICE_COMPILE__ -- correct per-pass inside a __host__ __device__ body.
+
+// ---------------------------------------------------------------------------
+// Runtime types and enums
+// ---------------------------------------------------------------------------
+#define cudaError_t hipError_t
+#define cudaError hipError
+#define cudaSuccess hipSuccess
+#define cudaErrorNotReady hipErrorNotReady
+#define cudaErrorAssert hipErrorAssert
+#define cudaErrorMemoryAllocation hipErrorOutOfMemory
+#define cudaErrorInitializationError hipErrorNotInitialized
+
+#define cudaStream_t hipStream_t
+#define cudaEvent_t hipEvent_t
+
+#define cudaDeviceProp hipDeviceProp_t
+
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+
+#define cudaEventDisableTiming hipEventDisableTiming
+
+// ---------------------------------------------------------------------------
+// Runtime API
+// ---------------------------------------------------------------------------
+#define cudaMalloc hipMalloc
+#define cudaMallocHost hipHostMalloc
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+
+#define cudaSetDevice hipSetDevice
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaDeviceSynchronize hipDeviceSynchronize
+
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+#define cudaRuntimeGetVersion hipRuntimeGetVersion
+#define cudaDriverGetVersion hipDriverGetVersion
+
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent hipStreamWaitEvent
+
+#define cudaEventCreate hipEventCreate
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventRecord hipEventRecord
+#define cudaEventDestroy hipEventDestroy
+#define cudaEventQuery hipEventQuery
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEventElapsedTime hipEventElapsedTime
+
+// ---------------------------------------------------------------------------
+// CUB: k2 uses the CUDA spelling cub::. hipcub puts its API in the (inline,
+// version-tagged, hidden-visibility) hipcub namespace, so aliasing cub ->
+// hipcub lets every existing cub::DeviceScan/DeviceReduce/... call resolve to
+// hipcub unchanged. (hipcub ignores CUB_WRAPPED_NAMESPACE; the inline namespace
+// plus hidden visibility already prevent a clash with torch's bundled hipcub.)
+// ---------------------------------------------------------------------------
+#define cub hipcub
+
+// ---------------------------------------------------------------------------
+// cuRAND device API (rand.cu): 1:1 with the hipRAND device API.
+// ---------------------------------------------------------------------------
+#define curandStatePhilox4_32_10_t hiprandStatePhilox4_32_10_t
+#define curand_init hiprand_init
+#define curand_uniform4 hiprand_uniform4
+#define curand_uniform2_double hiprand_uniform2_double
+#define curand4 hiprand4
+
+#endif  // defined(K2_WITH_HIP)
+
+#endif  // K2_CSRC_CUDA_TO_HIP_H_
diff --git a/k2/csrc/cudpp/cudpp_hip.cu b/k2/csrc/cudpp/cudpp_hip.cu
new file mode 100644
index 000000000..e86d0f867
--- /dev/null
+++ b/k2/csrc/cudpp/cudpp_hip.cu
@@ -0,0 +1,98 @@
+/**
+ * k2/csrc/cudpp/cudpp_hip.cu
+ *
+ * Copyright (c) 2026  Advanced Micro Devices, Inc. (authors: Jeff Daily <jeff.daily@amd.com>)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// HIP replacement for k2/csrc/cudpp/cudpp.cu. The vendored CUDPP segmented scan
+// is warp-synchronous (WARP_SIZE=32, two 32-lane warps per block) and so is a
+// wave64 hazard on CDNA. SegmentedExclusiveSum is exactly a per-segment
+// exclusive prefix sum, which hipCUB provides natively via
+// DeviceScan::ExclusiveScanByKey. We turn the head-flags array into a monotone
+// segment key (inclusive scan of the flags: the key increments by one at each
+// segment start, so the per-key scan resets exactly at segment boundaries) and
+// run an exclusive Sum scan keyed on it. Identical semantics, no warp
+// arithmetic, correct on wave32 and wave64.
+
+#include <hipcub/hipcub.hpp>
+
+#include "k2/csrc/array.h"
+#include "k2/csrc/context.h"
+#include "k2/csrc/cudpp/cudpp.h"
+#include "k2/csrc/log.h"
+
+namespace k2 {
+
+template <typename T>
+void SegmentedExclusiveSum(ContextPtr context, const T *d_in,
+                           int32_t num_elements, const uint32_t *d_iflags,
+                           T *d_out) {
+  if (num_elements <= 0) return;
+  hipStream_t stream = context->GetCudaStream();
+
+  // keys[i] = inclusive sum of flags[0..i]; the key changes (by +1) exactly at
+  // each segment start, which is where ExclusiveScanByKey must reset.
+  Array1<uint32_t> keys(context, num_elements);
+  uint32_t *keys_data = keys.Data();
+
+  size_t temp_bytes = 0;
+  K2_CHECK_EQ(hipcub::DeviceScan::InclusiveSum(nullptr, temp_bytes, d_iflags,
+                                               keys_data, num_elements, stream),
+              hipSuccess);
+  {
+    Array1<int8_t> d_temp(context, static_cast<int32_t>(temp_bytes));
+    K2_CHECK_EQ(
+        hipcub::DeviceScan::InclusiveSum(d_temp.Data(), temp_bytes, d_iflags,
+                                         keys_data, num_elements, stream),
+        hipSuccess);
+  }
+
+  temp_bytes = 0;
+  K2_CHECK_EQ(
+      hipcub::DeviceScan::ExclusiveScanByKey(
+          nullptr, temp_bytes, keys_data, d_in, d_out, hipcub::Sum(), T(0),
+          num_elements, hipcub::Equality(), stream),
+      hipSuccess);
+  {
+    Array1<int8_t> d_temp(context, static_cast<int32_t>(temp_bytes));
+    K2_CHECK_EQ(
+        hipcub::DeviceScan::ExclusiveScanByKey(
+            d_temp.Data(), temp_bytes, keys_data, d_in, d_out, hipcub::Sum(),
+            T(0), num_elements, hipcub::Equality(), stream),
+        hipSuccess);
+  }
+}
+
+template void SegmentedExclusiveSum<int32_t>(ContextPtr context,
+                                             const int32_t *d_in,
+                                             int32_t num_elements,
+                                             const uint32_t *d_iflags,
+                                             int32_t *d_out);
+
+template void SegmentedExclusiveSum<float>(ContextPtr context,
+                                           const float *d_in,
+                                           int32_t num_elements,
+                                           const uint32_t *d_iflags,
+                                           float *d_out);
+
+template void SegmentedExclusiveSum<double>(ContextPtr context,
+                                            const double *d_in,
+                                            int32_t num_elements,
+                                            const uint32_t *d_iflags,
+                                            double *d_out);
+
+}  // namespace k2
diff --git a/k2/csrc/eval.h b/k2/csrc/eval.h
index 449636d6a..107ad6525 100644
--- a/k2/csrc/eval.h
+++ b/k2/csrc/eval.h
@@ -28,7 +28,9 @@
 #include <type_traits>
 #include <vector>
 
-#ifdef K2_WITH_CUDA
+#if defined(K2_WITH_HIP)
+#include <hip/hip_cooperative_groups.h>
+#elif defined(K2_WITH_CUDA)
 #include <cooperative_groups.h>
 #endif
 
diff --git a/k2/csrc/fsa_utils.cu b/k2/csrc/fsa_utils.cu
index 9eb6231fe..1119da332 100644
--- a/k2/csrc/fsa_utils.cu
+++ b/k2/csrc/fsa_utils.cu
@@ -18,7 +18,9 @@
  * limitations under the License.
  */
 
-#ifdef K2_WITH_CUDA
+#if defined(K2_WITH_HIP)
+#include <hip/hip_cooperative_groups.h>
+#elif defined(K2_WITH_CUDA)
 #include <cooperative_groups.h>
 #endif
 
diff --git a/k2/csrc/hash.h b/k2/csrc/hash.h
index 5be08b574..ead2cacdf 100644
--- a/k2/csrc/hash.h
+++ b/k2/csrc/hash.h
@@ -41,7 +41,7 @@ unsigned long long int __forceinline__ __host__ __device__ AtomicCAS(
     unsigned long long int* address,
     unsigned long long int compare,
     unsigned long long int val) {
-#ifdef __CUDA_ARCH__
+#if K2_DEVICE_CODE
   return atomicCAS(address, compare, val);
 #else
   // For host code, we assume single-threaded for now).
diff --git a/k2/csrc/intersect.cu b/k2/csrc/intersect.cu
index 625456bc7..9b0d4b655 100644
--- a/k2/csrc/intersect.cu
+++ b/k2/csrc/intersect.cu
@@ -16,7 +16,9 @@
  * limitations under the License.
  */
 
-#ifdef K2_WITH_CUDA
+#if defined(K2_WITH_HIP)
+#include <hip/hip_cooperative_groups.h>
+#elif defined(K2_WITH_CUDA)
 #include <cooperative_groups.h>
 #endif
 
diff --git a/k2/csrc/log.h b/k2/csrc/log.h
index c85cc6d40..85436568e 100644
--- a/k2/csrc/log.h
+++ b/k2/csrc/log.h
@@ -43,7 +43,17 @@
 
 #include "k2/csrc/macros.h"
 
-#ifdef __CUDA_ARCH__
+#if defined(__HIPCC__)
+// Under clang/HIP a __host__ __device__ function is preprocessed once (host
+// pass) where __CUDA_ARCH__ is absent, so gating the decorator on __CUDA_ARCH__
+// would make it host-only and break device callers. Make it unconditional when
+// the HIP compiler is in use (cudaKDTree __both__ lesson); HIP's
+// __host__ __device__ is valid in both passes. Per-pass dispatch INSIDE these
+// functions uses K2_DEVICE_CODE. Keyed on __HIPCC__ (not the K2_WITH_HIP build
+// flag) so the plain-C++ TUs (e.g. the k2_log .cc built by g++, and
+// k2/csrc/host) that include this header do NOT see the HIP-only attributes.
+#define K2_CUDA_HOSTDEV __host__ __device__
+#elif defined(__CUDA_ARCH__)
 #define K2_CUDA_HOSTDEV __host__ __device__
 #else
 #define K2_CUDA_HOSTDEV
@@ -156,7 +166,7 @@ class Logger {
 
     if (cur_level_ <= level_) {
       printf("%s:%u:%s ", filename, line_num, func_name);
-#if defined(__CUDA_ARCH__)
+#if K2_DEVICE_CODE
       printf("block:[%u,%u,%u], thread: [%u,%u,%u] ", blockIdx.x, blockIdx.y,
              blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);
 #endif
@@ -179,7 +189,7 @@ class Logger {
     )";
     printf("\n");
     if (level_ == FATAL) {
-#if defined(__CUDA_ARCH__)
+#if K2_DEVICE_CODE
       // this is usually caused by one of the K2_CHECK macros and the detailed
       // error messages should have already been printed by the macro, so we
       // use an arbitrary string here.
@@ -322,7 +332,7 @@ inline int64_t MaxCpuMemAllocate() {
 }
 
 inline K2_CUDA_HOSTDEV LogLevel GetCurrentLogLevel() {
-#if defined(__CUDA_ARCH__)
+#if K2_DEVICE_CODE
   return DEBUG;
 #else
   static LogLevel log_level = INFO;
diff --git a/k2/csrc/macros.h b/k2/csrc/macros.h
index 9d9c39084..8f95116bb 100644
--- a/k2/csrc/macros.h
+++ b/k2/csrc/macros.h
@@ -29,6 +29,21 @@
 #define K2_FUNC __func__
 #endif
 
+// K2_DEVICE_CODE is 1 when compiling the device path of a __host__ __device__
+// function and 0 for the host path. On CUDA that is __CUDA_ARCH__; under the
+// HIP compiler it is __HIP_DEVICE_COMPILE__ (clang/HIP does NOT define
+// __CUDA_ARCH__). Use this for device-intrinsic-vs-host-fallback dispatch
+// instead of a bare `#ifdef __CUDA_ARCH__`, which silently takes the host path
+// in HIP device code. Keyed on __HIPCC__ (the compiler), not the K2_WITH_HIP
+// build flag, so plain-C++ TUs that include this header evaluate to 0 cleanly.
+#if defined(__HIPCC__)
+#define K2_DEVICE_CODE __HIP_DEVICE_COMPILE__
+#elif defined(__CUDA_ARCH__)
+#define K2_DEVICE_CODE 1
+#else
+#define K2_DEVICE_CODE 0
+#endif
+
 /*
 `K2_EVAL` simplifies the task of writing lambdas
 for CUDA as well as for CPU.
diff --git a/k2/csrc/math.cu b/k2/csrc/math.cu
index f53b854f0..5774e9cc3 100644
--- a/k2/csrc/math.cu
+++ b/k2/csrc/math.cu
@@ -50,7 +50,8 @@ int32_t HighestBitSet(int64_t i) {
   K2_CHECK_GE(i, 0);
 #if defined(__clang__) || defined (__GNUC__)
   if (i == 0) return -1;
-  return 63 - static_cast<int32_t>(__builtin_clzl(i));
+  // Use __builtin_clzll (always 64-bit) not __builtin_clzl (32-bit on Windows)
+  return 63 - static_cast<int32_t>(__builtin_clzll(static_cast<uint64_t>(i)));
 #else
   for (int64_t j = 0; j < 64; ++j) {
     if (i < ((int64_t)1 << j)) {
diff --git a/k2/csrc/moderngpu.h b/k2/csrc/moderngpu.h
index 6b7180be3..ea21b0c9b 100644
--- a/k2/csrc/moderngpu.h
+++ b/k2/csrc/moderngpu.h
@@ -19,7 +19,11 @@
 #ifndef K2_CSRC_MODERNGPU_H_
 #define K2_CSRC_MODERNGPU_H_
 
-#ifdef K2_WITH_CUDA
+#if defined(K2_WITH_HIP)
+// moderngpu is not portable to ROCm; use the HIP replacement that exposes the
+// same mgpu:: API (see moderngpu_shim.h).
+#include "k2/csrc/moderngpu_shim.h"
+#elif defined(K2_WITH_CUDA)
 #include "moderngpu/context.hxx"
 #include "moderngpu/kernel_load_balance.hxx"
 #include "moderngpu/kernel_mergesort.hxx"
diff --git a/k2/csrc/moderngpu_allocator_hip.cu b/k2/csrc/moderngpu_allocator_hip.cu
new file mode 100644
index 000000000..27f6407c1
--- /dev/null
+++ b/k2/csrc/moderngpu_allocator_hip.cu
@@ -0,0 +1,56 @@
+/**
+ * Copyright (c) 2026  Advanced Micro Devices, Inc. (authors: Jeff Daily <jeff.daily@amd.com>)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// HIP replacement for moderngpu_allocator.cu. The HIP mgpu shim
+// (moderngpu_shim.h) allocates and launches directly through the k2 Context, so
+// the moderngpu allocator subclass is unnecessary; GetModernGpuAllocator just
+// hands back a per-device shim context_t wrapping the k2 ContextPtr.
+
+#include <mutex>  // NOLINT
+#include <utility>
+
+#include "k2/csrc/context.h"
+#include "k2/csrc/moderngpu_allocator.h"
+
+namespace k2 {
+
+static mgpu::context_t *mgpu_contexts[kMaxNumGpus];
+static std::once_flag mgpu_once_flags[kMaxNumGpus];
+
+static void InitModernGpuAllocator(ContextPtr context) {
+  int32_t device_index = context->GetDeviceId();
+  K2_CHECK_GE(device_index, 0);
+  K2_CHECK_LT(device_index, kMaxNumGpus);
+  // It is never freed (same lifetime policy as the CUDA build).
+  mgpu_contexts[device_index] = new mgpu::context_t(context);
+}
+
+mgpu::context_t *GetModernGpuAllocator(ContextPtr context) {
+  K2_CHECK_EQ(context->GetDeviceType(), kCuda);
+
+  int32_t device_index = context->GetDeviceId();
+  K2_CHECK_GE(device_index, 0);
+  K2_CHECK_LT(device_index, kMaxNumGpus);
+
+  std::call_once(mgpu_once_flags[device_index], InitModernGpuAllocator,
+                 context);
+
+  return mgpu_contexts[device_index];
+}
+
+}  // namespace k2
diff --git a/k2/csrc/moderngpu_shim.h b/k2/csrc/moderngpu_shim.h
new file mode 100644
index 000000000..42d19c05f
--- /dev/null
+++ b/k2/csrc/moderngpu_shim.h
@@ -0,0 +1,398 @@
+/**
+ * Copyright (c) 2026  Advanced Micro Devices, Inc. (authors: Jeff Daily <jeff.daily@amd.com>)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// HIP replacement for the small slice of moderngpu that k2 uses. moderngpu is
+// not portable to ROCm (its intrinsics.hxx #errors under non-nvcc, hardcodes a
+// 32-lane warp, and uses inline PTX), so on the HIP build we do NOT compile
+// moderngpu at all. Instead this header provides the exact mgpu::-shaped API
+// that k2's call sites expect, backed by rocThrust and a couple of small
+// kernels, so every existing `mgpu::...` call compiles unchanged. The CUDA
+// build never sees this file (moderngpu.h includes the real moderngpu there).
+//
+// Semantics are matched to moderngpu:
+//  - mergesort / segmented_sort* are STABLE (moderngpu's are, and k2's CPU
+//    reference uses std::stable_sort; index maps must be reproducible);
+//  - the comparators k2 passes are arbitrary device callables (Arc/ArcComparer,
+//    device lambdas, LessThan/GreaterThan), which rocThrust handles directly
+//    and a radix sort could not.
+
+#ifndef K2_CSRC_MODERNGPU_SHIM_H_
+#define K2_CSRC_MODERNGPU_SHIM_H_
+
+#if !defined(K2_WITH_HIP)
+#error "moderngpu_shim.h is only for the HIP build"
+#endif
+
+#include <algorithm>
+#include <cstdint>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <hipcub/hipcub.hpp>
+#include <thrust/binary_search.h>  // NOLINT(build/include_order)
+#include <thrust/execution_policy.h>  // NOLINT(build/include_order)
+#include <thrust/iterator/counting_iterator.h>  // NOLINT(build/include_order)
+#include <thrust/sequence.h>  // NOLINT(build/include_order)
+#include <thrust/sort.h>  // NOLINT(build/include_order)
+
+#include "k2/csrc/context.h"
+#include "k2/csrc/log.h"
+
+namespace mgpu {
+
+// moderngpu's allocator/context base. Here it only needs to carry the k2
+// Context (for device allocate/deallocate) and the HIP stream.
+struct context_t {
+  k2::ContextPtr k2_context;
+
+  context_t() = default;
+  explicit context_t(k2::ContextPtr c) : k2_context(std::move(c)) {}
+
+  // Re-query each time so a CudaStreamOverride in effect is honored (matches
+  // how the CUDA build reads the stream).
+  hipStream_t stream() const { return k2_context->GetCudaStream(); }
+};
+
+// k2's GetModernGpuAllocator returns a `standard_context_t`-derived object on
+// CUDA; on HIP the plain context_t is enough.
+using standard_context_t = context_t;
+
+// ---- transform_lbs caching-tuple support (CatWithOffsets) -----------------
+// k2 calls transform_lbs with an optional mgpu::make_tuple(ptr...) whose values
+// are loaded per-segment and passed to the lambda as mgpu::tuple<...>.
+template <typename... Ts>
+struct tuple {
+  // Only the single-element case is exercised by k2, but keep it general.
+};
+
+template <typename T>
+struct tuple<T> {
+  T v0;
+};
+
+template <typename T>
+__host__ __device__ __forceinline__ const T &get_impl(
+    const tuple<T> &t, std::integral_constant<int, 0>) {
+  return t.v0;
+}
+
+template <int I, typename... Ts>
+__host__ __device__ __forceinline__ auto get(const tuple<Ts...> &t)
+    -> decltype(get_impl(t, std::integral_constant<int, I>())) {
+  return get_impl(t, std::integral_constant<int, I>());
+}
+
+// A tuple of device pointers; indexing it by segment yields a tuple of values.
+template <typename... Ts>
+struct ptr_tuple {};
+
+template <typename T>
+struct ptr_tuple<T> {
+  const T *p0;
+  __device__ __forceinline__ tuple<T> at(int32_t seg) const {
+    return tuple<T>{p0[seg]};
+  }
+};
+
+template <typename T>
+__host__ __forceinline__ ptr_tuple<T> make_tuple(const T *p0) {
+  return ptr_tuple<T>{p0};
+}
+
+// ---------------------------------------------------------------------------
+// Internal helpers
+// ---------------------------------------------------------------------------
+namespace shim_internal {
+
+constexpr int32_t kBlockSize = 256;
+
+template <typename T>
+inline thrust::device_ptr<T> dptr(T *p) {
+  return thrust::device_pointer_cast(p);
+}
+
+// row_ids[i] = segment of element i = upper_bound(row_splits[1..nsegs], i),
+// i.e. the moderngpu load-balance-search result. row_splits has nsegs+1
+// entries.
+inline void ComputeRowIds(context_t &ctx, int32_t count,
+                          const int32_t *row_splits, int32_t num_segments,
+                          int32_t *row_ids) {
+  if (count <= 0) return;
+  auto policy = thrust::hip::par.on(ctx.stream());
+  // ends = row_splits + 1 (the per-segment end offsets).
+  thrust::upper_bound(
+      policy, dptr(const_cast<int32_t *>(row_splits)) + 1,
+      dptr(const_cast<int32_t *>(row_splits)) + 1 + num_segments,
+      thrust::counting_iterator<int32_t>(0),
+      thrust::counting_iterator<int32_t>(count), dptr(row_ids));
+}
+
+// Grid-stride kernel that invokes the user lambda f(index, seg, rank) for the
+// plain transform_lbs; row_ids/row_splits give seg and rank.
+template <typename Lambda>
+__global__ void TransformLbsKernel(int32_t count, const int32_t *row_ids,
+                                    const int32_t *row_splits, Lambda f) {
+  for (int32_t index = blockIdx.x * blockDim.x + threadIdx.x; index < count;
+       index += gridDim.x * blockDim.x) {
+    int32_t seg = row_ids[index];
+    int32_t rank = index - row_splits[seg];
+    f(index, seg, rank);
+  }
+}
+
+// Same, with a per-segment cached tuple passed as the 4th lambda argument.
+template <typename Lambda, typename PtrTuple>
+__global__ void TransformLbsTupleKernel(int32_t count, const int32_t *row_ids,
+                                        const int32_t *row_splits,
+                                        PtrTuple cached, Lambda f) {
+  for (int32_t index = blockIdx.x * blockDim.x + threadIdx.x; index < count;
+       index += gridDim.x * blockDim.x) {
+    int32_t seg = row_ids[index];
+    int32_t rank = index - row_splits[seg];
+    f(index, seg, rank, cached.at(seg));
+  }
+}
+
+// Scratch device buffer tied to a k2 Context (used for row_ids).
+struct DeviceScratch {
+  k2::ContextPtr c;
+  void *data = nullptr;
+  void *deleter_context = nullptr;
+  DeviceScratch(k2::ContextPtr c, size_t bytes) : c(std::move(c)) {
+    data = this->c->Allocate(bytes, &deleter_context);
+  }
+  ~DeviceScratch() { c->Deallocate(data, deleter_context); }
+  template <typename T>
+  T *as() {
+    return reinterpret_cast<T *>(data);
+  }
+};
+
+// Copy the nsegs+1 segment offsets to host so we can iterate segments.
+inline std::vector<int32_t> SegmentsToHost(context_t &ctx,
+                                           const int32_t *segments,
+                                           int32_t num_segments) {
+  std::vector<int32_t> host(num_segments + 1);
+  K2_CHECK_EQ(hipMemcpyAsync(host.data(), segments,
+                             (num_segments + 1) * sizeof(int32_t),
+                             hipMemcpyDeviceToHost, ctx.stream()),
+              hipSuccess);
+  K2_CHECK_EQ(hipStreamSynchronize(ctx.stream()), hipSuccess);
+  return host;
+}
+
+}  // namespace shim_internal
+
+// ---------------------------------------------------------------------------
+// transform_lbs: f(index, seg, rank) for each index in [0, count).
+// ---------------------------------------------------------------------------
+template <typename Lambda>
+void transform_lbs(Lambda f, int32_t count, const int32_t *row_splits,
+                   int32_t num_segments, context_t &ctx) {
+  if (count <= 0) return;
+  shim_internal::DeviceScratch row_ids_buf(ctx.k2_context,
+                                           count * sizeof(int32_t));
+  int32_t *row_ids = row_ids_buf.as<int32_t>();
+  shim_internal::ComputeRowIds(ctx, count, row_splits, num_segments, row_ids);
+
+  int32_t grid = (count + shim_internal::kBlockSize - 1) /
+                 shim_internal::kBlockSize;
+  shim_internal::TransformLbsKernel<<<grid, shim_internal::kBlockSize, 0,
+                                      ctx.stream()>>>(count, row_ids,
+                                                      row_splits, f);
+}
+
+template <typename Lambda, typename... Ts>
+void transform_lbs(Lambda f, int32_t count, const int32_t *row_splits,
+                   int32_t num_segments, ptr_tuple<Ts...> cached,
+                   context_t &ctx) {
+  if (count <= 0) return;
+  shim_internal::DeviceScratch row_ids_buf(ctx.k2_context,
+                                           count * sizeof(int32_t));
+  int32_t *row_ids = row_ids_buf.as<int32_t>();
+  shim_internal::ComputeRowIds(ctx, count, row_splits, num_segments, row_ids);
+
+  int32_t grid = (count + shim_internal::kBlockSize - 1) /
+                 shim_internal::kBlockSize;
+  shim_internal::TransformLbsTupleKernel<<<grid, shim_internal::kBlockSize, 0,
+                                           ctx.stream()>>>(
+      count, row_ids, row_splits, cached, f);
+}
+
+// ---------------------------------------------------------------------------
+// mergesort: stable sort of keys, optionally permuting an index/value array.
+// ---------------------------------------------------------------------------
+template <typename Key, typename Comp>
+void mergesort(Key *keys, int32_t count, Comp comp, context_t &ctx) {
+  if (count <= 0) return;
+  auto policy = thrust::hip::par.on(ctx.stream());
+  thrust::stable_sort(policy, shim_internal::dptr(keys),
+                      shim_internal::dptr(keys) + count, comp);
+}
+
+template <typename Key, typename Value, typename Comp>
+void mergesort(Key *keys, Value *vals, int32_t count, Comp comp,
+               context_t &ctx) {
+  if (count <= 0) return;
+  auto policy = thrust::hip::par.on(ctx.stream());
+  thrust::stable_sort_by_key(policy, shim_internal::dptr(keys),
+                             shim_internal::dptr(keys) + count,
+                             shim_internal::dptr(vals), comp);
+}
+
+// ---------------------------------------------------------------------------
+// segmented_sort / segmented_sort_indices: per-segment stable sort. segments is
+// a device array of nsegs+1 offsets (row_splits style).
+// ---------------------------------------------------------------------------
+template <typename Key, typename Comp>
+void segmented_sort(Key *keys, int32_t count, const int32_t *segments,
+                    int32_t num_segments, Comp comp, context_t &ctx) {
+  if (count <= 0) return;
+  std::vector<int32_t> off =
+      shim_internal::SegmentsToHost(ctx, segments, num_segments);
+  auto policy = thrust::hip::par.on(ctx.stream());
+  for (int32_t s = 0; s < num_segments; ++s) {
+    int32_t begin = off[s], end = off[s + 1];
+    if (end - begin > 1)
+      thrust::stable_sort(policy, shim_internal::dptr(keys) + begin,
+                          shim_internal::dptr(keys) + end, comp);
+  }
+}
+
+template <typename Key, typename Index, typename Comp>
+void segmented_sort_indices(Key *keys, Index *indices, int32_t count,
+                            const int32_t *segments, int32_t num_segments,
+                            Comp comp, context_t &ctx) {
+  if (count <= 0) return;
+  // moderngpu's segmented_sort_indices fills `indices` with the GLOBAL identity
+  // permutation (0..count-1) and then stable-sorts each segment's slice
+  // alongside the keys, so afterwards indices[p] is the original global index
+  // of the element now at p. k2 relies on this (it does NOT pre-seed `indices`,
+  // unlike mergesort which seeds with Range()): PruneRaggedAxis1 reads
+  // order_map[idx01] as a global original index. Seed the identity here.
+  auto policy = thrust::hip::par.on(ctx.stream());
+  thrust::sequence(policy, shim_internal::dptr(indices),
+                   shim_internal::dptr(indices) + count, Index(0));
+  std::vector<int32_t> off =
+      shim_internal::SegmentsToHost(ctx, segments, num_segments);
+  for (int32_t s = 0; s < num_segments; ++s) {
+    int32_t begin = off[s], end = off[s + 1];
+    if (end - begin > 1)
+      thrust::stable_sort_by_key(policy, shim_internal::dptr(keys) + begin,
+                                 shim_internal::dptr(keys) + end,
+                                 shim_internal::dptr(indices) + begin, comp);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// load_balance_search: out_row_ids[i] = segment of element i.
+// ---------------------------------------------------------------------------
+inline void load_balance_search(int32_t count, const int32_t *row_splits,
+                                int32_t num_segments, int32_t *out_row_ids,
+                                context_t &ctx) {
+  shim_internal::ComputeRowIds(ctx, count, row_splits, num_segments,
+                               out_row_ids);
+}
+
+// bounds_lower/bounds_upper are enum constants used as a non-type template
+// argument, matching moderngpu's `sorted_search<mgpu::bounds_lower>(...)`.
+enum bounds_t { bounds_lower, bounds_upper };
+
+// moderngpu's plus_t<T>; only used as the scan op tag for transform_scan.
+template <typename T>
+struct plus_t {
+  __host__ __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+namespace shim_internal {
+template <typename T, typename Lambda>
+__global__ void MaterializeKernel(int32_t count, Lambda f, T *out) {
+  for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count;
+       i += gridDim.x * blockDim.x) {
+    out[i] = f(i);
+  }
+}
+}  // namespace shim_internal
+
+// transform_scan<T>(f, count, output, plus, reduction_out, ctx): exclusive sum
+// of f(i) for i in [0,count) into output[0..count-1], with the grand total
+// written to reduction_out (which k2 passes as output+count). Implemented as in
+// k2's own ExclusiveSum: materialize f(.) into count+1 elements (the last is a
+// don't-care input) and run hipcub ExclusiveSum, so output[count] == total.
+template <typename T, typename Lambda, typename Op>
+void transform_scan(Lambda f, int32_t count, T *output, Op /*op*/,
+                    T *reduction_out, context_t &ctx) {
+  if (count <= 0) {
+    if (count == 0)
+      K2_CHECK_EQ(hipMemsetAsync(reduction_out, 0, sizeof(T), ctx.stream()),
+                  hipSuccess);
+    return;
+  }
+  // values: count+1 transformed inputs (index `count` is a readable
+  // don't-care).
+  shim_internal::DeviceScratch values_buf(ctx.k2_context,
+                                          (count + 1) * sizeof(T));
+  T *values = values_buf.as<T>();
+  int32_t grid = (count + shim_internal::kBlockSize - 1) /
+                 shim_internal::kBlockSize;
+  shim_internal::MaterializeKernel<T, Lambda>
+      <<<grid, shim_internal::kBlockSize, 0, ctx.stream()>>>(count, f, values);
+
+  size_t temp_bytes = 0;
+  K2_CHECK_EQ(hipcub::DeviceScan::ExclusiveSum(nullptr, temp_bytes, values,
+                                               output, count + 1, ctx.stream()),
+              hipSuccess);
+  shim_internal::DeviceScratch temp_buf(ctx.k2_context, temp_bytes);
+  K2_CHECK_EQ(
+      hipcub::DeviceScan::ExclusiveSum(temp_buf.data, temp_bytes, values,
+                                       output, count + 1, ctx.stream()),
+      hipSuccess);
+  // output[count] now holds the total; reduction_out is output + count for k2's
+  // K2_TRANS_EXCSUM, so it is already populated, but write it explicitly in
+  // case reduction_out aliases elsewhere.
+  if (reduction_out != output + count)
+    K2_CHECK_EQ(hipMemcpyAsync(reduction_out, output + count, sizeof(T),
+                               hipMemcpyDeviceToDevice, ctx.stream()),
+                hipSuccess);
+}
+
+// sorted_search<bounds_lower>: out[i] = lower_bound(haystack, needles[i]).
+template <bounds_t Bounds, typename T, typename Comp>
+void sorted_search(const T *needles, int32_t num_needles, const T *haystack,
+                   int32_t num_haystack, int32_t *out, Comp /*comp*/,
+                   context_t &ctx) {
+  if (num_needles <= 0) return;
+  auto policy = thrust::hip::par.on(ctx.stream());
+  auto hay_begin = shim_internal::dptr(const_cast<T *>(haystack));
+  auto needle_begin = shim_internal::dptr(const_cast<T *>(needles));
+  if (Bounds == bounds_lower)
+    thrust::lower_bound(policy, hay_begin, hay_begin + num_haystack,
+                        needle_begin, needle_begin + num_needles,
+                        shim_internal::dptr(out));
+  else
+    thrust::upper_bound(policy, hay_begin, hay_begin + num_haystack,
+                        needle_begin, needle_begin + num_needles,
+                        shim_internal::dptr(out));
+}
+
+}  // namespace mgpu
+
+#endif  // K2_CSRC_MODERNGPU_SHIM_H_
diff --git a/k2/csrc/nvtx_test.cu b/k2/csrc/nvtx_test.cu
index 8a6d99ff9..c6330cb70 100644
--- a/k2/csrc/nvtx_test.cu
+++ b/k2/csrc/nvtx_test.cu
@@ -47,7 +47,8 @@
  *
  */
 
-#include <thread>  // NOLINT
+#include <chrono>   // NOLINT
+#include <thread>   // NOLINT
 
 #include "gtest/gtest.h"
 #include "k2/csrc/nvtx.h"
diff --git a/k2/csrc/pytorch_context.cu b/k2/csrc/pytorch_context.cu
index f732d9794..8a4788b3a 100644
--- a/k2/csrc/pytorch_context.cu
+++ b/k2/csrc/pytorch_context.cu
@@ -1,5 +1,6 @@
 /**
  * Copyright      2020  Mobvoi Inc.        (authors: Fangjun Kuang)
+ * Copyright (c) 2026  Advanced Micro Devices, Inc. (authors: Jeff Daily <jeff.daily@amd.com>)
  *
  * See LICENSE for clarification regarding multiple authors
  *
@@ -19,7 +20,18 @@
 #include <memory>
 #include <mutex>  // NOLINT
 
-#ifdef K2_WITH_CUDA
+#if defined(K2_WITH_HIP)
+// On a ROCm torch the c10/cuda/* headers include a generated
+// cuda_cmake_macros.h that only exists as the hip variant, so they do not
+// compile; the c10/hip/* headers are correct on every torch hipify generation.
+// They define the device APIs under c10::hip (hipify v1 rename) AND under
+// c10::cuda (hipify v2 masquerading); the call sites pick the right namespace
+// via TORCH_HIPIFY_V2.
+#include "c10/hip/HIPCachingAllocator.h"
+#include "c10/hip/HIPFunctions.h"
+#include "c10/hip/HIPStream.h"
+#include "torch/cuda.h"
+#elif defined(K2_WITH_CUDA)
 #include "c10/cuda/CUDACachingAllocator.h"
 #include "c10/cuda/CUDAFunctions.h"
 #include "torch/cuda.h"
@@ -145,21 +157,35 @@ class PytorchCudaContext : public Context {
   explicit PytorchCudaContext(int32_t gpu_id) : gpu_id_(gpu_id) {
 #ifdef K2_WITH_CUDA
     K2_CHECK_GE(gpu_id, 0);
+#if defined(K2_WITH_HIP) && !defined(TORCH_HIPIFY_V2)
+    // torch hipify v1 renamed the device classes: c10::hip is the only
+    // spelling.
+    K2_CHECK_LT(gpu_id, c10::hip::device_count());
+    c10::hip::set_device(static_cast<c10::DeviceIndex>(gpu_id));
+#else
+    // hipify v2 keeps the CUDA spelling as the masquerading API; pure CUDA too.
     K2_CHECK_LT(gpu_id, c10::cuda::device_count());
-
     c10::cuda::set_device(gpu_id);
+#endif
 
     // The internals of `lazyInitCUDA` are executed only once
     // so it is fine to invoke lazyInitCUDA() multiple times.
     // The call will be inlined since it is defined in the header
     // aten/src/ATen/Context.h
-#if  K2_TORCH_VERSION_MAJOR > 2 || (K2_TORCH_VERSION_MAJOR == 2 && K2_TORCH_VERSION_MINOR >= 6)
+#if K2_TORCH_VERSION_MAJOR > 2 || \
+    (K2_TORCH_VERSION_MAJOR == 2 && K2_TORCH_VERSION_MINOR >= 6)
     at::globalContext().lazyInitDevice(torch::kCUDA);
 #else
     at::globalContext().lazyInitCUDA();
 #endif
 
+#if defined(K2_WITH_HIP) && !defined(TORCH_HIPIFY_V2)
+    // torch hipify v1 rename: c10::hip::HIPCachingAllocator.
+    allocator_ = c10::hip::HIPCachingAllocator::get();
+#else
+    // hipify v2 masquerading or pure CUDA: c10::cuda::CUDACachingAllocator.
     allocator_ = c10::cuda::CUDACachingAllocator::get();
+#endif
     K2_CHECK(allocator_->raw_deleter() != nullptr);
 #else
     K2_LOG(FATAL) << "Unreachable code.";
@@ -172,8 +198,15 @@ class PytorchCudaContext : public Context {
 
   cudaStream_t GetCudaStream() const override {
 #ifdef K2_WITH_CUDA
+#if defined(K2_WITH_HIP) && !defined(TORCH_HIPIFY_V2)
+    // torch hipify v1 rename: c10::hip::getCurrentHIPStream.
+    return g_stream_override.OverrideStream(
+        c10::hip::getCurrentHIPStream(static_cast<c10::DeviceIndex>(gpu_id_)));
+#else
+    // hipify v2 masquerading or pure CUDA: c10::cuda::getCurrentCUDAStream.
     return g_stream_override.OverrideStream(
         c10::cuda::getCurrentCUDAStream(gpu_id_));
+#endif
 #else
     return cudaStream_t{};
 #endif
@@ -256,7 +289,14 @@ ContextPtr GetCudaContext(int32_t gpu_id /*= -1*/) {
 
   if (has_cuda) {
 #ifdef K2_WITH_CUDA
+#if defined(K2_WITH_HIP) && !defined(TORCH_HIPIFY_V2)
+    // torch hipify v1 rename: c10::hip::current_device.
+    if (gpu_id < 0)
+      gpu_id = static_cast<int32_t>(c10::hip::current_device());
+#else
+    // hipify v2 masquerading or pure CUDA: c10::cuda::current_device.
     if (gpu_id < 0) gpu_id = c10::cuda::current_device();
+#endif
     DeviceGuard guard(gpu_id);
     return std::make_shared<PytorchCudaContext>(gpu_id);
 #else
diff --git a/k2/csrc/pytorch_context_test.cu b/k2/csrc/pytorch_context_test.cu
index 217c89f3f..4eac3d241 100644
--- a/k2/csrc/pytorch_context_test.cu
+++ b/k2/csrc/pytorch_context_test.cu
@@ -16,7 +16,22 @@
  * limitations under the License.
  */
 
+#if defined(K2_WITH_HIP)
+// The c10/cuda/* headers pull a generated cuda_cmake_macros.h that only exists
+// as the hip variant on a ROCm torch, so include the c10/hip/* header on every
+// hipify generation; the device-fn namespace is selected below.
+#include "c10/hip/HIPFunctions.h"
+#else
 #include "c10/cuda/CUDAFunctions.h"
+#endif
+
+#if defined(K2_WITH_HIP) && !defined(TORCH_HIPIFY_V2)
+// torch hipify v1 renamed the device classes; c10::hip is the only spelling.
+namespace c10_device = c10::hip;
+#else
+// hipify v2 masquerading or pure CUDA: the CUDA spelling is public.
+namespace c10_device = c10::cuda;
+#endif
 #include "gtest/gtest.h"
 #include "k2/csrc/test_utils.h"
 //
@@ -28,17 +43,17 @@ namespace k2 {
 
 // Use a separate function because there is a lambda function inside K2_EVAL().
 static void TestImpl() {
-  K2_LOG(INFO) << "Number of devices: " << c10::cuda::device_count();
+  K2_LOG(INFO) << "Number of devices: " << c10_device::device_count();
 
   // Set the default device to 1
-  c10::cuda::set_device(1);
-  EXPECT_EQ(c10::cuda::current_device(), 1);
+  c10_device::set_device(1);
+  EXPECT_EQ(c10_device::current_device(), 1);
 
   ContextPtr c = GetCudaContext(0);
   EXPECT_EQ(c->GetDeviceId(), 0);
 
   // the default device should still be 1
-  EXPECT_EQ(c10::cuda::current_device(), 1);
+  EXPECT_EQ(c10_device::current_device(), 1);
 
   Array1<int32_t> a(c, "[1 2]");
   EXPECT_EQ(a.Context()->GetDeviceId(), 0);
@@ -73,7 +88,7 @@ TEST(PyTorchContext, GetCudaContext) {
   if (!torch::cuda::is_available()) return;
 
   // skip it if there are less than two CUDA GPUs.
-  if (c10::cuda::device_count() < 2) return;
+  if (c10_device::device_count() < 2) return;
 
   TestImpl();
 }
diff --git a/k2/csrc/ragged.h b/k2/csrc/ragged.h
index d4bde0bfa..8020ea261 100644
--- a/k2/csrc/ragged.h
+++ b/k2/csrc/ragged.h
@@ -39,13 +39,14 @@ namespace k2 {
 // Note: row_splits is of size num_rows + 1 and row_ids is of size
 // num_elements.
 struct RaggedShapeLayer {
-  RaggedShapeLayer() = default;
-
-  RaggedShapeLayer(const RaggedShapeLayer &) = default;
-  RaggedShapeLayer& operator=(const RaggedShapeLayer &) = default;
-
-  RaggedShapeLayer(RaggedShapeLayer &&) = default;
-  RaggedShapeLayer& operator=(RaggedShapeLayer &&) = default;
+  // NOTE: deliberately no user-declared constructors/assignment. k2
+  // brace-initializes this as an aggregate, e.g. RaggedShapeLayer{splits, ids,
+  // tot}. In C++20 a user-DECLARED (even = default) constructor disqualifies a
+  // type from aggregate initialization (the rule tightened from "user-provided"
+  // in C++17), and the ROCm torch 2.13 headers force a C++20 build, so the
+  // previously-defaulted copy/move/default members are left implicit (which are
+  // identical here -- all members are trivially copyable). The CUDA build is
+  // unaffected: the implicit members match the removed defaulted ones.
 
   // Search for "row_splits concept" in utils.h for explanation.  row_splits
   // is required; it must always be nonempty for a RaggedShapeLayer to be valid.
diff --git a/k2/csrc/ragged_ops.cu b/k2/csrc/ragged_ops.cu
index 5b73cdda7..77761fe08 100644
--- a/k2/csrc/ragged_ops.cu
+++ b/k2/csrc/ragged_ops.cu
@@ -2530,9 +2530,13 @@ struct HashOutputIterator {  // outputs just the index of the pair.
     return HashOutputIteratorDeref<T>(t_);
   }
   __host__ __device__ __forceinline__ HashOutputIterator
-  operator+(size_t offset) {
+  operator+(size_t offset) const {
     return HashOutputIterator{t_ + offset};
   }
+  __host__ __device__ __forceinline__ HashOutputIterator
+  operator-(size_t offset) const {
+    return HashOutputIterator{t_ - offset};
+  }
   __host__ __device__ __forceinline__ HashOutputIterator &operator+=(
       size_t offset) {
     t_ += offset;
diff --git a/k2/csrc/ragged_ops_inl.h b/k2/csrc/ragged_ops_inl.h
index 6f2a38c71..3b2f1425b 100644
--- a/k2/csrc/ragged_ops_inl.h
+++ b/k2/csrc/ragged_ops_inl.h
@@ -593,9 +593,17 @@ struct PairOutputIterator {  // outputs just the index of the pair.
     return PairOutputIteratorDeref<T>(i_);
   }
   __host__ __device__ __forceinline__ PairOutputIterator
-  operator+(int32_t offset) {
+  operator+(int32_t offset) const {
     return PairOutputIterator{i_ + offset};
   }
+  __host__ __device__ __forceinline__ PairOutputIterator
+  operator+(size_t offset) const {
+    return PairOutputIterator{i_ + offset};
+  }
+  __host__ __device__ __forceinline__ PairOutputIterator
+  operator-(int32_t offset) const {
+    return PairOutputIterator{i_ - offset};
+  }
   __host__ __device__ __forceinline__ PairOutputIterator &operator+=(
       int32_t offset) {
     i_ += offset;
diff --git a/k2/csrc/rand.cu b/k2/csrc/rand.cu
index 426355baa..750ede335 100644
--- a/k2/csrc/rand.cu
+++ b/k2/csrc/rand.cu
@@ -20,7 +20,9 @@
 #include <random>
 #include <type_traits>
 
-#ifdef K2_WITH_CUDA
+#if defined(K2_WITH_HIP)
+#include <hiprand/hiprand_kernel.h>  // NOLINT
+#elif defined(K2_WITH_CUDA)
 #include "curand.h"         // NOLINT
 #include "curand_kernel.h"  // NOLINT
 #endif
diff --git a/k2/csrc/utils.h b/k2/csrc/utils.h
index 434748c01..c9e01bbc7 100644
--- a/k2/csrc/utils.h
+++ b/k2/csrc/utils.h
@@ -338,7 +338,7 @@ __host__ __device__ __forceinline__ float IntAsFloat(int32_t i) {
    is an error if it becomes less than zero).
 */
 __host__ __device__ __forceinline__ bool AtomicDecAndCompareZero(int32_t *i) {
-#ifdef __CUDA_ARCH__
+#if K2_DEVICE_CODE
   int32_t old = atomicAdd(i, -1);
   K2_CHECK_GT(old, 0);
   return old == 1;
@@ -362,7 +362,7 @@ __host__ __device__ __forceinline__ bool AtomicDecAndCompareZero(int32_t *i) {
  */
 template <typename T>
 __host__ __device__ __forceinline__ void AtomicAdd(T *address, T value) {
-#ifdef __CUDA_ARCH__
+#if K2_DEVICE_CODE
   atomicAdd(address, value);
 #else
   // For host code, we assume single-threaded for now).
@@ -376,7 +376,15 @@ __host__ __device__ __forceinline__ void AtomicAdd(T *address, T value) {
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
 __host__ __device__ __forceinline__ void AtomicAdd(double *address,
                                                      double value) {
-#if __CUDA_ARCH__ >= 600
+#if defined(K2_WITH_HIP)
+#if K2_DEVICE_CODE
+  // gfx9 (CDNA) has native fp64 atomicAdd.
+  atomicAdd(address, value);
+#else
+  // For host code, we assume single-threaded for now.
+  *address += value;
+#endif
+#elif __CUDA_ARCH__ >= 600
   atomicAdd(address, value);
 #elif defined(__CUDA_ARCH__)
   // clang-format off
@@ -418,7 +426,7 @@ __host__ __device__ __forceinline__ float OrderedIntToFloat(int32_t i) {
  */
 __host__ __device__ __forceinline__ int32_t AtomicMax(int32_t *address,
                                                       int32_t val) {
-#if defined(__CUDA_ARCH__)
+#if K2_DEVICE_CODE
   return atomicMax(address, val);
 #else
   int32_t old = *address;
diff --git a/k2/python/csrc/CMakeLists.txt b/k2/python/csrc/CMakeLists.txt
index 19e34fc0c..8682bfe69 100644
--- a/k2/python/csrc/CMakeLists.txt
+++ b/k2/python/csrc/CMakeLists.txt
@@ -16,7 +16,7 @@ else()
   message(FATAL_ERROR "Please select a framework.")
 endif()
 
-if(NOT K2_WITH_CUDA)
+if(NOT K2_WITH_CUDA AND NOT K2_WITH_HIP)
   transform(OUTPUT_VARIABLE k2_srcs SRCS ${k2_srcs})
 endif()
 
@@ -29,7 +29,19 @@ if(WIN32)
   set_property(TARGET pybind11::windows_extras PROPERTY INTERFACE_COMPILE_OPTIONS "")
 endif()
 
-pybind11_add_module(_k2 ${k2_srcs})
+if(K2_WITH_HIP)
+  # NO_EXTRAS disables pybind11's LTO/-flto: under the HIP toolchain LTO leaves the
+  # module as slim bitcode with no PyInit__k2 (ImportError), since the HIP link does
+  # not finalize LTO (gpuRIR lesson).
+  pybind11_add_module(_k2 NO_EXTRAS ${k2_srcs})
+  set_source_files_properties(${k2_srcs} PROPERTIES LANGUAGE HIP)
+  set_target_properties(_k2 PROPERTIES
+    HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}"
+    HIP_SEPARABLE_COMPILATION ON
+    INTERPROCEDURAL_OPTIMIZATION OFF)
+else()
+  pybind11_add_module(_k2 ${k2_srcs})
+endif()
 target_link_libraries(_k2 PRIVATE context)
 target_link_libraries(_k2 PRIVATE fsa)
 
diff --git a/k2/python/csrc/torch/CMakeLists.txt b/k2/python/csrc/torch/CMakeLists.txt
index 8c6803f8a..5c64507d9 100644
--- a/k2/python/csrc/torch/CMakeLists.txt
+++ b/k2/python/csrc/torch/CMakeLists.txt
@@ -21,7 +21,7 @@ set(torch_srcs
   v2/ragged_shape.cu
 )
 
-if (K2_WITH_CUDA)
+if (K2_WITH_CUDA OR K2_WITH_HIP)
   list(APPEND torch_srcs mutual_information_cuda.cu)
 endif()
 
diff --git a/k2/python/csrc/torch/mutual_information_cuda.cu b/k2/python/csrc/torch/mutual_information_cuda.cu
index 84e60871e..d9069fb10 100644
--- a/k2/python/csrc/torch/mutual_information_cuda.cu
+++ b/k2/python/csrc/torch/mutual_information_cuda.cu
@@ -18,8 +18,15 @@
  * limitations under the License.
  */
 
+#if defined(K2_WITH_HIP)
+// c10/cuda/* does not compile on a ROCm torch (generated cuda_cmake_macros.h is
+// hip-only); the c10/hip/* stream header is correct on every hipify generation.
+#include <c10/hip/HIPStream.h>
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <c10/cuda/CUDAStream.h>  // for getCurrentCUDAStream()
 #include <cooperative_groups.h>
+#endif
 
 #include "k2/csrc/utils.h"  // for LogAdd
 #include "k2/python/csrc/torch/mutual_information.h"
diff --git a/k2/python/csrc/torch/ragged.cu b/k2/python/csrc/torch/ragged.cu
index 31ea395b8..f4b25e0d0 100644
--- a/k2/python/csrc/torch/ragged.cu
+++ b/k2/python/csrc/torch/ragged.cu
@@ -162,7 +162,11 @@ static void PybindRaggedTpl(py::module &m, const char *name) {
   });
 
   pyclass.def(py::pickle(
-      [](const PyClass &obj) {
+      [](const PyClass &obj) -> py::tuple {
+        // Explicit return type: clang/C++20 deduces a lambda's return type from
+        // all return statements and rejects the two py::make_tuple() calls
+        // below (different arities) as "different" deduced types; nvcc/gcc
+        // accept it.
         DeviceGuard guard(obj.Context());
         K2_CHECK(obj.NumAxes() == 2 || obj.NumAxes() == 3)
             << "Only support Ragged with NumAxes() == 2 or 3 for now, given "