ml-explore · NripeshN · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
diff --git a/.github/workflows/build_rocm.yml b/.github/workflows/build_rocm.yml
@@ -0,0 +1,97 @@
+name: Build ROCm and Test
+
+on:
+  push:
+    branches: [ rocm-support ]
+  workflow_dispatch:
+
+jobs:
+  build-and-test:
+    runs-on: strix-halo
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      run: |
+        uv venv venv
+        source venv/bin/activate
+        uv pip install --upgrade mlx-lm
+
+    - name: Build and install MLX ROCm wheel
+      run: |
+        source venv/bin/activate
+        export CMAKE_ARGS="-DMLX_BUILD_ROCM=ON -DMLX_ROCM_ARCHITECTURES=gfx1151 -DBLA_VENDOR=OpenBLAS -DCMAKE_BUILD_TYPE=RelWithDebInfo"
+        rm -rf wheelhouse
+        mkdir -p wheelhouse
+        uv build --wheel --out-dir wheelhouse .
+        uv pip install --force-reinstall wheelhouse/mlx-*.whl
+
+    - name: Basic MLX GPU test
+      run: |
+        source venv/bin/activate
+        python3 -c "
+        import mlx.core as mx
+        print('MLX version:', mx.__version__)
+        print('Default device:', mx.default_device())
+        mx.set_default_device(mx.gpu)
+        print('GPU device set')
+
+        # Test basic operations
+        a = mx.ones((10, 10))
+        mx.eval(a)
+        print('Basic array creation: OK')
+
+        # Test matmul
+        b = mx.random.normal((256, 256))
+        c = mx.matmul(b, b)
+        mx.eval(c)
+        print('Matmul test: OK')
+
+        # Test softmax
+        d = mx.softmax(b, axis=-1)
+        mx.eval(d)
+        print('Softmax test: OK')
+
+        print('All basic tests passed!')
+        "
+
+    - name: Run inference tests
+      run: |
+        source venv/bin/activate
+        export HIP_LAUNCH_BLOCKING=1
+        export PYTHONFAULTHANDLER=1
+        mkdir -p "${GITHUB_WORKSPACE}/rocm-stacktraces"
+
+        run_and_trace() {
+          local name="$1"
+          shift
+          lldb -Q -b \
+            -o "run" \
+            -k "bt" \
+            -k "quit 1" \
+            -- python3 "$(which mlx_lm.generate)" "$@" \
+            > >(tee "${GITHUB_WORKSPACE}/rocm-stacktraces/${name}.log") 2>&1
+        }
+
+        run_and_trace qwen3_bf16 --model mlx-community/Qwen3-0.6B-bf16 --prompt "Hi" --max-tokens 5
+        run_and_trace qwen3_8bit --model mlx-community/Qwen3-0.6B-8bit --prompt "How tall is Mt Everest?" --max-tokens 128
+
+    - name: Upload ROCm wheel artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v6
+      with:
+        name: rocm-wheel-${{ github.run_attempt }}
+        path: wheelhouse/mlx-*.whl
+        if-no-files-found: warn
+        retention-days: 14
+
+    - name: Upload ROCm stacktrace artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v6
+      with:
+        name: rocm-stacktraces-${{ github.run_attempt }}
+        path: ${{ github.workspace }}/rocm-stacktraces/*
+        if-no-files-found: warn
+        retention-days: 14
diff --git a/.gitignore b/.gitignore
@@ -79,3 +79,10 @@ uv.lock
 .cache/
 # vim
 *.swp
+
+# keys
+*.pem
+
+build.sh
+github-runner/
+sync_fork.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -36,6 +36,7 @@ option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
 option(MLX_BUILD_CPU "Build cpu backend" ON)
 option(MLX_BUILD_CUDA "Build cuda backend" OFF)
+option(MLX_BUILD_ROCM "Build rocm backend" OFF)
 option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
 option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
@@ -164,6 +165,43 @@ if(MLX_BUILD_CUDA)
   endif()
 endif()
 
+if(MLX_BUILD_ROCM)
+  # Set HIP architectures - these will be used by the ROCm backend
+  # CMakeLists.txt
+  #
+  # Supported architectures from ROCm 6.4.0 - 7.2.0 compatibility matrix: CDNA:
+  # gfx908 (MI100), gfx90a (MI200), gfx942 (MI300) CDNA4: gfx950 (MI400 series)
+  # RDNA2: gfx1030 (RX 6000 series) RDNA3: gfx1100 (RX 7900), gfx1101 (RX 7600)
+  # RDNA4: gfx1200, gfx1201 (RX 8000 series)
+  if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
+    if(DEFINED MLX_ROCM_ARCHITECTURES)
+      set(CMAKE_HIP_ARCHITECTURES
+          ${MLX_ROCM_ARCHITECTURES}
+          CACHE STRING "HIP architectures")
+    else()
+      set(CMAKE_HIP_ARCHITECTURES
+          "gfx908;gfx90a;gfx942;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102"
+          CACHE STRING "HIP architectures")
+    endif()
+  endif()
+  message(
+    STATUS "Setting CMAKE_HIP_ARCHITECTURES to: ${CMAKE_HIP_ARCHITECTURES}")
+  # Note: We don't enable_language(HIP) here because it causes CMake to add -x
+  # hip to all CXX files in targets that link to HIP libraries. Instead, we
+  # compile HIP files using custom commands in the ROCm backend CMakeLists.txt.
+  # Find the HIP compiler
+  find_program(
+    CMAKE_HIP_COMPILER
+    NAMES hipcc clang++
+    PATHS /opt/rocm/bin /opt/rocm-6.0.0/bin /opt/rocm/llvm/bin
+    PATH_SUFFIXES bin
+    DOC "HIP compiler")
+  if(NOT CMAKE_HIP_COMPILER)
+    message(FATAL_ERROR "Could not find HIP compiler (hipcc or clang++)")
+  endif()
+  message(STATUS "Found HIP compiler: ${CMAKE_HIP_COMPILER}")
+endif()
+
 if(MLX_BUILD_METAL)
   find_library(METAL_LIB Metal)
   find_library(FOUNDATION_LIB Foundation)
@@ -310,10 +348,12 @@ if(MLX_BUILD_CPU)
       message(FATAL_ERROR "Must have LAPACK installed")
     endif()
     find_path(LAPACK_INCLUDE_DIRS lapacke.h /usr/include /usr/local/include
-              /usr/local/opt/openblas/include)
+              /usr/local/opt/openblas/include /usr/include/openblas)
     message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
     message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
-    target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
+    if(LAPACK_INCLUDE_DIRS)
+      target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
+    endif()
     target_link_libraries(mlx PRIVATE ${LAPACK_LIBRARIES})
     # List blas after lapack otherwise we may accidentally incldue an old
     # version of lapack.h from the include dirs of blas.