bjacob · bjacob · Apr 22, 2026
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_x86_64.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_x86_64.mlir
@@ -175,23 +175,24 @@ func.func @pack_gemm_fill_dynamic_inner_tiled_avx512(%arg0 : tensor<?x?xf32>, %a
   %5 = iree_encoding.unset_encoding %4 encoding_dims{%m, %n, %k} : tensor<?x?xf32, #encoding_result_it> -> tensor<?x?xf32>{%d0, %d1}
   return %5 : tensor<?x?xf32>
 }
-//   CHECK-DAG: #[[$MAP_N:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
+//   CHECK-DAG: #[[$MAP_INNER:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
 // CHECK-LABEL: func @pack_gemm_fill_dynamic_inner_tiled_avx512(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
 //   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[ARG1]], %[[C1]]
-//   CHECK-DAG:   %[[OUT_D1:.+]] = affine.apply #[[$MAP_N]]()[%[[D1]]]
+//   CHECK-DAG:   %[[OUT_D0:.+]] = affine.apply #[[$MAP_INNER]]()[%[[D0]]]
+//   CHECK-DAG:   %[[OUT_D1:.+]] = affine.apply #[[$MAP_INNER]]()[%[[D1]]]
 //   CHECK-DAG:   %[[PACK_LHS:.+]] = linalg.pack {{.*}}%[[ARG0]]
 //       CHECK:   %[[PACK_RHS:.+]] = linalg.pack
 //  CHECK-SAME:     %[[ARG1]]
-//   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty(%[[D0]], %[[OUT_D1]]) : tensor<?x?x1x16xf32>
+//   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty(%[[OUT_D0]], %[[OUT_D1]]) : tensor<?x?x16x16xf32>
 //       CHECK:   %[[FILL:.+]] = linalg.fill
 //  CHECK-SAME:       outs(%[[EMPTY]] :
 //       CHECK:   %[[INNER:.+]] = iree_codegen.inner_tiled ins(%[[PACK_LHS]], %[[PACK_RHS]]) outs(%[[FILL]])
-//  CHECK-SAME:       kind = #iree_cpu.data_tiled_mma_layout<intrinsic = MMA_X86_AVX512_1x16x1_F32_F32>, semantics = #iree_cpu.mma_semantics<>
+//  CHECK-SAME:       kind = #iree_cpu.data_tiled_mma_layout<intrinsic = MMA_X86_AVX512_1x16x1_F32_F32, intrinsics_m = 16>, semantics = #iree_cpu.mma_semantics<>
 //       CHECK:   %[[UNPACK:.+]] = linalg.unpack %[[INNER]]
 //       CHECK:   return %[[UNPACK]]
 
@@ -223,19 +224,20 @@ func.func @unset_encoding_matmul_RESULT_inner_tiled_avx512(%arg0: tensor<127x255
 }
 // CHECK-LABEL: func @set_encoding_matmul_LHS_inner_tiled_avx512(
 //  CHECK-SAME:   %[[INPUT:[a-zA-Z0-9]+]]: tensor<127x255xf32>
-//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<127x255x1x1xf32>
-//       CHECK:   %[[PACK:.+]] = linalg.pack %[[INPUT]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %[[EMPTY]] : tensor<127x255xf32> -> tensor<127x255x1x1xf32>
-//       CHECK:   return %[[PACK]] : tensor<127x255x1x1xf32>
+//   CHECK-DAG:   %[[CST:.+]] = arith.constant 0.0
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<8x255x16x1xf32>
+//       CHECK:   %[[PACK:.+]] = linalg.pack %[[INPUT]] padding_value(%[[CST]] : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %[[EMPTY]] : tensor<127x255xf32> -> tensor<8x255x16x1xf32>
+//       CHECK:   return %[[PACK]] : tensor<8x255x16x1xf32>
 // CHECK-LABEL: func @set_encoding_matmul_RHS_inner_tiled_avx512(
 //  CHECK-SAME:   %[[INPUT_R:[a-zA-Z0-9]+]]: tensor<127x255xf32>
 //   CHECK-DAG:   %[[CST_R:.+]] = arith.constant 0.0
 //       CHECK:   %[[EMPTY_R:.+]] = tensor.empty() : tensor<16x127x16x1xf32>
 //       CHECK:   %[[PACK_R:.+]] = linalg.pack %[[INPUT_R]] padding_value(%[[CST_R]] : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %[[EMPTY_R]] : tensor<127x255xf32> -> tensor<16x127x16x1xf32>
 //       CHECK:   return %[[PACK_R]] : tensor<16x127x16x1xf32>
 // CHECK-LABEL: func @unset_encoding_matmul_RESULT_inner_tiled_avx512(
-//  CHECK-SAME:   %[[PACKED:[a-zA-Z0-9]+]]: tensor<127x16x1x16xf32>
+//  CHECK-SAME:   %[[PACKED:[a-zA-Z0-9]+]]: tensor<8x16x16x16xf32>
 //       CHECK:   %[[EMPTY_U:.+]] = tensor.empty() : tensor<127x255xf32>
-//       CHECK:   %[[UNPACK:.+]] = linalg.unpack %[[PACKED]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 16] into %[[EMPTY_U]] : tensor<127x16x1x16xf32> -> tensor<127x255xf32>
+//       CHECK:   %[[UNPACK:.+]] = linalg.unpack %[[PACKED]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %[[EMPTY_U]] : tensor<8x16x16x16xf32> -> tensor<127x255xf32>
 //       CHECK:   return %[[UNPACK]]
 
 // -----

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUAttrs.cpp
@@ -352,6 +352,28 @@ getRowMajorTilesMNKShape(MMAIntrinsic intrinsic) {
   }
 }
 
+int64_t getRegisterSpaceBytes(MMAIntrinsic intrinsic) {
+  // Total architectural vector register file size, in bytes. The inner-tiled
+  // cost model uses this as the capacity for the union of the ACC, LHS and
+  // RHS tiles. For scalable ISAs we treat the vector length as its minimum
+  // (1 × 128 bits = 16 bytes per register); this is a deliberate
+  // simplification — the resulting `intrinsics_m`/`intrinsics_n` choices are
+  // good enough in practice and avoid propagating scalability into the cost
+  // model.
+  uint32_t arch = static_cast<uint32_t>(intrinsic) & 0xFF00;
+  switch (arch) {
+  case 0x1200: // AVX/AVX2: 16 YMM × 32 B.
+    return 16 * 32;
+  case 0x1300: // AVX-512: 32 ZMM × 64 B.
+    return 32 * 64;
+  case 0x2200: // Arm SVE/SVE2: 32 Z × (VL treated as 128 bits).
+    return 32 * 16;
+  default:
+    // Plausible default, but override it on each arch you care for.
+    return 16 * 32;
+  }
+}
+
 /// Helper for `getIntrinsicSwizzle`:
 /// Ensures every `expandShape` row has at least one piece (unit
 /// dims that received no explicit `expand` get a non-scalable size-1 Internal

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUTypes.h b/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUTypes.h
@@ -73,6 +73,15 @@ Codegen::TileSwizzle getIntrinsicSwizzle(MMAIntrinsic mma, bool transposed,
 // Returns the TileSwizzle for the given MMA attr and operand index.
 Codegen::TileSwizzle getSwizzle(DataTiledMMAAttr mma, int operandIdx);
 
+// Returns the architectural vector register file capacity, in bytes, that the
+// inner-tiled MMA cost model may use to fit the union of ACC, LHS and RHS
+// tiles. For ISAs with scalable vectors (e.g. SVE/SVE2) the vector length is
+// treated as its 128-bit minimum — a deliberate simplification that produces
+// good-enough `intrinsics_m`/`intrinsics_n` choices without leaking
+// scalability into the cost model.
+// Values: AVX/AVX2 = 16 × 32 B, AVX-512 = 32 × 64 B, SVE/SVE2 = 32 × 16 B.
+int64_t getRegisterSpaceBytes(MMAIntrinsic intrinsic);
+
 } // namespace mlir::iree_compiler::IREE::CPU
 
 // clang-format on

diff --git a/compiler/src/iree/compiler/Codegen/ExternalInterfaces/CPUEncodingExternalModels.cpp b/compiler/src/iree/compiler/Codegen/ExternalInterfaces/CPUEncodingExternalModels.cpp
@@ -52,6 +52,7 @@
 #include "iree/compiler/Dialect/Encoding/Utils/Utils.h"
 #include "iree/compiler/Dialect/LinalgExt/Utils/MatchUtils.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/bit.h"
 #include "llvm/Support/DebugLog.h"
 #include "llvm/Support/InterleavedRange.h"
 #include "mlir/IR/AffineMap.h"
@@ -469,11 +470,78 @@ chooseIntrinsic(MLIRContext *ctx, ArrayRef<Type> elementTypes,
   return best;
 }
 
+/// Power-of-two cap on one unroll dim. For a static matmul extent
+/// `matmulSize` divided by `intrinsicSize`, we floor the cover count to a
+/// power of two. For dynamic dims we fall back to `fallback`, which callers
+/// set to something at least as large as the register budget — that budget
+/// itself terminates the enumeration below, so no tighter static cap is
+/// needed.
+static int64_t po2UnrollCap(int64_t matmulSize, int64_t intrinsicSize,
+                            int64_t fallback) {
+  if (ShapedType::isDynamic(matmulSize)) {
+    return fallback;
+  }
+  uint64_t cover =
+      std::max<int64_t>(1, llvm::divideCeil(matmulSize, intrinsicSize));
+  return llvm::bit_floor(cover);
+}
+
+// Phase 2 of `chooseCpuInnerTiledMmaForEncoding`: for an already-chosen
+// (intrinsic, transposed) pair, pick the largest power-of-two unroll
+// factors (intrinsicsM, intrinsicsN) such that the three tiles
+// (ACC + LHS + RHS) still fit in the target's vector register file,
+// breaking ties with arithmetic intensity (effM*effN)/(effM+effN) so
+// approximately-square tiles win.
+//
+// Returns nullopt if no feasible (im, in) exists. The returned pair is
+// (intrinsicsM, intrinsicsN).
+static std::optional<std::pair<int64_t, int64_t>>
+chooseUnrolling(MLIRContext *ctx, ArrayRef<Type> elementTypes,
+                IREE::CPU::MMAIntrinsic intr, bool transposed,
+                const IREE::Encoding::BxMxNxKxKb &matmulSizes) {
+  std::optional<IntrinsicInfo> info =
+      getIntrinsicInfo(ctx, elementTypes, intr, transposed);
+  if (!info) {
+    return std::nullopt;
+  }
+  int64_t regBitBudget = IREE::CPU::getRegisterSpaceBytes(intr) * 8;
+  int64_t capMPo2 =
+      po2UnrollCap(matmulSizes.M, info->intrinsicM, regBitBudget);
+  int64_t capNPo2 =
+      po2UnrollCap(matmulSizes.N, info->intrinsicN, regBitBudget);
+  int64_t accTerm = info->intrinsicM * info->intrinsicN * info->accBits;
+  int64_t lhsTerm = info->intrinsicM * info->intrinsicK * info->lhsBits;
+  int64_t rhsTerm = info->intrinsicN * info->intrinsicK * info->rhsBits;
+  std::optional<std::pair<int64_t, int64_t>> best;
+  double bestIntensity = -1.0;
+  // Enumerate power-of-two intrinsicsM; for each, pick the largest feasible
+  // power-of-two intrinsicsN under the bit budget and the static N cap.
+  // The budget bounds im on its own (im*lhsTerm alone must be < budget),
+  // which terminates the loop without any numRegs-style cap.
+  for (int64_t im = 1; im <= capMPo2; im *= 2) {
+    int64_t remaining = regBitBudget - im * lhsTerm;
+    if (remaining <= 0) {
+      break;
+    }
+    int64_t inMaxBudget = remaining / (im * accTerm + rhsTerm);
+    uint64_t inCap = std::min<int64_t>(capNPo2, inMaxBudget);
+    if (inCap < 1) {
+      continue;
+    }
+    int64_t in = llvm::bit_floor(inCap);
+    double effM = static_cast<double>(im) * info->intrinsicM;
+    double effN = static_cast<double>(in) * info->intrinsicN;
+    double intensity = effM * effN / (effM + effN);
+    if (intensity > bestIntensity) {
+      bestIntensity = intensity;
+      best = {im, in};
+    }
+  }
+  return best;
+}
+
 // Picks a CPU `DataTiledMMAAttr` for `iree_codegen.inner_tiled` given an
-// encoding and target config. Picks the best (intrinsic, transposed)
-// orientation via `chooseIntrinsic`, but leaves unroll factors at 1: an
-// actual cost model for intrinsics_m/intrinsics_n is added in a follow-up
-// commit.
+// encoding and target config.
 static IREE::CPU::DataTiledMMAAttr
 chooseCpuInnerTiledMmaForEncoding(MLIRContext *ctx,
                                   IREE::Encoding::EncodingAttr encoding,
@@ -493,8 +561,13 @@ chooseCpuInnerTiledMmaForEncoding(MLIRContext *ctx,
     return {};
   }
   auto [intr, transposed] = *intrChoice;
-  return IREE::CPU::DataTiledMMAAttr::get(ctx, intr, /*intrinsics_m=*/1,
-                                          /*intrinsics_n=*/1,
+  std::optional<std::pair<int64_t, int64_t>> unroll =
+      chooseUnrolling(ctx, elementTypes, intr, transposed, *matmulSizes);
+  if (!unroll) {
+    return {};
+  }
+  auto [intrinsicsM, intrinsicsN] = *unroll;
+  return IREE::CPU::DataTiledMMAAttr::get(ctx, intr, intrinsicsM, intrinsicsN,
                                           /*intrinsics_k=*/1, transposed);
 }