diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_x86_64.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_x86_64.mlir index 159f1b6612e8..0b500eff467a 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_x86_64.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_x86_64.mlir @@ -175,7 +175,7 @@ func.func @pack_gemm_fill_dynamic_inner_tiled_avx512(%arg0 : tensor, %a %5 = iree_encoding.unset_encoding %4 encoding_dims{%m, %n, %k} : tensor -> tensor{%d0, %d1} return %5 : tensor } -// CHECK-DAG: #[[$MAP_N:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> +// CHECK-DAG: #[[$MAP_INNER:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> // CHECK-LABEL: func @pack_gemm_fill_dynamic_inner_tiled_avx512( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor @@ -183,15 +183,16 @@ func.func @pack_gemm_fill_dynamic_inner_tiled_avx512(%arg0 : tensor, %a // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]] // CHECK-DAG: %[[D1:.+]] = tensor.dim %[[ARG1]], %[[C1]] -// CHECK-DAG: %[[OUT_D1:.+]] = affine.apply #[[$MAP_N]]()[%[[D1]]] +// CHECK-DAG: %[[OUT_D0:.+]] = affine.apply #[[$MAP_INNER]]()[%[[D0]]] +// CHECK-DAG: %[[OUT_D1:.+]] = affine.apply #[[$MAP_INNER]]()[%[[D1]]] // CHECK-DAG: %[[PACK_LHS:.+]] = linalg.pack {{.*}}%[[ARG0]] // CHECK: %[[PACK_RHS:.+]] = linalg.pack // CHECK-SAME: %[[ARG1]] -// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty(%[[D0]], %[[OUT_D1]]) : tensor +// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty(%[[OUT_D0]], %[[OUT_D1]]) : tensor // CHECK: %[[FILL:.+]] = linalg.fill // CHECK-SAME: outs(%[[EMPTY]] : // CHECK: %[[INNER:.+]] = iree_codegen.inner_tiled ins(%[[PACK_LHS]], %[[PACK_RHS]]) outs(%[[FILL]]) -// CHECK-SAME: kind = #iree_cpu.data_tiled_mma_layout, semantics = #iree_cpu.mma_semantics<> +// CHECK-SAME: kind = #iree_cpu.data_tiled_mma_layout, semantics = #iree_cpu.mma_semantics<> // CHECK: %[[UNPACK:.+]] = linalg.unpack %[[INNER]] // CHECK: return %[[UNPACK]] @@ -223,9 +224,10 @@ func.func @unset_encoding_matmul_RESULT_inner_tiled_avx512(%arg0: tensor<127x255 } // CHECK-LABEL: func @set_encoding_matmul_LHS_inner_tiled_avx512( // CHECK-SAME: %[[INPUT:[a-zA-Z0-9]+]]: tensor<127x255xf32> -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<127x255x1x1xf32> -// CHECK: %[[PACK:.+]] = linalg.pack %[[INPUT]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %[[EMPTY]] : tensor<127x255xf32> -> tensor<127x255x1x1xf32> -// CHECK: return %[[PACK]] : tensor<127x255x1x1xf32> +// CHECK-DAG: %[[CST:.+]] = arith.constant 0.0 +// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x255x16x1xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[INPUT]] padding_value(%[[CST]] : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %[[EMPTY]] : tensor<127x255xf32> -> tensor<8x255x16x1xf32> +// CHECK: return %[[PACK]] : tensor<8x255x16x1xf32> // CHECK-LABEL: func @set_encoding_matmul_RHS_inner_tiled_avx512( // CHECK-SAME: %[[INPUT_R:[a-zA-Z0-9]+]]: tensor<127x255xf32> // CHECK-DAG: %[[CST_R:.+]] = arith.constant 0.0 @@ -233,9 +235,9 @@ func.func @unset_encoding_matmul_RESULT_inner_tiled_avx512(%arg0: tensor<127x255 // CHECK: %[[PACK_R:.+]] = linalg.pack %[[INPUT_R]] padding_value(%[[CST_R]] : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %[[EMPTY_R]] : tensor<127x255xf32> -> tensor<16x127x16x1xf32> // CHECK: return %[[PACK_R]] : tensor<16x127x16x1xf32> // CHECK-LABEL: func @unset_encoding_matmul_RESULT_inner_tiled_avx512( -// CHECK-SAME: %[[PACKED:[a-zA-Z0-9]+]]: tensor<127x16x1x16xf32> +// CHECK-SAME: %[[PACKED:[a-zA-Z0-9]+]]: tensor<8x16x16x16xf32> // CHECK: %[[EMPTY_U:.+]] = tensor.empty() : tensor<127x255xf32> -// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[PACKED]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 16] into %[[EMPTY_U]] : tensor<127x16x1x16xf32> -> tensor<127x255xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[PACKED]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %[[EMPTY_U]] : tensor<8x16x16x16xf32> -> tensor<127x255xf32> // CHECK: return %[[UNPACK]] // ----- diff --git a/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUAttrs.cpp index 8716af67f6e9..92522cc863a5 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUAttrs.cpp @@ -352,6 +352,28 @@ getRowMajorTilesMNKShape(MMAIntrinsic intrinsic) { } } +int64_t getRegisterSpaceBytes(MMAIntrinsic intrinsic) { + // Total architectural vector register file size, in bytes. The inner-tiled + // cost model uses this as the capacity for the union of the ACC, LHS and + // RHS tiles. For scalable ISAs we treat the vector length as its minimum + // (1 × 128 bits = 16 bytes per register); this is a deliberate + // simplification — the resulting `intrinsics_m`/`intrinsics_n` choices are + // good enough in practice and avoid propagating scalability into the cost + // model. + uint32_t arch = static_cast(intrinsic) & 0xFF00; + switch (arch) { + case 0x1200: // AVX/AVX2: 16 YMM × 32 B. + return 16 * 32; + case 0x1300: // AVX-512: 32 ZMM × 64 B. + return 32 * 64; + case 0x2200: // Arm SVE/SVE2: 32 Z × (VL treated as 128 bits). + return 32 * 16; + default: + // Plausible default, but override it on each arch you care for. + return 16 * 32; + } +} + /// Helper for `getIntrinsicSwizzle`: /// Ensures every `expandShape` row has at least one piece (unit /// dims that received no explicit `expand` get a non-scalable size-1 Internal diff --git a/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUTypes.h b/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUTypes.h index 7d8e6498c08d..787b726608e3 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUTypes.h +++ b/compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUTypes.h @@ -73,6 +73,15 @@ Codegen::TileSwizzle getIntrinsicSwizzle(MMAIntrinsic mma, bool transposed, // Returns the TileSwizzle for the given MMA attr and operand index. Codegen::TileSwizzle getSwizzle(DataTiledMMAAttr mma, int operandIdx); +// Returns the architectural vector register file capacity, in bytes, that the +// inner-tiled MMA cost model may use to fit the union of ACC, LHS and RHS +// tiles. For ISAs with scalable vectors (e.g. SVE/SVE2) the vector length is +// treated as its 128-bit minimum — a deliberate simplification that produces +// good-enough `intrinsics_m`/`intrinsics_n` choices without leaking +// scalability into the cost model. +// Values: AVX/AVX2 = 16 × 32 B, AVX-512 = 32 × 64 B, SVE/SVE2 = 32 × 16 B. +int64_t getRegisterSpaceBytes(MMAIntrinsic intrinsic); + } // namespace mlir::iree_compiler::IREE::CPU // clang-format on diff --git a/compiler/src/iree/compiler/Codegen/ExternalInterfaces/CPUEncodingExternalModels.cpp b/compiler/src/iree/compiler/Codegen/ExternalInterfaces/CPUEncodingExternalModels.cpp index cb126fb4cd4a..e4b31845e8d3 100644 --- a/compiler/src/iree/compiler/Codegen/ExternalInterfaces/CPUEncodingExternalModels.cpp +++ b/compiler/src/iree/compiler/Codegen/ExternalInterfaces/CPUEncodingExternalModels.cpp @@ -52,6 +52,7 @@ #include "iree/compiler/Dialect/Encoding/Utils/Utils.h" #include "iree/compiler/Dialect/LinalgExt/Utils/MatchUtils.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/bit.h" #include "llvm/Support/DebugLog.h" #include "llvm/Support/InterleavedRange.h" #include "mlir/IR/AffineMap.h" @@ -469,11 +470,78 @@ chooseIntrinsic(MLIRContext *ctx, ArrayRef elementTypes, return best; } +/// Power-of-two cap on one unroll dim. For a static matmul extent +/// `matmulSize` divided by `intrinsicSize`, we floor the cover count to a +/// power of two. For dynamic dims we fall back to `fallback`, which callers +/// set to something at least as large as the register budget — that budget +/// itself terminates the enumeration below, so no tighter static cap is +/// needed. +static int64_t po2UnrollCap(int64_t matmulSize, int64_t intrinsicSize, + int64_t fallback) { + if (ShapedType::isDynamic(matmulSize)) { + return fallback; + } + uint64_t cover = + std::max(1, llvm::divideCeil(matmulSize, intrinsicSize)); + return llvm::bit_floor(cover); +} + +// Phase 2 of `chooseCpuInnerTiledMmaForEncoding`: for an already-chosen +// (intrinsic, transposed) pair, pick the largest power-of-two unroll +// factors (intrinsicsM, intrinsicsN) such that the three tiles +// (ACC + LHS + RHS) still fit in the target's vector register file, +// breaking ties with arithmetic intensity (effM*effN)/(effM+effN) so +// approximately-square tiles win. +// +// Returns nullopt if no feasible (im, in) exists. The returned pair is +// (intrinsicsM, intrinsicsN). +static std::optional> +chooseUnrolling(MLIRContext *ctx, ArrayRef elementTypes, + IREE::CPU::MMAIntrinsic intr, bool transposed, + const IREE::Encoding::BxMxNxKxKb &matmulSizes) { + std::optional info = + getIntrinsicInfo(ctx, elementTypes, intr, transposed); + if (!info) { + return std::nullopt; + } + int64_t regBitBudget = IREE::CPU::getRegisterSpaceBytes(intr) * 8; + int64_t capMPo2 = + po2UnrollCap(matmulSizes.M, info->intrinsicM, regBitBudget); + int64_t capNPo2 = + po2UnrollCap(matmulSizes.N, info->intrinsicN, regBitBudget); + int64_t accTerm = info->intrinsicM * info->intrinsicN * info->accBits; + int64_t lhsTerm = info->intrinsicM * info->intrinsicK * info->lhsBits; + int64_t rhsTerm = info->intrinsicN * info->intrinsicK * info->rhsBits; + std::optional> best; + double bestIntensity = -1.0; + // Enumerate power-of-two intrinsicsM; for each, pick the largest feasible + // power-of-two intrinsicsN under the bit budget and the static N cap. + // The budget bounds im on its own (im*lhsTerm alone must be < budget), + // which terminates the loop without any numRegs-style cap. + for (int64_t im = 1; im <= capMPo2; im *= 2) { + int64_t remaining = regBitBudget - im * lhsTerm; + if (remaining <= 0) { + break; + } + int64_t inMaxBudget = remaining / (im * accTerm + rhsTerm); + uint64_t inCap = std::min(capNPo2, inMaxBudget); + if (inCap < 1) { + continue; + } + int64_t in = llvm::bit_floor(inCap); + double effM = static_cast(im) * info->intrinsicM; + double effN = static_cast(in) * info->intrinsicN; + double intensity = effM * effN / (effM + effN); + if (intensity > bestIntensity) { + bestIntensity = intensity; + best = {im, in}; + } + } + return best; +} + // Picks a CPU `DataTiledMMAAttr` for `iree_codegen.inner_tiled` given an -// encoding and target config. Picks the best (intrinsic, transposed) -// orientation via `chooseIntrinsic`, but leaves unroll factors at 1: an -// actual cost model for intrinsics_m/intrinsics_n is added in a follow-up -// commit. +// encoding and target config. static IREE::CPU::DataTiledMMAAttr chooseCpuInnerTiledMmaForEncoding(MLIRContext *ctx, IREE::Encoding::EncodingAttr encoding, @@ -493,8 +561,13 @@ chooseCpuInnerTiledMmaForEncoding(MLIRContext *ctx, return {}; } auto [intr, transposed] = *intrChoice; - return IREE::CPU::DataTiledMMAAttr::get(ctx, intr, /*intrinsics_m=*/1, - /*intrinsics_n=*/1, + std::optional> unroll = + chooseUnrolling(ctx, elementTypes, intr, transposed, *matmulSizes); + if (!unroll) { + return {}; + } + auto [intrinsicsM, intrinsicsN] = *unroll; + return IREE::CPU::DataTiledMMAAttr::get(ctx, intr, intrinsicsM, intrinsicsN, /*intrinsics_k=*/1, transposed); }