Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -175,23 +175,24 @@ func.func @pack_gemm_fill_dynamic_inner_tiled_avx512(%arg0 : tensor<?x?xf32>, %a
%5 = iree_encoding.unset_encoding %4 encoding_dims{%m, %n, %k} : tensor<?x?xf32, #encoding_result_it> -> tensor<?x?xf32>{%d0, %d1}
return %5 : tensor<?x?xf32>
}
// CHECK-DAG: #[[$MAP_N:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
// CHECK-DAG: #[[$MAP_INNER:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
// CHECK-LABEL: func @pack_gemm_fill_dynamic_inner_tiled_avx512(
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
// CHECK-DAG: %[[D1:.+]] = tensor.dim %[[ARG1]], %[[C1]]
// CHECK-DAG: %[[OUT_D1:.+]] = affine.apply #[[$MAP_N]]()[%[[D1]]]
// CHECK-DAG: %[[OUT_D0:.+]] = affine.apply #[[$MAP_INNER]]()[%[[D0]]]
// CHECK-DAG: %[[OUT_D1:.+]] = affine.apply #[[$MAP_INNER]]()[%[[D1]]]
// CHECK-DAG: %[[PACK_LHS:.+]] = linalg.pack {{.*}}%[[ARG0]]
// CHECK: %[[PACK_RHS:.+]] = linalg.pack
// CHECK-SAME: %[[ARG1]]
// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty(%[[D0]], %[[OUT_D1]]) : tensor<?x?x1x16xf32>
// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty(%[[OUT_D0]], %[[OUT_D1]]) : tensor<?x?x16x16xf32>
// CHECK: %[[FILL:.+]] = linalg.fill
// CHECK-SAME: outs(%[[EMPTY]] :
// CHECK: %[[INNER:.+]] = iree_codegen.inner_tiled ins(%[[PACK_LHS]], %[[PACK_RHS]]) outs(%[[FILL]])
// CHECK-SAME: kind = #iree_cpu.data_tiled_mma_layout<intrinsic = MMA_X86_AVX512_1x16x1_F32_F32>, semantics = #iree_cpu.mma_semantics<>
// CHECK-SAME: kind = #iree_cpu.data_tiled_mma_layout<intrinsic = MMA_X86_AVX512_1x16x1_F32_F32, intrinsics_m = 16>, semantics = #iree_cpu.mma_semantics<>
// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[INNER]]
// CHECK: return %[[UNPACK]]

Expand Down Expand Up @@ -223,19 +224,20 @@ func.func @unset_encoding_matmul_RESULT_inner_tiled_avx512(%arg0: tensor<127x255
}
// CHECK-LABEL: func @set_encoding_matmul_LHS_inner_tiled_avx512(
// CHECK-SAME: %[[INPUT:[a-zA-Z0-9]+]]: tensor<127x255xf32>
// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<127x255x1x1xf32>
// CHECK: %[[PACK:.+]] = linalg.pack %[[INPUT]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %[[EMPTY]] : tensor<127x255xf32> -> tensor<127x255x1x1xf32>
// CHECK: return %[[PACK]] : tensor<127x255x1x1xf32>
// CHECK-DAG: %[[CST:.+]] = arith.constant 0.0
// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x255x16x1xf32>
// CHECK: %[[PACK:.+]] = linalg.pack %[[INPUT]] padding_value(%[[CST]] : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %[[EMPTY]] : tensor<127x255xf32> -> tensor<8x255x16x1xf32>
// CHECK: return %[[PACK]] : tensor<8x255x16x1xf32>
// CHECK-LABEL: func @set_encoding_matmul_RHS_inner_tiled_avx512(
// CHECK-SAME: %[[INPUT_R:[a-zA-Z0-9]+]]: tensor<127x255xf32>
// CHECK-DAG: %[[CST_R:.+]] = arith.constant 0.0
// CHECK: %[[EMPTY_R:.+]] = tensor.empty() : tensor<16x127x16x1xf32>
// CHECK: %[[PACK_R:.+]] = linalg.pack %[[INPUT_R]] padding_value(%[[CST_R]] : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %[[EMPTY_R]] : tensor<127x255xf32> -> tensor<16x127x16x1xf32>
// CHECK: return %[[PACK_R]] : tensor<16x127x16x1xf32>
// CHECK-LABEL: func @unset_encoding_matmul_RESULT_inner_tiled_avx512(
// CHECK-SAME: %[[PACKED:[a-zA-Z0-9]+]]: tensor<127x16x1x16xf32>
// CHECK-SAME: %[[PACKED:[a-zA-Z0-9]+]]: tensor<8x16x16x16xf32>
// CHECK: %[[EMPTY_U:.+]] = tensor.empty() : tensor<127x255xf32>
// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[PACKED]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 16] into %[[EMPTY_U]] : tensor<127x16x1x16xf32> -> tensor<127x255xf32>
// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[PACKED]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %[[EMPTY_U]] : tensor<8x16x16x16xf32> -> tensor<127x255xf32>
// CHECK: return %[[UNPACK]]

// -----
Expand Down
22 changes: 22 additions & 0 deletions compiler/src/iree/compiler/Codegen/Dialect/CPU/IR/IREECPUAttrs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,28 @@ getRowMajorTilesMNKShape(MMAIntrinsic intrinsic) {
}
}

int64_t getRegisterSpaceBytes(MMAIntrinsic intrinsic) {
// Total architectural vector register file size, in bytes. The inner-tiled
// cost model uses this as the capacity for the union of the ACC, LHS and
// RHS tiles. For scalable ISAs we treat the vector length as its minimum
// (1 × 128 bits = 16 bytes per register); this is a deliberate
// simplification — the resulting `intrinsics_m`/`intrinsics_n` choices are
// good enough in practice and avoid propagating scalability into the cost
// model.
uint32_t arch = static_cast<uint32_t>(intrinsic) & 0xFF00;
switch (arch) {
case 0x1200: // AVX/AVX2: 16 YMM × 32 B.
return 16 * 32;
case 0x1300: // AVX-512: 32 ZMM × 64 B.
return 32 * 64;
case 0x2200: // Arm SVE/SVE2: 32 Z × (VL treated as 128 bits).
return 32 * 16;
default:
// Plausible default, but override it on each arch you care for.
return 16 * 32;
}
}

/// Helper for `getIntrinsicSwizzle`:
/// Ensures every `expandShape` row has at least one piece (unit
/// dims that received no explicit `expand` get a non-scalable size-1 Internal
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@ Codegen::TileSwizzle getIntrinsicSwizzle(MMAIntrinsic mma, bool transposed,
// Returns the TileSwizzle for the given MMA attr and operand index.
Codegen::TileSwizzle getSwizzle(DataTiledMMAAttr mma, int operandIdx);

// Returns the architectural vector register file capacity, in bytes, that the
// inner-tiled MMA cost model may use to fit the union of ACC, LHS and RHS
// tiles. For ISAs with scalable vectors (e.g. SVE/SVE2) the vector length is
// treated as its 128-bit minimum — a deliberate simplification that produces
// good-enough `intrinsics_m`/`intrinsics_n` choices without leaking
// scalability into the cost model.
// Values: AVX/AVX2 = 16 × 32 B, AVX-512 = 32 × 64 B, SVE/SVE2 = 32 × 16 B.
int64_t getRegisterSpaceBytes(MMAIntrinsic intrinsic);

} // namespace mlir::iree_compiler::IREE::CPU

// clang-format on
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
#include "iree/compiler/Dialect/Encoding/Utils/Utils.h"
#include "iree/compiler/Dialect/LinalgExt/Utils/MatchUtils.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/bit.h"
#include "llvm/Support/DebugLog.h"
#include "llvm/Support/InterleavedRange.h"
#include "mlir/IR/AffineMap.h"
Expand Down Expand Up @@ -469,11 +470,78 @@ chooseIntrinsic(MLIRContext *ctx, ArrayRef<Type> elementTypes,
return best;
}

/// Power-of-two cap on one unroll dim. For a static matmul extent
/// `matmulSize` divided by `intrinsicSize`, we floor the cover count to a
/// power of two. For dynamic dims we fall back to `fallback`, which callers
/// set to something at least as large as the register budget — that budget
/// itself terminates the enumeration below, so no tighter static cap is
/// needed.
static int64_t po2UnrollCap(int64_t matmulSize, int64_t intrinsicSize,
int64_t fallback) {
if (ShapedType::isDynamic(matmulSize)) {
return fallback;
}
uint64_t cover =
std::max<int64_t>(1, llvm::divideCeil(matmulSize, intrinsicSize));
return llvm::bit_floor(cover);
}

// Phase 2 of `chooseCpuInnerTiledMmaForEncoding`: for an already-chosen
// (intrinsic, transposed) pair, pick the largest power-of-two unroll
// factors (intrinsicsM, intrinsicsN) such that the three tiles
// (ACC + LHS + RHS) still fit in the target's vector register file,
// breaking ties with arithmetic intensity (effM*effN)/(effM+effN) so
// approximately-square tiles win.
//
// Returns nullopt if no feasible (im, in) exists. The returned pair is
// (intrinsicsM, intrinsicsN).
static std::optional<std::pair<int64_t, int64_t>>
chooseUnrolling(MLIRContext *ctx, ArrayRef<Type> elementTypes,
IREE::CPU::MMAIntrinsic intr, bool transposed,
const IREE::Encoding::BxMxNxKxKb &matmulSizes) {
std::optional<IntrinsicInfo> info =
getIntrinsicInfo(ctx, elementTypes, intr, transposed);
if (!info) {
return std::nullopt;
}
int64_t regBitBudget = IREE::CPU::getRegisterSpaceBytes(intr) * 8;
int64_t capMPo2 =
po2UnrollCap(matmulSizes.M, info->intrinsicM, regBitBudget);
int64_t capNPo2 =
po2UnrollCap(matmulSizes.N, info->intrinsicN, regBitBudget);
int64_t accTerm = info->intrinsicM * info->intrinsicN * info->accBits;
int64_t lhsTerm = info->intrinsicM * info->intrinsicK * info->lhsBits;
int64_t rhsTerm = info->intrinsicN * info->intrinsicK * info->rhsBits;
std::optional<std::pair<int64_t, int64_t>> best;
double bestIntensity = -1.0;
// Enumerate power-of-two intrinsicsM; for each, pick the largest feasible
// power-of-two intrinsicsN under the bit budget and the static N cap.
// The budget bounds im on its own (im*lhsTerm alone must be < budget),
// which terminates the loop without any numRegs-style cap.
for (int64_t im = 1; im <= capMPo2; im *= 2) {
int64_t remaining = regBitBudget - im * lhsTerm;
if (remaining <= 0) {
break;
}
int64_t inMaxBudget = remaining / (im * accTerm + rhsTerm);
uint64_t inCap = std::min<int64_t>(capNPo2, inMaxBudget);
if (inCap < 1) {
continue;
}
int64_t in = llvm::bit_floor(inCap);
double effM = static_cast<double>(im) * info->intrinsicM;
double effN = static_cast<double>(in) * info->intrinsicN;
double intensity = effM * effN / (effM + effN);
if (intensity > bestIntensity) {
bestIntensity = intensity;
best = {im, in};
}
}
return best;
}

// Picks a CPU `DataTiledMMAAttr` for `iree_codegen.inner_tiled` given an
// encoding and target config. Picks the best (intrinsic, transposed)
// orientation via `chooseIntrinsic`, but leaves unroll factors at 1: an
// actual cost model for intrinsics_m/intrinsics_n is added in a follow-up
// commit.
// encoding and target config.
static IREE::CPU::DataTiledMMAAttr
chooseCpuInnerTiledMmaForEncoding(MLIRContext *ctx,
IREE::Encoding::EncodingAttr encoding,
Expand All @@ -493,8 +561,13 @@ chooseCpuInnerTiledMmaForEncoding(MLIRContext *ctx,
return {};
}
auto [intr, transposed] = *intrChoice;
return IREE::CPU::DataTiledMMAAttr::get(ctx, intr, /*intrinsics_m=*/1,
/*intrinsics_n=*/1,
std::optional<std::pair<int64_t, int64_t>> unroll =
chooseUnrolling(ctx, elementTypes, intr, transposed, *matmulSizes);
if (!unroll) {
return {};
}
auto [intrinsicsM, intrinsicsN] = *unroll;
return IREE::CPU::DataTiledMMAAttr::get(ctx, intr, intrinsicsM, intrinsicsN,
/*intrinsics_k=*/1, transposed);
}

Expand Down
Loading