Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6787163
starting fresh
ahurta92 Apr 15, 2026
098668e
Add multi-level transform benchmarks and validation tests
ahurta92 Apr 16, 2026
063425c
Update .clangd configuration to enhance HIP file parsing and include …
ahurta92 Apr 16, 2026
03b89a1
Add block-distributed 3D transform reference
ahurta92 Apr 22, 2026
faa0e79
Right-multiply pass 3 to eliminate the post-transform un-shuffle
ahurta92 Apr 22, 2026
2833f06
Add L7: block-distributed GPU transform (scalar, single-buffer)
ahurta92 Apr 22, 2026
6a7b1be
L7: add MFMA path and document actual GFX90A f64 16x16x4 layout
ahurta92 Apr 22, 2026
7f5027b
L7: pad per-wave LDS region to kill 16-way bank conflicts
ahurta92 Apr 22, 2026
04d98d6
L7: drop barriers that only synced within-wave same-region accesses
ahurta92 Apr 22, 2026
77113d4
L7: fuse pass-2 store with corner-turn write
ahurta92 Apr 22, 2026
c8cbe43
L7: swap pass-1 MFMA operands so its output feeds pass 2 in-register
ahurta92 Apr 22, 2026
6f75394
L7: cooperative coalesced distribute (2.1-2.7x perf jump)
ahurta92 Apr 22, 2026
0a9a647
L7: one block per tensor (grid = nfuncs), drop the cube loop
ahurta92 Apr 22, 2026
aef9bfb
L7: use double4 for distribute to emit 128-bit HBM loads
ahurta92 Apr 22, 2026
cb2d71a
L7: wide double4 loads for B cache
ahurta92 Apr 22, 2026
da31eee
BLOCKED_TRANSFORM: add measured perf table and lessons learned
ahurta92 Apr 22, 2026
3196d2c
BLOCKED_TRANSFORM: add K=32 design note
ahurta92 Apr 23, 2026
fcb8988
Add L8: rocWMMA variant of the blocked transform
ahurta92 Apr 23, 2026
31b8d8a
L8: fuse passes 1-2 by populating matrix_a fragment from accumulator
ahurta92 Apr 23, 2026
1ddc782
BLOCKED_TRANSFORM: K=20/K=24 design note, use v_mfma_f64_4x4x4f64
ahurta92 Apr 23, 2026
ce56820
Add v_mfma_f64_4x4x4f64 layout probes
ahurta92 Apr 23, 2026
dda9eaf
K=20: blocked MFMA kernel (hybrid 16x16x4 + 4x4x4 + pass 1/2 fusion)
ahurta92 Apr 23, 2026
614c518
BLOCKED_TRANSFORM: K=20 measured perf table + lessons
ahurta92 Apr 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions .clangd
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
CompileFlags:
# compile_commands.json already has all flags; nothing to add/remove
Remove:
- -O3 # clangd doesn't need optimization; speeds up indexing
- -O3 # clangd doesn't need optimization; speeds up indexing
- --offload-arch=* # offload flags confuse clangd's host-side indexer
Add:
- -x
- hip # ensure .hip/.h files are parsed as HIP (C++)
- -DMRA_HAVE_HIP=1
- -I/opt/rocm-7.2.2/include

Index:
Background: Build
Expand All @@ -10,3 +15,4 @@ Diagnostics:
Suppress:
- pp_including_mainfile_in_preamble
- unknown_builtin
- err_implicit_function_declaration # HIP device builtins not visible to host parser
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,4 @@ clangd*
compile_commands.json
rocroof/

prof/
508 changes: 508 additions & 0 deletions BLOCKED_TRANSFORM.md

Large diffs are not rendered by default.

113 changes: 14 additions & 99 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,114 +2,29 @@ cmake_minimum_required(VERSION 3.10)

project(transformbench LANGUAGES CXX)

set(CMAKE_CUDA_ARCHITECTURES 80)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
set(TTG_ENABLE_CUDA ON)
else(CMAKE_CUDA_COMPILER)
message(WARNING "CUDA compiler not found")
endif(CMAKE_CUDA_COMPILER)
set(HAVE_CUDA ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if we can compile .cu files")

check_language(HIP)
if(CMAKE_HIP_COMPILER)
enable_language(HIP)
set(TTG_ENABLE_HIP ON)
else(CMAKE_HIP_COMPILER)
message(WARNING "HIP compiler not found")
endif(CMAKE_HIP_COMPILER)
set(HAVE_HIP ${CMAKE_HIP_COMPILER} CACHE BOOL "True if we can compile .hip files")

option(USE_SUGGEST_LAYOUT "Use suggested layout instead of get_layout" ON)
option(DEBUG_TENSOR_TYPE "Compile-time print cute tensor types (breaks build)" OFF)
set(USE_CUBLASDX_VERSION "25.06" CACHE STRING "Version of cublasDx to use")


# fetch cublasDx
if (CMAKE_CUDA_COMPILER)
include(FetchContent)
FetchContent_Declare(
cublasdx
URL https://developer.download.nvidia.com/compute/cublasdx/redist/cublasdx/nvidia-mathdx-${USE_CUBLASDX_VERSION}.0.tar.gz
)
FetchContent_MakeAvailable(cublasdx)
FetchContent_GetProperties(cublasdx
SOURCE_DIR CUBLASDX_SOURCE_DIR
BINARY_DIR CUBLASDX_BINARY_DIR
)

# look for cublasDx
find_package(mathdx REQUIRED COMPONENTS cublasdx HINTS ${CUBLASDX_SOURCE_DIR}/nvidia/mathdx/25.06/)
if (TARGET mathdx::cublasdx)
message(STATUS "Found cublasDx at ${mathdx_CUBLASDX_DIR}")
else()
message(FATAL_ERROR "cublasDx not found")
endif()

endif(CMAKE_CUDA_COMPILER)

# Simple interface that holds cublasDx and CUDA settings
add_library(libmra INTERFACE)
if (CMAKE_CUDA_COMPILER)
# Link against cublasDx and CUDA
target_link_libraries(libmra INTERFACE mathdx::cublasdx)
# Set the CUDA architecture
target_compile_definitions(libmra INTERFACE MRA_CUDA_ARCH=${CMAKE_CUDA_ARCHITECTURES} MRA_HAVE_CUDA=1)
# Enable support for constexpr and extended lambdas
target_compile_options(libmra INTERFACE --expt-relaxed-constexpr --extended-lambda)

# Add the transformbench executable
add_executable(transformbench_cuda transformbench.cu)

# Link against the MRA interface
target_link_libraries(transformbench_cuda PUBLIC libmra)


if (USE_SUGGEST_LAYOUT)
# Enable using suggested layout instead of get_layout
target_compile_definitions(transformbench_cuda PUBLIC USE_SUGGEST_LAYOUT)
endif (USE_SUGGEST_LAYOUT)

if (DEBUG_TENSOR_TYPE)
# Enable compile-time printing of cute tensor types (breaks build)
target_compile_definitions(transformbench_cuda PUBLIC DEBUG_TENSOR_TYPE)
endif (DEBUG_TENSOR_TYPE)
else()
message(FATAL_ERROR "HIP compiler not found")
endif()

if (CMAKE_HIP_COMPILER)
# Ensure ROCm cmake configs (hip, hipblas, etc.) are findable
list(APPEND CMAKE_PREFIX_PATH /opt/rocm-6.4.3 /opt/rocm)
find_package(hipblas REQUIRED)

#target_link_libraries(libmra INTERFACE mathdx::cublasdx)
# Set the CUDA architecture
target_compile_definitions(libmra INTERFACE MRA_HAVE_HIP=1)
# Enable support for constexpr and extended lambdas
#target_compile_options(libmra INTERFACE --expt-relaxed-constexpr --extended-lambda)

# Add the transformbench executable
add_executable(transformbench_hip transformbench.hip)

# Link against the MRA interface and hipBLAS (for level 6 Kronecker GEMM)
target_link_libraries(transformbench_hip PUBLIC libmra roc::hipblas)
# Simple interface that holds HIP settings
add_library(libmra INTERFACE)
target_compile_definitions(libmra INTERFACE MRA_HAVE_HIP=1)

# Correctness test: validate any optimization level against the L1 reference
add_executable(validate_levels validate_levels.hip)
target_link_libraries(validate_levels PUBLIC libmra roc::hipblas)
# Add the transformbench executable
add_executable(transformbench_hip transformbench.hip)
target_link_libraries(transformbench_hip PUBLIC libmra)

if (USE_SUGGEST_LAYOUT)
# Enable using suggested layout instead of get_layout
target_compile_definitions(transformbench_hip PUBLIC USE_SUGGEST_LAYOUT)
endif (USE_SUGGEST_LAYOUT)
# Correctness test: GPU L1 vs CPU reference (mirrors transform3d.cc)
add_executable(validate validate.hip)
target_link_libraries(validate PUBLIC libmra)

if (DEBUG_TENSOR_TYPE)
# Enable compile-time printing of cute tensor types (breaks build)
target_compile_definitions(transformbench_hip PUBLIC DEBUG_TENSOR_TYPE)
endif (DEBUG_TENSOR_TYPE)
endif ()
# Multi-level correctness test (-l selects level, -K and -N override defaults)
add_executable(validate_levels validate_levels.hip)
target_link_libraries(validate_levels PUBLIC libmra)
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,6 @@ ln -sf build/compile_commands.json compile_commands.json
```bash
cmake .. -DMRA_HAVE_HIP=1 -DCMAKE_CXX_COMPILER=hipcc \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_PREFIX_PATH=/opt/rocm-6.4.3 \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
```

Expand Down
8 changes: 8 additions & 0 deletions counters.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
pmc: SQ_WAVES GRBM_GUI_ACTIVE
pmc: SQ_INSTS_VALU SQ_INSTS_MFMA SQ_INSTS_LDS
pmc: SQ_WAIT_INST_LDS SQ_LDS_BANK_CONFLICT
pmc: SQ_VALU_MFMA_BUSY_CYCLES
pmc: SQ_INSTS_VALU_MFMA_MOPS_F64 SQ_INSTS_VALU_FMA_F64
pmc: TCC_HIT_sum TCC_MISS_sum
pmc: TCC_EA_RDREQ_sum TCC_EA_WRREQ_sum
pmc: TCC_EA_RDREQ_DRAM_sum TCC_EA_WRREQ_DRAM_sum
Empty file added frames/.gitkeep
Empty file.
Loading